all repos — videocr @ 63873af476c8a732abae625f9464931cf23d2e59

Extract hardcoded subtitles from videos using machine learning

videocr/video.py (view raw)

 1from __future__ import annotations
 2from concurrent import futures
 3import pytesseract
 4import cv2
 5import timeit
 6
 7from .models import PredictedFrame
 8
 9
10class Video:
11    path: str
12    lang: str
13    num_frames: int
14
15    def __init__(self, path, lang):
16        self.path = path
17        self.lang = lang
18        v = cv2.VideoCapture(path)
19        self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT))
20        v.release()
21
22    def _frame_ocr(self, img):
23        data = pytesseract.image_to_data(img, lang=self.lang)
24        return data
25
26    def run_ocr(self):
27        v = cv2.VideoCapture(self.path)
28        print(self.num_frames)
29        frames = (v.read()[1] for _ in range(40))
30
31        with futures.ProcessPoolExecutor() as pool:
32            frames_ocr = pool.map(self._frame_ocr, frames, chunksize=1)
33            for i, data in enumerate(frames_ocr):
34                pred = PredictedFrame(i, data)
35                print(pred.text)
36
37        v.release()
38
39
40time_start = timeit.default_timer()
41v = Video('1.mp4', 'HanS')
42v.run_ocr()
43time_stop = timeit.default_timer()
44print(time_stop - time_start)