all repos — videocr @ efd72236249e706cbf68760944f29e61e40a6e48

Extract hardcoded subtitles from videos using machine learning

make sim_threshold adjustable through api
Yi Ge me@yige.ch
Mon, 29 Apr 2019 03:50:06 +0200
commit

efd72236249e706cbf68760944f29e61e40a6e48

parent

77362dce1a38c7cdc3674b2b42883203a277531b

3 files changed, 20 insertions(+), 16 deletions(-)

jump to
M videocr/api.pyvideocr/api.py

@@ -5,8 +5,9 @@ from . import constants

from .video import Video -def get_subtitles(video_path: str, lang='eng', time_start='0:00', time_end='', - conf_threshold=65, use_fullframe=False) -> str: +def get_subtitles( + video_path: str, lang='eng', time_start='0:00', time_end='', + conf_threshold=65, sim_threshold=90, use_fullframe=False) -> str: # download tesseract data file to ~/tessdata if necessary fpath = constants.TESSDATA_DIR / '{}.traineddata'.format(lang) if not fpath.is_file():

@@ -20,14 +21,14 @@ shutil.copyfileobj(res, f)

v = Video(video_path) v.run_ocr(lang, time_start, time_end, conf_threshold, use_fullframe) - return v.get_subtitles() + return v.get_subtitles(sim_threshold) def save_subtitles_to_file( video_path: str, file_path='subtitle.srt', lang='eng', - time_start='0:00', time_end='', conf_threshold=65, + time_start='0:00', time_end='', conf_threshold=65, sim_threshold=90, use_fullframe=False) -> None: with open(file_path, 'w+') as f: f.write(get_subtitles( video_path, lang, time_start, time_end, conf_threshold, - use_fullframe)) + sim_threshold, use_fullframe))
M videocr/models.pyvideocr/models.py

@@ -54,10 +54,12 @@

class PredictedSubtitle: frames: List[PredictedFrame] + sim_threshold: int text: str - def __init__(self, frames: List[PredictedFrame]): + def __init__(self, frames: List[PredictedFrame], sim_threshold: int): self.frames = [f for f in frames if f.confidence > 0] + self.sim_threshold = sim_threshold if self.frames: self.text = max(self.frames, key=lambda f: f.confidence).text

@@ -76,8 +78,8 @@ if self.frames:

return self.frames[-1].index return 0 - def is_similar_to(self, other: PredictedSubtitle, threshold=90) -> bool: - return fuzz.partial_ratio(self.text, other.text) >= threshold + def is_similar_to(self, other: PredictedSubtitle) -> bool: + return fuzz.partial_ratio(self.text, other.text) >= self.sim_threshold def __repr__(self): return '{} - {}. {}'.format(self.index_start, self.index_end, self.text)
M videocr/video.pyvideocr/video.py

@@ -29,7 +29,7 @@ self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT))

v.release() def run_ocr(self, lang: str, time_start: str, time_end: str, - conf_threshold: int, use_fullframe: bool) -> None: + conf_threshold:int, use_fullframe: bool) -> None: self.lang = lang self.use_fullframe = use_fullframe

@@ -80,8 +80,8 @@ img = img[self.height // 2:, :]

config = '--tessdata-dir "{}"'.format(constants.TESSDATA_DIR) return pytesseract.image_to_data(img, lang=self.lang, config=config) - def get_subtitles(self) -> str: - self._generate_subtitles() + def get_subtitles(self, sim_threshold: int) -> str: + self._generate_subtitles(sim_threshold) return ''.join( '{}\n{} --> {}\n{}\n\n'.format( i,

@@ -90,7 +90,7 @@ self._srt_timestamp(sub.index_end),

sub.text) for i, sub in enumerate(self.pred_subs)) - def _generate_subtitles(self) -> None: + def _generate_subtitles(self, sim_threshold: int) -> None: self.pred_subs = [] if self.pred_frames is None:

@@ -112,8 +112,8 @@ bound -= 1

else: # divide subtitle paragraphs para_new = j - WIN_BOUND - self._append_sub( - PredictedSubtitle(self.pred_frames[i:para_new])) + self._append_sub(PredictedSubtitle( + self.pred_frames[i:para_new], sim_threshold)) i = para_new j = i bound = WIN_BOUND

@@ -122,7 +122,8 @@ j += 1

# also handle the last remaining frames if i < len(self.pred_frames) - 1: - self._append_sub(PredictedSubtitle(self.pred_frames[i:])) + self._append_sub(PredictedSubtitle( + self.pred_frames[i:], sim_threshold)) def _append_sub(self, sub: PredictedSubtitle) -> None: if len(sub.text) == 0:

@@ -132,7 +133,7 @@ # merge new sub to the last subs if they are similar

while self.pred_subs and sub.is_similar_to(self.pred_subs[-1]): ls = self.pred_subs[-1] del self.pred_subs[-1] - sub = PredictedSubtitle(ls.frames + sub.frames) + sub = PredictedSubtitle(ls.frames + sub.frames, sub.sim_threshold) self.pred_subs.append(sub)