all repos — videocr @ 720c9d479ffc8e6f314d823102d99ef2b581cf66

Extract hardcoded subtitles from videos using machine learning

move download_lang_data to utils.py
Yi Ge me@yige.ch
Sun, 15 Dec 2019 20:56:09 +0800
commit

720c9d479ffc8e6f314d823102d99ef2b581cf66

parent

8a56cbf746e5d550853ca53409b0fdda29ac706d

2 files changed, 23 insertions(+), 15 deletions(-)

jump to
M videocr/api.pyvideocr/api.py

@@ -1,24 +1,11 @@

-from urllib.request import urlopen -import shutil - -from . import constants +from . import utils from .video import Video def get_subtitles( video_path: str, lang='eng', time_start='0:00', time_end='', conf_threshold=65, sim_threshold=90, use_fullframe=False) -> str: - # download tesseract data files to ~/tessdata if necessary - constants.TESSDATA_DIR.mkdir(parents=True, exist_ok=True) - for fname in lang.split('+'): - fpath = constants.TESSDATA_DIR / '{}.traineddata'.format(fname) - if not fpath.is_file(): - if fname[0].isupper(): - url = constants.TESSDATA_SCRIPT_URL.format(fname) - else: - url = constants.TESSDATA_URL.format(fname) - with urlopen(url) as res, open(fpath, 'w+b') as f: - shutil.copyfileobj(res, f) + utils.download_lang_data(lang) v = Video(video_path) v.run_ocr(lang, time_start, time_end, conf_threshold, use_fullframe)
A videocr/utils.py

@@ -0,0 +1,21 @@

+from urllib.request import urlopen +import shutil + +from . import constants + + +# download language data files to ~/tessdata if necessary +def download_lang_data(lang: str): + constants.TESSDATA_DIR.mkdir(parents=True, exist_ok=True) + + for lang_name in lang.split('+'): + filepath = constants.TESSDATA_DIR / '{}.traineddata'.format(lang_name) + if not filepath.is_file(): + # download needed file + if lang_name[0].isupper(): + url = constants.TESSDATA_SCRIPT_URL.format(lang_name) + else: + url = constants.TESSDATA_URL.format(lang_name) + + with urlopen(url) as res, open(filepath, 'w+b') as f: + shutil.copyfileobj(res, f)