all repos — videocr @ 3f73cb9bcafbd639ef5791a846b861d633cdb9dc

Extract hardcoded subtitles from videos using machine learning

videocr/models.py (view raw)

 1from __future__ import annotations
 2from typing import List
 3from dataclasses import dataclass
 4from fuzzywuzzy import fuzz
 5
 6
 7CONF_THRESHOLD = 60
 8# word predictions with lower confidence will be filtered out
 9
10
11@dataclass
12class PredictedWord:
13    __slots__ = 'confidence', 'text'
14    confidence: int
15    text: str
16
17
18class PredictedFrame:
19    index: int  # 0-based index of the frame
20    words: List[PredictedWord]
21    confidence: int  # total confidence of all words
22    text: str
23
24    def __init__(self, index, pred_data: str):
25        self.index = index
26        self.words = []
27
28        block = 0  # keep track of line breaks
29
30        for l in pred_data.splitlines()[1:]:
31            word_data = l.split()
32            if len(word_data) < 12:
33                # no word is predicted
34                continue
35            _, _, block_num, *_, conf, text = word_data
36            block_num, conf = int(block_num), int(conf)
37
38            # handle line breaks
39            if block < block_num:
40                block = block_num
41                if self.words and self.words[-1].text != '\n':
42                    self.words.append(PredictedWord(0, '\n'))
43
44            if conf >= CONF_THRESHOLD:
45                self.words.append(PredictedWord(conf, text))
46
47        self.confidence = sum(word.confidence for word in self.words)
48
49        self.text = ' '.join(word.text for word in self.words)
50        # remove chars that are obviously ocr errors
51        translate_table = {ord(c): None for c in '<>{}[];`@#$%^*_=~\\'}
52        translate_table[ord('|')] = 'I'
53        self.text = self.text.translate(translate_table).strip()
54
55    def is_similar_to(self, other: PredictedFrame, threshold=70) -> bool:
56        return fuzz.ratio(self.text, other.text) >= threshold
57
58
59class PredictedSubtitle:
60    frames: List[PredictedFrame]
61
62    def __init__(self, frames: List[PredictedFrame]):
63        self.frames = [f for f in frames if f.confidence > 0]
64
65        if self.frames:
66            conf_max = max(f.confidence for f in self.frames)
67            self.text = next(f.text for f in self.frames 
68                                    if f.confidence == conf_max)
69        else:
70            self.text = ''
71
72    @property
73    def index_start(self) -> int:
74        if self.frames:
75            return self.frames[0].index
76        return 0
77
78    @property
79    def index_end(self) -> int:
80        if self.frames:
81            return self.frames[-1].index
82        return 0
83
84    def is_similar_to(self, other: PredictedSubtitle, threshold=90) -> bool:
85        return fuzz.partial_ratio(self.text, other.text) >= threshold
86
87    def __repr__(self):
88        return '{} - {}. {}'.format(self.index_start, self.index_end, self.text)