Skip to content

Commit 0171255

Browse files
authored
improved text detection code for higher accuracy (#39)
1 parent 07c32a3 commit 0171255

File tree

3 files changed

+27
-21
lines changed

3 files changed

+27
-21
lines changed

src/config/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
logger = logging.getLogger(__name__)
77

88
APP_NAME = "meikipop"
9-
APP_VERSION = "v.1.5.1"
9+
APP_VERSION = "v.1.5.2"
1010
MAX_DICT_ENTRIES = 10
1111
IS_LINUX = sys.platform.startswith('linux')
1212
IS_WINDOWS = sys.platform.startswith('win')

src/ocr/ocr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def run(self):
3939
start_time = time.perf_counter()
4040
ocr_result = self.ocr_backend.scan(screenshot)
4141
logger.info(
42-
f"{self.ocr_backend.NAME} found {len(ocr_result)} paragraphs in {(time.perf_counter() - start_time):.3f}s.")
42+
f"{self.ocr_backend.NAME} found {len(ocr_result) if ocr_result else 0} paragraphs in {(time.perf_counter() - start_time):.3f}s.")
4343
# todo keep last ocr result?
4444

4545
self.shared_state.hit_scan_queue.put((True, ocr_result))

src/ocr/providers/meikiocr/provider.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ def scan(self, image: Image.Image) -> Optional[List[Paragraph]]:
8484
return None
8585

8686
# --- 1. run detection stage ---
87-
det_input, sx, sy = self._preprocess_for_detection(image_np)
88-
det_raw = self._run_detection_inference(det_input)
89-
text_boxes = self._postprocess_detection_results(det_raw, sx, sy)
87+
det_input, scale = self._preprocess_for_detection(image_np)
88+
det_raw = self._run_detection_inference(det_input, scale)
89+
text_boxes = self._postprocess_detection_results(det_raw)
9090

9191
if not text_boxes:
9292
return []
@@ -159,32 +159,38 @@ def _to_meikipop_paragraphs(self, ocr_results: list, img_width: int, img_height:
159159

160160
def _preprocess_for_detection(self, image: np.ndarray):
161161
h_orig, w_orig = image.shape[:2]
162-
resized = cv2.resize(image, (INPUT_DET_WIDTH, INPUT_DET_HEIGHT), interpolation=cv2.INTER_LINEAR)
163-
tensor = resized.astype(np.float32) / 255.0
162+
163+
scale = min(INPUT_DET_WIDTH / w_orig, INPUT_DET_HEIGHT / h_orig)
164+
w_resized, h_resized = int(w_orig * scale), int(h_orig * scale)
165+
166+
resized = cv2.resize(image, (w_resized, h_resized), interpolation=cv2.INTER_LINEAR)
167+
normalized_resized = resized.astype(np.float32) / 255.0
168+
169+
tensor = np.zeros((INPUT_DET_HEIGHT, INPUT_DET_WIDTH, 3), dtype=np.float32)
170+
tensor[:h_resized, :w_resized] = normalized_resized
164171
tensor = np.transpose(tensor, (2, 0, 1))
165172
tensor = np.expand_dims(tensor, axis=0)
166-
return tensor, w_orig / INPUT_DET_WIDTH, h_orig / INPUT_DET_HEIGHT
167173

168-
def _run_detection_inference(self, tensor: np.ndarray):
174+
return tensor, scale
175+
176+
def _run_detection_inference(self, tensor: np.ndarray, scale: float):
169177
inputs = {
170178
self.det_session.get_inputs()[0].name: tensor,
171-
self.det_session.get_inputs()[1].name: np.array([[INPUT_DET_WIDTH, INPUT_DET_HEIGHT]], dtype=np.int64)
179+
self.det_session.get_inputs()[1].name: np.array([[INPUT_DET_WIDTH / scale, INPUT_DET_HEIGHT / scale]],
180+
dtype=np.int64)
172181
}
173182
return self.det_session.run(None, inputs)
174183

175-
def _postprocess_detection_results(self, raw_outputs: list, scale_x: float, scale_y: float):
184+
def _postprocess_detection_results(self, raw_outputs: list):
176185
_, boxes, scores = raw_outputs
177186
boxes, scores = boxes[0], scores[0]
178-
text_boxes = []
179-
for box, score in zip(boxes, scores):
180-
if score < DET_CONFIDENCE_THRESHOLD: continue
181-
x1, y1, x2, y2 = box
182-
text_boxes.append({'bbox': [
183-
max(0, int(x1 * scale_x)),
184-
max(0, int(y1 * scale_y)),
185-
max(0, int(x2 * scale_x)),
186-
max(0, int(y2 * scale_y))
187-
]})
187+
confident_boxes = boxes[scores > DET_CONFIDENCE_THRESHOLD]
188+
if confident_boxes.shape[0] == 0:
189+
return []
190+
191+
clamped_boxes = np.maximum(0, confident_boxes.astype(np.int32))
192+
193+
text_boxes = [{'bbox': box.tolist()} for box in clamped_boxes]
188194
return text_boxes
189195

190196
def _preprocess_for_recognition(self, image: np.ndarray, text_boxes: list):

0 commit comments

Comments
 (0)