Skip to content

Commit 4b9a748

Browse files
authored
replaced meikiocr pipeline implementation with meikiocr lib (#40)
* replaced meikiocr pipeline implementation with meikiocr lib * Update requirements.txt
1 parent 0171255 commit 4b9a748

File tree

3 files changed

+32
-202
lines changed

3 files changed

+32
-202
lines changed

requirements.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,5 @@ pyobjc-framework-Quartz>=10.0; sys_platform == 'darwin'
99
pyobjc-framework-Cocoa>=10.0; sys_platform == 'darwin'
1010
python-xlib~=0.33; sys_platform == 'linux'
1111
websockets>=15.0.1
12-
numpy>=2.2.6
13-
opencv-python>=4.12.0.88
14-
onnxruntime==1.20.1
15-
huggingface-hub>=1.0.1
12+
meikiocr>=0.1.2
13+
onnxruntime==1.20.1; sys_platform == 'win32' # newer versions cause issues on windows

src/config/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
logger = logging.getLogger(__name__)
77

88
APP_NAME = "meikipop"
9-
APP_VERSION = "v.1.5.2"
9+
APP_VERSION = "v.1.5.3"
1010
MAX_DICT_ENTRIES = 10
1111
IS_LINUX = sys.platform.startswith('linux')
1212
IS_WINDOWS = sys.platform.startswith('win')
Lines changed: 29 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,73 @@
11
import logging
2+
import re
23
from typing import List, Optional
34

4-
import cv2
55
import numpy as np
6-
import onnxruntime as ort
7-
from huggingface_hub import hf_hub_download
86
from PIL import Image
9-
import re
107

11-
# the "contract" classes that a new provider MUST use for its return value.
8+
# Import the MeikiOCR library
9+
from meikiocr import MeikiOCR
10+
11+
# Import the "contract" classes from your application's interface
1212
from src.ocr.interface import BoundingBox, OcrProvider, Paragraph, Word
1313

1414
logger = logging.getLogger(__name__)
1515

16-
# --- model configuration ---
17-
DET_MODEL_REPO = "rtr46/meiki.text.detect.v0"
18-
DET_MODEL_NAME = "meiki.text.detect.v0.1.960x544.onnx"
19-
REC_MODEL_REPO = "rtr46/meiki.txt.recognition.v0"
20-
REC_MODEL_NAME = "meiki.text.rec.v0.960x32.onnx"
21-
2216
# --- pipeline configuration ---
23-
INPUT_DET_WIDTH = 960
24-
INPUT_DET_HEIGHT = 544
25-
INPUT_REC_HEIGHT = 32
26-
INPUT_REC_WIDTH = 960
17+
# These thresholds are passed to the library's run_ocr method.
2718
DET_CONFIDENCE_THRESHOLD = 0.5
2819
REC_CONFIDENCE_THRESHOLD = 0.1
29-
X_OVERLAP_THRESHOLD = 0.3
30-
EPSILON = 1e-6
3120

3221
JAPANESE_REGEX = re.compile(r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]')
3322

3423

3524
class MeikiOcrProvider(OcrProvider):
3625
"""
37-
An OCR provider that uses the high-performance meikiocr pipeline.
26+
An OCR provider that uses the high-performance meikiocr library.
3827
This provider is specifically optimized for recognizing Japanese text from video games.
3928
"""
4029
NAME = "meikiocr (local)"
4130

4231
def __init__(self):
4332
"""
44-
Initializes the provider and lazy-loads the ONNX models.
45-
This is called once when the provider is selected in MeikiPop.
33+
Initializes the provider by creating an instance of the MeikiOCR client.
34+
The library handles the model downloading and session management internally.
4635
"""
4736
logger.info(f"initializing {self.NAME} provider...")
48-
self.det_session = None
49-
self.rec_session = None
37+
self.ocr_client = None
5038
try:
51-
det_model_path = hf_hub_download(repo_id=DET_MODEL_REPO, filename=DET_MODEL_NAME)
52-
rec_model_path = hf_hub_download(repo_id=REC_MODEL_REPO, filename=REC_MODEL_NAME)
53-
54-
# prioritize gpu if available, fallback to cpu.
55-
available_providers = ort.get_available_providers()
56-
desired_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
57-
providers_to_use = [p for p in desired_providers if p in available_providers]
58-
ort.set_default_logger_severity(3) # suppress verbose logs
59-
60-
self.det_session = ort.InferenceSession(det_model_path, providers=providers_to_use)
61-
self.rec_session = ort.InferenceSession(rec_model_path, providers=providers_to_use)
62-
63-
active_provider = self.det_session.get_providers()[0]
64-
logger.info(f"{self.NAME} initialized successfully, running on: {active_provider}")
39+
self.ocr_client = MeikiOCR()
40+
logger.info(f"{self.NAME} initialized successfully, running on: {self.ocr_client.active_provider}")
6541

6642
except Exception as e:
6743
logger.error(f"failed to initialize {self.NAME}: {e}", exc_info=True)
6844

6945
def scan(self, image: Image.Image) -> Optional[List[Paragraph]]:
7046
"""
71-
Performs OCR on the given image using the full meikiocr pipeline.
47+
Performs OCR on the given image by calling the meikiocr library.
7248
"""
73-
if not self.det_session or not self.rec_session:
74-
logger.error(f"{self.NAME} was not initialized correctly. cannot perform scan.")
49+
if not self.ocr_client:
50+
logger.error(f"{self.NAME} was not initialized correctly. Cannot perform scan.")
7551
return None
7652

7753
try:
54+
# Convert PIL (RGB) image to the OpenCV (BGR) format expected by the library.
55+
image_np_rgb = np.array(image.convert("RGB"))
56+
image_np_bgr = image_np_rgb[:, :, ::-1] # Convert RGB to BGR
57+
img_height, img_width = image_np_bgr.shape[:2]
7858

79-
# convert pil (rgb) image to numpy array for opencv processing.
80-
image_np = np.array(image.convert("RGB"))
81-
img_height, img_width = image_np.shape[:2]
8259
if img_width == 0 or img_height == 0:
8360
logger.error("invalid image dimensions received.")
8461
return None
8562

86-
# --- 1. run detection stage ---
87-
det_input, scale = self._preprocess_for_detection(image_np)
88-
det_raw = self._run_detection_inference(det_input, scale)
89-
text_boxes = self._postprocess_detection_results(det_raw)
90-
91-
if not text_boxes:
92-
return []
93-
94-
# --- 2. run recognition stage ---
95-
rec_batch, valid_indices, crop_meta = self._preprocess_for_recognition(image_np, text_boxes)
96-
if rec_batch is None:
97-
return []
98-
99-
rec_raw = self._run_recognition_inference(rec_batch)
100-
ocr_results = self._postprocess_recognition_results(rec_raw, valid_indices, crop_meta, len(text_boxes))
63+
# --- 1. Run the entire OCR pipeline with a single library call ---
64+
ocr_results = self.ocr_client.run_ocr(
65+
image_np_bgr,
66+
det_threshold=DET_CONFIDENCE_THRESHOLD,
67+
rec_threshold=REC_CONFIDENCE_THRESHOLD
68+
)
10169

102-
# --- 3. transform data to meikipop's format ---
70+
# --- 2. Transform the library's output to MeikiPop's format ---
10371
return self._to_meikipop_paragraphs(ocr_results, img_width, img_height)
10472

10573
except Exception as e:
@@ -139,150 +107,14 @@ def _to_meikipop_paragraphs(self, ocr_results: list, img_width: int, img_height:
139107
min_y = min(c['bbox'][1] for c in chars)
140108
max_x = max(c['bbox'][2] for c in chars)
141109
max_y = max(c['bbox'][3] for c in chars)
142-
line_pixel_bbox = [min_x, min_y, max_x, max_y]
143-
line_box = self._to_normalized_bbox(line_pixel_bbox, img_width, img_height)
144-
145-
# meikiocr currently only supports horizontal text.
146-
is_vertical = False
110+
line_box = self._to_normalized_bbox([min_x, min_y, max_x, max_y], img_width, img_height)
147111

148112
paragraph = Paragraph(
149113
full_text=full_text,
150114
words=words_in_para,
151115
box=line_box,
152-
is_vertical=is_vertical
116+
is_vertical=False # meikiocr currently only supports horizontal text.
153117
)
154118
paragraphs.append(paragraph)
155119

156-
return paragraphs
157-
158-
# --- meikiocr pipeline methods (adapted from meiki_ocr.py) ---
159-
160-
def _preprocess_for_detection(self, image: np.ndarray):
161-
h_orig, w_orig = image.shape[:2]
162-
163-
scale = min(INPUT_DET_WIDTH / w_orig, INPUT_DET_HEIGHT / h_orig)
164-
w_resized, h_resized = int(w_orig * scale), int(h_orig * scale)
165-
166-
resized = cv2.resize(image, (w_resized, h_resized), interpolation=cv2.INTER_LINEAR)
167-
normalized_resized = resized.astype(np.float32) / 255.0
168-
169-
tensor = np.zeros((INPUT_DET_HEIGHT, INPUT_DET_WIDTH, 3), dtype=np.float32)
170-
tensor[:h_resized, :w_resized] = normalized_resized
171-
tensor = np.transpose(tensor, (2, 0, 1))
172-
tensor = np.expand_dims(tensor, axis=0)
173-
174-
return tensor, scale
175-
176-
def _run_detection_inference(self, tensor: np.ndarray, scale: float):
177-
inputs = {
178-
self.det_session.get_inputs()[0].name: tensor,
179-
self.det_session.get_inputs()[1].name: np.array([[INPUT_DET_WIDTH / scale, INPUT_DET_HEIGHT / scale]],
180-
dtype=np.int64)
181-
}
182-
return self.det_session.run(None, inputs)
183-
184-
def _postprocess_detection_results(self, raw_outputs: list):
185-
_, boxes, scores = raw_outputs
186-
boxes, scores = boxes[0], scores[0]
187-
confident_boxes = boxes[scores > DET_CONFIDENCE_THRESHOLD]
188-
if confident_boxes.shape[0] == 0:
189-
return []
190-
191-
clamped_boxes = np.maximum(0, confident_boxes.astype(np.int32))
192-
193-
text_boxes = [{'bbox': box.tolist()} for box in clamped_boxes]
194-
return text_boxes
195-
196-
def _preprocess_for_recognition(self, image: np.ndarray, text_boxes: list):
197-
tensors, valid_indices, crop_meta = [], [], []
198-
for i, tb in enumerate(text_boxes):
199-
x1, y1, x2, y2 = tb['bbox']
200-
w, h = x2 - x1, y2 - y1
201-
if w < h or w <= 0 or h <= 0: continue
202-
crop = image[y1:y2, x1:x2]
203-
ch, cw = crop.shape[:2]
204-
nh, nw = INPUT_REC_HEIGHT, int(round(cw * (INPUT_REC_HEIGHT / ch)))
205-
if nw > INPUT_REC_WIDTH:
206-
scale = INPUT_REC_WIDTH / nw
207-
nw, nh = INPUT_REC_WIDTH, int(round(nh * scale))
208-
resized = cv2.resize(crop, (nw, nh), interpolation=cv2.INTER_LINEAR)
209-
pw, ph = INPUT_REC_WIDTH - nw, INPUT_REC_HEIGHT - nh
210-
padded = np.pad(resized, ((0, ph), (0, pw), (0, 0)), constant_values=0)
211-
tensor = (padded.astype(np.float32) / 255.0)
212-
tensor = np.transpose(tensor, (2, 0, 1))
213-
tensors.append(tensor)
214-
valid_indices.append(i)
215-
crop_meta.append({'orig_bbox': [x1, y1, x2, y2], 'effective_w': nw})
216-
if not tensors: return None, [], []
217-
return np.stack(tensors, axis=0), valid_indices, crop_meta
218-
219-
def _run_recognition_inference(self, batch_tensor: np.ndarray):
220-
inputs = {
221-
"images": batch_tensor,
222-
"orig_target_sizes": np.array([[INPUT_REC_WIDTH, INPUT_REC_HEIGHT]], dtype=np.int64)
223-
}
224-
return self.rec_session.run(None, inputs)
225-
226-
def _postprocess_recognition_results(self, raw_outputs: list, valid_indices: list, crop_meta: list, num_boxes: int):
227-
labels_batch, boxes_batch, scores_batch = raw_outputs
228-
results = [{'text': '', 'chars': []} for _ in range(num_boxes)]
229-
for i, (labels, boxes, scores) in enumerate(zip(labels_batch, boxes_batch, scores_batch)):
230-
meta = crop_meta[i]
231-
gx1, gy1, gx2, gy2 = meta['orig_bbox']
232-
cw, ch = gx2 - gx1, gy2 - gy1
233-
ew = meta['effective_w']
234-
235-
candidates = []
236-
for lbl, box, scr in zip(labels, boxes, scores):
237-
if scr < REC_CONFIDENCE_THRESHOLD: continue
238-
239-
char = chr(lbl)
240-
rx1, ry1, rx2, ry2 = box
241-
rx1, rx2 = min(rx1, ew), min(rx2, ew)
242-
243-
# map: recognition space -> crop space -> global image
244-
cx1 = (rx1 / ew) * cw
245-
cx2 = (rx2 / ew) * cw
246-
cy1 = (ry1 / INPUT_REC_HEIGHT) * ch
247-
cy2 = (ry2 / INPUT_REC_HEIGHT) * ch
248-
249-
gx1_char = gx1 + int(cx1)
250-
gy1_char = gy1 + int(cy1)
251-
gx2_char = gx1 + int(cx2)
252-
gy2_char = gy1 + int(cy2)
253-
254-
candidates.append({
255-
'char': char,
256-
'bbox': [gx1_char, gy1_char, gx2_char, gy2_char],
257-
'conf': float(scr),
258-
'x_interval': (gx1_char, gx2_char)
259-
})
260-
261-
# sort by confidence (descending) to prepare for deduplication
262-
candidates.sort(key=lambda c: c['conf'], reverse=True)
263-
264-
# spatial deduplication on x-axis (non-maximum suppression)
265-
accepted = []
266-
for cand in candidates:
267-
x1_c, x2_c = cand['x_interval']
268-
width_c = x2_c - x1_c + EPSILON
269-
keep = True
270-
for acc in accepted:
271-
x1_a, x2_a = acc['x_interval']
272-
overlap = max(0, min(x2_c, x2_a) - max(x1_c, x1_a))
273-
if overlap / width_c > X_OVERLAP_THRESHOLD:
274-
keep = False
275-
break
276-
if keep:
277-
accepted.append(cand)
278-
279-
# sort by x for final reading order
280-
accepted.sort(key=lambda c: c['x_interval'][0])
281-
282-
text = ''.join(c['char'] for c in accepted)
283-
# keep the confidence score in the final output as it can be useful
284-
final_chars = [{'char': c['char'], 'bbox': c['bbox'], 'conf': c['conf']} for c in accepted]
285-
286-
results[valid_indices[i]] = {'text': text, 'chars': final_chars}
287-
288-
return results
120+
return paragraphs

0 commit comments

Comments
 (0)