11import logging
2+ import re
23from typing import List , Optional
34
4- import cv2
55import numpy as np
6- import onnxruntime as ort
7- from huggingface_hub import hf_hub_download
86from PIL import Image
9- import re
107
11- # the "contract" classes that a new provider MUST use for its return value.
8+ # Import the MeikiOCR library
9+ from meikiocr import MeikiOCR
10+
11+ # Import the "contract" classes from your application's interface
1212from src .ocr .interface import BoundingBox , OcrProvider , Paragraph , Word
1313
1414logger = logging .getLogger (__name__ )
1515
16- # --- model configuration ---
17- DET_MODEL_REPO = "rtr46/meiki.text.detect.v0"
18- DET_MODEL_NAME = "meiki.text.detect.v0.1.960x544.onnx"
19- REC_MODEL_REPO = "rtr46/meiki.txt.recognition.v0"
20- REC_MODEL_NAME = "meiki.text.rec.v0.960x32.onnx"
21-
2216# --- pipeline configuration ---
23- INPUT_DET_WIDTH = 960
24- INPUT_DET_HEIGHT = 544
25- INPUT_REC_HEIGHT = 32
26- INPUT_REC_WIDTH = 960
17+ # These thresholds are passed to the library's run_ocr method.
2718DET_CONFIDENCE_THRESHOLD = 0.5
2819REC_CONFIDENCE_THRESHOLD = 0.1
29- X_OVERLAP_THRESHOLD = 0.3
30- EPSILON = 1e-6
3120
3221JAPANESE_REGEX = re .compile (r'[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FAF]' )
3322
3423
3524class MeikiOcrProvider (OcrProvider ):
3625 """
37- An OCR provider that uses the high-performance meikiocr pipeline .
26+ An OCR provider that uses the high-performance meikiocr library .
3827 This provider is specifically optimized for recognizing Japanese text from video games.
3928 """
4029 NAME = "meikiocr (local)"
4130
4231 def __init__ (self ):
4332 """
44- Initializes the provider and lazy-loads the ONNX models .
45- This is called once when the provider is selected in MeikiPop .
33+ Initializes the provider by creating an instance of the MeikiOCR client .
34+ The library handles the model downloading and session management internally .
4635 """
4736 logger .info (f"initializing { self .NAME } provider..." )
48- self .det_session = None
49- self .rec_session = None
37+ self .ocr_client = None
5038 try :
51- det_model_path = hf_hub_download (repo_id = DET_MODEL_REPO , filename = DET_MODEL_NAME )
52- rec_model_path = hf_hub_download (repo_id = REC_MODEL_REPO , filename = REC_MODEL_NAME )
53-
54- # prioritize gpu if available, fallback to cpu.
55- available_providers = ort .get_available_providers ()
56- desired_providers = ['CUDAExecutionProvider' , 'CPUExecutionProvider' ]
57- providers_to_use = [p for p in desired_providers if p in available_providers ]
58- ort .set_default_logger_severity (3 ) # suppress verbose logs
59-
60- self .det_session = ort .InferenceSession (det_model_path , providers = providers_to_use )
61- self .rec_session = ort .InferenceSession (rec_model_path , providers = providers_to_use )
62-
63- active_provider = self .det_session .get_providers ()[0 ]
64- logger .info (f"{ self .NAME } initialized successfully, running on: { active_provider } " )
39+ self .ocr_client = MeikiOCR ()
40+ logger .info (f"{ self .NAME } initialized successfully, running on: { self .ocr_client .active_provider } " )
6541
6642 except Exception as e :
6743 logger .error (f"failed to initialize { self .NAME } : { e } " , exc_info = True )
6844
6945 def scan (self , image : Image .Image ) -> Optional [List [Paragraph ]]:
7046 """
71- Performs OCR on the given image using the full meikiocr pipeline .
47+ Performs OCR on the given image by calling the meikiocr library .
7248 """
73- if not self .det_session or not self . rec_session :
74- logger .error (f"{ self .NAME } was not initialized correctly. cannot perform scan." )
49+ if not self .ocr_client :
50+ logger .error (f"{ self .NAME } was not initialized correctly. Cannot perform scan." )
7551 return None
7652
7753 try :
54+ # Convert PIL (RGB) image to the OpenCV (BGR) format expected by the library.
55+ image_np_rgb = np .array (image .convert ("RGB" ))
56+ image_np_bgr = image_np_rgb [:, :, ::- 1 ] # Convert RGB to BGR
57+ img_height , img_width = image_np_bgr .shape [:2 ]
7858
79- # convert pil (rgb) image to numpy array for opencv processing.
80- image_np = np .array (image .convert ("RGB" ))
81- img_height , img_width = image_np .shape [:2 ]
8259 if img_width == 0 or img_height == 0 :
8360 logger .error ("invalid image dimensions received." )
8461 return None
8562
86- # --- 1. run detection stage ---
87- det_input , scale = self ._preprocess_for_detection (image_np )
88- det_raw = self ._run_detection_inference (det_input , scale )
89- text_boxes = self ._postprocess_detection_results (det_raw )
90-
91- if not text_boxes :
92- return []
93-
94- # --- 2. run recognition stage ---
95- rec_batch , valid_indices , crop_meta = self ._preprocess_for_recognition (image_np , text_boxes )
96- if rec_batch is None :
97- return []
98-
99- rec_raw = self ._run_recognition_inference (rec_batch )
100- ocr_results = self ._postprocess_recognition_results (rec_raw , valid_indices , crop_meta , len (text_boxes ))
63+ # --- 1. Run the entire OCR pipeline with a single library call ---
64+ ocr_results = self .ocr_client .run_ocr (
65+ image_np_bgr ,
66+ det_threshold = DET_CONFIDENCE_THRESHOLD ,
67+ rec_threshold = REC_CONFIDENCE_THRESHOLD
68+ )
10169
102- # --- 3. transform data to meikipop 's format ---
70+ # --- 2. Transform the library's output to MeikiPop 's format ---
10371 return self ._to_meikipop_paragraphs (ocr_results , img_width , img_height )
10472
10573 except Exception as e :
@@ -139,150 +107,14 @@ def _to_meikipop_paragraphs(self, ocr_results: list, img_width: int, img_height:
139107 min_y = min (c ['bbox' ][1 ] for c in chars )
140108 max_x = max (c ['bbox' ][2 ] for c in chars )
141109 max_y = max (c ['bbox' ][3 ] for c in chars )
142- line_pixel_bbox = [min_x , min_y , max_x , max_y ]
143- line_box = self ._to_normalized_bbox (line_pixel_bbox , img_width , img_height )
144-
145- # meikiocr currently only supports horizontal text.
146- is_vertical = False
110+ line_box = self ._to_normalized_bbox ([min_x , min_y , max_x , max_y ], img_width , img_height )
147111
148112 paragraph = Paragraph (
149113 full_text = full_text ,
150114 words = words_in_para ,
151115 box = line_box ,
152- is_vertical = is_vertical
116+ is_vertical = False # meikiocr currently only supports horizontal text.
153117 )
154118 paragraphs .append (paragraph )
155119
156- return paragraphs
157-
158- # --- meikiocr pipeline methods (adapted from meiki_ocr.py) ---
159-
160- def _preprocess_for_detection (self , image : np .ndarray ):
161- h_orig , w_orig = image .shape [:2 ]
162-
163- scale = min (INPUT_DET_WIDTH / w_orig , INPUT_DET_HEIGHT / h_orig )
164- w_resized , h_resized = int (w_orig * scale ), int (h_orig * scale )
165-
166- resized = cv2 .resize (image , (w_resized , h_resized ), interpolation = cv2 .INTER_LINEAR )
167- normalized_resized = resized .astype (np .float32 ) / 255.0
168-
169- tensor = np .zeros ((INPUT_DET_HEIGHT , INPUT_DET_WIDTH , 3 ), dtype = np .float32 )
170- tensor [:h_resized , :w_resized ] = normalized_resized
171- tensor = np .transpose (tensor , (2 , 0 , 1 ))
172- tensor = np .expand_dims (tensor , axis = 0 )
173-
174- return tensor , scale
175-
176- def _run_detection_inference (self , tensor : np .ndarray , scale : float ):
177- inputs = {
178- self .det_session .get_inputs ()[0 ].name : tensor ,
179- self .det_session .get_inputs ()[1 ].name : np .array ([[INPUT_DET_WIDTH / scale , INPUT_DET_HEIGHT / scale ]],
180- dtype = np .int64 )
181- }
182- return self .det_session .run (None , inputs )
183-
184- def _postprocess_detection_results (self , raw_outputs : list ):
185- _ , boxes , scores = raw_outputs
186- boxes , scores = boxes [0 ], scores [0 ]
187- confident_boxes = boxes [scores > DET_CONFIDENCE_THRESHOLD ]
188- if confident_boxes .shape [0 ] == 0 :
189- return []
190-
191- clamped_boxes = np .maximum (0 , confident_boxes .astype (np .int32 ))
192-
193- text_boxes = [{'bbox' : box .tolist ()} for box in clamped_boxes ]
194- return text_boxes
195-
196- def _preprocess_for_recognition (self , image : np .ndarray , text_boxes : list ):
197- tensors , valid_indices , crop_meta = [], [], []
198- for i , tb in enumerate (text_boxes ):
199- x1 , y1 , x2 , y2 = tb ['bbox' ]
200- w , h = x2 - x1 , y2 - y1
201- if w < h or w <= 0 or h <= 0 : continue
202- crop = image [y1 :y2 , x1 :x2 ]
203- ch , cw = crop .shape [:2 ]
204- nh , nw = INPUT_REC_HEIGHT , int (round (cw * (INPUT_REC_HEIGHT / ch )))
205- if nw > INPUT_REC_WIDTH :
206- scale = INPUT_REC_WIDTH / nw
207- nw , nh = INPUT_REC_WIDTH , int (round (nh * scale ))
208- resized = cv2 .resize (crop , (nw , nh ), interpolation = cv2 .INTER_LINEAR )
209- pw , ph = INPUT_REC_WIDTH - nw , INPUT_REC_HEIGHT - nh
210- padded = np .pad (resized , ((0 , ph ), (0 , pw ), (0 , 0 )), constant_values = 0 )
211- tensor = (padded .astype (np .float32 ) / 255.0 )
212- tensor = np .transpose (tensor , (2 , 0 , 1 ))
213- tensors .append (tensor )
214- valid_indices .append (i )
215- crop_meta .append ({'orig_bbox' : [x1 , y1 , x2 , y2 ], 'effective_w' : nw })
216- if not tensors : return None , [], []
217- return np .stack (tensors , axis = 0 ), valid_indices , crop_meta
218-
219- def _run_recognition_inference (self , batch_tensor : np .ndarray ):
220- inputs = {
221- "images" : batch_tensor ,
222- "orig_target_sizes" : np .array ([[INPUT_REC_WIDTH , INPUT_REC_HEIGHT ]], dtype = np .int64 )
223- }
224- return self .rec_session .run (None , inputs )
225-
226- def _postprocess_recognition_results (self , raw_outputs : list , valid_indices : list , crop_meta : list , num_boxes : int ):
227- labels_batch , boxes_batch , scores_batch = raw_outputs
228- results = [{'text' : '' , 'chars' : []} for _ in range (num_boxes )]
229- for i , (labels , boxes , scores ) in enumerate (zip (labels_batch , boxes_batch , scores_batch )):
230- meta = crop_meta [i ]
231- gx1 , gy1 , gx2 , gy2 = meta ['orig_bbox' ]
232- cw , ch = gx2 - gx1 , gy2 - gy1
233- ew = meta ['effective_w' ]
234-
235- candidates = []
236- for lbl , box , scr in zip (labels , boxes , scores ):
237- if scr < REC_CONFIDENCE_THRESHOLD : continue
238-
239- char = chr (lbl )
240- rx1 , ry1 , rx2 , ry2 = box
241- rx1 , rx2 = min (rx1 , ew ), min (rx2 , ew )
242-
243- # map: recognition space -> crop space -> global image
244- cx1 = (rx1 / ew ) * cw
245- cx2 = (rx2 / ew ) * cw
246- cy1 = (ry1 / INPUT_REC_HEIGHT ) * ch
247- cy2 = (ry2 / INPUT_REC_HEIGHT ) * ch
248-
249- gx1_char = gx1 + int (cx1 )
250- gy1_char = gy1 + int (cy1 )
251- gx2_char = gx1 + int (cx2 )
252- gy2_char = gy1 + int (cy2 )
253-
254- candidates .append ({
255- 'char' : char ,
256- 'bbox' : [gx1_char , gy1_char , gx2_char , gy2_char ],
257- 'conf' : float (scr ),
258- 'x_interval' : (gx1_char , gx2_char )
259- })
260-
261- # sort by confidence (descending) to prepare for deduplication
262- candidates .sort (key = lambda c : c ['conf' ], reverse = True )
263-
264- # spatial deduplication on x-axis (non-maximum suppression)
265- accepted = []
266- for cand in candidates :
267- x1_c , x2_c = cand ['x_interval' ]
268- width_c = x2_c - x1_c + EPSILON
269- keep = True
270- for acc in accepted :
271- x1_a , x2_a = acc ['x_interval' ]
272- overlap = max (0 , min (x2_c , x2_a ) - max (x1_c , x1_a ))
273- if overlap / width_c > X_OVERLAP_THRESHOLD :
274- keep = False
275- break
276- if keep :
277- accepted .append (cand )
278-
279- # sort by x for final reading order
280- accepted .sort (key = lambda c : c ['x_interval' ][0 ])
281-
282- text = '' .join (c ['char' ] for c in accepted )
283- # keep the confidence score in the final output as it can be useful
284- final_chars = [{'char' : c ['char' ], 'bbox' : c ['bbox' ], 'conf' : c ['conf' ]} for c in accepted ]
285-
286- results [valid_indices [i ]] = {'text' : text , 'chars' : final_chars }
287-
288- return results
120+ return paragraphs
0 commit comments