@@ -84,9 +84,9 @@ def scan(self, image: Image.Image) -> Optional[List[Paragraph]]:
8484 return None
8585
8686 # --- 1. run detection stage ---
87- det_input , sx , sy = self ._preprocess_for_detection (image_np )
88- det_raw = self ._run_detection_inference (det_input )
89- text_boxes = self ._postprocess_detection_results (det_raw , sx , sy )
87+ det_input , scale = self ._preprocess_for_detection (image_np )
88+ det_raw = self ._run_detection_inference (det_input , scale )
89+ text_boxes = self ._postprocess_detection_results (det_raw )
9090
9191 if not text_boxes :
9292 return []
@@ -159,32 +159,38 @@ def _to_meikipop_paragraphs(self, ocr_results: list, img_width: int, img_height:
159159
160160 def _preprocess_for_detection (self , image : np .ndarray ):
161161 h_orig , w_orig = image .shape [:2 ]
162- resized = cv2 .resize (image , (INPUT_DET_WIDTH , INPUT_DET_HEIGHT ), interpolation = cv2 .INTER_LINEAR )
163- tensor = resized .astype (np .float32 ) / 255.0
162+
163+ scale = min (INPUT_DET_WIDTH / w_orig , INPUT_DET_HEIGHT / h_orig )
164+ w_resized , h_resized = int (w_orig * scale ), int (h_orig * scale )
165+
166+ resized = cv2 .resize (image , (w_resized , h_resized ), interpolation = cv2 .INTER_LINEAR )
167+ normalized_resized = resized .astype (np .float32 ) / 255.0
168+
169+ tensor = np .zeros ((INPUT_DET_HEIGHT , INPUT_DET_WIDTH , 3 ), dtype = np .float32 )
170+ tensor [:h_resized , :w_resized ] = normalized_resized
164171 tensor = np .transpose (tensor , (2 , 0 , 1 ))
165172 tensor = np .expand_dims (tensor , axis = 0 )
166- return tensor , w_orig / INPUT_DET_WIDTH , h_orig / INPUT_DET_HEIGHT
167173
168- def _run_detection_inference (self , tensor : np .ndarray ):
174+ return tensor , scale
175+
176+ def _run_detection_inference (self , tensor : np .ndarray , scale : float ):
169177 inputs = {
170178 self .det_session .get_inputs ()[0 ].name : tensor ,
171- self .det_session .get_inputs ()[1 ].name : np .array ([[INPUT_DET_WIDTH , INPUT_DET_HEIGHT ]], dtype = np .int64 )
179+ self .det_session .get_inputs ()[1 ].name : np .array ([[INPUT_DET_WIDTH / scale , INPUT_DET_HEIGHT / scale ]],
180+ dtype = np .int64 )
172181 }
173182 return self .det_session .run (None , inputs )
174183
175- def _postprocess_detection_results (self , raw_outputs : list , scale_x : float , scale_y : float ):
184+ def _postprocess_detection_results (self , raw_outputs : list ):
176185 _ , boxes , scores = raw_outputs
177186 boxes , scores = boxes [0 ], scores [0 ]
178- text_boxes = []
179- for box , score in zip (boxes , scores ):
180- if score < DET_CONFIDENCE_THRESHOLD : continue
181- x1 , y1 , x2 , y2 = box
182- text_boxes .append ({'bbox' : [
183- max (0 , int (x1 * scale_x )),
184- max (0 , int (y1 * scale_y )),
185- max (0 , int (x2 * scale_x )),
186- max (0 , int (y2 * scale_y ))
187- ]})
187+ confident_boxes = boxes [scores > DET_CONFIDENCE_THRESHOLD ]
188+ if confident_boxes .shape [0 ] == 0 :
189+ return []
190+
191+ clamped_boxes = np .maximum (0 , confident_boxes .astype (np .int32 ))
192+
193+ text_boxes = [{'bbox' : box .tolist ()} for box in clamped_boxes ]
188194 return text_boxes
189195
190196 def _preprocess_for_recognition (self , image : np .ndarray , text_boxes : list ):
0 commit comments