@@ -140,7 +140,9 @@ def mock_classification(collection_data, *args, **kwargs):
140140 f"({ rate :.1f} domains/second)"
141141 )
142142
143- def test_cache_effectiveness (self ):
143+ @patch ("piedomains.data_collector.DataCollector.collect" )
144+ @patch ("piedomains.text.TextClassifier.classify_from_data" )
145+ def test_cache_effectiveness (self , mock_classify , mock_collect ):
144146 """Test that caching improves performance."""
145147 # Create some test cache files
146148 cache_html_dir = os .path .join (self .temp_dir , "html" )
@@ -151,29 +153,65 @@ def test_cache_effectiveness(self):
151153 with open (os .path .join (cache_html_dir , "example.com.html" ), "w" ) as f :
152154 f .write (test_html )
153155
154- # Mock the actual prediction to isolate cache performance
155- with patch (
156- "piedomains.text.TextClassifier._predict_text"
157- ) as mock_predict :
158- mock_predict .return_value = {
159- "text_label" : "news" ,
160- "text_prob" : 0.8 ,
161- "text_domain_probs" : {"news" : 0.8 , "other" : 0.2 }
156+ # Mock data collection to simulate cache behavior
157+ def mock_collection (domains , * args , ** kwargs ):
158+ use_cache = kwargs .get ("use_cache" , True )
159+ return {
160+ "collection_id" : "test_collection" ,
161+ "timestamp" : "2025-12-17T12:00:00Z" ,
162+ "domains" : [
163+ {
164+ "url" : "example.com" ,
165+ "domain" : "example.com" ,
166+ "text_path" : "html/example.com.html" ,
167+ "image_path" : "images/example.com.png" ,
168+ "date_time_collected" : "2025-12-17T12:00:00Z" ,
169+ "fetch_success" : True ,
170+ "cached" : use_cache , # Simulate cache usage
171+ "error" : None
172+ }
173+ ]
162174 }
163175
164- # First call (should use cache)
165- start_time = time .time ()
166- self .classifier .classify_by_text (["example.com" ], use_cache = True )
167- cached_time = time .time () - start_time
176+ # Mock classification
177+ def mock_classification (collection_data , * args , ** kwargs ):
178+ return [
179+ {
180+ "url" : "example.com" ,
181+ "domain" : "example.com" ,
182+ "text_path" : "html/example.com.html" ,
183+ "image_path" : "images/example.com.png" ,
184+ "date_time_collected" : "2025-12-17T12:00:00Z" ,
185+ "model_used" : "text/shallalist_ml" ,
186+ "category" : "news" ,
187+ "confidence" : 0.8 ,
188+ "reason" : None ,
189+ "error" : None ,
190+ "raw_predictions" : {"news" : 0.8 , "other" : 0.2 }
191+ }
192+ ]
168193
169- # Second call (should also use cache)
170- start_time = time .time ()
171- self .classifier .classify_by_text (["example.com" ], use_cache = True )
172- cached_time2 = time .time () - start_time
194+ mock_collect .side_effect = mock_collection
195+ mock_classify .side_effect = mock_classification
196+
197+ # First call (should use cache)
198+ start_time = time .time ()
199+ result1 = self .classifier .classify_by_text (["example.com" ], use_cache = True )
200+ cached_time = time .time () - start_time
201+
202+ # Second call (should also use cache)
203+ start_time = time .time ()
204+ result2 = self .classifier .classify_by_text (["example.com" ], use_cache = True )
205+ cached_time2 = time .time () - start_time
206+
207+ # Both should be fast since we're using cache
208+ self .assertLess (cached_time , 1.0 )
209+ self .assertLess (cached_time2 , 1.0 )
173210
174- # Both should be fast since we're using cache
175- self .assertLess (cached_time , 1.0 )
176- self .assertLess (cached_time2 , 1.0 )
211+ # Verify results are correct
212+ self .assertEqual (len (result1 ), 1 )
213+ self .assertEqual (len (result2 ), 1 )
214+ self .assertEqual (result1 [0 ]["category" ], "news" )
177215
178216 def test_memory_usage_batch_processing (self ):
179217 """Test memory usage doesn't grow excessively in batch processing."""
0 commit comments