@@ -68,27 +68,53 @@ def test_text_processing_performance(self):
6868 self .assertLess (cleaning_time , 1.0 )
6969 self .assertIsInstance (cleaned_text , str )
7070
71- @patch ("piedomains.text.TextClassifier.predict" )
72- def test_batch_processing_scalability (self , mock_predict ):
71+ @patch ("piedomains.data_collector.DataCollector.collect" )
72+ @patch ("piedomains.text.TextClassifier.classify_from_data" )
73+ def test_batch_processing_scalability (self , mock_classify , mock_collect ):
7374 """Test scalability of batch processing."""
7475
75- # Mock prediction results
76- def mock_batch_predict (domains , * args , ** kwargs ):
77- return pd .DataFrame (
78- [
76+ # Mock data collection results
77+ def mock_collection (domains , * args , ** kwargs ):
78+ return {
79+ "collection_id" : "test_collection" ,
80+ "timestamp" : "2025-12-17T12:00:00Z" ,
81+ "domains" : [
7982 {
83+ "url" : domain ,
8084 "domain" : domain ,
81- "text_label" : "news" ,
82- "text_prob" : 0.8 ,
83- "error" : None ,
85+ "text_path" : f"html/{ domain } .html" ,
86+ "image_path" : f"images/{ domain } .png" ,
87+ "date_time_collected" : "2025-12-17T12:00:00Z" ,
88+ "fetch_success" : True ,
89+ "cached" : False ,
90+ "error" : None
8491 }
85- for domain in [
86- self .classifier ._parse_domain_name (d ) for d in domains
87- ]
92+ for domain in domains
8893 ]
89- )
90-
91- mock_predict .side_effect = mock_batch_predict
94+ }
95+
96+ # Mock classification results
97+ def mock_classification (collection_data , * args , ** kwargs ):
98+ domains_data = collection_data .get ("domains" , [])
99+ return [
100+ {
101+ "url" : domain_data ["url" ],
102+ "domain" : domain_data ["domain" ],
103+ "text_path" : domain_data ["text_path" ],
104+ "image_path" : domain_data ["image_path" ],
105+ "date_time_collected" : domain_data ["date_time_collected" ],
106+ "model_used" : "text/shallalist_ml" ,
107+ "category" : "news" ,
108+ "confidence" : 0.8 ,
109+ "reason" : None ,
110+ "error" : None ,
111+ "raw_predictions" : {"news" : 0.8 , "other" : 0.2 }
112+ }
113+ for domain_data in domains_data
114+ ]
115+
116+ mock_collect .side_effect = mock_collection
117+ mock_classify .side_effect = mock_classification
92118
93119 # Test different batch sizes
94120 test_sizes = [10 , 50 , 100 ]
@@ -97,9 +123,7 @@ def mock_batch_predict(domains, *args, **kwargs):
97123 domains = [f"test{ i } .com" for i in range (size )]
98124
99125 start_time = time .time ()
100- result = self .classifier .classify_batch (
101- domains , method = "text" , batch_size = 25 , show_progress = False
102- )
126+ result = self .classifier .classify_by_text (domains )
103127 total_time = time .time () - start_time
104128
105129 # Verify results
@@ -129,11 +153,13 @@ def test_cache_effectiveness(self):
129153
130154 # Mock the actual prediction to isolate cache performance
131155 with patch (
132- "piedomains.text.TextClassifier.predict "
156+ "piedomains.text.TextClassifier._predict_text "
133157 ) as mock_predict :
134- mock_predict .return_value = pd .DataFrame (
135- [{"domain" : "example.com" , "text_label" : "news" , "text_prob" : 0.8 }]
136- )
158+ mock_predict .return_value = {
159+ "text_label" : "news" ,
160+ "text_prob" : 0.8 ,
161+ "text_domain_probs" : {"news" : 0.8 , "other" : 0.2 }
162+ }
137163
138164 # First call (should use cache)
139165 start_time = time .time ()
@@ -160,22 +186,54 @@ def test_memory_usage_batch_processing(self):
160186 initial_memory = process .memory_info ().rss / 1024 / 1024 # MB
161187
162188 # Mock to avoid actual model loading
163- with patch (
164- "piedomains.text.TextClassifier.predict"
165- ) as mock_predict :
166- mock_predict .return_value = pd .DataFrame (
167- [
168- {"domain" : f"test{ i } .com" , "text_label" : "news" , "text_prob" : 0.8 }
169- for i in range (50 )
189+ with patch ("piedomains.data_collector.DataCollector.collect" ) as mock_collect , \
190+ patch ("piedomains.text.TextClassifier.classify_from_data" ) as mock_classify :
191+
192+ def mock_collection (domains , * args , ** kwargs ):
193+ return {
194+ "collection_id" : "test_collection" ,
195+ "timestamp" : "2025-12-17T12:00:00Z" ,
196+ "domains" : [
197+ {
198+ "url" : domain ,
199+ "domain" : domain ,
200+ "text_path" : f"html/{ domain } .html" ,
201+ "image_path" : f"images/{ domain } .png" ,
202+ "date_time_collected" : "2025-12-17T12:00:00Z" ,
203+ "fetch_success" : True ,
204+ "cached" : False ,
205+ "error" : None
206+ }
207+ for domain in domains
208+ ]
209+ }
210+
211+ def mock_classification (collection_data , * args , ** kwargs ):
212+ domains_data = collection_data .get ("domains" , [])
213+ return [
214+ {
215+ "url" : domain_data ["url" ],
216+ "domain" : domain_data ["domain" ],
217+ "text_path" : domain_data ["text_path" ],
218+ "image_path" : domain_data ["image_path" ],
219+ "date_time_collected" : domain_data ["date_time_collected" ],
220+ "model_used" : "text/shallalist_ml" ,
221+ "category" : "news" ,
222+ "confidence" : 0.8 ,
223+ "reason" : None ,
224+ "error" : None ,
225+ "raw_predictions" : {"news" : 0.8 , "other" : 0.2 }
226+ }
227+ for domain_data in domains_data
170228 ]
171- )
229+
230+ mock_collect .side_effect = mock_collection
231+ mock_classify .side_effect = mock_classification
172232
173233 # Process multiple batches
174234 for batch_num in range (5 ):
175235 domains = [f"batch{ batch_num } _test{ i } .com" for i in range (50 )]
176- self .classifier .classify_batch (
177- domains , method = "text" , batch_size = 10 , show_progress = False
178- )
236+ self .classifier .classify_by_text (domains )
179237
180238 # Force garbage collection
181239 gc .collect ()
0 commit comments