Skip to content

Commit e969a8f

Browse files
committed
fix tests
1 parent f542d08 commit e969a8f

File tree

5 files changed

+182
-68
lines changed

5 files changed

+182
-68
lines changed

CHANGELOG.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,55 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.6.0] - 2025-12-17
9+
10+
### 💥 BREAKING CHANGES
11+
- **API Modernization**: Complete removal of DataFrame outputs in favor of pure JSON responses
12+
- **Deprecated Method Removal**: Removed `collect_data()` → Use `collect_content()`
13+
- **Deprecated Parameter Removal**: Removed `latest_models` → Use `latest`
14+
- **Deprecated Alias Removal**: Removed `classify_from_data()` → Use `classify_from_collection()`
15+
- **No Backward Compatibility**: Clean break from v0.5.x for cleaner, maintainable codebase
16+
17+
### 🎯 API Improvements
18+
- **Consistent Parameter Naming**: Unified `latest` parameter across all classification methods
19+
- **JSON-Only Responses**: All methods now return `List[Dict]` with consistent schema
20+
- **Separated Workflow**: Clear distinction between data collection and inference phases
21+
- **Method Naming**: More intuitive method names following verb-noun patterns
22+
23+
### 📋 Comprehensive Documentation
24+
- **JSON Schema Documentation**: Complete schema definitions for all API responses
25+
- **Field Documentation**: Detailed field descriptions with data types and examples
26+
- **Supported Categories**: Full list of 41 Shallalist categories with examples
27+
- **Updated Examples**: All examples updated to demonstrate new JSON-only API
28+
29+
### 🧪 Testing & Quality
30+
- **Updated Test Suite**: All tests migrated to new API methods and JSON expectations
31+
- **Linting Compliance**: Full `ruff` compliance with automatic formatting
32+
- **Example Updates**: All demonstration scripts updated for new API
33+
- **Documentation Sync**: README, examples, and docstrings fully synchronized
34+
35+
### 🏗️ Code Quality
36+
- **Removed Dead Code**: Eliminated all deprecated compatibility shims and warnings
37+
- **Cleaner Imports**: Removed unused imports and circular dependency risks
38+
- **Consistent Error Messages**: Standardized error messages and exception handling
39+
- **Type Consistency**: Better type hints and consistent return types
40+
41+
### 🚀 Migration Guide
42+
For users upgrading from v0.5.x:
43+
44+
```python
45+
# OLD (v0.5.x) - No longer supported
46+
result = classifier.classify(domains)
47+
df = pd.DataFrame(result) # DataFrame access
48+
data = classifier.collect_data(domains) # Deprecated method
49+
classifier.classify_from_data(data, latest_models=True) # Deprecated parameter
50+
51+
# NEW (v0.6.0) - Required changes
52+
results = classifier.classify(domains) # Returns List[Dict] directly
53+
collection = classifier.collect_content(domains) # New method name
54+
classifier.classify_from_collection(collection, latest=True) # New parameter name
55+
```
56+
857
## [0.5.0] - 2025-12-17
958

1059
### 🚀 Major Features

piedomains/__init__.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,36 +8,37 @@
88

99
def __getattr__(name):
1010
"""Lazy import handler for piedomains modules."""
11-
if name == "DomainClassifier":
12-
from .api import DomainClassifier
11+
match name:
12+
case "DomainClassifier":
13+
from .api import DomainClassifier
1314

14-
return DomainClassifier
15-
elif name == "classify_domains":
16-
from .api import classify_domains
15+
return DomainClassifier
16+
case "classify_domains":
17+
from .api import classify_domains
1718

18-
return classify_domains
19-
elif name == "DataCollector":
20-
from .data_collector import DataCollector
19+
return classify_domains
20+
case "DataCollector":
21+
from .data_collector import DataCollector
2122

22-
return DataCollector
23-
elif name == "TextClassifier":
24-
from .text import TextClassifier
23+
return DataCollector
24+
case "TextClassifier":
25+
from .text import TextClassifier
2526

26-
return TextClassifier
27-
elif name == "ImageClassifier":
28-
from .image import ImageClassifier
27+
return TextClassifier
28+
case "ImageClassifier":
29+
from .image import ImageClassifier
2930

30-
return ImageClassifier
31-
elif name == "LLMClassifier":
32-
from .llm_classifier import LLMClassifier
31+
return ImageClassifier
32+
case "LLMClassifier":
33+
from .llm_classifier import LLMClassifier
3334

34-
return LLMClassifier
35-
elif name == "LLMConfig":
36-
from .llm.config import LLMConfig
35+
return LLMClassifier
36+
case "LLMConfig":
37+
from .llm.config import LLMConfig
3738

38-
return LLMConfig
39-
else:
40-
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
39+
return LLMConfig
40+
case _:
41+
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
4142

4243

4344
__all__ = [

tests/test_integration_critical.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def classify_worker(domains):
177177
result_type, result_data = results_queue.get()
178178
if result_type == "success":
179179
success_count += 1
180-
self.assertIsInstance(result_data, pd.DataFrame)
180+
self.assertIsInstance(result_data, list)
181181

182182
self.assertEqual(success_count, 5, "All concurrent operations should succeed")
183183

@@ -200,8 +200,8 @@ def test_input_sanitization_security(self):
200200
try:
201201
# Should either handle gracefully or fail safely
202202
result = classifier.classify_by_text([malicious_input])
203-
# If it succeeds, should return valid DataFrame
204-
self.assertIsInstance(result, pd.DataFrame)
203+
# If it succeeds, should return valid list
204+
self.assertIsInstance(result, list)
205205
except Exception as e:
206206
# If it fails, should have proper error message
207207
# Accept validation errors, Playwright browser errors, or general exceptions

tests/test_llm_classifier.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -234,22 +234,28 @@ def test_classify_by_llm_mock(self, mock_litellm):
234234
provider="openai", model="gpt-4o", api_key="test-key"
235235
)
236236

237-
# Mock text classifier to avoid actual network calls
238-
with patch(
239-
"piedomains.text.TextClassifier.predict"
240-
) as mock_text:
241-
mock_text.return_value = pd.DataFrame(
242-
[
237+
# Mock data collection and classification to avoid actual network calls
238+
with patch("piedomains.data_collector.DataCollector.collect") as mock_collect:
239+
mock_collect.return_value = {
240+
"collection_id": "test_collection",
241+
"timestamp": "2025-12-17T12:00:00Z",
242+
"domains": [
243243
{
244+
"url": "example.com",
244245
"domain": "example.com",
245-
"extracted_text": "This is news content about current events.",
246+
"text_path": "html/example.com.html",
247+
"image_path": "images/example.com.png",
248+
"date_time_collected": "2025-12-17T12:00:00Z",
249+
"fetch_success": True,
250+
"cached": False,
251+
"error": None
246252
}
247253
]
248-
)
254+
}
249255

250256
result = self.classifier.classify_by_llm(["example.com"])
251257

252-
self.assertIsInstance(result, pd.DataFrame)
258+
self.assertIsInstance(result, list)
253259
self.assertGreater(len(result), 0)
254260

255261
# Check that litellm.completion was called

tests/test_performance.py

Lines changed: 91 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -68,27 +68,53 @@ def test_text_processing_performance(self):
6868
self.assertLess(cleaning_time, 1.0)
6969
self.assertIsInstance(cleaned_text, str)
7070

71-
@patch("piedomains.text.TextClassifier.predict")
72-
def test_batch_processing_scalability(self, mock_predict):
71+
@patch("piedomains.data_collector.DataCollector.collect")
72+
@patch("piedomains.text.TextClassifier.classify_from_data")
73+
def test_batch_processing_scalability(self, mock_classify, mock_collect):
7374
"""Test scalability of batch processing."""
7475

75-
# Mock prediction results
76-
def mock_batch_predict(domains, *args, **kwargs):
77-
return pd.DataFrame(
78-
[
76+
# Mock data collection results
77+
def mock_collection(domains, *args, **kwargs):
78+
return {
79+
"collection_id": "test_collection",
80+
"timestamp": "2025-12-17T12:00:00Z",
81+
"domains": [
7982
{
83+
"url": domain,
8084
"domain": domain,
81-
"text_label": "news",
82-
"text_prob": 0.8,
83-
"error": None,
85+
"text_path": f"html/{domain}.html",
86+
"image_path": f"images/{domain}.png",
87+
"date_time_collected": "2025-12-17T12:00:00Z",
88+
"fetch_success": True,
89+
"cached": False,
90+
"error": None
8491
}
85-
for domain in [
86-
self.classifier._parse_domain_name(d) for d in domains
87-
]
92+
for domain in domains
8893
]
89-
)
90-
91-
mock_predict.side_effect = mock_batch_predict
94+
}
95+
96+
# Mock classification results
97+
def mock_classification(collection_data, *args, **kwargs):
98+
domains_data = collection_data.get("domains", [])
99+
return [
100+
{
101+
"url": domain_data["url"],
102+
"domain": domain_data["domain"],
103+
"text_path": domain_data["text_path"],
104+
"image_path": domain_data["image_path"],
105+
"date_time_collected": domain_data["date_time_collected"],
106+
"model_used": "text/shallalist_ml",
107+
"category": "news",
108+
"confidence": 0.8,
109+
"reason": None,
110+
"error": None,
111+
"raw_predictions": {"news": 0.8, "other": 0.2}
112+
}
113+
for domain_data in domains_data
114+
]
115+
116+
mock_collect.side_effect = mock_collection
117+
mock_classify.side_effect = mock_classification
92118

93119
# Test different batch sizes
94120
test_sizes = [10, 50, 100]
@@ -97,9 +123,7 @@ def mock_batch_predict(domains, *args, **kwargs):
97123
domains = [f"test{i}.com" for i in range(size)]
98124

99125
start_time = time.time()
100-
result = self.classifier.classify_batch(
101-
domains, method="text", batch_size=25, show_progress=False
102-
)
126+
result = self.classifier.classify_by_text(domains)
103127
total_time = time.time() - start_time
104128

105129
# Verify results
@@ -129,11 +153,13 @@ def test_cache_effectiveness(self):
129153

130154
# Mock the actual prediction to isolate cache performance
131155
with patch(
132-
"piedomains.text.TextClassifier.predict"
156+
"piedomains.text.TextClassifier._predict_text"
133157
) as mock_predict:
134-
mock_predict.return_value = pd.DataFrame(
135-
[{"domain": "example.com", "text_label": "news", "text_prob": 0.8}]
136-
)
158+
mock_predict.return_value = {
159+
"text_label": "news",
160+
"text_prob": 0.8,
161+
"text_domain_probs": {"news": 0.8, "other": 0.2}
162+
}
137163

138164
# First call (should use cache)
139165
start_time = time.time()
@@ -160,22 +186,54 @@ def test_memory_usage_batch_processing(self):
160186
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
161187

162188
# Mock to avoid actual model loading
163-
with patch(
164-
"piedomains.text.TextClassifier.predict"
165-
) as mock_predict:
166-
mock_predict.return_value = pd.DataFrame(
167-
[
168-
{"domain": f"test{i}.com", "text_label": "news", "text_prob": 0.8}
169-
for i in range(50)
189+
with patch("piedomains.data_collector.DataCollector.collect") as mock_collect, \
190+
patch("piedomains.text.TextClassifier.classify_from_data") as mock_classify:
191+
192+
def mock_collection(domains, *args, **kwargs):
193+
return {
194+
"collection_id": "test_collection",
195+
"timestamp": "2025-12-17T12:00:00Z",
196+
"domains": [
197+
{
198+
"url": domain,
199+
"domain": domain,
200+
"text_path": f"html/{domain}.html",
201+
"image_path": f"images/{domain}.png",
202+
"date_time_collected": "2025-12-17T12:00:00Z",
203+
"fetch_success": True,
204+
"cached": False,
205+
"error": None
206+
}
207+
for domain in domains
208+
]
209+
}
210+
211+
def mock_classification(collection_data, *args, **kwargs):
212+
domains_data = collection_data.get("domains", [])
213+
return [
214+
{
215+
"url": domain_data["url"],
216+
"domain": domain_data["domain"],
217+
"text_path": domain_data["text_path"],
218+
"image_path": domain_data["image_path"],
219+
"date_time_collected": domain_data["date_time_collected"],
220+
"model_used": "text/shallalist_ml",
221+
"category": "news",
222+
"confidence": 0.8,
223+
"reason": None,
224+
"error": None,
225+
"raw_predictions": {"news": 0.8, "other": 0.2}
226+
}
227+
for domain_data in domains_data
170228
]
171-
)
229+
230+
mock_collect.side_effect = mock_collection
231+
mock_classify.side_effect = mock_classification
172232

173233
# Process multiple batches
174234
for batch_num in range(5):
175235
domains = [f"batch{batch_num}_test{i}.com" for i in range(50)]
176-
self.classifier.classify_batch(
177-
domains, method="text", batch_size=10, show_progress=False
178-
)
236+
self.classifier.classify_by_text(domains)
179237

180238
# Force garbage collection
181239
gc.collect()

0 commit comments

Comments
 (0)