fix tests

soodoku · soodoku · commit e969a8fd6eea · 2025-12-16T23:41:39.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,55 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.0] - 2025-12-17
+
+### 💥 BREAKING CHANGES
+- **API Modernization**: Complete removal of DataFrame outputs in favor of pure JSON responses
+- **Deprecated Method Removal**: Removed `collect_data()` → Use `collect_content()`
+- **Deprecated Parameter Removal**: Removed `latest_models` → Use `latest`
+- **Deprecated Alias Removal**: Removed `classify_from_data()` → Use `classify_from_collection()`
+- **No Backward Compatibility**: Clean break from v0.5.x for cleaner, maintainable codebase
+
+### 🎯 API Improvements
+- **Consistent Parameter Naming**: Unified `latest` parameter across all classification methods
+- **JSON-Only Responses**: All methods now return `List[Dict]` with consistent schema
+- **Separated Workflow**: Clear distinction between data collection and inference phases
+- **Method Naming**: More intuitive method names following verb-noun patterns
+
+### 📋 Comprehensive Documentation
+- **JSON Schema Documentation**: Complete schema definitions for all API responses
+- **Field Documentation**: Detailed field descriptions with data types and examples
+- **Supported Categories**: Full list of 41 Shallalist categories with examples
+- **Updated Examples**: All examples updated to demonstrate new JSON-only API
+
+### 🧪 Testing & Quality
+- **Updated Test Suite**: All tests migrated to new API methods and JSON expectations
+- **Linting Compliance**: Full `ruff` compliance with automatic formatting
+- **Example Updates**: All demonstration scripts updated for new API
+- **Documentation Sync**: README, examples, and docstrings fully synchronized
+
+### 🏗️ Code Quality
+- **Removed Dead Code**: Eliminated all deprecated compatibility shims and warnings
+- **Cleaner Imports**: Removed unused imports and circular dependency risks
+- **Consistent Error Messages**: Standardized error messages and exception handling
+- **Type Consistency**: Better type hints and consistent return types
+
+### 🚀 Migration Guide
+For users upgrading from v0.5.x:
+
+```python
+# OLD (v0.5.x) - No longer supported
+result = classifier.classify(domains)
+df = pd.DataFrame(result)  # DataFrame access
+data = classifier.collect_data(domains)  # Deprecated method
+classifier.classify_from_data(data, latest_models=True)  # Deprecated parameter
+
+# NEW (v0.6.0) - Required changes
+results = classifier.classify(domains)  # Returns List[Dict] directly
+collection = classifier.collect_content(domains)  # New method name
+classifier.classify_from_collection(collection, latest=True)  # New parameter name
+```
+
 ## [0.5.0] - 2025-12-17
 
 ### 🚀 Major Features
diff --git a/piedomains/__init__.py b/piedomains/__init__.py
@@ -8,36 +8,37 @@
 
 def __getattr__(name):
     """Lazy import handler for piedomains modules."""
-    if name == "DomainClassifier":
-        from .api import DomainClassifier
+    match name:
+        case "DomainClassifier":
+            from .api import DomainClassifier
 
-        return DomainClassifier
-    elif name == "classify_domains":
-        from .api import classify_domains
+            return DomainClassifier
+        case "classify_domains":
+            from .api import classify_domains
 
-        return classify_domains
-    elif name == "DataCollector":
-        from .data_collector import DataCollector
+            return classify_domains
+        case "DataCollector":
+            from .data_collector import DataCollector
 
-        return DataCollector
-    elif name == "TextClassifier":
-        from .text import TextClassifier
+            return DataCollector
+        case "TextClassifier":
+            from .text import TextClassifier
 
-        return TextClassifier
-    elif name == "ImageClassifier":
-        from .image import ImageClassifier
+            return TextClassifier
+        case "ImageClassifier":
+            from .image import ImageClassifier
 
-        return ImageClassifier
-    elif name == "LLMClassifier":
-        from .llm_classifier import LLMClassifier
+            return ImageClassifier
+        case "LLMClassifier":
+            from .llm_classifier import LLMClassifier
 
-        return LLMClassifier
-    elif name == "LLMConfig":
-        from .llm.config import LLMConfig
+            return LLMClassifier
+        case "LLMConfig":
+            from .llm.config import LLMConfig
 
-        return LLMConfig
-    else:
-        raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+            return LLMConfig
+        case _:
+            raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
 
 
 __all__ = [
diff --git a/tests/test_integration_critical.py b/tests/test_integration_critical.py
@@ -177,7 +177,7 @@ def classify_worker(domains):
             result_type, result_data = results_queue.get()
             if result_type == "success":
                 success_count += 1
-                self.assertIsInstance(result_data, pd.DataFrame)
+                self.assertIsInstance(result_data, list)
 
         self.assertEqual(success_count, 5, "All concurrent operations should succeed")
 
@@ -200,8 +200,8 @@ def test_input_sanitization_security(self):
                 try:
                     # Should either handle gracefully or fail safely
                     result = classifier.classify_by_text([malicious_input])
-                    # If it succeeds, should return valid DataFrame
-                    self.assertIsInstance(result, pd.DataFrame)
+                    # If it succeeds, should return valid list
+                    self.assertIsInstance(result, list)
                 except Exception as e:
                     # If it fails, should have proper error message
                     # Accept validation errors, Playwright browser errors, or general exceptions
diff --git a/tests/test_llm_classifier.py b/tests/test_llm_classifier.py
@@ -234,22 +234,28 @@ def test_classify_by_llm_mock(self, mock_litellm):
             provider="openai", model="gpt-4o", api_key="test-key"
         )
 
-        # Mock text classifier to avoid actual network calls
-        with patch(
-            "piedomains.text.TextClassifier.predict"
-        ) as mock_text:
-            mock_text.return_value = pd.DataFrame(
-                [
+        # Mock data collection and classification to avoid actual network calls
+        with patch("piedomains.data_collector.DataCollector.collect") as mock_collect:
+            mock_collect.return_value = {
+                "collection_id": "test_collection",
+                "timestamp": "2025-12-17T12:00:00Z",
+                "domains": [
                     {
+                        "url": "example.com",
                         "domain": "example.com",
-                        "extracted_text": "This is news content about current events.",
+                        "text_path": "html/example.com.html",
+                        "image_path": "images/example.com.png",
+                        "date_time_collected": "2025-12-17T12:00:00Z",
+                        "fetch_success": True,
+                        "cached": False,
+                        "error": None
                     }
                 ]
-            )
+            }
 
             result = self.classifier.classify_by_llm(["example.com"])
 
-            self.assertIsInstance(result, pd.DataFrame)
+            self.assertIsInstance(result, list)
             self.assertGreater(len(result), 0)
 
             # Check that litellm.completion was called
diff --git a/tests/test_performance.py b/tests/test_performance.py
@@ -68,27 +68,53 @@ def test_text_processing_performance(self):
         self.assertLess(cleaning_time, 1.0)
         self.assertIsInstance(cleaned_text, str)
 
-    @patch("piedomains.text.TextClassifier.predict")
-    def test_batch_processing_scalability(self, mock_predict):
+    @patch("piedomains.data_collector.DataCollector.collect")
+    @patch("piedomains.text.TextClassifier.classify_from_data")
+    def test_batch_processing_scalability(self, mock_classify, mock_collect):
         """Test scalability of batch processing."""
 
-        # Mock prediction results
-        def mock_batch_predict(domains, *args, **kwargs):
-            return pd.DataFrame(
-                [
+        # Mock data collection results
+        def mock_collection(domains, *args, **kwargs):
+            return {
+                "collection_id": "test_collection",
+                "timestamp": "2025-12-17T12:00:00Z",
+                "domains": [
                     {
+                        "url": domain,
                         "domain": domain,
-                        "text_label": "news",
-                        "text_prob": 0.8,
-                        "error": None,
+                        "text_path": f"html/{domain}.html",
+                        "image_path": f"images/{domain}.png",
+                        "date_time_collected": "2025-12-17T12:00:00Z",
+                        "fetch_success": True,
+                        "cached": False,
+                        "error": None
                     }
-                    for domain in [
-                        self.classifier._parse_domain_name(d) for d in domains
-                    ]
+                    for domain in domains
                 ]
-            )
-
-        mock_predict.side_effect = mock_batch_predict
+            }
+
+        # Mock classification results
+        def mock_classification(collection_data, *args, **kwargs):
+            domains_data = collection_data.get("domains", [])
+            return [
+                {
+                    "url": domain_data["url"],
+                    "domain": domain_data["domain"],
+                    "text_path": domain_data["text_path"],
+                    "image_path": domain_data["image_path"],
+                    "date_time_collected": domain_data["date_time_collected"],
+                    "model_used": "text/shallalist_ml",
+                    "category": "news",
+                    "confidence": 0.8,
+                    "reason": None,
+                    "error": None,
+                    "raw_predictions": {"news": 0.8, "other": 0.2}
+                }
+                for domain_data in domains_data
+            ]
+
+        mock_collect.side_effect = mock_collection
+        mock_classify.side_effect = mock_classification
 
         # Test different batch sizes
         test_sizes = [10, 50, 100]
@@ -97,9 +123,7 @@ def mock_batch_predict(domains, *args, **kwargs):
             domains = [f"test{i}.com" for i in range(size)]
 
             start_time = time.time()
-            result = self.classifier.classify_batch(
-                domains, method="text", batch_size=25, show_progress=False
-            )
+            result = self.classifier.classify_by_text(domains)
             total_time = time.time() - start_time
 
             # Verify results
@@ -129,11 +153,13 @@ def test_cache_effectiveness(self):
 
         # Mock the actual prediction to isolate cache performance
         with patch(
-            "piedomains.text.TextClassifier.predict"
+            "piedomains.text.TextClassifier._predict_text"
         ) as mock_predict:
-            mock_predict.return_value = pd.DataFrame(
-                [{"domain": "example.com", "text_label": "news", "text_prob": 0.8}]
-            )
+            mock_predict.return_value = {
+                "text_label": "news",
+                "text_prob": 0.8,
+                "text_domain_probs": {"news": 0.8, "other": 0.2}
+            }
 
             # First call (should use cache)
             start_time = time.time()
@@ -160,22 +186,54 @@ def test_memory_usage_batch_processing(self):
         initial_memory = process.memory_info().rss / 1024 / 1024  # MB
 
         # Mock to avoid actual model loading
-        with patch(
-            "piedomains.text.TextClassifier.predict"
-        ) as mock_predict:
-            mock_predict.return_value = pd.DataFrame(
-                [
-                    {"domain": f"test{i}.com", "text_label": "news", "text_prob": 0.8}
-                    for i in range(50)
+        with patch("piedomains.data_collector.DataCollector.collect") as mock_collect, \
+             patch("piedomains.text.TextClassifier.classify_from_data") as mock_classify:
+
+            def mock_collection(domains, *args, **kwargs):
+                return {
+                    "collection_id": "test_collection",
+                    "timestamp": "2025-12-17T12:00:00Z",
+                    "domains": [
+                        {
+                            "url": domain,
+                            "domain": domain,
+                            "text_path": f"html/{domain}.html",
+                            "image_path": f"images/{domain}.png",
+                            "date_time_collected": "2025-12-17T12:00:00Z",
+                            "fetch_success": True,
+                            "cached": False,
+                            "error": None
+                        }
+                        for domain in domains
+                    ]
+                }
+
+            def mock_classification(collection_data, *args, **kwargs):
+                domains_data = collection_data.get("domains", [])
+                return [
+                    {
+                        "url": domain_data["url"],
+                        "domain": domain_data["domain"],
+                        "text_path": domain_data["text_path"],
+                        "image_path": domain_data["image_path"],
+                        "date_time_collected": domain_data["date_time_collected"],
+                        "model_used": "text/shallalist_ml",
+                        "category": "news",
+                        "confidence": 0.8,
+                        "reason": None,
+                        "error": None,
+                        "raw_predictions": {"news": 0.8, "other": 0.2}
+                    }
+                    for domain_data in domains_data
                 ]
-            )
+
+            mock_collect.side_effect = mock_collection
+            mock_classify.side_effect = mock_classification
 
             # Process multiple batches
             for batch_num in range(5):
                 domains = [f"batch{batch_num}_test{i}.com" for i in range(50)]
-                self.classifier.classify_batch(
-                    domains, method="text", batch_size=10, show_progress=False
-                )
+                self.classifier.classify_by_text(domains)
 
                 # Force garbage collection
                 gc.collect()