themains
diff --git a/‎README.md‎
Lines changed: 21 additions & 20 deletions b/‎README.md‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎docs/source/piedomains.classifiers.rst‎
Lines changed: 13 additions & 29 deletions b/‎docs/source/piedomains.classifiers.rst‎
Lines changed: 13 additions & 29 deletions
diff --git a/‎examples/README.md‎
Lines changed: 46 additions & 15 deletions b/‎examples/README.md‎
Lines changed: 46 additions & 15 deletions
diff --git a/‎examples/json_only_demo.py‎
Lines changed: 152 additions & 0 deletions b/‎examples/json_only_demo.py‎
Lines changed: 152 additions & 0 deletions
@@ -5,12 +5,13 @@
 [![Downloads](https://pepy.tech/badge/piedomains)](https://pepy.tech/project/piedomains)
 [![Documentation](https://img.shields.io/badge/docs-github.io-blue)](https://themains.github.io/piedomains/)
 
-## 🚀 What's New in v0.5.0
+## 🚀 What's New in v0.6.0
 
-- **Playwright Migration**: Complete transition from Selenium to modern Playwright for faster, more reliable web content extraction
-- **12.8x Performance Boost**: Optimized parallel processing (13.2s → 1.0s per domain)
-- **Enhanced Docker Security**: Production-ready containerization with security sandboxing and resource limits
-- **Unified Content Pipeline**: Text and image extraction now use the same Playwright engine for consistency
+- **Streamlined JSON API**: Simple, consistent JSON responses for easy integration with any workflow
+- **Enhanced LLM Support**: Built-in support for OpenAI, Anthropic, and Google AI models with custom category definitions
+- **Advanced Archive Analysis**: Analyze historical website versions from archive.org with intelligent rate limiting
+- **Separated Data Collection**: Collect website content once, run multiple classification approaches (ML + LLM + ensemble)
+- **41 Content Categories**: Comprehensive classification including news, shopping, social media, education, finance, and more
 
 ## Installation
 
@@ -23,17 +24,18 @@ Requires Python 3.11+
 ## Basic Usage
 
 ```python
-from piedomains import DomainClassifier
+from piedomains import DomainClassifier, DataCollector
 
 classifier = DomainClassifier()
-result = classifier.classify(["cnn.com", "amazon.com", "wikipedia.org"])
-print(result[['domain', 'pred_label', 'pred_prob']])
+results = classifier.classify(["cnn.com", "amazon.com", "wikipedia.org"])
+
+for result in results:
+    print(f"{result['domain']}: {result['category']} ({result['confidence']:.3f})")
 
 # Output:
-#        domain    pred_label  pred_prob
-# 0     cnn.com          news   0.876543
-# 1  amazon.com      shopping   0.923456
-# 2 wikipedia.org   education   0.891234
+# cnn.com: news (0.876)
+# amazon.com: shopping (0.923)
+# wikipedia.org: education (0.891)
 ```
 
 ## Classification Methods
@@ -48,8 +50,10 @@ result = classifier.classify_by_text(["news.google.com"])
 # Image-only classification
 result = classifier.classify_by_images(["instagram.com"])
 
-# Batch processing
-results = classifier.classify_batch(domains, method="text", batch_size=50)
+# Batch processing with separated workflow
+collector = DataCollector()
+collection = collector.collect_batch(domains, batch_size=50)
+results = classifier.classify_from_collection(collection, method="text")
 ```
 
 ## Historical Analysis
@@ -60,12 +64,9 @@ old_result = classifier.classify(["facebook.com"], archive_date="20100101")
 
 # Batch processing with archive.org (respects rate limits)
 domains = ["google.com", "wikipedia.org", "cnn.com"]
-historical_results = classifier.classify_batch(
-    domains,
-    archive_date="20050101",
-    method="text",
-    batch_size=10  # Archive.org uses conservative defaults
-)
+collector = DataCollector(archive_date="20050101")
+collection = collector.collect_batch(domains, batch_size=10)  # Archive.org uses conservative defaults
+historical_results = classifier.classify_from_collection(collection, method="text")
 ```
 
 ### Archive.org Rate Limits & Best Practices
 
@@ -1,45 +1,29 @@
-piedomains.classifiers package
-==============================
+piedomains classifiers
+====================
 
-Submodules
-----------
+Domain Classification Modules
+-----------------------------
 
-piedomains.classifiers.combined\_classifier module
---------------------------------------------------
+piedomains.text module
+----------------------
 
-.. automodule:: piedomains.classifiers.combined_classifier
+.. automodule:: piedomains.text
    :members:
    :show-inheritance:
    :undoc-members:
 
-piedomains.classifiers.image\_classifier module
------------------------------------------------
+piedomains.image module
+-----------------------
 
-.. automodule:: piedomains.classifiers.image_classifier
+.. automodule:: piedomains.image
    :members:
    :show-inheritance:
    :undoc-members:
 
-piedomains.classifiers.llm\_classifier module
----------------------------------------------
+piedomains.llm module
+--------------------
 
-.. automodule:: piedomains.classifiers.llm_classifier
-   :members:
-   :show-inheritance:
-   :undoc-members:
-
-piedomains.classifiers.text\_classifier module
-----------------------------------------------
-
-.. automodule:: piedomains.classifiers.text_classifier
-   :members:
-   :show-inheritance:
-   :undoc-members:
-
-Module contents
----------------
-
-.. automodule:: piedomains.classifiers
+.. automodule:: piedomains.llm
    :members:
    :show-inheritance:
    :undoc-members:
@@ -1,25 +1,56 @@
-# Examples
+# Piedomains Examples
 
-This directory contains example scripts demonstrating piedomains functionality:
+This directory contains examples demonstrating the piedomains library's capabilities.
 
-## Traditional ML Classification
-- `new_api_demo.py`: Modern DomainClassifier API demonstration
-- `archive_demo.py`: Basic archive.org classification demo
-- `archive_functionality_demo.py`: Archive functionality testing
-- `final_archive_demo.py`: Final archive integration test
-- `jupyter_demo.py`: Jupyter notebook demonstration
+## 🚀 Quick Start - New JSON API
 
-## LLM-Powered Classification
-- `llm_demo.py`: LLM-based classification with OpenAI, Anthropic, Google models
+The piedomains library now features a clean JSON-only API that separates data collection from inference:
 
-## Running Examples
+```python
+from piedomains import DomainClassifier
 
-```bash
-cd examples
-python new_api_demo.py
-python llm_demo.py  # Requires API key
+# Simple classification - returns JSON instead of DataFrames
+classifier = DomainClassifier()
+results = classifier.classify(["cnn.com", "github.com"])
+
+for result in results:
+    print(f"{result['domain']}: {result['category']} ({result['confidence']:.3f})")
+    print(f"  Model: {result['model_used']}")
+    print(f"  Data: {result['text_path']}, {result['image_path']}")
+```
+
+## 🔧 Separated Workflow
+
+For advanced use cases, separate data collection from inference:
+
+```python
+from piedomains import DataCollector, DomainClassifier
+
+# Step 1: Collect data (can be reused)
+collector = DataCollector()
+data = collector.collect(["example.com"])
+
+# Step 2: Run inference (try different models on same data)
+classifier = DomainClassifier()
+text_results = classifier.classify_from_collection(data, method="text")
+image_results = classifier.classify_from_collection(data, method="images")
 ```
 
+## 📁 Available Examples
+
+### Core Functionality
+- `json_only_demo.py` - **NEW**: JSON-only API demonstration
+- `separated_workflow_demo.py` - **NEW**: Data collection & inference separation
+- `new_api_demo.py` - Traditional API (now returns JSON)
+- `jupyter_demo.py` - Jupyter notebook examples
+
+### Archive & Historical Analysis
+- `final_archive_demo.py` - Archive.org integration
+- Historical snapshots with `archive_date="20200101"`
+
+### LLM-Powered Classification
+- `llm_demo.py` - LLM-based classification with multiple providers
+
 ## 🔒 Security & Sandbox Examples
 
 **⚠️ Important**: For unknown/suspicious domains, use the sandbox examples to protect your system:
 
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+Demo of the new JSON-only classification architecture.
+
+This example shows the clean JSON API that replaces DataFrames.
+"""
+
+import sys
+from pathlib import Path
+
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+try:
+    from piedomains import DataCollector, DomainClassifier
+except ImportError as e:
+    print(f"Import error: {e}")
+    print("This demo requires the piedomains package to be installed.")
+    sys.exit(1)
+
+
+def demo_json_api():
+    """Demonstrate the new JSON-only API."""
+    print("🚀 Piedomains JSON-Only Architecture Demo")
+    print("=" * 50)
+
+    # Test domains
+    domains = ["example.com", "httpbin.org"]
+
+    # Create classifier
+    classifier = DomainClassifier(cache_dir="demo_cache")
+
+    print(f"\n🔤 Testing JSON API with {len(domains)} domains...")
+    print("Domains:", domains)
+
+    try:
+        # Test the new JSON-only classify method
+        results = classifier.classify(domains)
+
+        print("\n✅ Classification complete!")
+        print(f"Result type: {type(results)}")
+        print(f"Number of results: {len(results)}")
+
+        print("\n📊 Results:")
+        for i, result in enumerate(results):
+            print(f"\n{i+1}. Domain: {result.get('domain', 'unknown')}")
+            print(f"   URL: {result.get('url', 'unknown')}")
+            print(f"   Category: {result.get('category', 'unknown')}")
+            print(f"   Confidence: {result.get('confidence', 0.0):.3f}")
+            print(f"   Model Used: {result.get('model_used', 'unknown')}")
+            print(
+                f"   Data Collection Time: {result.get('date_time_collected', 'unknown')}"
+            )
+            print(f"   Text Path: {result.get('text_path', 'none')}")
+            print(f"   Image Path: {result.get('image_path', 'none')}")
+
+            if result.get("error"):
+                print(f"   ❌ Error: {result['error']}")
+            else:
+                print("   ✅ Success")
+
+        # Test different classification methods
+        print("\n🔤 Testing text-only classification...")
+        text_results = classifier.classify_by_text(domains)
+        print(f"Text results: {len(text_results)} domains")
+        for result in text_results:
+            print(
+                f"   {result['domain']}: {result.get('category', 'error')} "
+                f"({result.get('confidence', 0):.3f}) - {result.get('model_used', 'unknown')}"
+            )
+
+        print("\n🖼️  Testing image-only classification...")
+        image_results = classifier.classify_by_images(domains)
+        print(f"Image results: {len(image_results)} domains")
+        for result in image_results:
+            print(
+                f"   {result['domain']}: {result.get('category', 'error')} "
+                f"({result.get('confidence', 0):.3f}) - {result.get('model_used', 'unknown')}"
+            )
+
+        # Show JSON structure
+        print("\n📋 JSON Schema Example:")
+        if results:
+            example_result = results[0]
+            import json
+
+            print(json.dumps(example_result, indent=2))
+
+        print("\n✅ Demo completed successfully!")
+        print("\nKey improvements:")
+        print("- 🗂️  Pure JSON output (no pandas dependency)")
+        print("- 🔄 Unified data collection → inference pipeline")
+        print("- 📁 Clear data file paths for debugging")
+        print("- ♻️  Data reuse across multiple classification approaches")
+        print("- 🌐 Language-agnostic JSON format")
+
+    except Exception as e:
+        print(f"❌ Demo failed: {e}")
+        import traceback
+
+        traceback.print_exc()
+        print("\nThis is expected if:")
+        print("- Dependencies are missing")
+        print("- Network is unavailable")
+        print("- ML models aren't downloaded")
+        print("\nThe demo shows the API structure even without full functionality.")
+
+
+def demo_separated_workflow():
+    """Show the separated data collection and inference workflow."""
+    print("\n" + "=" * 50)
+    print("🔧 Separated Data Collection & Inference Demo")
+    print("=" * 50)
+
+    domains = ["httpbin.org"]
+
+    try:
+        print("\n📦 Step 1: Data Collection")
+        collector = DataCollector(cache_dir="demo_separated")
+        collection_data = collector.collect(domains)
+
+        print("✅ Collection complete!")
+        print(f"   Collection ID: {collection_data['collection_id']}")
+        print(f"   Successful: {collection_data['summary']['successful']}")
+        print(f"   Failed: {collection_data['summary']['failed']}")
+
+        print("\n🧠 Step 2: Classification")
+        classifier = DomainClassifier()
+
+        print("Running text classification on collected data...")
+        results = classifier.classify_from_collection(collection_data, method="text")
+
+        print("✅ Inference complete!")
+        for result in results:
+            print(
+                f"   {result['domain']}: {result.get('category', 'error')} "
+                f"({result.get('confidence', 0):.3f})"
+            )
+
+        print("\n♻️  Data Reuse: The same collected data can now be used with:")
+        print("   - Different ML model versions")
+        print("   - LLM-based classification")
+        print("   - Ensemble approaches")
+        print("   - External analysis tools")
+
+    except Exception as e:
+        print(f"❌ Separated workflow demo failed: {e}")
+
+
+if __name__ == "__main__":
+    demo_json_api()
+    demo_separated_workflow()