1+ #!/usr/bin/env python3
2+ """
3+ Demonstration of archive functionality using direct URLs.
4+ Since archive.org API is currently down, we'll use known archive URLs.
5+ """
6+
7+ import sys
8+ import os
9+ sys .path .insert (0 , '.' )
10+
11+ import requests
12+ import pandas as pd
13+ from bs4 import BeautifulSoup
14+
15+ def test_direct_archive_urls ():
16+ """Test with known working archive URLs."""
17+ print ("=== Archive.org Direct URL Test ===" )
18+
19+ # Known working archive URLs from different time periods
20+ test_cases = [
21+ {
22+ 'domain' : 'google.com' ,
23+ 'archive_url' : 'https://web.archive.org/web/20200101120000/https://google.com' ,
24+ 'date' : '2020-01-01'
25+ },
26+ {
27+ 'domain' : 'amazon.com' ,
28+ 'archive_url' : 'https://web.archive.org/web/20150101120000/https://amazon.com' ,
29+ 'date' : '2015-01-01'
30+ },
31+ {
32+ 'domain' : 'facebook.com' ,
33+ 'archive_url' : 'https://web.archive.org/web/20100101120000/https://facebook.com' ,
34+ 'date' : '2010-01-01'
35+ },
36+ {
37+ 'domain' : 'twitter.com' ,
38+ 'archive_url' : 'https://web.archive.org/web/20120101120000/https://twitter.com' ,
39+ 'date' : '2012-01-01'
40+ },
41+ {
42+ 'domain' : 'youtube.com' ,
43+ 'archive_url' : 'https://web.archive.org/web/20080101120000/https://youtube.com' ,
44+ 'date' : '2008-01-01'
45+ },
46+ {
47+ 'domain' : 'reddit.com' ,
48+ 'archive_url' : 'https://web.archive.org/web/20100601120000/https://reddit.com' ,
49+ 'date' : '2010-06-01'
50+ },
51+ {
52+ 'domain' : 'netflix.com' ,
53+ 'archive_url' : 'https://web.archive.org/web/20100101120000/https://netflix.com' ,
54+ 'date' : '2010-01-01'
55+ },
56+ {
57+ 'domain' : 'github.com' ,
58+ 'archive_url' : 'https://web.archive.org/web/20100401120000/https://github.com' ,
59+ 'date' : '2010-04-01'
60+ },
61+ {
62+ 'domain' : 'stackoverflow.com' ,
63+ 'archive_url' : 'https://web.archive.org/web/20100101120000/https://stackoverflow.com' ,
64+ 'date' : '2010-01-01'
65+ },
66+ {
67+ 'domain' : 'wikipedia.org' ,
68+ 'archive_url' : 'https://web.archive.org/web/20050101120000/https://wikipedia.org' ,
69+ 'date' : '2005-01-01'
70+ }
71+ ]
72+
73+ results = []
74+
75+ for i , case in enumerate (test_cases , 1 ):
76+ print (f"\n [{ i } /10] Testing { case ['domain' ]} from { case ['date' ]} ..." )
77+
78+ try :
79+ # Test if we can fetch the archived page
80+ response = requests .get (case ['archive_url' ], timeout = 15 )
81+
82+ if response .status_code == 200 :
83+ content = response .text
84+ content_length = len (content )
85+
86+ # Parse content to check if it's valid
87+ soup = BeautifulSoup (content , 'html.parser' )
88+ title = soup .find ('title' )
89+ title_text = title .get_text ().strip () if title else "No title"
90+
91+ # Check for archive.org wrapper content
92+ has_wayback_toolbar = 'wayback' in content .lower () or 'archive.org' in content .lower ()
93+
94+ results .append ({
95+ 'domain' : case ['domain' ],
96+ 'date' : case ['date' ],
97+ 'archive_url' : case ['archive_url' ],
98+ 'content_length' : content_length ,
99+ 'title' : title_text [:100 ], # Truncate long titles
100+ 'has_wayback_toolbar' : has_wayback_toolbar ,
101+ 'status' : 'success'
102+ })
103+
104+ print (f" ✓ Fetched { content_length } chars" )
105+ print (f" ✓ Title: { title_text [:60 ]} ..." )
106+ print (f" ✓ Wayback toolbar detected: { has_wayback_toolbar } " )
107+
108+ else :
109+ results .append ({
110+ 'domain' : case ['domain' ],
111+ 'date' : case ['date' ],
112+ 'archive_url' : case ['archive_url' ],
113+ 'content_length' : 0 ,
114+ 'title' : '' ,
115+ 'has_wayback_toolbar' : False ,
116+ 'status' : f'http_error_{ response .status_code } '
117+ })
118+ print (f" ✗ HTTP Error: { response .status_code } " )
119+
120+ except requests .exceptions .Timeout :
121+ results .append ({
122+ 'domain' : case ['domain' ],
123+ 'date' : case ['date' ],
124+ 'archive_url' : case ['archive_url' ],
125+ 'content_length' : 0 ,
126+ 'title' : '' ,
127+ 'has_wayback_toolbar' : False ,
128+ 'status' : 'timeout'
129+ })
130+ print (f" ✗ Timeout" )
131+
132+ except Exception as e :
133+ results .append ({
134+ 'domain' : case ['domain' ],
135+ 'date' : case ['date' ],
136+ 'archive_url' : case ['archive_url' ],
137+ 'content_length' : 0 ,
138+ 'title' : '' ,
139+ 'has_wayback_toolbar' : False ,
140+ 'status' : f'error: { str (e )[:50 ]} '
141+ })
142+ print (f" ✗ Error: { e } " )
143+
144+ # Results summary
145+ df = pd .DataFrame (results )
146+ print (f"\n === Results Summary ===" )
147+ print (f"Total domains tested: { len (test_cases )} " )
148+ print (f"Successful fetches: { (df ['status' ] == 'success' ).sum ()} " )
149+ print (f"Average content length: { df [df ['status' ] == 'success' ]['content_length' ].mean ():.0f} chars" )
150+
151+ successful = df [df ['status' ] == 'success' ]
152+ if len (successful ) > 0 :
153+ print (f"\n === Successful Archive Fetches ===" )
154+ for _ , row in successful .iterrows ():
155+ print (f"✓ { row ['domain' ]} ({ row ['date' ]} ): { row ['content_length' ]} chars" )
156+ print (f" Title: { row ['title' ]} " )
157+ print (f" URL: { row ['archive_url' ]} " )
158+
159+ failed = df [df ['status' ] != 'success' ]
160+ if len (failed ) > 0 :
161+ print (f"\n === Failed Fetches ===" )
162+ for _ , row in failed .iterrows ():
163+ print (f"✗ { row ['domain' ]} : { row ['status' ]} " )
164+
165+ return df
166+
167+ def demo_usage ():
168+ """Show how to use the archive functionality."""
169+ print (f"\n === Usage Example ===" )
170+ print ("# Import archive functions" )
171+ print ("from piedomains import pred_shalla_cat_archive, pred_shalla_cat_with_text_archive" )
172+ print ("" )
173+ print ("# Classify domains using content from January 1, 2020" )
174+ print ("domains = ['google.com', 'amazon.com', 'facebook.com']" )
175+ print ("result = pred_shalla_cat_archive(domains, '20200101')" )
176+ print ("print(result[['domain', 'pred_label', 'pred_prob', 'archive_date']])" )
177+ print ("" )
178+ print ("# Text-only classification from archive" )
179+ print ("text_result = pred_shalla_cat_with_text_archive(domains, '20200101')" )
180+ print ("print(text_result[['domain', 'text_label', 'text_prob']])" )
181+
182+ if __name__ == "__main__" :
183+ print ("Archive.org Integration Test for Piedomains" )
184+ print ("=" * 50 )
185+
186+ # Test package imports first
187+ imports_work = test_package_imports ()
188+
189+ if imports_work :
190+ # Run comprehensive test
191+ test_results = test_direct_archive_urls ()
192+
193+ # Show usage example
194+ demo_usage ()
195+
196+ print (f"\n === Next Steps ===" )
197+ if (test_results ['status' ] == 'success' ).any ():
198+ print ("✓ Archive integration is working!" )
199+ print ("✓ Ready to test with piedomains classification" )
200+ print ("✓ Try the Jupyter Lab commands shown above" )
201+ else :
202+ print ("⚠ Archive.org may be experiencing issues" )
203+ print ("⚠ Try again later or use cached content for testing" )
204+ else :
205+ print ("✗ Package import issues - check archive_support.py" )
0 commit comments