@@ -234,77 +234,7 @@ async def filters_and_scorers():
234234 print (f" ✅ Crawler prioritized { len (results )} pages by relevance score" )
235235 print (" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first" )
236236
237- # 4️⃣ Wrap-Up and Key Takeaways
238- async def wrap_up ():
239- """
240- PART 4: Wrap-Up and Key Takeaways
241-
242- Summarize the key concepts learned in this tutorial.
243- """
244- print ("\n ===== COMPLETE CRAWLER EXAMPLE =====" )
245- print ("Combining filters, scorers, and streaming for an optimized crawl" )
246-
247- # Create a sophisticated filter chain
248- filter_chain = FilterChain (
249- [
250- DomainFilter (
251- allowed_domains = ["docs.crawl4ai.com" ],
252- blocked_domains = ["old.docs.crawl4ai.com" ],
253- ),
254- URLPatternFilter (patterns = ["*core*" , "*advanced*" , "*blog*" ]),
255- ContentTypeFilter (allowed_types = ["text/html" ]),
256- ]
257- )
258-
259- # Create a composite scorer that combines multiple scoring strategies
260- keyword_scorer = KeywordRelevanceScorer (
261- keywords = ["crawl" , "example" , "async" , "configuration" ], weight = 0.7
262- )
263- # Set up the configuration
264- config = CrawlerRunConfig (
265- deep_crawl_strategy = BestFirstCrawlingStrategy (
266- max_depth = 1 ,
267- include_external = False ,
268- filter_chain = filter_chain ,
269- url_scorer = keyword_scorer ,
270- ),
271- scraping_strategy = LXMLWebScrapingStrategy (),
272- stream = True ,
273- verbose = True ,
274- )
275-
276- # Execute the crawl
277- results = []
278- start_time = time .perf_counter ()
279-
280- async with AsyncWebCrawler () as crawler :
281- async for result in await crawler .arun (
282- url = "https://docs.crawl4ai.com" , config = config
283- ):
284- results .append (result )
285- score = result .metadata .get ("score" , 0 )
286- depth = result .metadata .get ("depth" , 0 )
287- print (f"→ Depth: { depth } | Score: { score :.2f} | { result .url } " )
288-
289- duration = time .perf_counter () - start_time
290-
291- # Summarize the results
292- print (f"\n ✅ Crawled { len (results )} high-value pages in { duration :.2f} seconds" )
293- print (
294- f"✅ Average score: { sum (r .metadata .get ('score' , 0 ) for r in results ) / len (results ):.2f} "
295- )
296-
297- # Group by depth
298- depth_counts = {}
299- for result in results :
300- depth = result .metadata .get ("depth" , 0 )
301- depth_counts [depth ] = depth_counts .get (depth , 0 ) + 1
302-
303- print ("\n 📊 Pages crawled by depth:" )
304- for depth , count in sorted (depth_counts .items ()):
305- print (f" Depth { depth } : { count } pages" )
306-
307- # 5️⃣ Advanced Filters
237+ # 4️⃣ Advanced Filters
308238async def advanced_filters ():
309239 """
310240 PART 5: Demonstrates advanced filtering techniques for specialized crawling.
@@ -367,7 +297,7 @@ async def advanced_filters():
367297 relevance_score = result .metadata .get ("relevance_score" , 0 )
368298 print (f" → Score: { relevance_score :.2f} | { result .url } " )
369299
370- # Main function to run the entire tutorial
300+ # 5️⃣ Max Pages and Score Thresholds
371301async def max_pages_and_thresholds ():
372302 """
373303 PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
@@ -466,6 +396,77 @@ async def max_pages_and_thresholds():
466396 print (f" ✅ Average score: { avg_score :.2f} " )
467397 print (" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first" )
468398
399+ # 6️⃣ Wrap-Up and Key Takeaways
400+ async def wrap_up ():
401+ """
402+ PART 4: Wrap-Up and Key Takeaways
403+
404+ Summarize the key concepts learned in this tutorial.
405+ """
406+ print ("\n ===== COMPLETE CRAWLER EXAMPLE =====" )
407+ print ("Combining filters, scorers, and streaming for an optimized crawl" )
408+
409+ # Create a sophisticated filter chain
410+ filter_chain = FilterChain (
411+ [
412+ DomainFilter (
413+ allowed_domains = ["docs.crawl4ai.com" ],
414+ blocked_domains = ["old.docs.crawl4ai.com" ],
415+ ),
416+ URLPatternFilter (patterns = ["*core*" , "*advanced*" , "*blog*" ]),
417+ ContentTypeFilter (allowed_types = ["text/html" ]),
418+ ]
419+ )
420+
421+ # Create a composite scorer that combines multiple scoring strategies
422+ keyword_scorer = KeywordRelevanceScorer (
423+ keywords = ["crawl" , "example" , "async" , "configuration" ], weight = 0.7
424+ )
425+ # Set up the configuration
426+ config = CrawlerRunConfig (
427+ deep_crawl_strategy = BestFirstCrawlingStrategy (
428+ max_depth = 1 ,
429+ include_external = False ,
430+ filter_chain = filter_chain ,
431+ url_scorer = keyword_scorer ,
432+ ),
433+ scraping_strategy = LXMLWebScrapingStrategy (),
434+ stream = True ,
435+ verbose = True ,
436+ )
437+
438+ # Execute the crawl
439+ results = []
440+ start_time = time .perf_counter ()
441+
442+ async with AsyncWebCrawler () as crawler :
443+ async for result in await crawler .arun (
444+ url = "https://docs.crawl4ai.com" , config = config
445+ ):
446+ results .append (result )
447+ score = result .metadata .get ("score" , 0 )
448+ depth = result .metadata .get ("depth" , 0 )
449+ print (f"→ Depth: { depth } | Score: { score :.2f} | { result .url } " )
450+
451+ duration = time .perf_counter () - start_time
452+
453+ # Summarize the results
454+ print (f"\n ✅ Crawled { len (results )} high-value pages in { duration :.2f} seconds" )
455+ print (
456+ f"✅ Average score: { sum (r .metadata .get ('score' , 0 ) for r in results ) / len (results ):.2f} "
457+ )
458+
459+ # Group by depth
460+ depth_counts = {}
461+ for result in results :
462+ depth = result .metadata .get ("depth" , 0 )
463+ depth_counts [depth ] = depth_counts .get (depth , 0 ) + 1
464+
465+ print ("\n 📊 Pages crawled by depth:" )
466+ for depth , count in sorted (depth_counts .items ()):
467+ print (f" Depth { depth } : { count } pages" )
468+
469+
469470async def run_tutorial ():
470471 """
471472 Executes all tutorial sections in sequence.
0 commit comments