Skip to content

Commit df6a6d5

Browse files
committed
refactor(docs): reorganize tutorial sections and update wrap-up example
1 parent e896c08 commit df6a6d5

File tree

1 file changed

+73
-72
lines changed

1 file changed

+73
-72
lines changed

docs/examples/deepcrawl_example.py

Lines changed: 73 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -234,77 +234,7 @@ async def filters_and_scorers():
234234
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
235235
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
236236

237-
# 4️⃣ Wrap-Up and Key Takeaways
238-
async def wrap_up():
239-
"""
240-
PART 4: Wrap-Up and Key Takeaways
241-
242-
Summarize the key concepts learned in this tutorial.
243-
"""
244-
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
245-
print("Combining filters, scorers, and streaming for an optimized crawl")
246-
247-
# Create a sophisticated filter chain
248-
filter_chain = FilterChain(
249-
[
250-
DomainFilter(
251-
allowed_domains=["docs.crawl4ai.com"],
252-
blocked_domains=["old.docs.crawl4ai.com"],
253-
),
254-
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
255-
ContentTypeFilter(allowed_types=["text/html"]),
256-
]
257-
)
258-
259-
# Create a composite scorer that combines multiple scoring strategies
260-
keyword_scorer = KeywordRelevanceScorer(
261-
keywords=["crawl", "example", "async", "configuration"], weight=0.7
262-
)
263-
# Set up the configuration
264-
config = CrawlerRunConfig(
265-
deep_crawl_strategy=BestFirstCrawlingStrategy(
266-
max_depth=1,
267-
include_external=False,
268-
filter_chain=filter_chain,
269-
url_scorer=keyword_scorer,
270-
),
271-
scraping_strategy=LXMLWebScrapingStrategy(),
272-
stream=True,
273-
verbose=True,
274-
)
275-
276-
# Execute the crawl
277-
results = []
278-
start_time = time.perf_counter()
279-
280-
async with AsyncWebCrawler() as crawler:
281-
async for result in await crawler.arun(
282-
url="https://docs.crawl4ai.com", config=config
283-
):
284-
results.append(result)
285-
score = result.metadata.get("score", 0)
286-
depth = result.metadata.get("depth", 0)
287-
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
288-
289-
duration = time.perf_counter() - start_time
290-
291-
# Summarize the results
292-
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
293-
print(
294-
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
295-
)
296-
297-
# Group by depth
298-
depth_counts = {}
299-
for result in results:
300-
depth = result.metadata.get("depth", 0)
301-
depth_counts[depth] = depth_counts.get(depth, 0) + 1
302-
303-
print("\n📊 Pages crawled by depth:")
304-
for depth, count in sorted(depth_counts.items()):
305-
print(f" Depth {depth}: {count} pages")
306-
307-
# 5️⃣ Advanced Filters
237+
# 4️⃣ Advanced Filters
308238
async def advanced_filters():
309239
"""
310240
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
@@ -367,7 +297,7 @@ async def advanced_filters():
367297
relevance_score = result.metadata.get("relevance_score", 0)
368298
print(f" → Score: {relevance_score:.2f} | {result.url}")
369299

370-
# Main function to run the entire tutorial
300+
# 5️⃣ Max Pages and Score Thresholds
371301
async def max_pages_and_thresholds():
372302
"""
373303
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
@@ -466,6 +396,77 @@ async def max_pages_and_thresholds():
466396
print(f" ✅ Average score: {avg_score:.2f}")
467397
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
468398

399+
# 6️⃣ Wrap-Up and Key Takeaways
400+
async def wrap_up():
401+
"""
402+
PART 4: Wrap-Up and Key Takeaways
403+
404+
Summarize the key concepts learned in this tutorial.
405+
"""
406+
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
407+
print("Combining filters, scorers, and streaming for an optimized crawl")
408+
409+
# Create a sophisticated filter chain
410+
filter_chain = FilterChain(
411+
[
412+
DomainFilter(
413+
allowed_domains=["docs.crawl4ai.com"],
414+
blocked_domains=["old.docs.crawl4ai.com"],
415+
),
416+
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
417+
ContentTypeFilter(allowed_types=["text/html"]),
418+
]
419+
)
420+
421+
# Create a composite scorer that combines multiple scoring strategies
422+
keyword_scorer = KeywordRelevanceScorer(
423+
keywords=["crawl", "example", "async", "configuration"], weight=0.7
424+
)
425+
# Set up the configuration
426+
config = CrawlerRunConfig(
427+
deep_crawl_strategy=BestFirstCrawlingStrategy(
428+
max_depth=1,
429+
include_external=False,
430+
filter_chain=filter_chain,
431+
url_scorer=keyword_scorer,
432+
),
433+
scraping_strategy=LXMLWebScrapingStrategy(),
434+
stream=True,
435+
verbose=True,
436+
)
437+
438+
# Execute the crawl
439+
results = []
440+
start_time = time.perf_counter()
441+
442+
async with AsyncWebCrawler() as crawler:
443+
async for result in await crawler.arun(
444+
url="https://docs.crawl4ai.com", config=config
445+
):
446+
results.append(result)
447+
score = result.metadata.get("score", 0)
448+
depth = result.metadata.get("depth", 0)
449+
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
450+
451+
duration = time.perf_counter() - start_time
452+
453+
# Summarize the results
454+
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
455+
print(
456+
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
457+
)
458+
459+
# Group by depth
460+
depth_counts = {}
461+
for result in results:
462+
depth = result.metadata.get("depth", 0)
463+
depth_counts[depth] = depth_counts.get(depth, 0) + 1
464+
465+
print("\n📊 Pages crawled by depth:")
466+
for depth, count in sorted(depth_counts.items()):
467+
print(f" Depth {depth}: {count} pages")
468+
469+
469470
async def run_tutorial():
470471
"""
471472
Executes all tutorial sections in sequence.

0 commit comments

Comments
 (0)