A Coding Implementation of Crawl4AI for Web Crawling, Markdown Generation, JavaScript Execution, and LLM-Based Structured Extraction


import subprocess
import sys


print("📦 Installing system dependencies...")
subprocess.run(['apt-get', 'update', '-qq'], capture_output=True)
subprocess.run(['apt-get', 'install', '-y', '-qq',
               'libnss3', 'libnspr4', 'libatk1.0-0', 'libatk-bridge2.0-0',
               'libcups2', 'libdrm2', 'libxkbcommon0', 'libxcomposite1',
               'libxdamage1', 'libxfixes3', 'libxrandr2', 'libgbm1',
               'libasound2', 'libpango-1.0-0', 'libcairo2'], capture_output=True)
print("✅ System dependencies installed!")


print("\n📦 Installing Python packages...")
subprocess.run([sys.executable, '-m', 'pip', 'install', '-U', 'crawl4ai', 'nest_asyncio', 'pydantic', '-q'])
print("✅ Python packages installed!")


print("\n📦 Installing Playwright browsers (this may take a minute)...")
subprocess.run([sys.executable, '-m', 'playwright', 'install', 'chromium'], capture_output=True)
subprocess.run([sys.executable, '-m', 'playwright', 'install-deps', 'chromium'], capture_output=True)
print("✅ Playwright browsers installed!")


import nest_asyncio
nest_asyncio.apply()


import asyncio
import json
from typing import List, Optional
from pydantic import BaseModel, Field


print("\n" + "="*60)
print("✅ INSTALLATION COMPLETE! Ready to crawl!")
print("="*60)


print("\n" + "="*60)
print("📖 PART 2: BASIC CRAWLING")
print("="*60)


from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode


async def basic_crawl():
   """The simplest possible crawl - fetch a webpage and get markdown."""
   print("\n🔍 Running basic crawl on example.com...")
  
   async with AsyncWebCrawler() as crawler:
       result = await crawler.arun(url="https://example.com")
      
       print(f"\n✅ Crawl successful: {result.success}")
       print(f"📄 Title: {result.metadata.get('title', 'N/A')}")
       print(f"📝 Markdown length: {len(result.markdown.raw_markdown)} characters")
       print(f"\n--- First 500 chars of markdown ---")
       print(result.markdown.raw_markdown[:500])
      
   return result


result = asyncio.run(basic_crawl())


print("\n" + "="*60)
print("⚙️ PART 3: CONFIGURED CRAWLING")
print("="*60)


async def configured_crawl():
   """Crawling with custom browser and crawler configurations."""
   print("\n🔧 Running configured crawl with custom settings...")
  
   browser_config = BrowserConfig(
       headless=True,
       verbose=True,
       viewport_width=1920,
       viewport_height=1080,
       user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
   )
  
   run_config = CrawlerRunConfig(
       cache_mode=CacheMode.BYPASS,
       word_count_threshold=10,
       page_timeout=30000,
       wait_until="networkidle",
       verbose=True
   )
  
   async with AsyncWebCrawler(config=browser_config) as crawler:
       result = await crawler.arun(
           url="https://httpbin.org/html",
           config=run_config
       )
      
       print(f"\n✅ Success: {result.success}")
       print(f"📊 Status code: {result.status_code}")
       print(f"\n--- Content Preview ---")
       print(result.markdown.raw_markdown[:400])
      
   return result


result = asyncio.run(configured_crawl())


print("\n" + "="*60)
print("📝 PART 4: MARKDOWN GENERATION")
print("="*60)


from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator


async def markdown_generation_demo():
   """Demonstrates raw vs fit markdown with content filtering."""
   print("\n🎯 Demonstrating markdown generation strategies...")
  
   browser_config = BrowserConfig(headless=True, verbose=False)
  
   run_config = CrawlerRunConfig(
       cache_mode=CacheMode.BYPASS,
       markdown_generator=DefaultMarkdownGenerator(
           content_filter=PruningContentFilter(
               threshold=0.4,
               threshold_type="fixed",
               min_word_threshold=20
           )
       )
   )
  
   async with AsyncWebCrawler(config=browser_config) as crawler:
       result = await crawler.arun(
           url="https://en.wikipedia.org/wiki/Web_scraping",
           config=run_config
       )
      
       raw_len = len(result.markdown.raw_markdown)
       fit_len = len(result.markdown.fit_markdown) if result.markdown.fit_markdown else 0
      
       print(f"\n📊 Markdown Comparison:")
       print(f"   Raw Markdown:  {raw_len:,} characters")
       print(f"   Fit Markdown:  {fit_len:,} characters")
       print(f"   Reduction:     {((raw_len - fit_len) / raw_len * 100):.1f}%")
      
       print(f"\n--- Fit Markdown Preview (first 600 chars) ---")
       print(result.markdown.fit_markdown[:600] if result.markdown.fit_markdown else "N/A")
      
   return result


result = asyncio.run(markdown_generation_demo())



Source link

  • Related Posts

    TinyFish AI Releases Full Web Infrastructure Platform for AI Agents: Search, Fetch, Browser, and Agent Under One API Key

    AI agents struggle with tasks that require interacting with the live web — fetching a competitor’s pricing page, extracting structured data from a JavaScript-heavy dashboard, or automating a multi-step workflow…

    TinyFish Launches Full Web Infrastructure Platform for AI Agents — Search, Fetch, Browser, and Agent Under One API Key

    AI agents struggle with tasks that require interacting with the live web — fetching a competitor’s pricing page, extracting structured data from a JavaScript-heavy dashboard, or automating a multi-step workflow…

    Leave a Reply

    Your email address will not be published. Required fields are marked *