Build a Web Scraping AI Agent in Python
Traditional web scrapers are brittle: they rely on CSS selectors and XPath expressions that break every time a site redesigns its layout. An AI-powered scraping agent takes a fundamentally different approach — it reads the page like a human, understands what data is present, and extracts it into a structured schema. This makes it resilient to layout changes and capable of extracting data from sites where you never defined a schema.
This tutorial builds a web scraping agent that handles both static HTML and JavaScript-rendered pages, uses an LLM to extract structured data from unstructured content, handles pagination and multi-page crawls, and outputs clean JSON ready for downstream processing. You will also learn how to respect robots.txt and implement ethical rate limiting.
What You'll Learn#
- How to fetch and clean web pages using requests and BeautifulSoup
- How to use Playwright for JavaScript-rendered pages
- How to extract structured JSON from unstructured HTML using LLM parsing
- How to build a multi-page crawling loop with pagination detection
- How to implement ethical scraping with rate limiting and robots.txt compliance
Prerequisites#
- Python 3.10+
- OpenAI API key
- Familiarity with LangChain agent tools
- Understanding of AI agent concepts
Architecture Overview#
The agent has three layers:
- Fetcher — Gets page HTML using either
requests(static) or Playwright (dynamic/JS-rendered) - Cleaner — Strips HTML tags, ads, navigation, and boilerplate to leave content-only text
- Extractor — Passes clean text to an LLM with a schema description to produce structured JSON
The agent wraps these layers with LangChain tools and a ReAct loop that can decide to: fetch a URL, follow a pagination link, or stop when extraction criteria are met.
Step 1: Install Dependencies#
pip install langchain==0.3.0 langchain-openai==0.2.0 \
requests==2.32.3 beautifulsoup4==4.12.3 \
playwright==1.47.0 python-dotenv==1.0.1 \
urllib3==2.2.3 lxml==5.3.0 fake-useragent==1.5.1
Install Playwright browsers:
playwright install chromium
Step 2: Static Page Fetcher#
# fetchers/static_fetcher.py
import time
import random
import urllib.robotparser
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
_robots_cache: dict[str, urllib.robotparser.RobotFileParser] = {}
def check_robots(url: str, user_agent: str = "*") -> bool:
"""Check if scraping the URL is permitted by robots.txt."""
parsed = urlparse(url)
base = f"{parsed.scheme}://{parsed.netloc}"
if base not in _robots_cache:
rp = urllib.robotparser.RobotFileParser()
rp.set_url(f"{base}/robots.txt")
try:
rp.read()
except Exception:
rp = None # If robots.txt is inaccessible, assume allowed
_robots_cache[base] = rp
rp = _robots_cache[base]
return rp is None or rp.can_fetch(user_agent, url)
def fetch_static(url: str, delay_range: tuple = (1.0, 3.0)) -> dict:
"""Fetch a static HTML page with rate limiting and robots.txt compliance."""
if not check_robots(url):
return {"url": url, "html": None, "error": "Blocked by robots.txt"}
# Respectful rate limiting
time.sleep(random.uniform(*delay_range))
headers = {
"User-Agent": ua.random,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
return {"url": url, "html": response.text, "status": response.status_code, "error": None}
except requests.RequestException as e:
return {"url": url, "html": None, "error": str(e)}
def clean_html(html: str, url: str = "") -> str:
"""Strip navigation, scripts, ads, and boilerplate from HTML."""
soup = BeautifulSoup(html, "lxml")
# Remove non-content elements
for tag in soup(["script", "style", "nav", "footer", "header",
"aside", "iframe", "form", "button", "noscript"]):
tag.decompose()
# Remove common ad/nav class patterns
for cls in ["nav", "menu", "header", "footer", "sidebar", "ad", "ads",
"advertisement", "cookie", "popup", "modal", "banner"]:
for el in soup.find_all(class_=lambda c: c and cls in c.lower()):
el.decompose()
# Extract main content areas first if available
main = soup.find("main") or soup.find("article") or soup.find(id="content")
if main:
text = main.get_text(separator="\n", strip=True)
else:
text = soup.get_text(separator="\n", strip=True)
# Collapse excessive whitespace
import re
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
Step 3: Dynamic Page Fetcher (JavaScript-Rendered Sites)#
# fetchers/dynamic_fetcher.py
import asyncio
from playwright.async_api import async_playwright
async def fetch_dynamic_async(url: str, wait_for: str = None) -> dict:
"""Fetch a JavaScript-rendered page using Playwright."""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (compatible; ResearchBot/1.0)",
java_script_enabled=True,
)
page = await context.new_page()
try:
await page.goto(url, wait_until="networkidle", timeout=30000)
if wait_for:
await page.wait_for_selector(wait_for, timeout=5000)
html = await page.content()
return {"url": url, "html": html, "error": None}
except Exception as e:
return {"url": url, "html": None, "error": str(e)}
finally:
await browser.close()
def fetch_dynamic(url: str, wait_for: str = None) -> dict:
"""Sync wrapper for the async Playwright fetcher."""
return asyncio.run(fetch_dynamic_async(url, wait_for))
Step 4: LLM-Powered Data Extractor#
This is the core innovation: instead of CSS selectors, you describe what you want in plain English and the LLM extracts it.
# extractor.py
import json
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
EXTRACTOR_SYSTEM = """You are a data extraction specialist.
Extract structured data from the provided web page text according to the schema description.
Return ONLY valid JSON matching the schema. Do not add commentary.
If a field is not found, use null. If a field is a list and nothing is found, use [].
Extract ALL instances if multiple records are present."""
extractor_llm = ChatOpenAI(model="gpt-4o", temperature=0)
extract_chain = ChatPromptTemplate.from_messages([
("system", EXTRACTOR_SYSTEM),
("human", "Schema to extract:\n{schema}\n\nPage content:\n{content}"),
]) | extractor_llm
def extract_structured_data(content: str, schema_description: str) -> dict | list:
"""
Extract structured data from page text using the given schema description.
schema_description examples:
- "List of products with fields: name, price, rating, in_stock"
- "Article with fields: title, author, published_date, content_summary, tags"
- "Job listing with fields: title, company, location, salary_range, requirements"
"""
# Truncate content to avoid token limits
truncated = content[:6000] if len(content) > 6000 else content
result = extract_chain.invoke({
"schema": schema_description,
"content": truncated,
})
try:
return json.loads(result.content)
except json.JSONDecodeError:
# Try to find JSON in the response
import re
json_match = re.search(r'(\[.*\]|\{.*\})', result.content, re.DOTALL)
if json_match:
return json.loads(json_match.group())
return {"error": "Could not parse JSON", "raw": result.content}
Step 5: Build the Scraping Agent#
# agent.py
import json
from langchain.tools import tool
from langchain_openai import ChatOpenAI
from langchain.agents import create_openai_tools_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from fetchers.static_fetcher import fetch_static, clean_html
from fetchers.dynamic_fetcher import fetch_dynamic
from extractor import extract_structured_data
from bs4 import BeautifulSoup
@tool
def scrape_page(url: str, javascript_required: bool = False) -> str:
"""Fetch and clean a web page. Set javascript_required=True for SPAs and dynamic sites."""
if javascript_required:
result = fetch_dynamic(url)
else:
result = fetch_static(url)
if result.get("error"):
return f"Error fetching {url}: {result['error']}"
cleaned = clean_html(result["html"], url)
return cleaned[:5000] # Return first 5000 chars for the agent to inspect
@tool
def extract_data_from_page(url: str, schema_description: str, javascript_required: bool = False) -> str:
"""
Fetch a page and extract structured data according to a schema description.
Returns JSON string with the extracted data.
"""
if javascript_required:
result = fetch_dynamic(url)
else:
result = fetch_static(url)
if result.get("error"):
return f"Error: {result['error']}"
cleaned = clean_html(result["html"], url)
extracted = extract_structured_data(cleaned, schema_description)
return json.dumps(extracted, indent=2)
@tool
def find_pagination_links(url: str, javascript_required: bool = False) -> str:
"""Find pagination links on a page (next page, page numbers)."""
if javascript_required:
result = fetch_dynamic(url)
else:
result = fetch_static(url)
if result.get("error") or not result.get("html"):
return "Could not load page to find pagination."
soup = BeautifulSoup(result["html"], "lxml")
pagination_links = []
# Common pagination patterns
for selector in ["a[rel='next']", ".next a", ".pagination a", "[aria-label='Next']"]:
links = soup.select(selector)
for link in links[:3]:
href = link.get("href", "")
if href:
from urllib.parse import urljoin
pagination_links.append(urljoin(url, href))
if pagination_links:
return f"Pagination links found: {json.dumps(pagination_links)}"
return "No pagination links found on this page."
def build_scraping_agent() -> AgentExecutor:
llm = ChatOpenAI(model="gpt-4o", temperature=0)
tools = [scrape_page, extract_data_from_page, find_pagination_links]
prompt = ChatPromptTemplate.from_messages([
("system", """You are a web scraping agent.
Given a URL and extraction goal, fetch pages and extract structured data.
Guidelines:
- Always check page content before extracting to understand the structure
- For pagination, extract all pages up to the requested limit
- Respect rate limits — don't fetch the same URL twice
- If extraction fails, try adjusting the schema description and retry once
- Return final data as a JSON array"""),
MessagesPlaceholder(variable_name="chat_history"),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
])
agent = create_openai_tools_agent(llm, tools, prompt)
return AgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=15)
Step 6: CLI Runner#
# cli.py
import argparse
import json
from agent import build_scraping_agent
def main():
parser = argparse.ArgumentParser(description="Web Scraping AI Agent")
parser.add_argument("url", help="Starting URL to scrape")
parser.add_argument("schema", help="Schema description: e.g. 'list of products with name, price, rating'")
parser.add_argument("--pages", type=int, default=1, help="Number of pages to crawl")
parser.add_argument("--js", action="store_true", help="Use JavaScript rendering (Playwright)")
parser.add_argument("--output", help="Output JSON file path")
args = parser.parse_args()
executor = build_scraping_agent()
task = (
f"Scrape {args.url} "
f"{'using JavaScript rendering ' if args.js else ''}"
f"and extract: {args.schema}. "
f"{'Crawl up to ' + str(args.pages) + ' pages using pagination.' if args.pages > 1 else ''}"
f" Return all results as a JSON array."
)
result = executor.invoke({"input": task, "chat_history": []})
print("\nExtracted Data:")
print(result["output"])
if args.output:
with open(args.output, "w") as f:
f.write(result["output"])
print(f"\nSaved to {args.output}")
if __name__ == "__main__":
main()
Example usage:
# Scrape static site
python cli.py "https://books.toscrape.com" \
"list of books with title, price, rating, availability" \
--pages 3 --output books.json
# Scrape JavaScript-rendered site
python cli.py "https://example-spa.com/products" \
"product listings with name, price, SKU" \
--js --output products.json
Ethical Scraping Checklist#
Before running your agent:
- Check the site's
robots.txtand Terms of Service - Use delays between requests (built in — minimum 1 second)
- Identify your bot with a descriptive User-Agent
- Do not scrape personal data without a legitimate legal basis
- Cache results to avoid re-requesting the same pages
- Stop if you receive 429 (Too Many Requests) responses
For production scraping at scale, add Langfuse observability to track success rates and extraction quality. Deploy as a scheduled job using the Docker deployment guide.
What's Next#
- Feed scraped data into an AI data analyst agent for automatic analysis
- Build an agentic RAG system that retrieves from scraped content
- Review AI agent security best practices for handling scraped data safely
- Add human review gates using HITL patterns before processing scraped personal data
- Explore AI agent testing patterns to build regression tests for your extraction schemas