Build an AI Data Analyst Agent in Python
Business analysts spend enormous time writing the same types of SQL queries and pandas transformations. An AI data analyst agent changes the workflow: the user asks a question in plain English, the agent writes and executes the appropriate code, interprets the result, and optionally generates a chart — all without the user touching a line of code.
This tutorial builds a data analyst agent that handles CSV files and SQL databases, generates matplotlib charts on demand, explains its reasoning, and handles follow-up questions in a multi-turn conversation. The architecture is designed to be safe: all code runs in a sandboxed executor, and the agent cannot modify data without explicit user confirmation.
What You'll Learn#
- How to give an agent tools to read CSVs and query SQLite databases
- How to generate pandas code dynamically and execute it safely
- How to produce matplotlib charts from natural language chart requests
- How to maintain conversation context for multi-turn data exploration
- How to handle SQL injection risks in agent-generated queries
Prerequisites#
- Python 3.10+
- OpenAI API key
- Familiarity with LangChain agents
- Understanding of what AI agents do
Architecture Overview#
The agent has three types of tools:
- Schema Discovery Tools — inspect CSV column names/types, list database tables, describe table schemas
- Execution Tools — run pandas operations on a CSV, run SQL SELECT queries against a database
- Visualization Tools — generate matplotlib charts and save them as PNG files
The agent is a ReAct loop: it thinks about the question, picks a tool, observes the output, and either returns an answer or picks the next tool.
Step 1: Install Dependencies#
pip install langchain==0.3.0 langchain-openai==0.2.0 langchain-experimental==0.3.0 \
pandas==2.2.0 matplotlib==3.9.0 sqlalchemy==2.0.32 \
python-dotenv==1.0.1 tabulate==0.9.0
Step 2: Data Loading and Schema Discovery#
# data_manager.py
import pandas as pd
import sqlite3
from pathlib import Path
from typing import Union
class DataManager:
"""Manages loaded datasets (CSV and SQLite) for the agent."""
def __init__(self):
self.dataframes: dict[str, pd.DataFrame] = {}
self.db_connections: dict[str, sqlite3.Connection] = {}
def load_csv(self, path: str, name: str = None) -> str:
"""Load a CSV file and register it by name."""
path = Path(path)
dataset_name = name or path.stem
self.dataframes[dataset_name] = pd.read_csv(path)
return dataset_name
def load_sqlite(self, path: str, name: str = None) -> str:
"""Connect to a SQLite database and register it by name."""
db_name = name or Path(path).stem
self.db_connections[db_name] = sqlite3.connect(path)
return db_name
def get_csv_schema(self, name: str) -> str:
"""Return column names, types, and sample values for a CSV dataset."""
df = self.dataframes.get(name)
if df is None:
return f"Dataset '{name}' not found. Available: {list(self.dataframes.keys())}"
lines = [f"Dataset: {name} ({len(df)} rows, {len(df.columns)} columns)"]
for col in df.columns:
dtype = str(df[col].dtype)
sample = df[col].dropna().head(3).tolist()
lines.append(f" - {col} ({dtype}): e.g. {sample}")
return "\n".join(lines)
def get_db_schema(self, db_name: str) -> str:
"""Return all table schemas for a SQLite database."""
conn = self.db_connections.get(db_name)
if conn is None:
return f"Database '{db_name}' not found."
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = [row[0] for row in cursor.fetchall()]
lines = [f"Database: {db_name}"]
for table in tables:
cursor = conn.execute(f"PRAGMA table_info({table})")
cols = cursor.fetchall()
lines.append(f" Table: {table}")
for col in cols:
lines.append(f" - {col[1]} ({col[2]})")
return "\n".join(lines)
Step 3: Execution and Visualization Tools#
# tools.py
import io
import sys
import pandas as pd
import matplotlib
matplotlib.use("Agg") # Non-interactive backend for server environments
import matplotlib.pyplot as plt
from langchain.tools import tool
from data_manager import DataManager
# Module-level data manager — populated at startup
dm = DataManager()
@tool
def inspect_csv(dataset_name: str) -> str:
"""Show column names, data types, and sample values for a loaded CSV dataset."""
return dm.get_csv_schema(dataset_name)
@tool
def inspect_database(database_name: str) -> str:
"""Show all table names and column definitions for a loaded SQLite database."""
return dm.get_db_schema(database_name)
@tool
def run_pandas_code(dataset_name: str, python_code: str) -> str:
"""
Execute pandas Python code against a CSV dataset.
The dataset is available as variable 'df'.
Return the result as a string. Do NOT use print() — use return or assign to 'result'.
"""
df = dm.dataframes.get(dataset_name)
if df is None:
return f"Dataset '{dataset_name}' not loaded."
# Safety: block dangerous operations
blocked = ["import os", "import sys", "subprocess", "open(", "__import__",
"exec(", "eval(", "df.to_csv", "df.to_sql", "df.to_excel"]
for danger in blocked:
if danger in python_code:
return f"Blocked: '{danger}' is not permitted in data analysis code."
local_vars = {"df": df.copy(), "pd": pd, "result": None}
try:
exec(python_code, {}, local_vars)
result = local_vars.get("result")
if result is None:
return "Code executed but no 'result' variable was set."
if isinstance(result, pd.DataFrame):
return result.to_markdown(index=False)
return str(result)
except Exception as e:
return f"Execution error: {type(e).__name__}: {e}"
@tool
def run_sql_query(database_name: str, sql_query: str) -> str:
"""
Run a SQL SELECT query against a loaded SQLite database.
Only SELECT statements are permitted.
"""
conn = dm.db_connections.get(database_name)
if conn is None:
return f"Database '{database_name}' not loaded."
# Block non-SELECT statements
stripped = sql_query.strip().upper()
if not stripped.startswith("SELECT"):
return "Only SELECT queries are permitted. Write queries not modify data."
try:
result = pd.read_sql_query(sql_query, conn)
return result.to_markdown(index=False)
except Exception as e:
return f"SQL error: {e}"
@tool
def generate_chart(dataset_name: str, chart_spec: str, output_path: str = "/tmp/chart.png") -> str:
"""
Generate a matplotlib chart from a natural language chart specification.
chart_spec example: 'bar chart of sales by region, title=Sales by Region'
Returns the path to the saved PNG file.
"""
df = dm.dataframes.get(dataset_name)
if df is None:
return f"Dataset '{dataset_name}' not loaded."
# Ask the LLM to translate chart_spec into matplotlib code
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
chart_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
chart_chain = ChatPromptTemplate.from_messages([
("system", """Write matplotlib Python code to create this chart.
Columns available: {columns}
Save to: {output_path}
Use plt.savefig('{output_path}', bbox_inches='tight', dpi=150)
Start with 'fig, ax = plt.subplots(figsize=(10, 6))'"""),
("human", "Chart request: {spec}"),
]) | chart_llm
code_response = chart_chain.invoke({
"columns": list(df.columns),
"output_path": output_path,
"spec": chart_spec,
})
chart_code = code_response.content
local_vars = {"df": df.copy(), "pd": pd, "plt": plt}
try:
exec(chart_code, {}, local_vars)
plt.close("all")
return f"Chart saved to {output_path}"
except Exception as e:
plt.close("all")
return f"Chart generation error: {e}"
Step 4: Build the Conversational Agent#
# agent.py
import os
from langchain_openai import ChatOpenAI
from langchain.agents import create_openai_tools_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from tools import dm, inspect_csv, inspect_database, run_pandas_code, run_sql_query, generate_chart
SYSTEM_PROMPT = """You are an expert data analyst with access to datasets.
Always start by inspecting the schema to understand the data structure.
Write pandas code or SQL queries to answer quantitative questions.
Explain your analysis in plain language after each result.
For chart requests, use the generate_chart tool and describe what the chart shows.
Never guess at column names — always inspect schema first."""
def build_data_analyst_agent() -> AgentExecutor:
llm = ChatOpenAI(model="gpt-4o", temperature=0)
tools = [inspect_csv, inspect_database, run_pandas_code, run_sql_query, generate_chart]
prompt = ChatPromptTemplate.from_messages([
("system", SYSTEM_PROMPT),
MessagesPlaceholder(variable_name="chat_history"),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
])
agent = create_openai_tools_agent(llm, tools, prompt)
return AgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=10)
def main():
import pandas as pd
# Create a sample dataset for the demo
sample_data = {
"region": ["North", "South", "East", "West", "North", "South"],
"product": ["Widget A", "Widget B", "Widget A", "Widget C", "Widget B", "Widget A"],
"sales": [12500, 8900, 15200, 7300, 11800, 9400],
"units": [125, 89, 152, 73, 118, 94],
"quarter": ["Q1", "Q1", "Q1", "Q1", "Q2", "Q2"],
}
pd.DataFrame(sample_data).to_csv("/tmp/sales.csv", index=False)
dm.load_csv("/tmp/sales.csv", "sales")
executor = build_data_analyst_agent()
chat_history = []
print("Data Analyst Agent ready. Type 'quit' to exit.\n")
while True:
question = input("You: ").strip()
if question.lower() in ("quit", "exit", "q"):
break
result = executor.invoke({
"input": question,
"chat_history": chat_history,
})
answer = result["output"]
print(f"\nAgent: {answer}\n")
chat_history.extend([
HumanMessage(content=question),
AIMessage(content=answer),
])
if __name__ == "__main__":
main()
Example conversation:
You: What dataset do we have loaded?
Agent: You have 'sales' loaded with 6 rows and 5 columns: region, product, sales, units, quarter.
You: Which region had the highest total sales?
Agent: North had the highest total sales at $24,300 across Q1 and Q2.
You: Make a bar chart of total sales by region
Agent: Chart saved to /tmp/chart.png. The chart shows North leading at $24,300...
Step 5: Connect to a Real Database#
# db_setup.py — example: load a SQLite sales database
import sqlite3
import pandas as pd
from tools import dm
conn = sqlite3.connect("/tmp/company.db")
pd.DataFrame({
"customer_id": [1, 2, 3, 4],
"name": ["Acme Corp", "Globex", "Initech", "Umbrella"],
"revenue": [450000, 280000, 175000, 990000],
"segment": ["Enterprise", "Mid-Market", "SMB", "Enterprise"],
}).to_sql("customers", conn, if_exists="replace", index=False)
conn.close()
dm.load_sqlite("/tmp/company.db", "company")
# Now the agent can answer: "Which enterprise customers have revenue above 400k?"
For production, add Langfuse observability to track which queries the agent runs, measure latency, and evaluate answer quality. Also review AI agent security best practices before exposing this agent to external users.
What's Next#
- Deploy this agent as an API service using the Docker deployment guide
- Add human approval before the agent runs write operations using HITL patterns
- Connect this to a multi-agent system with the LangGraph multi-agent tutorial
- Read the LangChain tutorial to deepen your understanding of tool-using agents
- Explore AI agent use cases for engineering teams for more data workflow ideas