How to Add Browser Capabilities to a LangChain Agent
LangChain agents can reason, plan, and call tools. What they can't do out of the box is see a web page, take a screenshot, or verify that a UI action actually worked.
Here's how to add browser tools to a LangChain agent using the PageBolt API — no Selenium, no Playwright, no browser to manage.
Python: adding tools to a LangChain agent
import os
import requests
import base64
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_openai import ChatOpenAI
from langchain.tools import tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
BASE_URL = "https://pagebolt.dev/api/v1"
@tool
def take_screenshot(url: str) -> str:
"""
Take a screenshot of a web page. Returns a description of what was captured.
Use this to visually verify a page, check layouts, or inspect rendered content.
Input: a full URL (e.g. https://example.com)
"""
res = requests.post(
f"{BASE_URL}/screenshot",
headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
json={"url": url, "blockBanners": True, "fullPage": True},
)
res.raise_for_status()
# Return base64 for vision-capable models, or a confirmation string
img_b64 = base64.b64encode(res.content).decode()
return f"[screenshot captured: {len(res.content)} bytes, base64 encoded]\ndata:image/png;base64,{img_b64[:100]}..."
@tool
def inspect_page(url: str) -> str:
"""
Inspect a web page and return all interactive elements with their CSS selectors.
Use this before automating a page — it finds the correct selectors for buttons,
inputs, links, and forms. Returns a structured list of elements.
Input: a full URL (e.g. https://example.com/login)
"""
res = requests.post(
f"{BASE_URL}/inspect",
headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
json={"url": url},
)
res.raise_for_status()
data = res.json()
elements = data.get("elements", [])
lines = []
for el in elements[:30]: # Limit to first 30 elements to stay within context
tag = el.get("tag", "")
role = el.get("role", "")
text = el.get("text", "")[:80]
selector = el.get("selector", "")
lines.append(f"{tag} [{role}] '{text}' → {selector}")
return f"Found {len(elements)} elements on {url}:\n" + "\n".join(lines)
@tool
def generate_pdf(url: str) -> str:
"""
Generate a PDF of a web page and save it to ./output.pdf.
Use this to capture documents, invoices, reports, or any printable page.
Input: a full URL (e.g. https://example.com/invoice/123)
"""
res = requests.post(
f"{BASE_URL}/pdf",
headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
json={"url": url},
)
res.raise_for_status()
with open("output.pdf", "wb") as f:
f.write(res.content)
return f"PDF saved to output.pdf ({len(res.content)} bytes)"
@tool
def run_browser_sequence(url: str, task_description: str) -> str:
"""
Navigate to a URL and describe what you want to do. This tool will attempt
a multi-step browser sequence (click, fill, submit) and return a screenshot
of the final state.
Input: url (starting page), task_description (what to do on the page)
Note: For best results, use inspect_page first to get accurate selectors.
"""
# This simplified version just screenshots the URL
# In production, you'd parse task_description into steps
res = requests.post(
f"{BASE_URL}/screenshot",
headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
json={"url": url, "blockBanners": True},
)
res.raise_for_status()
return f"Navigated to {url} and captured result ({len(res.content)} bytes). Task attempted: {task_description}"
# Build the agent
tools = [take_screenshot, inspect_page, generate_pdf, run_browser_sequence]
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = ChatPromptTemplate.from_messages([
("system", """You are a web research and automation agent. You have browser tools available:
- take_screenshot: Visually capture any web page
- inspect_page: Get all interactive elements and selectors from a page
- generate_pdf: Save any page as a PDF
- run_browser_sequence: Automate multi-step interactions on a page
Always inspect a page before automating it to get accurate selectors.
When taking screenshots, describe what you observe in detail."""),
("human", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
])
agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
Run it
# Visual check
result = agent_executor.invoke({
"input": "Screenshot https://news.ycombinator.com and tell me the top 3 stories"
})
print(result["output"])
# Inspect before automating
result = agent_executor.invoke({
"input": "Inspect https://example.com/login and tell me the selectors for the email and password fields"
})
print(result["output"])
# PDF capture
result = agent_executor.invoke({
"input": "Generate a PDF of https://pagebolt.dev/docs and save it"
})
print(result["output"])
JavaScript / LangChain.js
import { ChatOpenAI } from "@langchain/openai";
import { AgentExecutor, createOpenAIToolsAgent } from "langchain/agents";
import { DynamicTool } from "@langchain/core/tools";
import { ChatPromptTemplate, MessagesPlaceholder } from "@langchain/core/prompts";
const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY!;
const BASE_URL = "https://pagebolt.dev/api/v1";
async function pageboltPost(endpoint: string, body: object): Promise<Response> {
return fetch(`${BASE_URL}${endpoint}`, {
method: "POST",
headers: { "x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json" },
body: JSON.stringify(body),
});
}
const screenshotTool = new DynamicTool({
name: "take_screenshot",
description: "Take a screenshot of a web page. Input: a full URL. Returns confirmation of capture.",
func: async (url: string) => {
const res = await pageboltPost("/screenshot", { url, blockBanners: true, fullPage: true });
if (!res.ok) throw new Error(`Screenshot failed: ${res.status}`);
const bytes = await res.arrayBuffer();
return `Screenshot captured: ${bytes.byteLength} bytes from ${url}`;
},
});
const inspectTool = new DynamicTool({
name: "inspect_page",
description: "Get all interactive elements and CSS selectors from a page. Use before automating. Input: a full URL.",
func: async (url: string) => {
const res = await pageboltPost("/inspect", { url });
if (!res.ok) throw new Error(`Inspect failed: ${res.status}`);
const data = await res.json();
const elements = (data.elements || []).slice(0, 25);
const lines = elements.map((el: any) =>
`${el.tag} [${el.role}] '${(el.text || "").slice(0, 60)}' → ${el.selector}`
);
return `${data.elements?.length ?? 0} elements found:\n${lines.join("\n")}`;
},
});
const pdfTool = new DynamicTool({
name: "generate_pdf",
description: "Generate a PDF of a web page. Input: a full URL.",
func: async (url: string) => {
const res = await pageboltPost("/pdf", { url });
if (!res.ok) throw new Error(`PDF failed: ${res.status}`);
const bytes = await res.arrayBuffer();
return `PDF generated: ${bytes.byteLength} bytes from ${url}`;
},
});
const tools = [screenshotTool, inspectTool, pdfTool];
const llm = new ChatOpenAI({ model: "gpt-4o", temperature: 0 });
const prompt = ChatPromptTemplate.fromMessages([
["system", `You are a web research and automation agent with browser tools.
Use take_screenshot to visually capture pages.
Use inspect_page to find CSS selectors before automating.
Use generate_pdf to capture documents.`],
["human", "{input}"],
new MessagesPlaceholder("agent_scratchpad"),
]);
const agent = await createOpenAIToolsAgent({ llm, tools, prompt });
const agentExecutor = new AgentExecutor({ agent, tools, verbose: true });
// Use it
const result = await agentExecutor.invoke({
input: "Inspect https://example.com/login — what are the selectors for the form fields?"
});
console.log(result.output);
With vision: letting the agent actually see the screenshot
If you're using a vision-capable model (GPT-4o, Claude), you can pass the screenshot image directly:
import anthropic
import requests
import base64
client = anthropic.Anthropic()
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
def screenshot_and_analyze(url: str, question: str) -> str:
"""Take a screenshot and ask Claude to analyze it."""
res = requests.post(
"https://pagebolt.dev/api/v1/screenshot",
headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
json={"url": url, "blockBanners": True},
)
res.raise_for_status()
img_b64 = base64.b64encode(res.content).decode()
message = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": img_b64,
},
},
{"type": "text", "text": question},
],
}],
)
return message.content[0].text
# Ask Claude to visually review a page
analysis = screenshot_and_analyze(
"https://staging.yourapp.com/pricing",
"Does the pricing table look correct on desktop? Any obvious layout issues?"
)
print(analysis)
Real use case: agent-driven visual QA
# Agent checks staging after every deploy
result = agent_executor.invoke({
"input": """
I just deployed a new version to https://staging.yourapp.com.
1. Take a screenshot of the homepage
2. Take a screenshot of /pricing
3. Inspect /login and confirm the email, password, and submit button selectors exist
Report any visual issues or missing elements.
"""
})
print(result["output"])
The agent will screenshot both pages, inspect the login form, and return a natural-language QA report — no manual browser session needed.
Why an API instead of a browser package
LangChain has browser tools that wrap Playwright or Selenium. The problem in agent contexts:
- Memory: A headless browser uses 300–600MB. If you're running many agent sessions concurrently, that adds up fast.
- Reliability: Browser automation in agent contexts breaks on dynamic content, CAPTCHAs, and selector drift.
- Deployment: Adding Playwright to a Lambda function or a container adds complexity. An API call adds nothing.
The PageBolt API handles the browser — your agent just makes HTTP calls.
Try it free — 100 requests/month, no credit card. → Get started in 2 minutes
Top comments (0)