DEV Community

Custodia-Admin
Custodia-Admin

Posted on • Originally published at pagebolt.dev

How to add browser capabilities to a LangChain agent (screenshots, PDFs, page inspection)

How to Add Browser Capabilities to a LangChain Agent

LangChain agents can reason, plan, and call tools. What they can't do out of the box is see a web page, take a screenshot, or verify that a UI action actually worked.

Here's how to add browser tools to a LangChain agent using the PageBolt API — no Selenium, no Playwright, no browser to manage.

Python: adding tools to a LangChain agent

import os
import requests
import base64
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_openai import ChatOpenAI
from langchain.tools import tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
BASE_URL = "https://pagebolt.dev/api/v1"

@tool
def take_screenshot(url: str) -> str:
    """
    Take a screenshot of a web page. Returns a description of what was captured.
    Use this to visually verify a page, check layouts, or inspect rendered content.
    Input: a full URL (e.g. https://example.com)
    """
    res = requests.post(
        f"{BASE_URL}/screenshot",
        headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
        json={"url": url, "blockBanners": True, "fullPage": True},
    )
    res.raise_for_status()
    # Return base64 for vision-capable models, or a confirmation string
    img_b64 = base64.b64encode(res.content).decode()
    return f"[screenshot captured: {len(res.content)} bytes, base64 encoded]\ndata:image/png;base64,{img_b64[:100]}..."


@tool
def inspect_page(url: str) -> str:
    """
    Inspect a web page and return all interactive elements with their CSS selectors.
    Use this before automating a page — it finds the correct selectors for buttons,
    inputs, links, and forms. Returns a structured list of elements.
    Input: a full URL (e.g. https://example.com/login)
    """
    res = requests.post(
        f"{BASE_URL}/inspect",
        headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
        json={"url": url},
    )
    res.raise_for_status()
    data = res.json()
    elements = data.get("elements", [])

    lines = []
    for el in elements[:30]:  # Limit to first 30 elements to stay within context
        tag = el.get("tag", "")
        role = el.get("role", "")
        text = el.get("text", "")[:80]
        selector = el.get("selector", "")
        lines.append(f"{tag} [{role}] '{text}'{selector}")

    return f"Found {len(elements)} elements on {url}:\n" + "\n".join(lines)


@tool
def generate_pdf(url: str) -> str:
    """
    Generate a PDF of a web page and save it to ./output.pdf.
    Use this to capture documents, invoices, reports, or any printable page.
    Input: a full URL (e.g. https://example.com/invoice/123)
    """
    res = requests.post(
        f"{BASE_URL}/pdf",
        headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
        json={"url": url},
    )
    res.raise_for_status()
    with open("output.pdf", "wb") as f:
        f.write(res.content)
    return f"PDF saved to output.pdf ({len(res.content)} bytes)"


@tool
def run_browser_sequence(url: str, task_description: str) -> str:
    """
    Navigate to a URL and describe what you want to do. This tool will attempt
    a multi-step browser sequence (click, fill, submit) and return a screenshot
    of the final state.
    Input: url (starting page), task_description (what to do on the page)
    Note: For best results, use inspect_page first to get accurate selectors.
    """
    # This simplified version just screenshots the URL
    # In production, you'd parse task_description into steps
    res = requests.post(
        f"{BASE_URL}/screenshot",
        headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
        json={"url": url, "blockBanners": True},
    )
    res.raise_for_status()
    return f"Navigated to {url} and captured result ({len(res.content)} bytes). Task attempted: {task_description}"


# Build the agent
tools = [take_screenshot, inspect_page, generate_pdf, run_browser_sequence]

llm = ChatOpenAI(model="gpt-4o", temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a web research and automation agent. You have browser tools available:
- take_screenshot: Visually capture any web page
- inspect_page: Get all interactive elements and selectors from a page
- generate_pdf: Save any page as a PDF
- run_browser_sequence: Automate multi-step interactions on a page

Always inspect a page before automating it to get accurate selectors.
When taking screenshots, describe what you observe in detail."""),
    ("human", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])

agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
Enter fullscreen mode Exit fullscreen mode

Run it

# Visual check
result = agent_executor.invoke({
    "input": "Screenshot https://news.ycombinator.com and tell me the top 3 stories"
})
print(result["output"])

# Inspect before automating
result = agent_executor.invoke({
    "input": "Inspect https://example.com/login and tell me the selectors for the email and password fields"
})
print(result["output"])

# PDF capture
result = agent_executor.invoke({
    "input": "Generate a PDF of https://pagebolt.dev/docs and save it"
})
print(result["output"])
Enter fullscreen mode Exit fullscreen mode

JavaScript / LangChain.js

import { ChatOpenAI } from "@langchain/openai";
import { AgentExecutor, createOpenAIToolsAgent } from "langchain/agents";
import { DynamicTool } from "@langchain/core/tools";
import { ChatPromptTemplate, MessagesPlaceholder } from "@langchain/core/prompts";

const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY!;
const BASE_URL = "https://pagebolt.dev/api/v1";

async function pageboltPost(endpoint: string, body: object): Promise<Response> {
  return fetch(`${BASE_URL}${endpoint}`, {
    method: "POST",
    headers: { "x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json" },
    body: JSON.stringify(body),
  });
}

const screenshotTool = new DynamicTool({
  name: "take_screenshot",
  description: "Take a screenshot of a web page. Input: a full URL. Returns confirmation of capture.",
  func: async (url: string) => {
    const res = await pageboltPost("/screenshot", { url, blockBanners: true, fullPage: true });
    if (!res.ok) throw new Error(`Screenshot failed: ${res.status}`);
    const bytes = await res.arrayBuffer();
    return `Screenshot captured: ${bytes.byteLength} bytes from ${url}`;
  },
});

const inspectTool = new DynamicTool({
  name: "inspect_page",
  description: "Get all interactive elements and CSS selectors from a page. Use before automating. Input: a full URL.",
  func: async (url: string) => {
    const res = await pageboltPost("/inspect", { url });
    if (!res.ok) throw new Error(`Inspect failed: ${res.status}`);
    const data = await res.json();
    const elements = (data.elements || []).slice(0, 25);
    const lines = elements.map((el: any) =>
      `${el.tag} [${el.role}] '${(el.text || "").slice(0, 60)}' → ${el.selector}`
    );
    return `${data.elements?.length ?? 0} elements found:\n${lines.join("\n")}`;
  },
});

const pdfTool = new DynamicTool({
  name: "generate_pdf",
  description: "Generate a PDF of a web page. Input: a full URL.",
  func: async (url: string) => {
    const res = await pageboltPost("/pdf", { url });
    if (!res.ok) throw new Error(`PDF failed: ${res.status}`);
    const bytes = await res.arrayBuffer();
    return `PDF generated: ${bytes.byteLength} bytes from ${url}`;
  },
});

const tools = [screenshotTool, inspectTool, pdfTool];

const llm = new ChatOpenAI({ model: "gpt-4o", temperature: 0 });

const prompt = ChatPromptTemplate.fromMessages([
  ["system", `You are a web research and automation agent with browser tools.
Use take_screenshot to visually capture pages.
Use inspect_page to find CSS selectors before automating.
Use generate_pdf to capture documents.`],
  ["human", "{input}"],
  new MessagesPlaceholder("agent_scratchpad"),
]);

const agent = await createOpenAIToolsAgent({ llm, tools, prompt });
const agentExecutor = new AgentExecutor({ agent, tools, verbose: true });

// Use it
const result = await agentExecutor.invoke({
  input: "Inspect https://example.com/login — what are the selectors for the form fields?"
});
console.log(result.output);
Enter fullscreen mode Exit fullscreen mode

With vision: letting the agent actually see the screenshot

If you're using a vision-capable model (GPT-4o, Claude), you can pass the screenshot image directly:

import anthropic
import requests
import base64

client = anthropic.Anthropic()
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]

def screenshot_and_analyze(url: str, question: str) -> str:
    """Take a screenshot and ask Claude to analyze it."""
    res = requests.post(
        "https://pagebolt.dev/api/v1/screenshot",
        headers={"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"},
        json={"url": url, "blockBanners": True},
    )
    res.raise_for_status()
    img_b64 = base64.b64encode(res.content).decode()

    message = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": img_b64,
                    },
                },
                {"type": "text", "text": question},
            ],
        }],
    )
    return message.content[0].text

# Ask Claude to visually review a page
analysis = screenshot_and_analyze(
    "https://staging.yourapp.com/pricing",
    "Does the pricing table look correct on desktop? Any obvious layout issues?"
)
print(analysis)
Enter fullscreen mode Exit fullscreen mode

Real use case: agent-driven visual QA

# Agent checks staging after every deploy
result = agent_executor.invoke({
    "input": """
    I just deployed a new version to https://staging.yourapp.com.
    1. Take a screenshot of the homepage
    2. Take a screenshot of /pricing
    3. Inspect /login and confirm the email, password, and submit button selectors exist
    Report any visual issues or missing elements.
    """
})
print(result["output"])
Enter fullscreen mode Exit fullscreen mode

The agent will screenshot both pages, inspect the login form, and return a natural-language QA report — no manual browser session needed.

Why an API instead of a browser package

LangChain has browser tools that wrap Playwright or Selenium. The problem in agent contexts:

  • Memory: A headless browser uses 300–600MB. If you're running many agent sessions concurrently, that adds up fast.
  • Reliability: Browser automation in agent contexts breaks on dynamic content, CAPTCHAs, and selector drift.
  • Deployment: Adding Playwright to a Lambda function or a container adds complexity. An API call adds nothing.

The PageBolt API handles the browser — your agent just makes HTTP calls.


Try it free — 100 requests/month, no credit card. → Get started in 2 minutes

Top comments (0)