Give Your OpenAI Assistant Browser Tools with Function Calling
OpenAI's function calling lets your assistant invoke real tools mid-conversation. Here's how to add browser capabilities — screenshots, page inspection, PDF generation — so your assistant can actually see and interact with the web.
Define the tools
import OpenAI from "openai";
import fetch from "node-fetch";
const client = new OpenAI();
const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY;
// Tool definitions passed to the API
const tools = [
{
type: "function",
function: {
name: "take_screenshot",
description: "Take a screenshot of a web page and return it as a base64-encoded image. Use this to visually inspect a page, check a layout, or verify that something rendered correctly.",
parameters: {
type: "object",
properties: {
url: {
type: "string",
description: "The full URL to screenshot (e.g. https://example.com)",
},
fullPage: {
type: "boolean",
description: "Whether to capture the full scrollable page (default: false)",
},
darkMode: {
type: "boolean",
description: "Whether to render in dark mode",
},
viewportDevice: {
type: "string",
description: "Device to emulate (e.g. iphone_14_pro, ipad_pro, macbook_pro_14). Omit for desktop.",
},
},
required: ["url"],
},
},
},
{
type: "function",
function: {
name: "inspect_page",
description: "Get all interactive elements on a web page with their CSS selectors. Use this before attempting to automate a page — it returns buttons, inputs, links, and forms with unique selectors.",
parameters: {
type: "object",
properties: {
url: {
type: "string",
description: "The full URL to inspect",
},
},
required: ["url"],
},
},
},
{
type: "function",
function: {
name: "generate_pdf",
description: "Generate a PDF of a web page or HTML content.",
parameters: {
type: "object",
properties: {
url: {
type: "string",
description: "The URL to convert to PDF",
},
format: {
type: "string",
enum: ["A4", "Letter", "Legal"],
description: "Paper format (default: A4)",
},
},
required: ["url"],
},
},
},
];
Implement the tool handlers
async function handleToolCall(name, args) {
const headers = {
"x-api-key": PAGEBOLT_API_KEY,
"Content-Type": "application/json",
};
if (name === "take_screenshot") {
const res = await fetch("https://pagebolt.dev/api/v1/screenshot", {
method: "POST",
headers,
body: JSON.stringify({ blockBanners: true, ...args }),
});
if (!res.ok) throw new Error(`Screenshot failed: ${await res.text()}`);
// Return base64 for vision-capable models
const buffer = Buffer.from(await res.arrayBuffer());
const b64 = buffer.toString("base64");
return {
type: "image",
data: b64,
mimeType: "image/png",
size: buffer.length,
};
}
if (name === "inspect_page") {
const res = await fetch("https://pagebolt.dev/api/v1/inspect", {
method: "POST",
headers,
body: JSON.stringify(args),
});
if (!res.ok) throw new Error(`Inspect failed: ${await res.text()}`);
const data = await res.json();
// Summarize for the model
const elements = (data.elements || []).slice(0, 40);
const summary = elements
.map((el) => `${el.tag}[${el.role}] "${(el.text || "").slice(0, 60)}" → ${el.selector}`)
.join("\n");
return { elementCount: data.elements?.length, elements: summary };
}
if (name === "generate_pdf") {
const res = await fetch("https://pagebolt.dev/api/v1/pdf", {
method: "POST",
headers,
body: JSON.stringify(args),
});
if (!res.ok) throw new Error(`PDF failed: ${await res.text()}`);
const buffer = Buffer.from(await res.arrayBuffer());
return { success: true, bytes: buffer.length, message: `PDF generated (${buffer.length} bytes)` };
}
throw new Error(`Unknown tool: ${name}`);
}
Run the agent loop
async function runAgent(userMessage) {
const messages = [{ role: "user", content: userMessage }];
while (true) {
const response = await client.chat.completions.create({
model: "gpt-4o",
messages,
tools,
tool_choice: "auto",
});
const choice = response.choices[0];
messages.push(choice.message);
// If no tool calls, we're done
if (!choice.message.tool_calls || choice.message.tool_calls.length === 0) {
return choice.message.content;
}
// Execute tool calls and add results
const toolResults = await Promise.all(
choice.message.tool_calls.map(async (tc) => {
const args = JSON.parse(tc.function.arguments);
console.log(`→ calling ${tc.function.name}(${JSON.stringify(args)})`);
try {
const result = await handleToolCall(tc.function.name, args);
return {
role: "tool",
tool_call_id: tc.id,
content: JSON.stringify(result),
};
} catch (err) {
return {
role: "tool",
tool_call_id: tc.id,
content: JSON.stringify({ error: err.message }),
};
}
})
);
messages.push(...toolResults);
}
}
// Examples
const result1 = await runAgent(
"Screenshot https://news.ycombinator.com and tell me the top 5 story titles"
);
console.log(result1);
const result2 = await runAgent(
"Inspect https://example.com/signup — what are the CSS selectors for all the form inputs?"
);
console.log(result2);
const result3 = await runAgent(
"Screenshot https://pagebolt.dev on iPhone 14 Pro and describe the mobile layout"
);
console.log(result3);
Python version
import os
import json
import base64
import requests
from openai import OpenAI
client = OpenAI()
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
HEADERS = {"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"}
tools = [
{
"type": "function",
"function": {
"name": "take_screenshot",
"description": "Take a screenshot of a web page. Returns base64 PNG.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string"},
"fullPage": {"type": "boolean"},
"viewportDevice": {"type": "string"},
"darkMode": {"type": "boolean"},
},
"required": ["url"],
},
},
},
{
"type": "function",
"function": {
"name": "inspect_page",
"description": "Get interactive elements and selectors from a page.",
"parameters": {
"type": "object",
"properties": {"url": {"type": "string"}},
"required": ["url"],
},
},
},
]
def handle_tool(name: str, args: dict) -> str:
if name == "take_screenshot":
res = requests.post(
"https://pagebolt.dev/api/v1/screenshot",
headers=HEADERS,
json={"blockBanners": True, **args},
)
res.raise_for_status()
b64 = base64.b64encode(res.content).decode()
return json.dumps({"screenshot_b64": b64, "bytes": len(res.content)})
if name == "inspect_page":
res = requests.post("https://pagebolt.dev/api/v1/inspect", headers=HEADERS, json=args)
res.raise_for_status()
data = res.json()
elements = data.get("elements", [])[:30]
lines = [
f"{el['tag']}[{el.get('role','')}] '{(el.get('text') or '')[:60]}' → {el.get('selector','')}"
for el in elements
]
return json.dumps({"count": len(data.get("elements", [])), "elements": "\n".join(lines)})
raise ValueError(f"Unknown tool: {name}")
def run_agent(user_message: str) -> str:
messages = [{"role": "user", "content": user_message}]
while True:
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=tools,
tool_choice="auto",
)
choice = response.choices[0]
messages.append(choice.message)
if not choice.message.tool_calls:
return choice.message.content
for tc in choice.message.tool_calls:
args = json.loads(tc.function.arguments)
print(f"→ {tc.function.name}({args})")
try:
result = handle_tool(tc.function.name, args)
except Exception as e:
result = json.dumps({"error": str(e)})
messages.append({
"role": "tool",
"tool_call_id": tc.id,
"content": result,
})
# Run it
print(run_agent("Screenshot https://example.com and describe what you see"))
print(run_agent("Inspect https://example.com/login — list all the form inputs and their selectors"))
Streaming with tool calls
async function runAgentStreaming(userMessage) {
const messages = [{ role: "user", content: userMessage }];
while (true) {
const stream = await client.chat.completions.create({
model: "gpt-4o",
messages,
tools,
tool_choice: "auto",
stream: true,
});
let fullMessage = { role: "assistant", content: "", tool_calls: [] };
let currentToolCall = null;
for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta;
if (!delta) continue;
if (delta.content) {
process.stdout.write(delta.content);
fullMessage.content += delta.content;
}
if (delta.tool_calls) {
for (const tcDelta of delta.tool_calls) {
if (tcDelta.index !== undefined) {
currentToolCall = fullMessage.tool_calls[tcDelta.index] || {
id: "", type: "function", function: { name: "", arguments: "" }
};
fullMessage.tool_calls[tcDelta.index] = currentToolCall;
}
if (tcDelta.id) currentToolCall.id = tcDelta.id;
if (tcDelta.function?.name) currentToolCall.function.name += tcDelta.function.name;
if (tcDelta.function?.arguments) currentToolCall.function.arguments += tcDelta.function.arguments;
}
}
}
messages.push(fullMessage);
if (fullMessage.tool_calls.length === 0) {
console.log("\n[done]");
return fullMessage.content;
}
// Execute tools
const results = await Promise.all(
fullMessage.tool_calls.map(async (tc) => {
const args = JSON.parse(tc.function.arguments);
const result = await handleToolCall(tc.function.name, args);
return { role: "tool", tool_call_id: tc.id, content: JSON.stringify(result) };
})
);
messages.push(...results);
}
}
When to use this pattern
- Agent QA loops — deploy → agent screenshots → agent reports issues
- Research agents — agent finds URLs → screenshots and summarizes content
- Documentation generators — agent screenshots your app and writes docs
- Monitoring — agent checks pages on a schedule and alerts on visual changes
- Form automation — agent inspects page, fills inputs, verifies result
The OpenAI tools pattern keeps the agent loop clean: define what tools exist, implement the handlers, let the model decide when to call them.
Try it free — 100 requests/month, no credit card. → Get started in 2 minutes
Top comments (0)