DEV Community

Custodia-Admin
Custodia-Admin

Posted on • Originally published at pagebolt.dev

Give your OpenAI assistant browser tools with function calling

Give Your OpenAI Assistant Browser Tools with Function Calling

OpenAI's function calling lets your assistant invoke real tools mid-conversation. Here's how to add browser capabilities — screenshots, page inspection, PDF generation — so your assistant can actually see and interact with the web.

Define the tools

import OpenAI from "openai";
import fetch from "node-fetch";

const client = new OpenAI();
const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY;

// Tool definitions passed to the API
const tools = [
  {
    type: "function",
    function: {
      name: "take_screenshot",
      description: "Take a screenshot of a web page and return it as a base64-encoded image. Use this to visually inspect a page, check a layout, or verify that something rendered correctly.",
      parameters: {
        type: "object",
        properties: {
          url: {
            type: "string",
            description: "The full URL to screenshot (e.g. https://example.com)",
          },
          fullPage: {
            type: "boolean",
            description: "Whether to capture the full scrollable page (default: false)",
          },
          darkMode: {
            type: "boolean",
            description: "Whether to render in dark mode",
          },
          viewportDevice: {
            type: "string",
            description: "Device to emulate (e.g. iphone_14_pro, ipad_pro, macbook_pro_14). Omit for desktop.",
          },
        },
        required: ["url"],
      },
    },
  },
  {
    type: "function",
    function: {
      name: "inspect_page",
      description: "Get all interactive elements on a web page with their CSS selectors. Use this before attempting to automate a page — it returns buttons, inputs, links, and forms with unique selectors.",
      parameters: {
        type: "object",
        properties: {
          url: {
            type: "string",
            description: "The full URL to inspect",
          },
        },
        required: ["url"],
      },
    },
  },
  {
    type: "function",
    function: {
      name: "generate_pdf",
      description: "Generate a PDF of a web page or HTML content.",
      parameters: {
        type: "object",
        properties: {
          url: {
            type: "string",
            description: "The URL to convert to PDF",
          },
          format: {
            type: "string",
            enum: ["A4", "Letter", "Legal"],
            description: "Paper format (default: A4)",
          },
        },
        required: ["url"],
      },
    },
  },
];
Enter fullscreen mode Exit fullscreen mode

Implement the tool handlers

async function handleToolCall(name, args) {
  const headers = {
    "x-api-key": PAGEBOLT_API_KEY,
    "Content-Type": "application/json",
  };

  if (name === "take_screenshot") {
    const res = await fetch("https://pagebolt.dev/api/v1/screenshot", {
      method: "POST",
      headers,
      body: JSON.stringify({ blockBanners: true, ...args }),
    });
    if (!res.ok) throw new Error(`Screenshot failed: ${await res.text()}`);

    // Return base64 for vision-capable models
    const buffer = Buffer.from(await res.arrayBuffer());
    const b64 = buffer.toString("base64");
    return {
      type: "image",
      data: b64,
      mimeType: "image/png",
      size: buffer.length,
    };
  }

  if (name === "inspect_page") {
    const res = await fetch("https://pagebolt.dev/api/v1/inspect", {
      method: "POST",
      headers,
      body: JSON.stringify(args),
    });
    if (!res.ok) throw new Error(`Inspect failed: ${await res.text()}`);
    const data = await res.json();

    // Summarize for the model
    const elements = (data.elements || []).slice(0, 40);
    const summary = elements
      .map((el) => `${el.tag}[${el.role}] "${(el.text || "").slice(0, 60)}" → ${el.selector}`)
      .join("\n");
    return { elementCount: data.elements?.length, elements: summary };
  }

  if (name === "generate_pdf") {
    const res = await fetch("https://pagebolt.dev/api/v1/pdf", {
      method: "POST",
      headers,
      body: JSON.stringify(args),
    });
    if (!res.ok) throw new Error(`PDF failed: ${await res.text()}`);
    const buffer = Buffer.from(await res.arrayBuffer());
    return { success: true, bytes: buffer.length, message: `PDF generated (${buffer.length} bytes)` };
  }

  throw new Error(`Unknown tool: ${name}`);
}
Enter fullscreen mode Exit fullscreen mode

Run the agent loop

async function runAgent(userMessage) {
  const messages = [{ role: "user", content: userMessage }];

  while (true) {
    const response = await client.chat.completions.create({
      model: "gpt-4o",
      messages,
      tools,
      tool_choice: "auto",
    });

    const choice = response.choices[0];
    messages.push(choice.message);

    // If no tool calls, we're done
    if (!choice.message.tool_calls || choice.message.tool_calls.length === 0) {
      return choice.message.content;
    }

    // Execute tool calls and add results
    const toolResults = await Promise.all(
      choice.message.tool_calls.map(async (tc) => {
        const args = JSON.parse(tc.function.arguments);
        console.log(`→ calling ${tc.function.name}(${JSON.stringify(args)})`);

        try {
          const result = await handleToolCall(tc.function.name, args);
          return {
            role: "tool",
            tool_call_id: tc.id,
            content: JSON.stringify(result),
          };
        } catch (err) {
          return {
            role: "tool",
            tool_call_id: tc.id,
            content: JSON.stringify({ error: err.message }),
          };
        }
      })
    );

    messages.push(...toolResults);
  }
}

// Examples
const result1 = await runAgent(
  "Screenshot https://news.ycombinator.com and tell me the top 5 story titles"
);
console.log(result1);

const result2 = await runAgent(
  "Inspect https://example.com/signup — what are the CSS selectors for all the form inputs?"
);
console.log(result2);

const result3 = await runAgent(
  "Screenshot https://pagebolt.dev on iPhone 14 Pro and describe the mobile layout"
);
console.log(result3);
Enter fullscreen mode Exit fullscreen mode

Python version

import os
import json
import base64
import requests
from openai import OpenAI

client = OpenAI()
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
HEADERS = {"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"}

tools = [
    {
        "type": "function",
        "function": {
            "name": "take_screenshot",
            "description": "Take a screenshot of a web page. Returns base64 PNG.",
            "parameters": {
                "type": "object",
                "properties": {
                    "url": {"type": "string"},
                    "fullPage": {"type": "boolean"},
                    "viewportDevice": {"type": "string"},
                    "darkMode": {"type": "boolean"},
                },
                "required": ["url"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "inspect_page",
            "description": "Get interactive elements and selectors from a page.",
            "parameters": {
                "type": "object",
                "properties": {"url": {"type": "string"}},
                "required": ["url"],
            },
        },
    },
]


def handle_tool(name: str, args: dict) -> str:
    if name == "take_screenshot":
        res = requests.post(
            "https://pagebolt.dev/api/v1/screenshot",
            headers=HEADERS,
            json={"blockBanners": True, **args},
        )
        res.raise_for_status()
        b64 = base64.b64encode(res.content).decode()
        return json.dumps({"screenshot_b64": b64, "bytes": len(res.content)})

    if name == "inspect_page":
        res = requests.post("https://pagebolt.dev/api/v1/inspect", headers=HEADERS, json=args)
        res.raise_for_status()
        data = res.json()
        elements = data.get("elements", [])[:30]
        lines = [
            f"{el['tag']}[{el.get('role','')}] '{(el.get('text') or '')[:60]}'{el.get('selector','')}"
            for el in elements
        ]
        return json.dumps({"count": len(data.get("elements", [])), "elements": "\n".join(lines)})

    raise ValueError(f"Unknown tool: {name}")


def run_agent(user_message: str) -> str:
    messages = [{"role": "user", "content": user_message}]

    while True:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            tools=tools,
            tool_choice="auto",
        )
        choice = response.choices[0]
        messages.append(choice.message)

        if not choice.message.tool_calls:
            return choice.message.content

        for tc in choice.message.tool_calls:
            args = json.loads(tc.function.arguments)
            print(f"{tc.function.name}({args})")
            try:
                result = handle_tool(tc.function.name, args)
            except Exception as e:
                result = json.dumps({"error": str(e)})

            messages.append({
                "role": "tool",
                "tool_call_id": tc.id,
                "content": result,
            })


# Run it
print(run_agent("Screenshot https://example.com and describe what you see"))
print(run_agent("Inspect https://example.com/login — list all the form inputs and their selectors"))
Enter fullscreen mode Exit fullscreen mode

Streaming with tool calls

async function runAgentStreaming(userMessage) {
  const messages = [{ role: "user", content: userMessage }];

  while (true) {
    const stream = await client.chat.completions.create({
      model: "gpt-4o",
      messages,
      tools,
      tool_choice: "auto",
      stream: true,
    });

    let fullMessage = { role: "assistant", content: "", tool_calls: [] };
    let currentToolCall = null;

    for await (const chunk of stream) {
      const delta = chunk.choices[0]?.delta;
      if (!delta) continue;

      if (delta.content) {
        process.stdout.write(delta.content);
        fullMessage.content += delta.content;
      }

      if (delta.tool_calls) {
        for (const tcDelta of delta.tool_calls) {
          if (tcDelta.index !== undefined) {
            currentToolCall = fullMessage.tool_calls[tcDelta.index] || {
              id: "", type: "function", function: { name: "", arguments: "" }
            };
            fullMessage.tool_calls[tcDelta.index] = currentToolCall;
          }
          if (tcDelta.id) currentToolCall.id = tcDelta.id;
          if (tcDelta.function?.name) currentToolCall.function.name += tcDelta.function.name;
          if (tcDelta.function?.arguments) currentToolCall.function.arguments += tcDelta.function.arguments;
        }
      }
    }

    messages.push(fullMessage);

    if (fullMessage.tool_calls.length === 0) {
      console.log("\n[done]");
      return fullMessage.content;
    }

    // Execute tools
    const results = await Promise.all(
      fullMessage.tool_calls.map(async (tc) => {
        const args = JSON.parse(tc.function.arguments);
        const result = await handleToolCall(tc.function.name, args);
        return { role: "tool", tool_call_id: tc.id, content: JSON.stringify(result) };
      })
    );
    messages.push(...results);
  }
}
Enter fullscreen mode Exit fullscreen mode

When to use this pattern

  • Agent QA loops — deploy → agent screenshots → agent reports issues
  • Research agents — agent finds URLs → screenshots and summarizes content
  • Documentation generators — agent screenshots your app and writes docs
  • Monitoring — agent checks pages on a schedule and alerts on visual changes
  • Form automation — agent inspects page, fills inputs, verifies result

The OpenAI tools pattern keeps the agent loop clean: define what tools exist, implement the handlers, let the model decide when to call them.


Try it free — 100 requests/month, no credit card. → Get started in 2 minutes

Top comments (0)