Give your OpenAI assistant browser tools with function calling

#ai #openai #python #javascript

Give Your OpenAI Assistant Browser Tools with Function Calling

OpenAI's function calling lets your assistant invoke real tools mid-conversation. Here's how to add browser capabilities — screenshots, page inspection, PDF generation — so your assistant can actually see and interact with the web.

Define the tools

import OpenAI from "openai";
import fetch from "node-fetch";

const client = new OpenAI();
const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY;

// Tool definitions passed to the API
const tools = [
  {
    type: "function",
    function: {
      name: "take_screenshot",
      description: "Take a screenshot of a web page and return it as a base64-encoded image. Use this to visually inspect a page, check a layout, or verify that something rendered correctly.",
      parameters: {
        type: "object",
        properties: {
          url: {
            type: "string",
            description: "The full URL to screenshot (e.g. https://example.com)",
          },
          fullPage: {
            type: "boolean",
            description: "Whether to capture the full scrollable page (default: false)",
          },
          darkMode: {
            type: "boolean",
            description: "Whether to render in dark mode",
          },
          viewportDevice: {
            type: "string",
            description: "Device to emulate (e.g. iphone_14_pro, ipad_pro, macbook_pro_14). Omit for desktop.",
          },
        },
        required: ["url"],
      },
    },
  },
  {
    type: "function",
    function: {
      name: "inspect_page",
      description: "Get all interactive elements on a web page with their CSS selectors. Use this before attempting to automate a page — it returns buttons, inputs, links, and forms with unique selectors.",
      parameters: {
        type: "object",
        properties: {
          url: {
            type: "string",
            description: "The full URL to inspect",
          },
        },
        required: ["url"],
      },
    },
  },
  {
    type: "function",
    function: {
      name: "generate_pdf",
      description: "Generate a PDF of a web page or HTML content.",
      parameters: {
        type: "object",
        properties: {
          url: {
            type: "string",
            description: "The URL to convert to PDF",
          },
          format: {
            type: "string",
            enum: ["A4", "Letter", "Legal"],
            description: "Paper format (default: A4)",
          },
        },
        required: ["url"],
      },
    },
  },
];

Implement the tool handlers

async function handleToolCall(name, args) {
  const headers = {
    "x-api-key": PAGEBOLT_API_KEY,
    "Content-Type": "application/json",
  };

  if (name === "take_screenshot") {
    const res = await fetch("https://pagebolt.dev/api/v1/screenshot", {
      method: "POST",
      headers,
      body: JSON.stringify({ blockBanners: true, ...args }),
    });
    if (!res.ok) throw new Error(`Screenshot failed: ${await res.text()}`);

    // Return base64 for vision-capable models
    const buffer = Buffer.from(await res.arrayBuffer());
    const b64 = buffer.toString("base64");
    return {
      type: "image",
      data: b64,
      mimeType: "image/png",
      size: buffer.length,
    };
  }

  if (name === "inspect_page") {
    const res = await fetch("https://pagebolt.dev/api/v1/inspect", {
      method: "POST",
      headers,
      body: JSON.stringify(args),
    });
    if (!res.ok) throw new Error(`Inspect failed: ${await res.text()}`);
    const data = await res.json();

    // Summarize for the model
    const elements = (data.elements || []).slice(0, 40);
    const summary = elements
      .map((el) => `${el.tag}[${el.role}] "${(el.text || "").slice(0, 60)}" → ${el.selector}`)
      .join("\n");
    return { elementCount: data.elements?.length, elements: summary };
  }

  if (name === "generate_pdf") {
    const res = await fetch("https://pagebolt.dev/api/v1/pdf", {
      method: "POST",
      headers,
      body: JSON.stringify(args),
    });
    if (!res.ok) throw new Error(`PDF failed: ${await res.text()}`);
    const buffer = Buffer.from(await res.arrayBuffer());
    return { success: true, bytes: buffer.length, message: `PDF generated (${buffer.length} bytes)` };
  }

  throw new Error(`Unknown tool: ${name}`);
}

Run the agent loop

async function runAgent(userMessage) {
  const messages = [{ role: "user", content: userMessage }];

  while (true) {
    const response = await client.chat.completions.create({
      model: "gpt-4o",
      messages,
      tools,
      tool_choice: "auto",
    });

    const choice = response.choices[0];
    messages.push(choice.message);

    // If no tool calls, we're done
    if (!choice.message.tool_calls || choice.message.tool_calls.length === 0) {
      return choice.message.content;
    }

    // Execute tool calls and add results
    const toolResults = await Promise.all(
      choice.message.tool_calls.map(async (tc) => {
        const args = JSON.parse(tc.function.arguments);
        console.log(`→ calling ${tc.function.name}(${JSON.stringify(args)})`);

        try {
          const result = await handleToolCall(tc.function.name, args);
          return {
            role: "tool",
            tool_call_id: tc.id,
            content: JSON.stringify(result),
          };
        } catch (err) {
          return {
            role: "tool",
            tool_call_id: tc.id,
            content: JSON.stringify({ error: err.message }),
          };
        }
      })
    );

    messages.push(...toolResults);
  }
}

// Examples
const result1 = await runAgent(
  "Screenshot https://news.ycombinator.com and tell me the top 5 story titles"
);
console.log(result1);

const result2 = await runAgent(
  "Inspect https://example.com/signup — what are the CSS selectors for all the form inputs?"
);
console.log(result2);

const result3 = await runAgent(
  "Screenshot https://pagebolt.dev on iPhone 14 Pro and describe the mobile layout"
);
console.log(result3);

Python version

import os
import json
import base64
import requests
from openai import OpenAI

client = OpenAI()
PAGEBOLT_API_KEY = os.environ["PAGEBOLT_API_KEY"]
HEADERS = {"x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json"}

tools = [
    {
        "type": "function",
        "function": {
            "name": "take_screenshot",
            "description": "Take a screenshot of a web page. Returns base64 PNG.",
            "parameters": {
                "type": "object",
                "properties": {
                    "url": {"type": "string"},
                    "fullPage": {"type": "boolean"},
                    "viewportDevice": {"type": "string"},
                    "darkMode": {"type": "boolean"},
                },
                "required": ["url"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "inspect_page",
            "description": "Get interactive elements and selectors from a page.",
            "parameters": {
                "type": "object",
                "properties": {"url": {"type": "string"}},
                "required": ["url"],
            },
        },
    },
]


def handle_tool(name: str, args: dict) -> str:
    if name == "take_screenshot":
        res = requests.post(
            "https://pagebolt.dev/api/v1/screenshot",
            headers=HEADERS,
            json={"blockBanners": True, **args},
        )
        res.raise_for_status()
        b64 = base64.b64encode(res.content).decode()
        return json.dumps({"screenshot_b64": b64, "bytes": len(res.content)})

    if name == "inspect_page":
        res = requests.post("https://pagebolt.dev/api/v1/inspect", headers=HEADERS, json=args)
        res.raise_for_status()
        data = res.json()
        elements = data.get("elements", [])[:30]
        lines = [
            f"{el['tag']}[{el.get('role','')}] '{(el.get('text') or '')[:60]}' → {el.get('selector','')}"
            for el in elements
        ]
        return json.dumps({"count": len(data.get("elements", [])), "elements": "\n".join(lines)})

    raise ValueError(f"Unknown tool: {name}")


def run_agent(user_message: str) -> str:
    messages = [{"role": "user", "content": user_message}]

    while True:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            tools=tools,
            tool_choice="auto",
        )
        choice = response.choices[0]
        messages.append(choice.message)

        if not choice.message.tool_calls:
            return choice.message.content

        for tc in choice.message.tool_calls:
            args = json.loads(tc.function.arguments)
            print(f"→ {tc.function.name}({args})")
            try:
                result = handle_tool(tc.function.name, args)
            except Exception as e:
                result = json.dumps({"error": str(e)})

            messages.append({
                "role": "tool",
                "tool_call_id": tc.id,
                "content": result,
            })


# Run it
print(run_agent("Screenshot https://example.com and describe what you see"))
print(run_agent("Inspect https://example.com/login — list all the form inputs and their selectors"))

Streaming with tool calls

async function runAgentStreaming(userMessage) {
  const messages = [{ role: "user", content: userMessage }];

  while (true) {
    const stream = await client.chat.completions.create({
      model: "gpt-4o",
      messages,
      tools,
      tool_choice: "auto",
      stream: true,
    });

    let fullMessage = { role: "assistant", content: "", tool_calls: [] };
    let currentToolCall = null;

    for await (const chunk of stream) {
      const delta = chunk.choices[0]?.delta;
      if (!delta) continue;

      if (delta.content) {
        process.stdout.write(delta.content);
        fullMessage.content += delta.content;
      }

      if (delta.tool_calls) {
        for (const tcDelta of delta.tool_calls) {
          if (tcDelta.index !== undefined) {
            currentToolCall = fullMessage.tool_calls[tcDelta.index] || {
              id: "", type: "function", function: { name: "", arguments: "" }
            };
            fullMessage.tool_calls[tcDelta.index] = currentToolCall;
          }
          if (tcDelta.id) currentToolCall.id = tcDelta.id;
          if (tcDelta.function?.name) currentToolCall.function.name += tcDelta.function.name;
          if (tcDelta.function?.arguments) currentToolCall.function.arguments += tcDelta.function.arguments;
        }
      }
    }

    messages.push(fullMessage);

    if (fullMessage.tool_calls.length === 0) {
      console.log("\n[done]");
      return fullMessage.content;
    }

    // Execute tools
    const results = await Promise.all(
      fullMessage.tool_calls.map(async (tc) => {
        const args = JSON.parse(tc.function.arguments);
        const result = await handleToolCall(tc.function.name, args);
        return { role: "tool", tool_call_id: tc.id, content: JSON.stringify(result) };
      })
    );
    messages.push(...results);
  }
}

When to use this pattern

Agent QA loops — deploy → agent screenshots → agent reports issues
Research agents — agent finds URLs → screenshots and summarizes content
Documentation generators — agent screenshots your app and writes docs
Monitoring — agent checks pages on a schedule and alerts on visual changes
Form automation — agent inspects page, fills inputs, verifies result

The OpenAI tools pattern keeps the agent loop clean: define what tools exist, implement the handlers, let the model decide when to call them.

Try it free — 100 requests/month, no credit card. → Get started in 2 minutes

DEV Community