<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Andrea</title>
    <description>The latest articles on DEV Community by Andrea (@andreanotte).</description>
    <link>https://dev.to/andreanotte</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3029602%2F7c345a64-d847-40b5-a463-38670d27d6a3.jpg</url>
      <title>DEV Community: Andrea</title>
      <link>https://dev.to/andreanotte</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/andreanotte"/>
    <language>en</language>
    <item>
      <title>[Boost]</title>
      <dc:creator>Andrea</dc:creator>
      <pubDate>Wed, 09 Jul 2025 17:07:34 +0000</pubDate>
      <link>https://dev.to/andreanotte/-loc</link>
      <guid>https://dev.to/andreanotte/-loc</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/nottelabs/web-agents-that-actually-understand-websites-how-nottes-perception-layer-solves-the-dom-problem-20jb" class="crayons-story__hidden-navigation-link"&gt;Web Agents That Actually Understand Websites: How Notte's Perception Layer Solves the DOM Problem&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/nottelabs" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3050805%2F602adbd6-5254-4002-84f4-5f9b1855030d.jpg" alt="nottelabs profile" class="crayons-avatar__image"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/nottelabs" class="crayons-story__secondary fw-medium m:hidden"&gt;
              nottelabs
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                nottelabs
                
              
              &lt;div id="story-author-preview-content-2620717" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/nottelabs" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3050805%2F602adbd6-5254-4002-84f4-5f9b1855030d.jpg" class="crayons-avatar__image" alt=""&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;nottelabs&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/nottelabs/web-agents-that-actually-understand-websites-how-nottes-perception-layer-solves-the-dom-problem-20jb" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 27 '25&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/nottelabs/web-agents-that-actually-understand-websites-how-nottes-perception-layer-solves-the-dom-problem-20jb" id="article-link-2620717"&gt;
          Web Agents That Actually Understand Websites: How Notte's Perception Layer Solves the DOM Problem
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/programming"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;programming&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/productivity"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;productivity&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
          &lt;a href="https://dev.to/nottelabs/web-agents-that-actually-understand-websites-how-nottes-perception-layer-solves-the-dom-problem-20jb" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left"&gt;
            &lt;div class="multiple_reactions_aggregate"&gt;
              &lt;span class="multiple_reactions_icons_container"&gt;
                  &lt;span class="crayons_icon_container"&gt;
                    &lt;img src="https://assets.dev.to/assets/sparkle-heart-5f9bee3767e18deb1bb725290cb151c25234768a0e9a2bd39370c382d02920cf.svg" width="18" height="18"&gt;
                  &lt;/span&gt;
              &lt;/span&gt;
              &lt;span class="aggregate_reactions_counter"&gt;2&lt;span class="hidden s:inline"&gt; reactions&lt;/span&gt;&lt;/span&gt;
            &lt;/div&gt;
          &lt;/a&gt;
            &lt;a href="https://dev.to/nottelabs/web-agents-that-actually-understand-websites-how-nottes-perception-layer-solves-the-dom-problem-20jb#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              Comments


              &lt;span class="hidden s:inline"&gt;Add Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            5 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>webdev</category>
      <category>programming</category>
      <category>ai</category>
      <category>productivity</category>
    </item>
    <item>
      <title>Opensource web agent outclasses Browser-Use</title>
      <dc:creator>Andrea</dc:creator>
      <pubDate>Tue, 08 Apr 2025 17:26:50 +0000</pubDate>
      <link>https://dev.to/andreanotte/opensource-web-agent-outclasses-browser-use-ne1</link>
      <guid>https://dev.to/andreanotte/opensource-web-agent-outclasses-browser-use-ne1</guid>
      <description>&lt;p&gt;Complete post: &lt;a href="https://github.com/nottelabs/open-operator-evals" rel="noopener noreferrer"&gt;https://github.com/nottelabs/open-operator-evals&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The race for open-source web agents is heating up, leading to very bold statements being thrown around. We cut through the noise and bring a fully transparent and reproducible benchmark to get a sense of the curent scene. Everything is open, inviting you to see exactly how different systems perform—and perhaps prompting a closer look at other's claims ;)&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Rank&lt;/th&gt;
&lt;th&gt;Provider&lt;/th&gt;
&lt;th&gt;Agent Self-Report&lt;/th&gt;
&lt;th&gt;LLM Evaluation&lt;/th&gt;
&lt;th&gt;Time per Task&lt;/th&gt;
&lt;th&gt;Task Reliability&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;🏆&lt;/td&gt;
&lt;td&gt;&lt;a href="https://github.com/nottelabs/notte" rel="noopener noreferrer"&gt;Notte&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;86.2%&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;79.0%&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;47s&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;96.6%&lt;/strong&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;2️⃣&lt;/td&gt;
&lt;td&gt;&lt;a href="https://github.com/browser-use/browser-use" rel="noopener noreferrer"&gt;Browser-Use&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;77.3%&lt;/td&gt;
&lt;td&gt;60.2%&lt;/td&gt;
&lt;td&gt;113s&lt;/td&gt;
&lt;td&gt;83.3%&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;3️⃣&lt;/td&gt;
&lt;td&gt;&lt;a href="https://github.com/convergence-ai/proxy-lite" rel="noopener noreferrer"&gt;Convergence&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;38.4%&lt;/td&gt;
&lt;td&gt;31.4%&lt;/td&gt;
&lt;td&gt;83s&lt;/td&gt;
&lt;td&gt;50%&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Results are averaged over tasks and then over 8 separate runs to account for the high variance inherent in web agent systems. In our benchmarks, each provider ran each task 8 times using the same configuration, headless mode, and strict limits: 6 minutes or 20 steps maximum—because no one wants an agent burning 80 steps to find a lasagna recipe. Agents had to handle execution and failures autonomously.&lt;/p&gt;

&lt;h2&gt;
  
  
  Key highlights
&lt;/h2&gt;

&lt;p&gt;You can investigate all replays/logs and reproduce the benchmark yourself 👇🏻&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;a href="https://github.com/nottelabs/notte" rel="noopener noreferrer"&gt;Notte&lt;/a&gt; leads the benchmark by achieving the highest performance with 86.2% self-reported success and 79% LLM-verified completion. It also has the fastest execution time at 47s per task and an impressive 96.6% task reliability—Percentage of tasks an agent successfully completes at least once when given multiple attempts&lt;/li&gt;
&lt;li&gt;
&lt;a href="https://github.com/browser-use/browser-use" rel="noopener noreferrer"&gt;Browser-Use&lt;/a&gt; demonstrates a notable performance difference compared to their claimed results in &lt;a href="https://browser-use.com/posts/sota-technical-report" rel="noopener noreferrer"&gt;their blog post&lt;/a&gt;, achieving 77.3% self-reported agent performance and 60.2% LLM-verified success versus their stated 89%. The absence of access to their results files prevents us from verifying their reported performance.&lt;/li&gt;
&lt;li&gt;
&lt;a href="https://github.com/convergence-ai/proxy-lite" rel="noopener noreferrer"&gt;Convergence&lt;/a&gt; shows significantly lower performance than competitors with 38.4% agent success and 31.4% evaluation success, primarily due to CAPTCHA and bot detection issues. However, shows strong self-awareness, achieving near-perfect alignment in some instances, indicating potential for improvement if detection challenges are overcome.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;PS: &lt;a href="https://nottelabs.notion.site/jobs-for-humans" rel="noopener noreferrer"&gt;We are actively hiring software and research engineers&lt;/a&gt; 🪩&lt;/p&gt;

&lt;h2&gt;
  
  
  The metrics
&lt;/h2&gt;

&lt;p&gt;In the main table&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;Agent Self-Report&lt;/code&gt; The success rate reported by the agent itself across all tasks. This reflects the agent's internal confidence in its performance.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;LLM Evaluation&lt;/code&gt; The success rate determined by GPT-4 using WebVoyager's evaluation prompt as a judge evaluator, assessing the agent's actions and outputs. This provides an objective measure of task completion.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;Time per Task&lt;/code&gt; The average execution time in seconds for the agent to attempt and complete a single task. This indicates the efficiency and speed of the agent's operations.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;Task Reliability&lt;/code&gt; The percentage of tasks the agent successfully completed at least once across multiple attempts (8 in this benchmark). This metric highlights the agent's ability to handle a diverse set of tasks given sufficient retries, indicating system robustness.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;In the breakdowns&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;Alignment&lt;/code&gt; Ratio of Agent Self-Report to LLM Evaluation, indicating overestimation (&amp;gt;1.0) or underestimation (&amp;lt;1.0) by the agent. Being close to 1 or &amp;lt;1.0 is typically better.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;Mismatch&lt;/code&gt; Counts the specific instances where the agent claimed success but the evaluator disagreed. This reveals how often the agent incorrectly assessed its own performance.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  The dataset
&lt;/h2&gt;

&lt;p&gt;WebVoyager is a dataset of ~600 tasks for web agents. Example:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;task: Book a journey with return option on same day from Edinburg to Manchester for Tomorrow, and show me the lowest price option available
url: https://www.google.com/travel/flights
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;An agent navigates the site and returns a success status and an answer. Relying on the agent’s self-reported success is unreliable, as agents may misjudge task completion. WebVoyager addresses this with an independent LLM evaluator that judges success based on agent actions and screenshots.&lt;/p&gt;

&lt;h3&gt;
  
  
  The challenge of high variance
&lt;/h3&gt;

&lt;p&gt;Beyond known limitations like outdated web content, a key issue is the high variance in agent performance. These systems, powered by non-deterministic LLMs and operating on a constantly changing web, often yield inconsistent results. Reasoning errors, execution failures, and unpredictable network behavior make single-run evaluations unreliable. To counter this, we propose to run each task multiple times for a much more accurate view—averaging results helps smooth out randomness and gives a more statistically sound estimate of performance.&lt;/p&gt;

&lt;h3&gt;
  
  
  WebVoyager30
&lt;/h3&gt;

&lt;p&gt;To reduce variance and improve reproducibility, we sampled &lt;a href="//eval/data/webvoyager/webvoyager_simple.jsonl"&gt;WebVoyager30&lt;/a&gt;—a 30-task subset across 15 diverse websites. It retains the full dataset’s complexity while enabling practical multi-run evaluation, offering a more reliable benchmark for the community.&lt;/p&gt;

&lt;p&gt;Running 30 tasks × 8 times (240 runs total) is far more informative than running 600 tasks once, as it averages out randomness and provides a statistically sounder view of performance. Running all 600 tasks 8× would be ideal but is often impractical due to compute costs and time, making fast and accessible reproduction difficult.&lt;/p&gt;

&lt;p&gt;The selected tasks are neither trivial nor overly complex—they reflect the overall difficulty of the full dataset, making this a reasonable and cost-effective proxy.&lt;/p&gt;

&lt;h2&gt;
  
  
  Breakdowns
&lt;/h2&gt;

&lt;p&gt;Benchmark results breakdown for each provider.&lt;/p&gt;

&lt;h2&gt;
  
  
  Notte
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Provider: notte
Version: [v1.3.3](https://github.com/nottelabs/notte/releases/tag/v1.3.3)
Reasoning: gemini/gemini-2.0-flash
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Notte leads the benchmark with 86.2% self-reported success and 79% LLM-verified completion, along with the fastest execution time at 47s per task and an impressive 96.6% task reliability. It shows consistent performance, with self-assessments slightly overestimating results. Alignment ratios range from 0.960 to 1.183, with low mismatch counts (mostly 3). Task times are really efficient (45-51s), and run 1743001170-7 achieved near-perfect alignment at 0.960.0.&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Runs&lt;/th&gt;
&lt;th&gt;Agent Self-Report&lt;/th&gt;
&lt;th&gt;LLM Evaluation&lt;/th&gt;
&lt;th&gt;Alignment&lt;/th&gt;
&lt;th&gt;Mismatch&lt;/th&gt;
&lt;th&gt;Time per Task&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-0&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.929&lt;/td&gt;
&lt;td&gt;0.857&lt;/td&gt;
&lt;td&gt;1.084&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;47s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-3&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.867&lt;/td&gt;
&lt;td&gt;0.767&lt;/td&gt;
&lt;td&gt;1.130&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;50s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-4&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.867&lt;/td&gt;
&lt;td&gt;0.800&lt;/td&gt;
&lt;td&gt;1.084&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;51s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-6&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.867&lt;/td&gt;
&lt;td&gt;0.733&lt;/td&gt;
&lt;td&gt;1.183&lt;/td&gt;
&lt;td&gt;4&lt;/td&gt;
&lt;td&gt;45s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-1&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.862&lt;/td&gt;
&lt;td&gt;0.759&lt;/td&gt;
&lt;td&gt;1.136&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;47s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-7&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.857&lt;/td&gt;
&lt;td&gt;0.893&lt;/td&gt;
&lt;td&gt;0.960&lt;/td&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;47s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-2&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.828&lt;/td&gt;
&lt;td&gt;0.759&lt;/td&gt;
&lt;td&gt;1.091&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;45s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Notte/1743001170"&gt;1743001170-5&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.821&lt;/td&gt;
&lt;td&gt;0.750&lt;/td&gt;
&lt;td&gt;1.095&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;49s&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h2&gt;
  
  
  Browser-Use
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight markdown"&gt;&lt;code&gt;Provider: Browser-Use
Version: &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="nv"&gt;v0.1.40&lt;/span&gt;&lt;span class="p"&gt;](&lt;/span&gt;&lt;span class="sx"&gt;https://github.com/browser-use/browser-use/releases/tag/0.1.40&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
Reasoning: openai/gpt-4o
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Browser-Use &lt;a href="https://browser-use.com/posts/sota-technical-report" rel="noopener noreferrer"&gt;reported an 89% success rate&lt;/a&gt; on WebVoyager, but we were unable to replicate these results despite our efforts, both on WebVoyager30 with multiple retries and with the full dataset in a single shot. We also tested different configurations of the agent, browser, and lenient interpretations of ambiguous outcomes, but their reported performance was impossible to achieve. Browser-Use shows higher alignment ratios (1.2–1.534), indicating 20–50% overestimation of its abilities. It also has more mismatches (5–8), reflecting a bigger gap between self-assessment and performance.&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Runs&lt;/th&gt;
&lt;th&gt;Agent Self-Report&lt;/th&gt;
&lt;th&gt;LLM Evaluation&lt;/th&gt;
&lt;th&gt;Alignment&lt;/th&gt;
&lt;th&gt;Mismatch&lt;/th&gt;
&lt;th&gt;Time per Task&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-6&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.833&lt;/td&gt;
&lt;td&gt;0.667&lt;/td&gt;
&lt;td&gt;1.249&lt;/td&gt;
&lt;td&gt;7&lt;/td&gt;
&lt;td&gt;98s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-4&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.815&lt;/td&gt;
&lt;td&gt;0.667&lt;/td&gt;
&lt;td&gt;1.222&lt;/td&gt;
&lt;td&gt;5&lt;/td&gt;
&lt;td&gt;119s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-1&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.808&lt;/td&gt;
&lt;td&gt;0.577&lt;/td&gt;
&lt;td&gt;1.400&lt;/td&gt;
&lt;td&gt;7&lt;/td&gt;
&lt;td&gt;127s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-5&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.800&lt;/td&gt;
&lt;td&gt;0.600&lt;/td&gt;
&lt;td&gt;1.333&lt;/td&gt;
&lt;td&gt;6&lt;/td&gt;
&lt;td&gt;95s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-2&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.786&lt;/td&gt;
&lt;td&gt;0.679&lt;/td&gt;
&lt;td&gt;1.158&lt;/td&gt;
&lt;td&gt;5&lt;/td&gt;
&lt;td&gt;132s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-7&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.767&lt;/td&gt;
&lt;td&gt;0.500&lt;/td&gt;
&lt;td&gt;1.534&lt;/td&gt;
&lt;td&gt;8&lt;/td&gt;
&lt;td&gt;105s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-3&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.708&lt;/td&gt;
&lt;td&gt;0.542&lt;/td&gt;
&lt;td&gt;1.306&lt;/td&gt;
&lt;td&gt;5&lt;/td&gt;
&lt;td&gt;113s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/BrowserUse/1743016360"&gt;1743016360-0&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.667&lt;/td&gt;
&lt;td&gt;0.583&lt;/td&gt;
&lt;td&gt;1.144&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;118s&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h2&gt;
  
  
  Convergence
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight markdown"&gt;&lt;code&gt;Provider: Convergence
Version: &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="nv"&gt;a4389c5&lt;/span&gt;&lt;span class="p"&gt;](&lt;/span&gt;&lt;span class="sx"&gt;https://github.com/convergence-ai/proxy-lite/commit/a4389c599d5f5f77dc18510c879e2e783434766b&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
Reasoning: Convergence Proxy-lite
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Convergence Proxy-lite performs significantly below competitors at just 38.4% (agent) and 31.4% (evaluation) success rates. However, these results appear heavily impacted by technical issues, as the system frequently triggers Google's CAPTCHA and bot detection services. Despite these limitations, Convergence demonstrates remarkably better alignment between self-assessment and evaluation than Browser-Use, with one run achieving perfect 1.000 alignment with zero mismatches. This suggests that with improved bot detection handling, Convergence would likely outperform Browser-Use due to its superior self-awareness and calibration.&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Runs&lt;/th&gt;
&lt;th&gt;Agent Self-Report&lt;/th&gt;
&lt;th&gt;LLM Evaluation&lt;/th&gt;
&lt;th&gt;Alignment&lt;/th&gt;
&lt;th&gt;Mismatch&lt;/th&gt;
&lt;th&gt;Time per Task&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-6&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.483&lt;/td&gt;
&lt;td&gt;0.345&lt;/td&gt;
&lt;td&gt;1.400&lt;/td&gt;
&lt;td&gt;4&lt;/td&gt;
&lt;td&gt;77s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-0&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.407&lt;/td&gt;
&lt;td&gt;0.333&lt;/td&gt;
&lt;td&gt;1.222&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;85s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-3&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.393&lt;/td&gt;
&lt;td&gt;0.286&lt;/td&gt;
&lt;td&gt;1.374&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;82s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-4&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.379&lt;/td&gt;
&lt;td&gt;0.345&lt;/td&gt;
&lt;td&gt;1.099&lt;/td&gt;
&lt;td&gt;2&lt;/td&gt;
&lt;td&gt;82s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-5&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.379&lt;/td&gt;
&lt;td&gt;0.276&lt;/td&gt;
&lt;td&gt;1.373&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;84s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-7&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.367&lt;/td&gt;
&lt;td&gt;0.333&lt;/td&gt;
&lt;td&gt;1.102&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;84s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-2&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.357&lt;/td&gt;
&lt;td&gt;0.286&lt;/td&gt;
&lt;td&gt;1.248&lt;/td&gt;
&lt;td&gt;3&lt;/td&gt;
&lt;td&gt;86s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;a href="https://dev.toWebVoyager30/Convergence/1743114165"&gt;1743114165-1&lt;/a&gt;&lt;/td&gt;
&lt;td&gt;0.310&lt;/td&gt;
&lt;td&gt;0.310&lt;/td&gt;
&lt;td&gt;1.000&lt;/td&gt;
&lt;td&gt;0&lt;/td&gt;
&lt;td&gt;84s&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;Our open-source agent evaluation reveals notable differences between reported and observed performance. While Notte shows strong capabilities and good self-awareness, other systems exhibit issues with reproducibility and self-assessment. These results underscore the importance of clear, reproducible benchmarks. We encourage collaboration from the research and engineering community to develop improved trusted evaluation standards.&lt;/p&gt;

</description>
      <category>ai</category>
      <category>agents</category>
      <category>cua</category>
      <category>playwright</category>
    </item>
  </channel>
</rss>
