<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Okeke Chukwudubem</title>
    <description>The latest articles on DEV Community by Okeke Chukwudubem (@okeke_chukwudubem_5f3bf49).</description>
    <link>https://dev.to/okeke_chukwudubem_5f3bf49</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg</url>
      <title>DEV Community: Okeke Chukwudubem</title>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/okeke_chukwudubem_5f3bf49"/>
    <language>en</language>
    <item>
      <title>Day 12: My AI agent can control WhatsApp perfectly. But banking apps? Invisible. The difference is accessibility labels. Apps built for blind users are automatable. Apps that skip accessibility lock out both humans and ai.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Tue, 30 Jun 2026 13:49:34 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-12-my-ai-agent-can-control-whatsapp-perfectly-but-banking-apps-invisible-the-difference-is-3ek6</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-12-my-ai-agent-can-control-whatsapp-perfectly-but-banking-apps-invisible-the-difference-is-3ek6</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-12-the-apps-my-ai-agent-cant-see-3918" class="crayons-story__hidden-navigation-link"&gt;Project Log #12: The Apps My AI Agent Can't See&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" alt="okeke_chukwudubem_5f3bf49 profile" class="crayons-avatar__image" width="800" height="735"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Okeke Chukwudubem
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Okeke Chukwudubem
                
              
              &lt;div id="story-author-preview-content-4031792" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/okeke_chukwudubem_5f3bf49" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" class="crayons-avatar__image" alt="" width="800" height="735"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Okeke Chukwudubem&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-12-the-apps-my-ai-agent-cant-see-3918" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 30&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-12-the-apps-my-ai-agent-cant-see-3918" id="article-link-4031792"&gt;
          Project Log #12: The Apps My AI Agent Can't See
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/programming"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;programming&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/productivity"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;productivity&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-12-the-apps-my-ai-agent-cant-see-3918#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            2 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>a11y</category>
      <category>agents</category>
      <category>ai</category>
      <category>automation</category>
    </item>
    <item>
      <title>Project Log #12: The Apps My AI Agent Can't See</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Tue, 30 Jun 2026 13:47:50 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-12-the-apps-my-ai-agent-cant-see-3918</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-12-the-apps-my-ai-agent-cant-see-3918</guid>
      <description>&lt;p&gt;Day 12. WhatsApp is fully automatable. Banking apps are invisible. The reason is accessibility.&lt;/p&gt;

&lt;p&gt;For 11 days, I've been building an AI agent that can see. UI trees. OCR. Template matching. I've benchmarked it. Optimised it. Proved it works.&lt;/p&gt;

&lt;p&gt;But today I discovered something that changed how I think about this entire project.&lt;/p&gt;

&lt;p&gt;My agent can control WhatsApp perfectly. Every button has a label. Every icon has a content description. The send button says "Send message." The back button says "Back." The search button says "Search." These are accessibility labels—designed for screen readers used by blind and visually impaired people.&lt;/p&gt;

&lt;p&gt;WhatsApp invested in accessibility. And that investment accidentally makes their app fully automatable by my agent.&lt;/p&gt;

&lt;p&gt;Then I tested a local banking app. And another. And another.&lt;/p&gt;

&lt;p&gt;None of them had accessibility labels. The login button? content-desc="". The password field? content-desc="". The transfer button? Generic class name. Nothing identifiable. My UI tree approach was useless. The agent was blind.&lt;/p&gt;

&lt;p&gt;This is bigger than my project. The same neglect that locks out visually impaired users also locks out AI agents. When developers skip accessibility, both humans and machines suffer.&lt;/p&gt;

&lt;p&gt;For the apps that invested in accessibility, my agent works flawlessly. For the ones that didn't, I'm back to OCR and template matching—slower, less reliable, and device-dependent.&lt;/p&gt;

&lt;p&gt;I'm now building a list of "agent-friendly" apps and "agent-hostile" apps. The pattern is clear: well-funded, global apps (WhatsApp, Google apps, Slack) are accessible. Local apps, especially banking and government apps, are not.&lt;/p&gt;

&lt;p&gt;This project started as an automation tool. It's becoming an accessibility audit.&lt;/p&gt;

&lt;p&gt;Next: Building fallback strategies for unlabeled elements.&lt;/p&gt;

&lt;p&gt;👉 github.com/Dexter2344/phone-agent&lt;/p&gt;

</description>
      <category>ai</category>
      <category>webdev</category>
      <category>programming</category>
      <category>productivity</category>
    </item>
    <item>
      <title>Project Log #11: UI Tree vs Screenshots — The Real Performance Test</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Fri, 26 Jun 2026 10:23:29 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-11-ui-tree-vs-screenshots-the-real-performance-test-4jpd</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-11-ui-tree-vs-screenshots-the-real-performance-test-4jpd</guid>
      <description>&lt;p&gt;Day 11. I ran the old vision system and the new one side by side. The results surprised me.&lt;/p&gt;

&lt;p&gt;Yesterday I rewrote the vision system. Screenshots and template matching got demoted. Android's UI hierarchy tree became the primary.&lt;/p&gt;

&lt;p&gt;Today I ran both systems side by side on the same tasks to see if the rewrite was actually worth it.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The Test&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;I set up three identical tasks and ran each one twice—once with the old screenshot-based system, once with the new UI tree system. Same phone. Same apps. Same commands.&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Task&lt;/th&gt;
&lt;th&gt;Old System (Screenshots)&lt;/th&gt;
&lt;th&gt;New System (UI Tree)&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Open WhatsApp&lt;/td&gt;
&lt;td&gt;3.2s&lt;/td&gt;
&lt;td&gt;2.1s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Find "Mom" in contacts&lt;/td&gt;
&lt;td&gt;2.8s (OCR)&lt;/td&gt;
&lt;td&gt;0.7s (UI tree text match)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Tap send button&lt;/td&gt;
&lt;td&gt;4.1s (template matching)&lt;/td&gt;
&lt;td&gt;0.6s (content-desc match)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Total&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;10.1s&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;3.4s&lt;/strong&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;The new system is nearly 3x faster for a simple task. The difference gets bigger as tasks get more complex.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Why the UI Tree Wins&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;The old system had to:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Take a screenshot (1-2 seconds)&lt;/li&gt;
&lt;li&gt;Run OCR or template matching (2-4 seconds)&lt;/li&gt;
&lt;li&gt;Parse the results&lt;/li&gt;
&lt;li&gt;Calculate coordinates&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;The new system:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Runs one ADB command that dumps an XML file&lt;/li&gt;
&lt;li&gt;Searches the XML for a matching element&lt;/li&gt;
&lt;li&gt;Reads the exact coordinates from the element's bounds&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;No image processing. No OCR loading time. No template matching loops. Just a text search on a structured document.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What I Got Wrong Yesterday&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;I said the UI tree takes 0.5-1 second. In practice, it's faster—around 0.3-0.7 seconds for the dump and pull combined. The XML file is small (usually under 100KB). The parsing is instant.&lt;/p&gt;

&lt;p&gt;But there's something I didn't anticipate: the UI tree is &lt;em&gt;too&lt;/em&gt; detailed. A single screen can have 200+ elements. Searching through all of them by content-desc or text takes a few milliseconds of Python. It's negligible, but it reminds me that XML parsing on a phone CPU isn't free.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What's Still Hard&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;The UI tree approach has a blind spot: not all apps label their elements well.&lt;/p&gt;

&lt;p&gt;WhatsApp is great. The send button has &lt;code&gt;content-desc="Send message"&lt;/code&gt;. The back button has &lt;code&gt;content-desc="Back"&lt;/code&gt;. The search button has &lt;code&gt;content-desc="Search"&lt;/code&gt;. These are accessibility labels that screen readers use, and WhatsApp's developers actually invested in them.&lt;/p&gt;

&lt;p&gt;But I tested a local banking app. The login button? &lt;code&gt;content-desc=""&lt;/code&gt;. The password field? &lt;code&gt;content-desc=""&lt;/code&gt;. Nothing. No labels. No text. Just generic class names like &lt;code&gt;android.widget.Button&lt;/code&gt; with no identifying information.&lt;/p&gt;

&lt;p&gt;For apps like that, the UI tree is useless. I still need OCR as a fallback. The unified &lt;code&gt;find_target()&lt;/code&gt; function handles this gracefully—it tries UI tree first, and if nothing is found, it falls back to OCR and template matching.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The Accessibility Divide&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Here's something I didn't expect to learn from this project: well-funded apps (WhatsApp, Google apps, Slack) have excellent accessibility labels. They invested in making their apps usable by screen readers, and that investment accidentally makes them automatable by my agent.&lt;/p&gt;

&lt;p&gt;Smaller apps, local apps, banking apps—they often have zero accessibility labels. My agent's ability to automate an app depends on whether that app's developers cared about disabled users.&lt;/p&gt;

&lt;p&gt;This is a sobering realization. The same neglect that locks out blind users also locks out AI agents. Accessibility isn't just about inclusion—it's about making software machine-readable. And when developers skip it, both humans and AI suffer.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What's Next (Day 12)&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Build a fallback mapping for unlabeled elements: if &lt;code&gt;content-desc&lt;/code&gt; is empty, try to identify the element by its class + position + surrounding labeled elements&lt;/li&gt;
&lt;li&gt;Test on more apps beyond WhatsApp—banking apps, settings, Chrome&lt;/li&gt;
&lt;li&gt;Start logging which apps are "agent-friendly" and which aren't&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;The Repo&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;👉 &lt;strong&gt;github.com/Dexter2344/phone-agent&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;code&gt;vision.py&lt;/code&gt; v4 is live. &lt;code&gt;agent.py&lt;/code&gt; v10 is live. The unified target finder handles UI tree, OCR, and template matching in one call.&lt;/p&gt;

&lt;p&gt;This is Day 11. The rewrite was worth it. The agent is faster, more reliable, and no longer tied to one device. But I've discovered a new problem: the agent can only automate apps that were built to be accessible. That's the next wall to climb.&lt;/p&gt;

</description>
      <category>ai</category>
      <category>webdev</category>
      <category>programming</category>
      <category>productivity</category>
    </item>
    <item>
      <title>Day 10 of building an AI agent that controls a phone.

I'm ditching screenshots. OCR was slow. Template matching broke on different devices. The answer? Reading the Android UI tree directly. Exact coordinates. No guessing. Under 1 second.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Thu, 25 Jun 2026 18:53:58 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-10-of-building-an-ai-agent-that-controls-a-phone-im-ditching-screenshots-ocr-was-slow-5fdl</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-10-of-building-an-ai-agent-that-controls-a-phone-im-ditching-screenshots-ocr-was-slow-5fdl</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-10-im-ditching-screenshots-heres-why-3o7a" class="crayons-story__hidden-navigation-link"&gt;Project Log #10: I'm Ditching Screenshots. Here's Why.&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" alt="okeke_chukwudubem_5f3bf49 profile" class="crayons-avatar__image" width="800" height="735"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Okeke Chukwudubem
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Okeke Chukwudubem
                
              
              &lt;div id="story-author-preview-content-3991631" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/okeke_chukwudubem_5f3bf49" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" class="crayons-avatar__image" alt="" width="800" height="735"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Okeke Chukwudubem&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-10-im-ditching-screenshots-heres-why-3o7a" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 25&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-10-im-ditching-screenshots-heres-why-3o7a" id="article-link-3991631"&gt;
          Project Log #10: I'm Ditching Screenshots. Here's Why.
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/programming"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;programming&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/productivity"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;productivity&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-10-im-ditching-screenshots-heres-why-3o7a#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            2 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>agents</category>
      <category>ai</category>
      <category>android</category>
      <category>showdev</category>
    </item>
    <item>
      <title>Project Log #10: I'm Ditching Screenshots. Here's Why.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Thu, 25 Jun 2026 18:53:09 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-10-im-ditching-screenshots-heres-why-3o7a</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-10-im-ditching-screenshots-heres-why-3o7a</guid>
      <description>&lt;p&gt;Day 10. OCR and template matching hit their limits. UI hierarchy inspection might be the real answer.&lt;/p&gt;

&lt;p&gt;Nine days ago, I was proud of my screenshot-based vision system. ML Kit for text. Template matching for icons. A clever fallback chain that worked most of the time.&lt;/p&gt;

&lt;p&gt;Today, I'm ripping most of it out.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The Breaking Point&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Last week, I tested the agent on a friend's phone. Template matching failed. The same icons I cropped on my device didn't match on his—different screen density, different rendering, different pixel arrangement.&lt;/p&gt;

&lt;p&gt;I explored building a multi-resolution icon library. Crop every icon at 5 different DPIs? That's tedious. I explored AI-based icon detection. Train a model to recognize buttons by shape? That's heavy for a phone CPU.&lt;/p&gt;

&lt;p&gt;Then I remembered something. Android already knows what's on the screen. It has to—it's rendering the UI. And there's a way to read that information directly.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Enter UI Hierarchy Inspection&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;ADB has a command called &lt;code&gt;uiautomator dump&lt;/code&gt;. It spits out an XML file containing every visible UI element on the screen—buttons, text fields, icons, images, everything. Each element has:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;A class name (e.g., &lt;code&gt;android.widget.Button&lt;/code&gt;, &lt;code&gt;android.widget.ImageView&lt;/code&gt;)&lt;/li&gt;
&lt;li&gt;Bounds (exact pixel coordinates of where it sits)&lt;/li&gt;
&lt;li&gt;Text (if it has any)&lt;/li&gt;
&lt;li&gt;A content description (accessibility label, often used for icons)&lt;/li&gt;
&lt;li&gt;Whether it's clickable, scrollable, focused&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is not a screenshot. This is the app's internal blueprint.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Why This Changes Everything&lt;/strong&gt;&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Screenshot-Based (Old Way)&lt;/th&gt;
&lt;th&gt;UI Tree (New Way)&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Run OCR on a screenshot (1.5–2s)&lt;/td&gt;
&lt;td&gt;Run one ADB command (0.5–1s)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;If text not found, try template matching (2–4s)&lt;/td&gt;
&lt;td&gt;Not needed. Icons have content descriptions.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Accuracy depends on screen resolution and DPI&lt;/td&gt;
&lt;td&gt;Accuracy is 100%—the OS tells you exactly where things are&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Breaks on different devices&lt;/td&gt;
&lt;td&gt;Works across all devices. Same XML structure.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Can't detect icons without reference images&lt;/td&gt;
&lt;td&gt;Icons are in the tree with coordinates&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;The First Experiment&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;I ran &lt;code&gt;adb shell uiautomator dump&lt;/code&gt; on my phone, then pulled the XML file. I searched for "send." Here's a snippet of what I found:&lt;/p&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;
xml
&amp;lt;node
  class="android.widget.ImageButton"
  content-desc="Send message"
  bounds="[924,1656][1020,1752]"
  clickable="true"
  package="com.whatsapp" /&amp;gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

</description>
      <category>ai</category>
      <category>webdev</category>
      <category>programming</category>
      <category>productivity</category>
    </item>
    <item>
      <title>Day 9 of building an AI agent that controls a phone.

It works perfectly on my phone. But on a friend's phone, template matching failed. Icons rendered differently. The agent couldn't send a message.

Now I'm exploring UI hierarchy inspection</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Sat, 20 Jun 2026 21:23:07 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-9-of-building-an-ai-agent-that-controls-a-phone-it-works-perfectly-on-my-phone-but-on-a-1lgl</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-9-of-building-an-ai-agent-that-controls-a-phone-it-works-perfectly-on-my-phone-but-on-a-1lgl</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng" class="crayons-story__hidden-navigation-link"&gt;Project Log #9: My AI Agent Works on My Phone. But What About Yours?&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" alt="okeke_chukwudubem_5f3bf49 profile" class="crayons-avatar__image"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Okeke Chukwudubem
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Okeke Chukwudubem
                
              
              &lt;div id="story-author-preview-content-3951189" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/okeke_chukwudubem_5f3bf49" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" class="crayons-avatar__image" alt=""&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Okeke Chukwudubem&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 20&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng" id="article-link-3951189"&gt;
          Project Log #9: My AI Agent Works on My Phone. But What About Yours?
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/programming"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;programming&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/productivity"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;productivity&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left"&gt;
            &lt;div class="multiple_reactions_aggregate"&gt;
              &lt;span class="multiple_reactions_icons_container"&gt;
                  &lt;span class="crayons_icon_container"&gt;
                    &lt;img src="https://assets.dev.to/assets/sparkle-heart-5f9bee3767e18deb1bb725290cb151c25234768a0e9a2bd39370c382d02920cf.svg" width="18" height="18"&gt;
                  &lt;/span&gt;
              &lt;/span&gt;
              &lt;span class="aggregate_reactions_counter"&gt;1&lt;span class="hidden s:inline"&gt;&amp;nbsp;reaction&lt;/span&gt;&lt;/span&gt;
            &lt;/div&gt;
          &lt;/a&gt;
            &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            3 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>agents</category>
      <category>ai</category>
      <category>buildinpublic</category>
      <category>mobile</category>
    </item>
    <item>
      <title>Project Log #9: My AI Agent Works on My Phone. But What About Yours?</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Sat, 20 Jun 2026 21:22:01 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-9-my-ai-agent-works-on-my-phone-but-what-about-yours-2mng</guid>
      <description>&lt;p&gt;Day 9. Template matching works. But screen sizes, resolutions, and Android versions might break everything.&lt;/p&gt;

&lt;p&gt;Eight days ago, the agent was an idea. Now it can read text, handle interruptions, and find icons on a screen.&lt;/p&gt;

&lt;p&gt;But there's a question I've been avoiding: does it work on any phone other than mine?&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;The Cross-Device Problem&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Every screenshot I've taken, every icon I've cropped, every coordinate I've mapped—it's all on one device. My phone. Same screen size. Same resolution. Same Android version. Same DPI.&lt;/p&gt;

&lt;p&gt;Template matching relies on reference images that look exactly like the target on screen. Change the screen density, change the icon size, change the font scaling, and the match confidence drops. Suddenly "send_button.png" doesn't match anymore, and the agent can't press send.&lt;/p&gt;

&lt;p&gt;This isn't a bug in my code. It's a fundamental challenge in computer vision: reference-based matching breaks when the visual context changes.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Today's Experiment&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;I tested the same agent on a friend's phone—different manufacturer, different Android version, slightly larger screen. The results were humbling.&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Task&lt;/th&gt;
&lt;th&gt;My Phone&lt;/th&gt;
&lt;th&gt;Friend's Phone&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;OCR (text recognition)&lt;/td&gt;
&lt;td&gt;✅ 95% accuracy&lt;/td&gt;
&lt;td&gt;✅ ~90% accuracy&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Find "Mom" in contacts&lt;/td&gt;
&lt;td&gt;✅ Found&lt;/td&gt;
&lt;td&gt;✅ Found&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Template match: send button&lt;/td&gt;
&lt;td&gt;✅ 94% confidence&lt;/td&gt;
&lt;td&gt;❌ 62% confidence&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Template match: back button&lt;/td&gt;
&lt;td&gt;✅ 91% confidence&lt;/td&gt;
&lt;td&gt;❌ 58% confidence&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;OCR held up reasonably well because text is text. Fonts might change slightly, but the characters are the same. But the icons—the send button, the back arrow—were rendered at a different size and slightly different pixel arrangement on my friend's device.&lt;/p&gt;

&lt;p&gt;The agent failed to send the message.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Why This Matters&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;An AI agent that only works on one phone isn't an agent. It's a script. If I want this to be useful to anyone else—or even to myself if I change phones—it needs to be device-agnostic.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Possible Solutions I'm Exploring&lt;/strong&gt;&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Solution&lt;/th&gt;
&lt;th&gt;Pros&lt;/th&gt;
&lt;th&gt;Cons&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Multi-resolution icon library&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Simple. Just crop icons at different DPIs.&lt;/td&gt;
&lt;td&gt;Tedious. How many variants are enough?&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;AI-based icon detection&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Could recognize icons by shape, not pixels.&lt;/td&gt;
&lt;td&gt;Requires training data. Heavy for a phone.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;UI hierarchy inspection&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Instead of "seeing" the screen, read the app's UI tree directly via ADB.&lt;/td&gt;
&lt;td&gt;Requires root or accessibility service. Not universal.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Relative positioning&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Once OCR finds text, calculate icon positions relative to known landmarks.&lt;/td&gt;
&lt;td&gt;Fragile. Different layouts on different devices.&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;None of these are perfect. All of them are more work. But that's the reality of building something that's supposed to work in the wild, not just in a demo.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What I'm Trying First&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;The UI hierarchy approach. ADB has a command called &lt;code&gt;uiautomator dump&lt;/code&gt; that returns an XML tree of every visible element on screen—text, buttons, icons, everything. Each element has bounds, a class name, and a content description.&lt;/p&gt;

&lt;p&gt;If I can parse that XML tree instead of taking screenshots, the agent doesn't need to "see" the screen at all. It just reads the structure. No OCR. No template matching. No resolution issues.&lt;/p&gt;

&lt;p&gt;This is a fundamental architectural shift. But it might be the right one.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What's Next (Day 10)&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Experiment with &lt;code&gt;uiautomator dump&lt;/code&gt; as a replacement for screenshot-based detection&lt;/li&gt;
&lt;li&gt;Compare speed and accuracy against current OCR + template matching pipeline&lt;/li&gt;
&lt;li&gt;Decide whether to pivot the entire vision system&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;The Repo&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;👉 &lt;strong&gt;github.com/Dexter2344/phone-agent&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;All code from Day 8 is live. The Day 9 experiments are in a new branch called &lt;code&gt;ui-tree-experiment&lt;/code&gt;. I'll merge to main once I have results.&lt;/p&gt;

&lt;p&gt;This is Day 9. The hard problems don't stop coming. But neither do I.&lt;/p&gt;

</description>
      <category>ai</category>
      <category>webdev</category>
      <category>programming</category>
      <category>productivity</category>
    </item>
    <item>
      <title>Day 8 of building an AI agent that controls a phone.

Template matching is now wired into the agent. When OCR can't find a target, the agent searches its icon library. The send button is no longer invisible.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Thu, 18 Jun 2026 22:40:34 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-8-of-building-an-ai-agent-that-controls-a-phone-template-matching-is-now-wired-into-the-4enm</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-8-of-building-an-ai-agent-that-controls-a-phone-template-matching-is-now-wired-into-the-4enm</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-8-the-ai-phone-agent-can-now-see-icons-4ad1" class="crayons-story__hidden-navigation-link"&gt;Project Log #8: The AI Phone Agent Can Now See Icons&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" alt="okeke_chukwudubem_5f3bf49 profile" class="crayons-avatar__image" width="800" height="735"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Okeke Chukwudubem
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Okeke Chukwudubem
                
              
              &lt;div id="story-author-preview-content-3935637" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/okeke_chukwudubem_5f3bf49" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" class="crayons-avatar__image" alt="" width="800" height="735"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Okeke Chukwudubem&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-8-the-ai-phone-agent-can-now-see-icons-4ad1" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 18&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-8-the-ai-phone-agent-can-now-see-icons-4ad1" id="article-link-3935637"&gt;
          Project Log #8: The AI Phone Agent Can Now See Icons
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/software"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;software&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/buildinpublic"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;buildinpublic&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-8-the-ai-phone-agent-can-now-see-icons-4ad1#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            2 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>agents</category>
      <category>ai</category>
      <category>buildinpublic</category>
      <category>mobile</category>
    </item>
    <item>
      <title>Project Log #8: The AI Phone Agent Can Now See Icons</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Thu, 18 Jun 2026 22:40:00 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-8-the-ai-phone-agent-can-now-see-icons-4ad1</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/project-log-8-the-ai-phone-agent-can-now-see-icons-4ad1</guid>
      <description>&lt;p&gt;Day 8. Template matching is wired into the agent. The send button is no longer invisible.&lt;/p&gt;

&lt;p&gt;Seven days ago, the agent was blind to icons. It could read text, find contacts, and type messages—but it couldn't press send.&lt;/p&gt;

&lt;p&gt;Today, that changed.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What Got Built&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Template matching is now integrated into the agent's decision pipeline. Here's the new flow when the agent needs to tap something:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Try OCR first.&lt;/strong&gt; If the target is text (like a contact name), find it via ML Kit or Tesseract.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;If OCR fails&lt;/strong&gt;, fall back to template matching. The agent searches its icon library for a matching reference image.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;If template matching succeeds&lt;/strong&gt; above the confidence threshold (80%), tap the matched coordinates.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;If both fail&lt;/strong&gt;, report the failure and stop. No guessing. No hardcoded coordinates.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;Today's Progress&lt;/strong&gt;&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Task&lt;/th&gt;
&lt;th&gt;Status&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Updated &lt;code&gt;agent.py&lt;/code&gt; to call &lt;code&gt;match_template()&lt;/code&gt; when OCR fails&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Tested on WhatsApp send button&lt;/td&gt;
&lt;td&gt;✅ Success&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Tested on WhatsApp back button&lt;/td&gt;
&lt;td&gt;✅ Success&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Full pipeline test: type message → detect send icon → tap → verify sent&lt;/td&gt;
&lt;td&gt;✅ Passed&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Added 3 reference icons to the library&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;The Full Pipeline Test&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;I gave the agent a command: "Send a WhatsApp message to Mom saying I'll call later."&lt;/p&gt;

&lt;p&gt;Here's what happened:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Agent opened WhatsApp via ADB. ✅&lt;/li&gt;
&lt;li&gt;Agent searched for "Mom" using OCR + fuzzy matching. Found her. ✅&lt;/li&gt;
&lt;li&gt;Agent tapped the contact. Chat opened. ✅&lt;/li&gt;
&lt;li&gt;Agent typed "I'll call later" into the message box. ✅&lt;/li&gt;
&lt;li&gt;Agent looked for the send button. OCR didn't find it (no text). ❌&lt;/li&gt;
&lt;li&gt;Agent switched to template matching. ✅&lt;/li&gt;
&lt;li&gt;Agent matched &lt;code&gt;send_button.png&lt;/code&gt; with 94% confidence. ✅&lt;/li&gt;
&lt;li&gt;Agent tapped the coordinates. ✅&lt;/li&gt;
&lt;li&gt;Agent verified the message appeared in the chat. ✅&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;Task complete. No hardcoded coordinates. No guessing. The agent found the icon by seeing it.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What's in the Icon Library Now&lt;/strong&gt;&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Icon&lt;/th&gt;
&lt;th&gt;File&lt;/th&gt;
&lt;th&gt;Status&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Send button (WhatsApp)&lt;/td&gt;
&lt;td&gt;&lt;code&gt;send_button.png&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;✅ Working&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Back button (WhatsApp)&lt;/td&gt;
&lt;td&gt;&lt;code&gt;back_button.png&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;✅ Working&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Search button (WhatsApp)&lt;/td&gt;
&lt;td&gt;&lt;code&gt;search_button.png&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;🔧 Testing&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;What's Still Hard&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Template matching is slower than OCR. Each match takes 2-4 seconds on my device. For a single icon that's fine. For a task that needs to find three different icons, the delays add up.&lt;/p&gt;

&lt;p&gt;The simple NumPy fallback is about 3x slower than OpenCV. On a device without OpenCV installed, template matching becomes the new bottleneck.&lt;/p&gt;

&lt;p&gt;And icons that change appearance based on theme (dark mode vs light mode) need separate reference images. One icon, two variants. The library will grow.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What's Next (Day 9)&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Add more reference icons: attach, camera, emoji, more options&lt;/li&gt;
&lt;li&gt;Test on a second device to confirm the icon matching works across different screen sizes&lt;/li&gt;
&lt;li&gt;Record a full demo video of the pipeline in action&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;The Repo&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;👉 &lt;strong&gt;github.com/Dexter2344/phone-agent&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;code&gt;agent.py&lt;/code&gt; now calls &lt;code&gt;match_template()&lt;/code&gt; as a fallback when OCR can't find a target. &lt;code&gt;vision.py&lt;/code&gt; handles the matching with OpenCV primary and NumPy fallback. The icon library is growing.&lt;/p&gt;

&lt;p&gt;This is Day 8. The agent can finally see what it's doing.&lt;/p&gt;

</description>
      <category>ai</category>
      <category>webdev</category>
      <category>software</category>
      <category>buildinpublic</category>
    </item>
    <item>
      <title>Day 7 of building an AI agent that controls a phone.

OCR is fast now. But the agent still can't see icons the send button, the camera, the paperclip. I'm adding template matching with OpenCV to give it visual recognition for image-only UI elements.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Wed, 17 Jun 2026 15:46:10 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-7-of-building-an-ai-agent-that-controls-a-phone-ocr-is-fast-now-but-the-agent-still-cant-206o</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-7-of-building-an-ai-agent-that-controls-a-phone-ocr-is-fast-now-but-the-agent-still-cant-206o</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/day-7-text-is-solved-but-icons-and-image-only-buttons-are-still-invisible-enter-template-h87" class="crayons-story__hidden-navigation-link"&gt;Day 7. Text is solved. But icons and image-only buttons are still invisible. Enter template matching.&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" alt="okeke_chukwudubem_5f3bf49 profile" class="crayons-avatar__image" width="800" height="735"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Okeke Chukwudubem
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Okeke Chukwudubem
                
              
              &lt;div id="story-author-preview-content-3925291" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/okeke_chukwudubem_5f3bf49" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" class="crayons-avatar__image" alt="" width="800" height="735"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Okeke Chukwudubem&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/day-7-text-is-solved-but-icons-and-image-only-buttons-are-still-invisible-enter-template-h87" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 17&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/day-7-text-is-solved-but-icons-and-image-only-buttons-are-still-invisible-enter-template-h87" id="article-link-3925291"&gt;
          Day 7. Text is solved. But icons and image-only buttons are still invisible. Enter template matching.
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/programming"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;programming&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/softwaredevelopment"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;softwaredevelopment&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/day-7-text-is-solved-but-icons-and-image-only-buttons-are-still-invisible-enter-template-h87#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            3 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>agents</category>
      <category>ai</category>
      <category>devjournal</category>
      <category>mobile</category>
    </item>
    <item>
      <title>Day 7. Text is solved. But icons and image-only buttons are still invisible. Enter template matching.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Wed, 17 Jun 2026 15:45:23 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-7-text-is-solved-but-icons-and-image-only-buttons-are-still-invisible-enter-template-h87</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-7-text-is-solved-but-icons-and-image-only-buttons-are-still-invisible-enter-template-h87</guid>
      <description>&lt;p&gt;Two days ago, I fixed the OCR bottleneck. The agent can now read text on a screen in under 2 seconds. It handles interruptions. It verifies each step.&lt;/p&gt;

&lt;p&gt;But there's a problem I've been dodging since Day 1, and it's time to face it.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What the Agent Still Can't See&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Every app has buttons without text. The send button in WhatsApp is a paper plane icon. The attach button is a paperclip. The back button is an arrow. The camera button is a camera.&lt;/p&gt;

&lt;p&gt;ML Kit sees text. It doesn't see icons. Tesseract doesn't see icons either. So when the agent needs to tap the send button after typing a message, it has no idea where that button is.&lt;/p&gt;

&lt;p&gt;Right now, my workaround is hardcoded coordinates. I know the send button is roughly at a specific position on my phone. But that breaks on a different device. It breaks if the app updates. It breaks if the screen orientation changes. It's a hack, not a solution.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Template Matching: The Plan&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Template matching is an old-school computer vision technique. You give it a reference image (like a small crop of the send icon) and it scans a larger image (the screenshot) looking for a match. It returns the coordinates of the best match.&lt;/p&gt;

&lt;p&gt;It's not AI. It's not deep learning. It's pixel math. But it works surprisingly well for static UI elements like icons that don't change appearance.&lt;/p&gt;

&lt;p&gt;Here's the approach:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Build an icon library.&lt;/strong&gt; For each app the agent supports (WhatsApp for now), I manually crop reference images of the key icons: send, attach, back, camera, search.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Before each tap action&lt;/strong&gt;, if the target isn't found via OCR, the agent runs template matching against its icon library.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;If a match is found&lt;/strong&gt; above a confidence threshold (80%), the agent taps the center of the matched region.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;If no match is found&lt;/strong&gt;, the agent reports the failure and stops, rather than guessing.&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;Today's Progress&lt;/strong&gt;&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Task&lt;/th&gt;
&lt;th&gt;Status&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Researched template matching libraries for Python&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Selected OpenCV as primary engine, NumPy/PIL as fallback&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Wrote &lt;code&gt;match_template()&lt;/code&gt; and &lt;code&gt;_simple_template_match()&lt;/code&gt; in vision.py&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Created first icon reference image (WhatsApp send button)&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Uploaded &lt;code&gt;send_button.png&lt;/code&gt; to repo&lt;/td&gt;
&lt;td&gt;✅ Done&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Integrated into agent.py&lt;/td&gt;
&lt;td&gt;⏳ Next&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;The Code&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;The new &lt;code&gt;vision.py&lt;/code&gt; now has two template matching functions:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;match_template()&lt;/code&gt; — Uses OpenCV for accurate, fast matching. If OpenCV isn't installed, it falls back to the simple version.&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;_simple_template_match()&lt;/code&gt; — A lightweight pure-Python implementation using NumPy and PIL. Zero dependencies beyond what the agent already uses.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;The function takes a screenshot and a reference icon image, and returns the center coordinates of the best match above the confidence threshold.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Why Two Versions?&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;OpenCV is heavy. Installing it in Termux requires compiling from source or finding a pre-built arm64 wheel. The simple fallback means the agent can still do template matching even without OpenCV—just slower and slightly less accurate. Progressive enhancement. No hard dependency.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What's Next (Day 8)&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Wire template matching into &lt;code&gt;agent.py&lt;/code&gt; so the agent actually uses it when OCR fails&lt;/li&gt;
&lt;li&gt;Test the full pipeline: type message → detect send icon via template matching → tap → verify&lt;/li&gt;
&lt;li&gt;Start building out the icon reference library for WhatsApp&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;The Repo&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;👉 &lt;strong&gt;github.com/Dexter2344/phone-agent&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;code&gt;vision.py&lt;/code&gt; is now at v3 with ML Kit, Tesseract, fuzzy matching, interruption handling, and template matching. &lt;code&gt;send_button.png&lt;/code&gt; is in the repo. Agent integration coming next.&lt;/p&gt;

&lt;p&gt;This is Day 7. The agent is getting eyes for the things that don't have names.&lt;/p&gt;

</description>
      <category>ai</category>
      <category>webdev</category>
      <category>programming</category>
      <category>softwaredevelopment</category>
    </item>
    <item>
      <title>Day 6 of building an AI agent that controls a phone.

Swapped Tesseract for Google ML Kit. Screen scans went from 12 seconds to under 2 seconds. Added an interruption handler that dismisses calls and notifications before they break the agent.</title>
      <dc:creator>Okeke Chukwudubem</dc:creator>
      <pubDate>Tue, 16 Jun 2026 04:09:00 +0000</pubDate>
      <link>https://dev.to/okeke_chukwudubem_5f3bf49/day-6-of-building-an-ai-agent-that-controls-a-phone-swapped-tesseract-for-google-ml-kit-screen-18if</link>
      <guid>https://dev.to/okeke_chukwudubem_5f3bf49/day-6-of-building-an-ai-agent-that-controls-a-phone-swapped-tesseract-for-google-ml-kit-screen-18if</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-6-i-fixed-the-ocr-problem-that-was-killing-my-ai-agent-5bme" class="crayons-story__hidden-navigation-link"&gt;Project Log #6: I Fixed the OCR Problem That Was Killing My AI Agent&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" alt="okeke_chukwudubem_5f3bf49 profile" class="crayons-avatar__image" width="800" height="735"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/okeke_chukwudubem_5f3bf49" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Okeke Chukwudubem
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Okeke Chukwudubem
                
              
              &lt;div id="story-author-preview-content-3911136" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/okeke_chukwudubem_5f3bf49" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3908265%2F586841bb-154e-4de4-a539-b939b3018c48.jpeg" class="crayons-avatar__image" alt="" width="800" height="735"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Okeke Chukwudubem&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-6-i-fixed-the-ocr-problem-that-was-killing-my-ai-agent-5bme" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 16&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-6-i-fixed-the-ocr-problem-that-was-killing-my-ai-agent-5bme" id="article-link-3911136"&gt;
          Project Log #6: I Fixed the OCR Problem That Was Killing My AI Agent
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/webdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;webdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/automation"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;automation&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/programming"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;programming&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/okeke_chukwudubem_5f3bf49/project-log-6-i-fixed-the-ocr-problem-that-was-killing-my-ai-agent-5bme#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            3 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>agents</category>
      <category>ai</category>
      <category>mobile</category>
      <category>performance</category>
    </item>
  </channel>
</rss>
