<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Vikas Goel</title>
    <description>The latest articles on DEV Community by Vikas Goel (@goel_vikas).</description>
    <link>https://dev.to/goel_vikas</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3826847%2Ff53bb2af-9c7a-496c-85ef-17350cec384b.jpeg</url>
      <title>DEV Community: Vikas Goel</title>
      <link>https://dev.to/goel_vikas</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/goel_vikas"/>
    <language>en</language>
    <item>
      <title>[Boost]</title>
      <dc:creator>Vikas Goel</dc:creator>
      <pubDate>Mon, 16 Mar 2026 17:13:59 +0000</pubDate>
      <link>https://dev.to/goel_vikas/-43l0</link>
      <guid>https://dev.to/goel_vikas/-43l0</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/goel_vikas/how-i-built-a-spatial-intelligence-agent-that-sees-thinks-and-speaks-using-gemini-live-api-3hik" class="crayons-story__hidden-navigation-link"&gt;How I Built a Spatial Intelligence Agent That Sees, Thinks, and Speaks — Using Gemini Live API&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/goel_vikas" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3826847%2Ff53bb2af-9c7a-496c-85ef-17350cec384b.jpeg" alt="goel_vikas profile" class="crayons-avatar__image" width="800" height="878"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/goel_vikas" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Vikas Goel
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Vikas Goel
                
              
              &lt;div id="story-author-preview-content-3359559" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/goel_vikas" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3826847%2Ff53bb2af-9c7a-496c-85ef-17350cec384b.jpeg" class="crayons-avatar__image" alt="" width="800" height="878"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Vikas Goel&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/goel_vikas/how-i-built-a-spatial-intelligence-agent-that-sees-thinks-and-speaks-using-gemini-live-api-3hik" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Mar 16&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/goel_vikas/how-i-built-a-spatial-intelligence-agent-that-sees-thinks-and-speaks-using-gemini-live-api-3hik" id="article-link-3359559"&gt;
          How I Built a Spatial Intelligence Agent That Sees, Thinks, and Speaks — Using Gemini Live API
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/gemini"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;gemini&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/googlecloud"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;googlecloud&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/a11y"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;a11y&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/goel_vikas/how-i-built-a-spatial-intelligence-agent-that-sees-thinks-and-speaks-using-gemini-live-api-3hik#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              Comments


              &lt;span class="hidden s:inline"&gt;Add Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            8 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>gemini</category>
      <category>googlecloud</category>
      <category>ai</category>
      <category>a11y</category>
    </item>
    <item>
      <title>How I Built a Spatial Intelligence Agent That Sees, Thinks, and Speaks — Using Gemini Live API</title>
      <dc:creator>Vikas Goel</dc:creator>
      <pubDate>Mon, 16 Mar 2026 15:51:25 +0000</pubDate>
      <link>https://dev.to/goel_vikas/how-i-built-a-spatial-intelligence-agent-that-sees-thinks-and-speaks-using-gemini-live-api-3hik</link>
      <guid>https://dev.to/goel_vikas/how-i-built-a-spatial-intelligence-agent-that-sees-thinks-and-speaks-using-gemini-live-api-3hik</guid>
      <description>&lt;p&gt;&lt;em&gt;Created for the &lt;a href="https://geminiliveagentchallenge.devpost.com/" rel="noopener noreferrer"&gt;Gemini Live Agent Challenge&lt;/a&gt; #GeminiLiveAgentChallenge&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;What if your phone could be a skilled human guide — one that sees the world through your camera, understands what matters, and tells you only what you need to hear?&lt;/p&gt;

&lt;p&gt;That's &lt;strong&gt;Drishti&lt;/strong&gt; (दृष्टि — Sanskrit for "Vision"). It's a spatial intelligence agent built on Google's Gemini Live API that transforms any smartphone into a real-time navigation companion for visually impaired users. No special hardware. No wearable devices. Just a phone on a chest lanyard and a voice that understands your world.&lt;/p&gt;

&lt;p&gt;In this post, I'll share how I built it across 17 production revisions, the architectural decisions that made it work, and the hardest problem I solved — one that has nothing to do with AI models and everything to do with time.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fo1iij69me3a5avtrvhik.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fo1iij69me3a5avtrvhik.png" alt=" "&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  The Problem: Chatbots Can't Navigate the Physical World
&lt;/h2&gt;

&lt;p&gt;1.3 billion people worldwide live with visual impairment. Current assistive technology falls into two categories: expensive specialized hardware (thousands of dollars) or AI chatbots that describe what they see frame-by-frame.&lt;/p&gt;

&lt;p&gt;Neither works for real-time navigation. Here's why:&lt;/p&gt;

&lt;p&gt;A blind person doesn't need to hear &lt;em&gt;"I see a door, a wall, two chairs, a houseplant, and a rug."&lt;/em&gt; They need to hear &lt;em&gt;"Door ahead, 2 steps"&lt;/em&gt; and then &lt;strong&gt;silence&lt;/strong&gt; until the next thing that matters.&lt;/p&gt;

&lt;p&gt;The difference between a chatbot and a guide is editorial judgment: knowing what to say, when to say it, and — most importantly — when to say nothing. A skilled human guide speaks about 8 times in a 5-minute walk. My first prototype spoke 30 times. Getting from 30 to 8 required more engineering than getting from 0 to 30.&lt;/p&gt;

&lt;h2&gt;
  
  
  The Architecture: Three Geminis, One Conductor
&lt;/h2&gt;

&lt;p&gt;I call it the &lt;strong&gt;Conductor Model&lt;/strong&gt; because the Python backend doesn't see or speak — it &lt;em&gt;conducts&lt;/em&gt; three Gemini instances, each doing what it's best at:&lt;/p&gt;

&lt;h3&gt;
  
  
  Gemini Live API — The Voice
&lt;/h3&gt;

&lt;p&gt;Bidirectional audio streaming with interruption handling. The user can interrupt mid-sentence, switch to Hindi, ask questions. Gemini handles all of this natively with &lt;code&gt;proactive_audio&lt;/code&gt; and &lt;code&gt;affective_dialog&lt;/code&gt; enabled. The voice feels natural, not robotic.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# Gemini Live session config
&lt;/span&gt;&lt;span class="n"&gt;config&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;LiveConnectConfig&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;response_modalities&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;AUDIO&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;
    &lt;span class="n"&gt;speech_config&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="nc"&gt;SpeechConfig&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="n"&gt;voice_config&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="nc"&gt;VoiceConfig&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;prebuilt_voice_config&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="nc"&gt;PrebuiltVoiceConfig&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;voice_name&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Kore&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
    &lt;span class="p"&gt;),&lt;/span&gt;
    &lt;span class="n"&gt;system_instruction&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="nc"&gt;Content&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;parts&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="nc"&gt;Part&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;text&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;system_prompt&lt;/span&gt;&lt;span class="p"&gt;)]),&lt;/span&gt;
    &lt;span class="n"&gt;realtime_input_config&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="nc"&gt;RealtimeInputConfig&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="n"&gt;automatic_activity_detection&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="nc"&gt;AutomaticActivityDetection&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;disabled&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;),&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Gemini 2.5 Flash — The Eyes
&lt;/h3&gt;

&lt;p&gt;Event-driven scene analysis via &lt;code&gt;generateContent&lt;/code&gt;. Not called every frame — called only when something changes (body turn, stair entry, significant visual change). Returns structured JSON:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight json"&gt;&lt;code&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"environment"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="nl"&gt;"type"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"indoor_stairs"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="nl"&gt;"description"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"concrete staircase going down"&lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"path_ahead"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="nl"&gt;"clear"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="kc"&gt;false&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="nl"&gt;"blocked_by"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"coffee table"&lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"objects"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="nl"&gt;"what"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"dog"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="nl"&gt;"where"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ahead, 10 o'clock"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="nl"&gt;"distance"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"5 steps"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
     &lt;/span&gt;&lt;span class="nl"&gt;"navigation_relevant"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="kc"&gt;true&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"suggested_speech"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"Dog 5 steps ahead at 10 o'clock."&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The &lt;code&gt;navigation_relevant&lt;/code&gt; field is the key innovation here. In a crowded market, Gemini aggregates 20 people as "not relevant" but flags one person pushing a cart. This single boolean solved the attention budgeting problem that our entire Python tracker couldn't handle.&lt;/p&gt;

&lt;p&gt;Critical config: &lt;code&gt;thinking_budget=0&lt;/code&gt;. Gemini 2.5 Flash is a thinking model — by default it spends up to 8,192 tokens reasoning before answering. For scene perception, this thinking adds latency without improving quality. Setting &lt;code&gt;thinking_budget=0&lt;/code&gt; cut response time from 7 seconds to 2.4 seconds — a 66% improvement.&lt;/p&gt;

&lt;h3&gt;
  
  
  Cloud Vision API — The Safety Tripwire
&lt;/h3&gt;

&lt;p&gt;At 200ms latency, Cloud Vision detects vehicles in the walking path before the 2.4-second cognitive analysis can. It doesn't understand scenes — it just says "bicycle, 15% of frame, center." But for a cyclist approaching at 15 km/h, those 2 seconds matter.&lt;/p&gt;

&lt;h2&gt;
  
  
  The World Model: A Brain That Knows When to Shut Up
&lt;/h2&gt;

&lt;p&gt;The World Model is the conductor. It receives perception results from all three Gemini services and makes one decision: &lt;strong&gt;speak or stay silent&lt;/strong&gt;.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fe5u432zirho9hdzdqxxh.jpeg" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fe5u432zirho9hdzdqxxh.jpeg" alt=" "&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;It maintains four behavioral dimensions inspired by cognitive science models of human spatial reasoning:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Alertness&lt;/strong&gt; — spikes near stairs and crossings, decays in stable environments&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Urgency&lt;/strong&gt; — responds to obstacle proximity, drives cooldown bypass&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Spatial Confidence&lt;/strong&gt; — tracks how fresh our perception is. THIS is the self-correcting mechanism.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Verbosity&lt;/strong&gt; — responds to user commands ("be quiet" / "describe everything") and environment complexity&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;These dimensions drive a &lt;strong&gt;9-priority editorial decision engine&lt;/strong&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;P1: Vehicle emergency (CV, 200ms)
P2: Fast obstacle (CV + temporal validation)
P3: Safety alert (cognitive)
P4: Environment transition (cognitive)
P5: Goal match (speech + cognitive)
P6: Path blocked (cognitive)
P7: New navigation object (cognitive)
P8: Proactive info (cognitive suggested_speech)
P9: Memory augmentation (stored landmarks)
Default: SILENCE
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Every priority level is gated by temporal validation (more on that below). The default is silence. This is what makes Drishti a guide, not a narrator.&lt;/p&gt;

&lt;h2&gt;
  
  
  The Hardest Problem: Time
&lt;/h2&gt;

&lt;p&gt;Here's what nobody tells you about real-time spatial AI: &lt;strong&gt;by the time you process a frame, the user has moved.&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Gemini 2.5 Flash takes 2.4 seconds to analyze a frame. At walking speed (1.2 m/s), the user moves 2.9 meters in that time. A "door 1 meter ahead" warning arrives when the user already walked through the door.&lt;/p&gt;

&lt;p&gt;My first attempt at fixing this was to use Cloud Vision (200ms) as a fast obstacle detector. It created worse problems:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Rev 15 test results:
  7 "Stop!" alerts fired
  Only 2 were correct
  5 were false positives — doors, houseplants, clothing

  User feedback (Hindi, translated):
  "Why report the box so late?"
  "The houseplant info is old"
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;CV has no spatial understanding. "Door, 18% of frame, center" — is the user walking through it or into it? CV doesn't know. Adding more filters created a cascade of new problems.&lt;/p&gt;

&lt;h3&gt;
  
  
  The Real Fix: Temporal Validation
&lt;/h3&gt;

&lt;p&gt;Every camera frame gets &lt;strong&gt;stamped&lt;/strong&gt; with the phone's sensor state at capture time: speed, compass heading, step count. When a perception result arrives seconds later, the system computes:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;distance_moved&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;current_step_count&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;snapshot_step_count&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.7&lt;/span&gt;  &lt;span class="c1"&gt;# meters per step
&lt;/span&gt;&lt;span class="n"&gt;heading_change&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;abs&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;current_heading&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;snapshot_heading&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;remaining&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;obstacle_distance&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;distance_moved&lt;/span&gt;

&lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;remaining&lt;/span&gt; &lt;span class="o"&gt;&amp;lt;&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mf"&gt;0.5&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;    &lt;span class="c1"&gt;# User passed it
&lt;/span&gt;    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;STALE&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;       &lt;span class="c1"&gt;# → Drop silently. User already walked through.
&lt;/span&gt;
&lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;remaining&lt;/span&gt; &lt;span class="o"&gt;&amp;lt;&lt;/span&gt; &lt;span class="mf"&gt;1.5&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;     &lt;span class="c1"&gt;# About to hit it
&lt;/span&gt;    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;IMMINENT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;    &lt;span class="c1"&gt;# → Bypass cooldown. Warn NOW.
&lt;/span&gt;
&lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;VALID&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;           &lt;span class="c1"&gt;# → Normal alert with corrected distance
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This doesn't need GPS (too imprecise indoors — ±10-20m). Step count from the accelerometer and compass heading work everywhere, including inside buildings.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F05373ha60sdk69onr7vy.jpeg" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F05373ha60sdk69onr7vy.jpeg" alt=" "&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Results from Rev 16 session:&lt;/strong&gt;&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Status&lt;/th&gt;
&lt;th&gt;Count&lt;/th&gt;
&lt;th&gt;Example&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Stale (dropped)&lt;/td&gt;
&lt;td&gt;6&lt;/td&gt;
&lt;td&gt;"user turned 105°", "user moved 4.5m past obstacle"&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Imminent (urgent)&lt;/td&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;couch at 0.8m → "Stop! Couch 1 step ahead!"&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Valid (normal)&lt;/td&gt;
&lt;td&gt;1&lt;/td&gt;
&lt;td&gt;houseplant at 1.6m → "houseplant about 2 steps ahead"&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Six false positives silently killed. One genuine obstacle correctly urgent. Zero false alerts reached the user.&lt;/p&gt;

&lt;h2&gt;
  
  
  The Brain's Self-Correcting Loop
&lt;/h2&gt;

&lt;p&gt;The spatial confidence dimension creates an automatic feedback loop:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Cognitive runs → confidence HIGH (we know what's here)&lt;/li&gt;
&lt;li&gt;User walks 3 meters → confidence DECAYS (we're in unknown territory)&lt;/li&gt;
&lt;li&gt;Confidence drops below 0.2 → vigilance SPIKES → triggers new cognitive call&lt;/li&gt;
&lt;li&gt;New cognitive runs on fresh frame → confidence RECOVERS&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;The system literally knows when it doesn't know, and actively seeks to fix that. When I showed the brain panel to testers, the consistent reaction was &lt;em&gt;"it's thinking"&lt;/em&gt; — exactly the response I wanted.&lt;/p&gt;

&lt;h2&gt;
  
  
  What I Discovered About Gemini
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Function calling from Live API has ~16% success rate
&lt;/h3&gt;

&lt;p&gt;I ran 6 controlled experiments sending identical tool declarations through the native audio model. Function calls were mechanically dropped — not a prompting issue, but a platform limitation. This forced the complete architectural separation: Live for voice, generateContent for perception.&lt;/p&gt;

&lt;p&gt;In hindsight, this made the system better. Each Gemini instance does what it's best at.&lt;/p&gt;

&lt;h3&gt;
  
  
  Gemini's visual understanding beats custom CV pipelines
&lt;/h3&gt;

&lt;p&gt;My v3.1 Python pipeline used SORT tracking (Kalman Filter + Hungarian Algorithm) to maintain object identity across frames. It produced 12 false "houseplant approaching" alerts in a 5-minute test. Gemini said &lt;em&gt;"Potted plants line both sides of the path"&lt;/em&gt; — accurate, contextual, mentioned once. I deleted 400 lines of tracker code.&lt;/p&gt;

&lt;h3&gt;
  
  
  Silence requires more engineering than speech
&lt;/h3&gt;

&lt;p&gt;Getting the system to speak was trivial — inject text, Gemini talks. Getting it to stay silent for 2 full minutes during stable walking while remaining ready to warn about obstacles required the entire editorial decision engine, behavioral dimensions, cooldown management, temporal validation, and the self-correcting confidence loop.&lt;/p&gt;

&lt;h2&gt;
  
  
  Phone Sensors: The Underrated Superpower
&lt;/h2&gt;

&lt;p&gt;The accelerometer, compass, and step counter in your phone are incredibly reliable and available in every web browser via the DeviceMotion and DeviceOrientation APIs:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight javascript"&gt;&lt;code&gt;&lt;span class="c1"&gt;// Step detection from accelerometer&lt;/span&gt;
&lt;span class="nb"&gt;window&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;addEventListener&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="dl"&gt;'&lt;/span&gt;&lt;span class="s1"&gt;devicemotion&lt;/span&gt;&lt;span class="dl"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;e&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&amp;gt;&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="k"&gt;if &lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nb"&gt;Math&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;abs&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;e&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;acceleration&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;y&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;&amp;gt;&lt;/span&gt; &lt;span class="mf"&gt;3.5&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
        &lt;span class="kd"&gt;const&lt;/span&gt; &lt;span class="nx"&gt;now&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nb"&gt;Date&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;now&lt;/span&gt;&lt;span class="p"&gt;();&lt;/span&gt;
        &lt;span class="k"&gt;if &lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;now&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="nx"&gt;lastStepTime&lt;/span&gt; &lt;span class="o"&gt;&amp;gt;&lt;/span&gt; &lt;span class="mi"&gt;250&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
            &lt;span class="nx"&gt;stepCount&lt;/span&gt;&lt;span class="o"&gt;++&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
            &lt;span class="nx"&gt;lastStepTime&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nx"&gt;now&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
        &lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="p"&gt;}&lt;/span&gt;
&lt;span class="p"&gt;});&lt;/span&gt;

&lt;span class="c1"&gt;// Compass heading&lt;/span&gt;
&lt;span class="nb"&gt;window&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;addEventListener&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="dl"&gt;'&lt;/span&gt;&lt;span class="s1"&gt;deviceorientation&lt;/span&gt;&lt;span class="dl"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;e&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&amp;gt;&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="nx"&gt;heading&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nx"&gt;e&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nx"&gt;alpha&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt; &lt;span class="c1"&gt;// 0-360 degrees from North&lt;/span&gt;
&lt;span class="p"&gt;});&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;iOS caveat:&lt;/strong&gt; Since iOS 13, &lt;code&gt;DeviceMotionEvent.requestPermission()&lt;/code&gt; must be called inside a user gesture handler (click/tap), before any other async calls. Our sensors silently failed in 3 production revisions before we figured this out.&lt;/p&gt;

&lt;p&gt;With step count and compass heading, we can compute:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Distance moved&lt;/strong&gt;: steps × 0.7m stride length&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Direction changed&lt;/strong&gt;: heading delta since frame capture&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Movement state&lt;/strong&gt;: walking / stationary / stairs (from gravity-axis deviation)&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This is all the temporal validator needs. No GPS required.&lt;/p&gt;

&lt;h2&gt;
  
  
  The Goal System: Implicit + Explicit, One Priority List
&lt;/h2&gt;

&lt;p&gt;Goals emerge from two sources and coexist seamlessly:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Implicit goals&lt;/strong&gt; emerge from the environment. When the user enters a staircase, &lt;code&gt;stair_navigation&lt;/code&gt; activates at priority 0.9 — automatically, from sensors detecting vertical acceleration + cognitive confirming stairs. When the user reaches level ground, it expires automatically. No user input needed.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Explicit goals&lt;/strong&gt; come from user speech. When the user says &lt;em&gt;"Machli kahan milegi?"&lt;/em&gt; (Hindi: "Where can I find fish?"), the ConversationInterpreter extracts a goal from Gemini's response (not from the user's garbled speech — Gemini's response is cleaner and more reliable). The goal persists until achieved or expired.&lt;/p&gt;

&lt;p&gt;Both goal types sit in ONE priority-sorted list. The World Model's &lt;code&gt;decide()&lt;/code&gt; method doesn't know or care where a goal came from. When cognitive detects a "FRESH FISH" sign 2 minutes later, the goal matches and fires. The user hears: &lt;em&gt;"Fresh Fish sign on your right!"&lt;/em&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  The Evolution: 17 Revisions
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Rev&lt;/th&gt;
&lt;th&gt;What happened&lt;/th&gt;
&lt;th&gt;What I learned&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;6&lt;/td&gt;
&lt;td&gt;v3.1 deployed, 30 alerts in 5 min&lt;/td&gt;
&lt;td&gt;Python perception pipeline produces too many false positives&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;7&lt;/td&gt;
&lt;td&gt;v4 first deploy, 89/91 cognitive calls failed (429)&lt;/td&gt;
&lt;td&gt;Free tier has 20 RPD limit&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;12&lt;/td&gt;
&lt;td&gt;"Coffee table 2 steps ahead. Clear to your left."&lt;/td&gt;
&lt;td&gt;The conductor model WORKS&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;13&lt;/td&gt;
&lt;td&gt;Too silent — 4 utterances in 209 seconds&lt;/td&gt;
&lt;td&gt;Verbosity gate was blocking suggested_speech&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;14&lt;/td&gt;
&lt;td&gt;Cognitive 7s latency, warnings too late&lt;/td&gt;
&lt;td&gt;thinking_budget=0 cuts to 2.4s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;15&lt;/td&gt;
&lt;td&gt;5/7 "Stop!" were false positives&lt;/td&gt;
&lt;td&gt;CV can't make navigation judgments&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;16&lt;/td&gt;
&lt;td&gt;6 stale dropped, 1 imminent correct, 0 false&lt;/td&gt;
&lt;td&gt;Temporal validation solves everything&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Each revision was tested in production — real walking, real obstacles, real dog. Structured logs from every session drove the next architectural decision.&lt;/p&gt;

&lt;h2&gt;
  
  
  Try It Yourself
&lt;/h2&gt;

&lt;p&gt;Drishti is live at &lt;strong&gt;&lt;a href="https://drishti-whn43ovjpq-uc.a.run.app" rel="noopener noreferrer"&gt;drishti-whn43ovjpq-uc.a.run.app&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Open on a phone (iOS or Android), grant camera + mic + sensor permissions, and walk around your house. You'll hear Drishti describe obstacles — and more importantly, you'll hear it stay quiet when there's nothing to say.&lt;/p&gt;

&lt;p&gt;Source code: &lt;strong&gt;&lt;a href="https://github.com/vikasjoel/dristi" rel="noopener noreferrer"&gt;github.com/vikasjoel/dristi&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Tech Stack
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Backend:&lt;/strong&gt; Python 3.11, FastAPI, WebSockets on &lt;strong&gt;Google Cloud Run&lt;/strong&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;AI:&lt;/strong&gt; &lt;strong&gt;Gemini Live API&lt;/strong&gt; (voice), &lt;strong&gt;Gemini 2.5 Flash&lt;/strong&gt; (perception), &lt;strong&gt;Cloud Vision API&lt;/strong&gt; (safety)&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Frontend:&lt;/strong&gt; HTML5 PWA with Web Sensors API, Geolocation API&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Key configs:&lt;/strong&gt; &lt;code&gt;thinking_budget=0&lt;/code&gt;, &lt;code&gt;proactive_audio&lt;/code&gt;, &lt;code&gt;affective_dialog&lt;/code&gt;
&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  What's Next
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Stationary camera modes&lt;/strong&gt; — baby monitoring, elderly care, security (plugin architecture already supports them)&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Topological mapping&lt;/strong&gt; from scene transitions — building spatial maps without SLAM&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Maps API integration&lt;/strong&gt; for navigation beyond camera view&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;User calibration&lt;/strong&gt; — learning individual stride length and verbosity preferences&lt;/li&gt;
&lt;/ul&gt;




&lt;p&gt;&lt;em&gt;Built with genuine passion for accessibility. If you're building with the Gemini Live API, I'd love to hear what you're creating. The future of AI isn't text boxes — it's spatial intelligence.&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;&lt;em&gt;#GeminiLiveAgentChallenge&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;&lt;em&gt;Created for the purposes of entering the &lt;a href="https://geminiliveagentchallenge.devpost.com/" rel="noopener noreferrer"&gt;Gemini Live Agent Challenge&lt;/a&gt; hackathon.&lt;/em&gt;&lt;/p&gt;

</description>
      <category>gemini</category>
      <category>googlecloud</category>
      <category>ai</category>
      <category>a11y</category>
    </item>
  </channel>
</rss>
