<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Praveen Veera</title>
    <description>The latest articles on DEV Community by Praveen Veera (@praveen_builds).</description>
    <link>https://dev.to/praveen_builds</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F261093%2Fa37f2188-51e9-46e4-a0e4-788b2bec5231.png</url>
      <title>DEV Community: Praveen Veera</title>
      <link>https://dev.to/praveen_builds</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/praveen_builds"/>
    <language>en</language>
    <item>
      <title>[Boost]</title>
      <dc:creator>Praveen Veera</dc:creator>
      <pubDate>Mon, 29 Jun 2026 20:35:09 +0000</pubDate>
      <link>https://dev.to/praveen_builds/-4k38</link>
      <guid>https://dev.to/praveen_builds/-4k38</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/praveen_builds/run-a-private-ai-coding-agent-locally-setup-design-with-ollama-opencode-and-custom-workspace-392o" class="crayons-story__hidden-navigation-link"&gt;Run a Private AI Coding Agent Locally: Setup &amp;amp; Design with Ollama, OpenCode, and Custom Workspace Skills&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/praveen_builds" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F261093%2Fa37f2188-51e9-46e4-a0e4-788b2bec5231.png" alt="praveen_builds profile" class="crayons-avatar__image" width="420" height="420"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/praveen_builds" class="crayons-story__secondary fw-medium m:hidden"&gt;
              Praveen Veera
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                Praveen Veera
                
              
              &lt;div id="story-author-preview-content-4016677" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/praveen_builds" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F261093%2Fa37f2188-51e9-46e4-a0e4-788b2bec5231.png" class="crayons-avatar__image" alt="" width="420" height="420"&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;Praveen Veera&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/praveen_builds/run-a-private-ai-coding-agent-locally-setup-design-with-ollama-opencode-and-custom-workspace-392o" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Jun 29&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/praveen_builds/run-a-private-ai-coding-agent-locally-setup-design-with-ollama-opencode-and-custom-workspace-392o" id="article-link-4016677"&gt;
          Run a Private AI Coding Agent Locally: Setup &amp;amp; Design with Ollama, OpenCode, and Custom Workspace Skills
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/opencode"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;opencode&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ollama"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ollama&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/qwen"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;qwen&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/agents"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;agents&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
            &lt;a href="https://dev.to/praveen_builds/run-a-private-ai-coding-agent-locally-setup-design-with-ollama-opencode-and-custom-workspace-392o#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              

              &lt;span class="hidden s:inline"&gt;Add&amp;nbsp;Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            7 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success crayons-icon c-btn__icon"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>agents</category>
      <category>ai</category>
      <category>llm</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Run a Private AI Coding Agent Locally: Setup &amp; Design with Ollama, OpenCode, and Custom Workspace Skills</title>
      <dc:creator>Praveen Veera</dc:creator>
      <pubDate>Mon, 29 Jun 2026 19:34:19 +0000</pubDate>
      <link>https://dev.to/praveen_builds/run-a-private-ai-coding-agent-locally-setup-design-with-ollama-opencode-and-custom-workspace-392o</link>
      <guid>https://dev.to/praveen_builds/run-a-private-ai-coding-agent-locally-setup-design-with-ollama-opencode-and-custom-workspace-392o</guid>
      <description>&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Farticles%2Ftw91d0f75wottkpvpxrq.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Farticles%2Ftw91d0f75wottkpvpxrq.png" alt="Local AI Agent Architecture - At a Glance"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Once you have local autocomplete and chat running inside your IDE, the next step is transitioning to autonomous execution. Setting up a local coding agent running directly inside your terminal or editor gives you a private, offline partner capable of executing shell commands, refactoring files, and diagnosing compilation errors.&lt;/p&gt;

&lt;p&gt;This guide focuses on the workspace design, custom instructions, and domain-specific skills required to orchestrate a reliable local agent using &lt;strong&gt;Ollama&lt;/strong&gt; and &lt;strong&gt;OpenCode&lt;/strong&gt;.&lt;/p&gt;




&lt;h3&gt;
  
  
  🔰 What is an "AI Agent" (For Beginners)?
&lt;/h3&gt;

&lt;p&gt;If you have only used ChatGPT or Claude in a browser, a coding agent behaves differently. Standard chat systems only output text; you must manually copy and paste the code block into your editor. &lt;/p&gt;

&lt;p&gt;An &lt;strong&gt;AI agent&lt;/strong&gt; has "hands." It integrates directly with your workstation's filesystem and terminal. Instead of just suggesting code, the agent runs an active execution loop: it reads files, writes code modules, executes compiler test suites, inspects error outputs, and iterates autonomously until the task is complete.&lt;/p&gt;




&lt;h2&gt;
  
  
  1. The Local Agent Architecture
&lt;/h2&gt;

&lt;p&gt;A private agentic workspace coordinates model outputs with local system execution. Here is the operational design of the loop:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌────────────────────────────────────────────┐
│                 Developer                  │
│        Terminal / VS Code / OpenCode       │
└─────────────────────┬──────────────────────┘
                      │
┌─────────────────────▼──────────────────────┐
│                  OpenCode                  │
│  - Agent execution loop                    │
│  - Context window manager                  │
│  - Project instruction parser              │
│  - Tool permission registry                │
│  - Skills / specialist agents              │
└──────────────┬───────────────┬─────────────┘
               │               │
     ┌─────────▼──────┐  ┌────▼─────────────┐
     │ Project Repo   │  │ Local OS Tools   │
     │ - Source code  │  │ - Terminal bash  │
     │ - Docs         │  │ - Git versioning │
     │ - Test suites  │  │ - Linters        │
     └────────────────┘  └──────────────────┘
                      │
┌─────────────────────▼──────────────────────┐
│                   Ollama                     │
│           Local model inference            │
│       Qwen / Llama coding models           │
└────────────────────────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;The Developer:&lt;/strong&gt; Initiates a task (e.g., "Add a health-check route") in the terminal interface.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;OpenCode (Agent Interface):&lt;/strong&gt; Reads global instructions, loads domain-specific skills, parses the repository directory, and maps available tools.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Ollama (Local Runtime):&lt;/strong&gt; Handles prompt inference, generating tool-call tags in XML or JSON format.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Local Tools:&lt;/strong&gt; The agent runtime parses the tags, requests developer permission, and executes the files or bash commands natively.&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  2. Step 1: Interface &amp;amp; Local Runtime Link (OpenCode)
&lt;/h2&gt;

&lt;p&gt;OpenCode acts as the execution bridge, routing prompt contexts to your local Ollama API. Configure it by editing your workspace configuration file:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight json"&gt;&lt;code&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"provider"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ollama"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"endpoint"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"http://localhost:11434"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"model"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"qwen2.5-coder:14b-instruct"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"default_agent"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"builder"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"system_instructions_path"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"./.agents/instructions.md"&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;em&gt;Note: For the local model settings, we run the instruct weights via Ollama configured with a minimum context window (&lt;code&gt;num_ctx 16384&lt;/code&gt;) and a deterministic temperature (&lt;code&gt;0.0&lt;/code&gt;), as detailed in our first guide.&lt;/em&gt;&lt;/p&gt;




&lt;h2&gt;
  
  
  3. Step 2: Project Instructions &amp;amp; Guardrails
&lt;/h2&gt;

&lt;p&gt;To prevent the agent from executing destructive commands or writing non-compliant code, you must define project-specific guardrails. Create a project instructions file (&lt;code&gt;.agents/instructions.md&lt;/code&gt;):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight markdown"&gt;&lt;code&gt;&lt;span class="gh"&gt;# Project Instructions&lt;/span&gt;

&lt;span class="gu"&gt;## Architecture &amp;amp; Stack&lt;/span&gt;
&lt;span class="p"&gt;-&lt;/span&gt; Frontend: Next.js (App Router, TypeScript)
&lt;span class="p"&gt;-&lt;/span&gt; Backend: FastAPI (Python 3.11, Pydantic v2)
&lt;span class="p"&gt;-&lt;/span&gt; Database: PostgreSQL

&lt;span class="gu"&gt;## Core Rules&lt;/span&gt;
&lt;span class="p"&gt;-&lt;/span&gt; Do not modify database schemas without explicit permission.
&lt;span class="p"&gt;-&lt;/span&gt; Do not introduce new third-party dependencies without explaining the rationale.
&lt;span class="p"&gt;-&lt;/span&gt; Run linting and tests before proposing a completed task.

&lt;span class="gu"&gt;## Code Style&lt;/span&gt;
&lt;span class="p"&gt;-&lt;/span&gt; Use TypeScript strict mode for frontend modules.
&lt;span class="p"&gt;-&lt;/span&gt; Use asynchronous database operations (async/await) in Python.
&lt;span class="p"&gt;-&lt;/span&gt; Add unit tests for all new business logic.

&lt;span class="gu"&gt;## Safety Constraints&lt;/span&gt;
&lt;span class="p"&gt;-&lt;/span&gt; Never print secrets, API tokens, or environment files to standard out.
&lt;span class="p"&gt;-&lt;/span&gt; Do not delete source files unless explicitly requested.
&lt;span class="p"&gt;-&lt;/span&gt; Present a concrete plan before executing multi-file changes.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  4. Step 3: Domain-Specific Skills (Specialist Guides)
&lt;/h2&gt;

&lt;p&gt;Lightweight local models (like 14B parameters) can struggle with complex routing patterns or framework boilerplate. By organizing your codebase with a dedicated &lt;code&gt;skills/&lt;/code&gt; directory, you equip your agent with specialized recipes:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="s"&gt;project-root/&lt;/span&gt;
&lt;span class="s"&gt;├── .agents/&lt;/span&gt;
&lt;span class="s"&gt;│   └── instructions.md&lt;/span&gt;
&lt;span class="s"&gt;└── skills/&lt;/span&gt;
    &lt;span class="s"&gt;├── nextjs-feature.md&lt;/span&gt;
    &lt;span class="s"&gt;├── fastapi-api.md&lt;/span&gt;
    &lt;span class="s"&gt;├── database-migration.md&lt;/span&gt;
    &lt;span class="s"&gt;└── test-writing.md&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Here is a sample skill definition file for writing endpoints (&lt;code&gt;skills/fastapi-api.md&lt;/code&gt;):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight markdown"&gt;&lt;code&gt;&lt;span class="gh"&gt;# FastAPI API Skill&lt;/span&gt;

When adding a new API endpoint to the backend:
&lt;span class="p"&gt;
1.&lt;/span&gt; Check existing router imports in &lt;span class="sb"&gt;`app/main.py`&lt;/span&gt;.
&lt;span class="p"&gt;2.&lt;/span&gt; Define Pydantic request and response schemas in &lt;span class="sb"&gt;`app/schemas/`&lt;/span&gt;.
&lt;span class="p"&gt;3.&lt;/span&gt; Use async database sessions with &lt;span class="sb"&gt;`sqlalchemy.ext.asyncio`&lt;/span&gt;.
&lt;span class="p"&gt;4.&lt;/span&gt; Include explicit error handlers using &lt;span class="sb"&gt;`HTTPException`&lt;/span&gt; with clear detail messages.
&lt;span class="p"&gt;5.&lt;/span&gt; Create a corresponding test file in &lt;span class="sb"&gt;`tests/test_api.py`&lt;/span&gt;.
&lt;span class="p"&gt;6.&lt;/span&gt; Run linting and verify API responses before marking the task complete.
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;When a user prompts the agent to add a backend route, OpenCode automatically appends this skill file to the active system context, ensuring the model matches your codebase's architectural pattern without bloating the base system prompt.&lt;/p&gt;




&lt;h2&gt;
  
  
  5. Step 4: Tool Risk &amp;amp; Permission Registry
&lt;/h2&gt;

&lt;p&gt;Giving an agent system access introduces risks. You must categorize available tools by risk level to prevent accidental system changes:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;th&gt;Risk Level&lt;/th&gt;
&lt;th&gt;Safety Guideline&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Read Files&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Inspects code structures and configuration.&lt;/td&gt;
&lt;td&gt;Low&lt;/td&gt;
&lt;td&gt;Safe to execute automatically.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Search Repo&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Locates variable definitions and file locations.&lt;/td&gt;
&lt;td&gt;Low&lt;/td&gt;
&lt;td&gt;Safe to execute automatically.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Git Diff/Status&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Analyzes workspace changes.&lt;/td&gt;
&lt;td&gt;Low&lt;/td&gt;
&lt;td&gt;Safe to execute automatically.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Run Tests&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Executes unit tests to validate code.&lt;/td&gt;
&lt;td&gt;Medium&lt;/td&gt;
&lt;td&gt;Restrict execution duration to prevent infinite loops.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Modify Files&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Edits source code or templates.&lt;/td&gt;
&lt;td&gt;Medium&lt;/td&gt;
&lt;td&gt;Require manual review or run inside a Git sandbox.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Delete Files&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Cleans up obsolete components.&lt;/td&gt;
&lt;td&gt;High&lt;/td&gt;
&lt;td&gt;Always prompt for explicit human confirmation.&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Shell Commands&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Runs compiler commands, builds, or scripts.&lt;/td&gt;
&lt;td&gt;High&lt;/td&gt;
&lt;td&gt;Never automate; require step-by-step developer approval.&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;blockquote&gt;
&lt;p&gt;🛡️ &lt;strong&gt;The Git Sandbox Rule:&lt;/strong&gt; Always initialize a Git repository and commit your active changes before letting a local agent write code. If the agent goes rogue, deletes files, or writes buggy code, you can roll back your entire workspace instantly by running:&lt;/p&gt;


&lt;pre class="highlight shell"&gt;&lt;code&gt;git reset &lt;span class="nt"&gt;--hard&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  6. Detailed Agent Workflow Trace
&lt;/h2&gt;

&lt;p&gt;To understand how the agent uses instructions, skills, and tools under the hood, here is a trace of the execution loop when implementing a feature:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;User Prompt:&lt;/strong&gt; &lt;em&gt;"Add a health-check endpoint to the FastAPI service."&lt;/em&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;1. Read Directory  ──&amp;gt; Locates app/main.py and skills/fastapi-api.md
2. Parse Rules     ──&amp;gt; Identifies FastAPI backend framework rules
3. Read main.py    ──&amp;gt; Finds existing router configuration
4. Propose Plan    ──&amp;gt; Prints target changes to terminal for approval
5. Edit Files      ──&amp;gt; Inserts /health endpoint using async route
6. Write Test      ──&amp;gt; Creates test_health_check in tests/test_api.py
7. Run CLI Command ──&amp;gt; Executes: pytest tests/test_api.py (Requires user approval)
8. Git Diff Check  ──&amp;gt; Displays final diff output and completes loop
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  7. Parallel Parser Implementations (Tool Calling)
&lt;/h2&gt;

&lt;p&gt;Local agents use regular expressions to parse XML tool commands generated by the local model. Here is how you can implement a robust, non-greedy tool call extractor in both TypeScript and Python. &lt;em&gt;(For an in-depth analysis of why XML tags are used to prevent format failure loops, refer to our previous guide)&lt;/em&gt;.&lt;/p&gt;

&lt;h3&gt;
  
  
  TypeScript Implementation
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight typescript"&gt;&lt;code&gt;&lt;span class="k"&gt;export&lt;/span&gt; &lt;span class="kd"&gt;function&lt;/span&gt; &lt;span class="nf"&gt;parseToolCall&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;output&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="kr"&gt;string&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="c1"&gt;// Non-greedy regex prevents merging multiple distinct tags&lt;/span&gt;
  &lt;span class="kd"&gt;const&lt;/span&gt; &lt;span class="nx"&gt;fileWriteRegex&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sr"&gt;/&amp;lt;write_file&lt;/span&gt;&lt;span class="se"&gt;\s&lt;/span&gt;&lt;span class="sr"&gt;+path="&lt;/span&gt;&lt;span class="se"&gt;([^&lt;/span&gt;&lt;span class="sr"&gt;"&lt;/span&gt;&lt;span class="se"&gt;]&lt;/span&gt;&lt;span class="sr"&gt;+&lt;/span&gt;&lt;span class="se"&gt;)&lt;/span&gt;&lt;span class="sr"&gt;"&amp;gt;&lt;/span&gt;&lt;span class="se"&gt;([\s\S]&lt;/span&gt;&lt;span class="sr"&gt;*&lt;/span&gt;&lt;span class="se"&gt;?)&lt;/span&gt;&lt;span class="sr"&gt;&amp;lt;&lt;/span&gt;&lt;span class="se"&gt;\/&lt;/span&gt;&lt;span class="sr"&gt;write_file&amp;gt;/&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
  &lt;span class="kd"&gt;const&lt;/span&gt; &lt;span class="nx"&gt;match&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nx"&gt;output&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;match&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;fileWriteRegex&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;

  &lt;span class="k"&gt;if &lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;match&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="na"&gt;tool&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="dl"&gt;"&lt;/span&gt;&lt;span class="s2"&gt;write_file&lt;/span&gt;&lt;span class="dl"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
      &lt;span class="na"&gt;path&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nx"&gt;match&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;
      &lt;span class="na"&gt;content&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nx"&gt;match&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;trim&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
    &lt;span class="p"&gt;};&lt;/span&gt;
  &lt;span class="p"&gt;}&lt;/span&gt;
  &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="kc"&gt;null&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Python Implementation
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;re&lt;/span&gt;

&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;parse_tool_call&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;output&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
    &lt;span class="c1"&gt;# Non-greedy regex pattern (.*?) avoids greedy tag merges
&lt;/span&gt;    &lt;span class="n"&gt;file_write_regex&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sa"&gt;r&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;&amp;lt;write_file\s+path=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;([^&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;]+)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;&amp;gt;([\s\S]*?)&amp;lt;/write_file&amp;gt;&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;
    &lt;span class="n"&gt;match&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;re&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;search&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;file_write_regex&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;output&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;match&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;tool&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;write_file&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;path&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;match&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;group&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;content&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;match&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;group&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
        &lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  8. Live Validation &amp;amp; GitHub Repository
&lt;/h2&gt;

&lt;p&gt;To demonstrate the viability of this design, the complete setup has been packaged and executed locally on an Apple Silicon workstation. &lt;/p&gt;

&lt;h3&gt;
  
  
  Companion Repository Code
&lt;/h3&gt;

&lt;p&gt;All configuration files, project rules, specialized skills, and the active test-runner script are hosted in the companion repository:&lt;br&gt;
👉 &lt;strong&gt;&lt;a href="https://github.com/praveenveera/software-permanence/tree/main/03-local-agent-setup" rel="noopener noreferrer"&gt;software-permanence/03-local-agent-setup&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;
&lt;h3&gt;
  
  
  Step-by-Step Execution Logs
&lt;/h3&gt;

&lt;p&gt;By running the local python simulator &lt;a href="https://github.com/praveenveera/software-permanence/blob/main/03-local-agent-setup/run_agent_loop.py" rel="noopener noreferrer"&gt;&lt;code&gt;run_agent_loop.py&lt;/code&gt;&lt;/a&gt;, we triggered &lt;code&gt;qwen2.5-coder:14b&lt;/code&gt; to read the codebase, parse our rules, write the route, and run unit tests. Here are the raw terminal logs from the execution:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;=== Launching Local Agent Run Simulation ===
[Step 1] Loading workspace configs, guidelines, and skills...
[Step 2] Reading current workspace status...
[Step 3] Querying local model 'qwen2.5-coder:14b' via Ollama...
  └─ Generation completed in 4.71 seconds.
  └─ Prompt Tokens: 407, Generation Tokens: 135
[Step 4] Extracting tool call payload from model output...
  └─ Parsed Action: write_file to 'workspace/app/main.py'
[Step 5] Writing modified code to local workspace...
  └─ Updated 'workspace/app/main.py' successfully.
[Step 6] Adding health-check assertion to unittest suite...
  └─ Appended 'test_read_health' test case.
[Step 7] Running unittest suite to validate changes...

=== Workspace Test Results ===
Ran 2 tests in 0.013s
OK

[Pass] Agent validation completed with all test assertions passing!
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  The Generated Endpoint Code
&lt;/h3&gt;

&lt;p&gt;Here is the exact FastAPI router code created autonomously by the local model during the run, showing that it followed the async rules and exception detail handlers specified in &lt;code&gt;skills/fastapi-api.md&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="nd"&gt;@app.get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;/health&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="k"&gt;async&lt;/span&gt; &lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;health_check&lt;/span&gt;&lt;span class="p"&gt;():&lt;/span&gt;
    &lt;span class="k"&gt;try&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="c1"&gt;# Simulate a database check or other critical resource
&lt;/span&gt;        &lt;span class="c1"&gt;# For demonstration, we'll just return OK
&lt;/span&gt;        &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;status&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;OK&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="k"&gt;except&lt;/span&gt; &lt;span class="nb"&gt;Exception&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;e&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="k"&gt;raise&lt;/span&gt; &lt;span class="nc"&gt;HTTPException&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;status_code&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="mi"&gt;500&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;detail&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Internal Server Error&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="n"&gt;e&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  9. Hard-Earned Lessons: What Did Not Work Well
&lt;/h2&gt;

&lt;p&gt;Running autonomous agent loops on local hardware highlighted several unique operational hurdles:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Tool Permission Fatigue:&lt;/strong&gt; Requiring user confirmation for high-risk tools like bash commands is necessary for safety, but it creates developer fatigue. You find yourself repeatedly hitting "Y" during compilation loops.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Recursive Error Loops:&lt;/strong&gt; If a model writes buggy code and the test step fails, smaller models can get stuck in a recursive loop (apologizing, rewriting the same bug, running tests, and failing again). Setting a hard execution breaker (halting after 3 failures) is critical.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Lack of Isolation:&lt;/strong&gt; Unlike cloud sandboxes, a local agent runs directly on your machine. If it runs &lt;code&gt;npm install&lt;/code&gt;, it compiles binaries on your host OS. Containerizing your workspace or running it inside a Docker dev container is highly recommended for security.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Context Overload:&lt;/strong&gt; Attaching multiple skill files and file summaries to the prompt quickly eats up the 16k context window. You must actively prune inactive files from the agent's history to maintain generation accuracy.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Summary
&lt;/h2&gt;

&lt;p&gt;Designing a local coding agent gives you complete privacy and data sovereignty. By configuring Ollama with deterministic parameters, establishing clear instructions, organizing workspace skills, and enforcing the Git Sandbox rule, you can run a reliable agentic environment directly on your local workstation.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;Are you running local coding agents on your machine? What model sizes have worked best for your workflow? Let's discuss in the comments.&lt;/em&gt;&lt;/p&gt;




&lt;p&gt;&lt;strong&gt;Hi, I'm Praveen Veera.&lt;/strong&gt; I build practical AI systems, specializing in Enterprise AI Platforms, Local LLMs, and Dev Tools.&lt;/p&gt;

&lt;p&gt;Read my notes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Substack Newsletter:&lt;/strong&gt; &lt;a href="https://praveenbuilds.substack.com" rel="noopener noreferrer"&gt;praveenbuilds.substack.com&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;LinkedIn:&lt;/strong&gt; &lt;a href="https://www.linkedin.com/in/praveen-veera-6ab22567/" rel="noopener noreferrer"&gt;linkedin.com/in/praveen-veera-6ab22567&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;GitHub (Companion Code):&lt;/strong&gt; &lt;a href="https://github.com/praveenveera/software-permanence" rel="noopener noreferrer"&gt;github.com/praveenveera/software-permanence&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Dev.to:&lt;/strong&gt; &lt;a href="https://dev.to/praveen_builds"&gt;dev.to/praveen_builds&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Medium:&lt;/strong&gt; &lt;a href="https://medium.com/@praveenveera92" rel="noopener noreferrer"&gt;medium.com/@praveenveera92&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Instagram:&lt;/strong&gt; &lt;a href="https://instagram.com/praveen.builds" rel="noopener noreferrer"&gt;@praveen.builds&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Hashnode:&lt;/strong&gt; &lt;a href="https://hashnode.com/@praveen-builds" rel="noopener noreferrer"&gt;hashnode.com/@praveen-builds&lt;/a&gt;
&lt;/li&gt;
&lt;/ul&gt;

</description>
      <category>opencode</category>
      <category>ollama</category>
      <category>qwen</category>
      <category>agents</category>
    </item>
    <item>
      <title>Why Local AI Coding Agents Fail (And How to Break the "Apology Loop")</title>
      <dc:creator>Praveen Veera</dc:creator>
      <pubDate>Mon, 29 Jun 2026 19:33:57 +0000</pubDate>
      <link>https://dev.to/praveen_builds/why-local-ai-coding-agents-fail-and-how-to-break-the-apology-loop-3gjh</link>
      <guid>https://dev.to/praveen_builds/why-local-ai-coding-agents-fail-and-how-to-break-the-apology-loop-3gjh</guid>
      <description>&lt;p&gt;Unlike standard chat interfaces where you ask questions and read answers, &lt;strong&gt;AI coding agents&lt;/strong&gt; (like &lt;strong&gt;Cline&lt;/strong&gt;, &lt;strong&gt;Continue&lt;/strong&gt;, or &lt;strong&gt;GarageBuild&lt;/strong&gt;) execute actions. They write files, run terminal commands, and inspect compiler errors automatically.&lt;/p&gt;

&lt;p&gt;In practice, running local agents on consumer workstations often leads to infinite retries, including parser loops and malformed JSON payloads.&lt;/p&gt;

&lt;p&gt;This analysis breaks down the systems boundary between the &lt;strong&gt;Model Layer&lt;/strong&gt; (the AI brain) and the &lt;strong&gt;Agent Runtime&lt;/strong&gt; (the workstation execution layer), explaining why local agents fail and how to configure them to prevent loop crashes.&lt;/p&gt;

&lt;h3&gt;
  
  
  🔰 What is an "AI Agent" (For Beginners)?
&lt;/h3&gt;

&lt;p&gt;If you have only used ChatGPT or Claude in a browser, coding agents are a different beast. Standard chat models only output text; you must manually copy and paste the code into your editor. &lt;strong&gt;AI agents&lt;/strong&gt; are given "hands", meaning they are integrated directly with your filesystem and terminal. They read files, create new code modules, and run test suites autonomously.&lt;/p&gt;

&lt;p&gt;Because they have local system access, the first rule of running agents is the &lt;strong&gt;Git Sandbox Rule&lt;/strong&gt;:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;Always run agents inside a clean Git repository.&lt;/strong&gt; Before launching an agent loop, commit your active changes. If the agent goes rogue, deletes files, or writes broken code, you can roll back your entire workspace instantly with &lt;code&gt;git reset --hard&lt;/code&gt;. Never run agents in root directories or folders containing unversioned files.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Farticles%2Fd716ea3pcdkozq8hg298.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Farticles%2Fd716ea3pcdkozq8hg298.png" alt="Local AI Agent Cheat Sheet - At a Glance"&gt;&lt;/a&gt;&lt;/p&gt;




&lt;h2&gt;
  
  
  1. Background: The Model vs. Runtime Divide
&lt;/h2&gt;

&lt;p&gt;An agentic developer environment relies on two separate layers that must constantly communicate:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;1. The Model Layer (Brain):&lt;/strong&gt; The LLM that decides &lt;em&gt;what&lt;/em&gt; to do.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;2. The Agent Runtime (Body):&lt;/strong&gt; The host framework (Cline, Continue, or GarageBuild) that manages filesystem tools and executes commands.
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;   ┌────────────────────────┐         1. Instructions &amp;amp; Context         ┌─────────────────┐
   │  Agent Runtime (Body)  ├──────────────────────────────────────────&amp;gt;│ Local LLM (Brain)│
   │                        │&amp;lt;──────────────────────────────────────────┤                 │
   └───────────┬────────────┘        2. Tool Call Command (JSON)        └─────────────────┘
               │
               │ 3. Executes File Write or CLI Command
               ▼
   ┌────────────────────────┐
   │ Workstation Filesystem │
   │  (Returns Logs/Errors) │
   └────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Failure occurs when the output formatting returned by the model cannot be understood by the runtime parser.&lt;/p&gt;




&lt;h2&gt;
  
  
  2. Why Local Agents Fail
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Failure 1: The JSON Parser Loop (The "Strict Form" Bottleneck)
&lt;/h3&gt;

&lt;p&gt;Most agent frameworks require models to output commands in strict JSON formats. However, lightweight local models (under 30B parameters) struggle to maintain strict syntax under complexity. &lt;br&gt;
If a model misses a single closing bracket, leaves a trailing comma, or outputs conversational padding around the JSON (e.g. &lt;em&gt;"Sure, here is the JSON to write that file..."&lt;/em&gt;), standard JSON parsers crash.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;💡 &lt;strong&gt;The Envelope Analogy:&lt;/strong&gt;&lt;br&gt;
JSON behaves like a strict government form: missing a single comma rejects the entire document. &lt;br&gt;
Wrapping tools in XML tags (&lt;code&gt;&amp;lt;write_file&amp;gt;...&amp;lt;/write_file&amp;gt;&lt;/code&gt;) is like placing your letter in a bright red envelope. Even if the model chatters before and after the envelope, the parser can easily spot the red borders and pull out the code package.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3&gt;
  
  
  Failure 2: KV Cache Context Eviction (The "Whiteboard" Limit)
&lt;/h3&gt;

&lt;p&gt;As an agent works, the conversation history grows, holding compiler logs, shell outputs, and file edits. When the accumulated tokens fill the context window (&lt;code&gt;num_ctx&lt;/code&gt;), the local server must evict older tokens to make room.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;⚠️ &lt;strong&gt;The Whiteboard Analogy:&lt;/strong&gt;&lt;br&gt;
Think of your context window as a whiteboard. As you chat, you write down every step. Once the board is full, you have to erase the top lines to keep writing. If you erase the original task instructions written at the very top, the agent forgets what it was supposed to do and begins outputting plain text summaries.&lt;/p&gt;
&lt;/blockquote&gt;


&lt;h2&gt;
  
  
  3. Quantization Mechanics: Why PTQ Breaks Tool-Calling (and How QAT Fixes It)
&lt;/h2&gt;

&lt;p&gt;To fit models like Qwen 14B or Gemma 12B on standard laptops, developers rely on &lt;strong&gt;quantization&lt;/strong&gt; to compress the weights from 16-bit floats (FP16) to 4-bit integers (INT4). However, how a model is quantized determines its agentic reliability:&lt;/p&gt;
&lt;h3&gt;
  
  
  Post-Training Quantization (PTQ)
&lt;/h3&gt;

&lt;p&gt;Standard quantization (PTQ) rounds model weights after training is complete. While this reduces the VRAM size by ~70%, it degrades the model's subtle attention patterns. For agent workflows, this degradation targets formatting heads: a PTQ-quantized 7B or 14B model will frequently miss closing JSON braces or confuse tool schemas because its structural weights were rounded off.&lt;/p&gt;
&lt;h3&gt;
  
  
  Quantization-Aware Training (QAT)
&lt;/h3&gt;

&lt;p&gt;In QAT, the model is trained with low-precision constraints active. By simulating quantization noise during training, the model adapts, keeping its reasoning and structured tool-calling performance intact even when compressed. &lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;The Sizing Rule:&lt;/strong&gt; If you are running an agent loop, always prefer a model optimized with &lt;strong&gt;QAT&lt;/strong&gt; (such as &lt;em&gt;Gemma 4 12B QAT&lt;/em&gt;) over standard PTQ weights, or step up to a higher quantization level (e.g. &lt;strong&gt;Q6_K&lt;/strong&gt; or &lt;strong&gt;Q8&lt;/strong&gt; instead of Q4_K_M) for PTQ models.&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Here is how tool-calling reliability scales across different quantization formats and parameters:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Model &amp;amp; Precision&lt;/th&gt;
&lt;th&gt;Quantization Type&lt;/th&gt;
&lt;th&gt;JSON Tool Success Rate&lt;/th&gt;
&lt;th&gt;XML Tag Success Rate&lt;/th&gt;
&lt;th&gt;Workstation Speed&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Qwen 2.5 Coder 7B (Q4_K_M)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;PTQ&lt;/td&gt;
&lt;td&gt;48%&lt;/td&gt;
&lt;td&gt;82%&lt;/td&gt;
&lt;td&gt;~75 tok/s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Gemma 4 12B (Q4_K_M)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;PTQ&lt;/td&gt;
&lt;td&gt;52%&lt;/td&gt;
&lt;td&gt;84%&lt;/td&gt;
&lt;td&gt;~32 tok/s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Gemma 4 12B (Q4_K_M)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;QAT&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;92%&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;98%&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;~32 tok/s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Qwen 2.5 Coder 14B (Q4_K_M)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;PTQ&lt;/td&gt;
&lt;td&gt;74%&lt;/td&gt;
&lt;td&gt;96%&lt;/td&gt;
&lt;td&gt;~30 tok/s&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Qwen 2.5 Coder 14B (Q8_0)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;PTQ&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;89%&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;98%&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;~24 tok/s&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;


&lt;h2&gt;
  
  
  4. The Technical Solution: XML Tag Resiliency
&lt;/h2&gt;

&lt;p&gt;To stabilize local agent loops, we must move away from strict JSON parsing and adopt &lt;strong&gt;XML tag parsing&lt;/strong&gt; combined with regular expressions.&lt;/p&gt;

&lt;p&gt;XML is much more resilient because start and end tags can be extracted via regular expressions. This bypasses the need for the model to output a syntactically complete JSON object.&lt;/p&gt;
&lt;h3&gt;
  
  
  The XML Tool Schema:
&lt;/h3&gt;


&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight xml"&gt;&lt;code&gt;&lt;span class="nt"&gt;&amp;lt;write_file&lt;/span&gt; &lt;span class="na"&gt;path=&lt;/span&gt;&lt;span class="s"&gt;"./src/main.ts"&lt;/span&gt;&lt;span class="nt"&gt;&amp;gt;&lt;/span&gt;
import { serve } from "bun";
serve({
  port: 3000,
  fetch(req) { return new Response("Ok"); }
});
&lt;span class="nt"&gt;&amp;lt;/write_file&amp;gt;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;h3&gt;
  
  
  The Client-Side Parser:
&lt;/h3&gt;

&lt;p&gt;Even if the model outputs conversational text before or after the code block, the runtime can extract the target file path and contents using a regular expression. Here is how you implement it in both TypeScript and Python:&lt;/p&gt;
&lt;h4&gt;
  
  
  TypeScript Implementation:
&lt;/h4&gt;


&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight typescript"&gt;&lt;code&gt;&lt;span class="k"&gt;export&lt;/span&gt; &lt;span class="kd"&gt;function&lt;/span&gt; &lt;span class="nf"&gt;parseToolCall&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;output&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="kr"&gt;string&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="kd"&gt;const&lt;/span&gt; &lt;span class="nx"&gt;fileWriteRegex&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sr"&gt;/&amp;lt;write_file&lt;/span&gt;&lt;span class="se"&gt;\s&lt;/span&gt;&lt;span class="sr"&gt;+path="&lt;/span&gt;&lt;span class="se"&gt;([^&lt;/span&gt;&lt;span class="sr"&gt;"&lt;/span&gt;&lt;span class="se"&gt;]&lt;/span&gt;&lt;span class="sr"&gt;+&lt;/span&gt;&lt;span class="se"&gt;)&lt;/span&gt;&lt;span class="sr"&gt;"&amp;gt;&lt;/span&gt;&lt;span class="se"&gt;([\s\S]&lt;/span&gt;&lt;span class="sr"&gt;*&lt;/span&gt;&lt;span class="se"&gt;?)&lt;/span&gt;&lt;span class="sr"&gt;&amp;lt;&lt;/span&gt;&lt;span class="se"&gt;\/&lt;/span&gt;&lt;span class="sr"&gt;write_file&amp;gt;/&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
  &lt;span class="kd"&gt;const&lt;/span&gt; &lt;span class="nx"&gt;match&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nx"&gt;output&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;match&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;fileWriteRegex&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;

  &lt;span class="k"&gt;if &lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nx"&gt;match&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="na"&gt;tool&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="dl"&gt;"&lt;/span&gt;&lt;span class="s2"&gt;write_file&lt;/span&gt;&lt;span class="dl"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
      &lt;span class="na"&gt;path&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nx"&gt;match&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;
      &lt;span class="na"&gt;content&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nx"&gt;match&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;trim&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
    &lt;span class="p"&gt;};&lt;/span&gt;
  &lt;span class="p"&gt;}&lt;/span&gt;
  &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="kc"&gt;null&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;h4&gt;
  
  
  Python Implementation:
&lt;/h4&gt;


&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;re&lt;/span&gt;

&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;parse_tool_call&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;output&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
    &lt;span class="n"&gt;file_write_regex&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sa"&gt;r&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;&amp;lt;write_file\s+path=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;([^&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;]+)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;&amp;gt;([\s\S]*?)&amp;lt;/write_file&amp;gt;&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;
    &lt;span class="n"&gt;match&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;re&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;search&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;file_write_regex&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;output&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;match&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;tool&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;write_file&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;path&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;match&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;group&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
            &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;content&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;match&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;group&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
        &lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;


&lt;p&gt;This regex parser extracts the code payload, preventing the model from falling into apology loops.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;⚠️ &lt;strong&gt;Developer Tip (Greedy vs. Lazy Regex):&lt;/strong&gt; Notice the &lt;code&gt;?&lt;/code&gt; in the regex pattern: &lt;code&gt;[\s\S]*?&lt;/code&gt;. This enforces a &lt;strong&gt;lazy/non-greedy match&lt;/strong&gt;. If your local model outputs multiple &lt;code&gt;&amp;lt;write_file&amp;gt;&lt;/code&gt; tags in a single response, a greedy pattern (&lt;code&gt;[\s\S]*&lt;/code&gt;) will merge all files together into a single, corrupted payload. Always enforce lazy matching in your agent's parser regex.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3&gt;
  
  
  Parser Resiliency Validation Results
&lt;/h3&gt;

&lt;p&gt;To prove the advantage of regex-based XML parsers over traditional JSON parsers, we executed a local validation script comparing both implementations against conversational agent outputs. &lt;/p&gt;

&lt;p&gt;The full test script is hosted in the companion repository:&lt;br&gt;
👉 &lt;strong&gt;&lt;a href="https://github.com/praveenveera/software-permanence/tree/main/02-why-local-agents-fail" rel="noopener noreferrer"&gt;software-permanence/02-why-local-agents-fail&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Here is the raw terminal log output from running &lt;a href="https://github.com/praveenveera/software-permanence/blob/main/02-why-local-agents-fail/test_parser_resiliency.py" rel="noopener noreferrer"&gt;&lt;code&gt;test_parser_resiliency.py&lt;/code&gt;&lt;/a&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;=== Testing Tool-Calling Parser Resiliency ===

[Test 1] Executing JSON Parser...
  ❌ JSON Parser FAILED (Could not extract due to conversational wrapping / invalid escaping)

[Test 2] Executing XML Regex Parser...
  ✅ XML Parser PASSED:
{
  "tool": "write_file",
  "path": "./config.json",
  "content": "{\n  \"port\": 8080\n}"
}

=== Validation Complete: XML Regex parser proves 100% resilient ===
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  5. Workstation Configuration Guidelines
&lt;/h2&gt;

&lt;p&gt;If you are running local agent loops, configure your runtime settings with these parameters:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Set Temperature to 0.0 - 0.2:&lt;/strong&gt; Enforce deterministic outputs. Higher temperatures introduce formatting drift that degrades tool-calling syntax.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Increase Context Window (&lt;code&gt;num_ctx&lt;/code&gt;):&lt;/strong&gt; Set a minimum of &lt;code&gt;16384&lt;/code&gt; (16k) or &lt;code&gt;32768&lt;/code&gt; (32k) context limits in your &lt;code&gt;Modelfile&lt;/code&gt; to prevent early context eviction.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Pinnable System Instructions:&lt;/strong&gt; Instruct the model to strictly suppress greetings, conversational text, and code summaries.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Isolate Models:&lt;/strong&gt; Do not run agent loops on models under 14B. Use &lt;code&gt;qwen2.5-coder:14b&lt;/code&gt; as a minimum, or run &lt;code&gt;qwen2.5-coder:32b-instruct&lt;/code&gt; inside local Docker containers.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Implement Loop Breakers:&lt;/strong&gt; Configure your agent runtime to track consecutive parser retries. If the agent receives a compilation error or formatting fail &lt;strong&gt;3 times&lt;/strong&gt; in a row, trigger an automatic breakpoint to halt execution and request user input. This prevents the agent from draining your laptop battery while looping.&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  6. A Beginner's Diagnostic Checklist
&lt;/h2&gt;

&lt;p&gt;When you are starting out with local agents, crashes or slow speeds will happen. Use this simple diagnostic guide to identify the bottleneck:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;Is Ollama actually running?&lt;/strong&gt; Check your system menu bar or type &lt;code&gt;ollama list&lt;/code&gt; in your terminal. If the local server isn't active, the agent will throw connection errors.&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Did generation speed collapse?&lt;/strong&gt; If the agent starts writing code extremely slowly (&amp;lt; 2 tokens/second), your model has likely spilled out of VRAM into system RAM. Open your Activity Monitor (macOS) or Task Manager (Windows) to check memory swap usage. You may need to load a smaller quantization level (e.g. &lt;code&gt;Q4_K_M&lt;/code&gt; instead of &lt;code&gt;Q8_0&lt;/code&gt;).&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Did the agent "forget" its instructions?&lt;/strong&gt; If the agent starts replying with general conversational prose mid-task, your context window has filled up and evicted the system prompt. Restart the agent session to clean the active history window.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  7. Summary
&lt;/h2&gt;

&lt;p&gt;Local agent failure is a systems alignment problem, not just a model capabilities issue. By moving from fragile JSON parsers to regex-based XML extraction, you can run stable, local agent loops on your workstation.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;Are you running local agentic workflows? How are you handling parser validation errors? Let me know in the comments.&lt;/em&gt;&lt;/p&gt;




&lt;p&gt;&lt;strong&gt;Hi, I'm Praveen Veera.&lt;/strong&gt; I build practical AI systems, specializing in Enterprise AI Platforms, Local LLMs, and Dev Tools.&lt;/p&gt;

&lt;p&gt;Read my notes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;Substack Newsletter:&lt;/strong&gt; &lt;a href="https://praveenbuilds.substack.com" rel="noopener noreferrer"&gt;praveenbuilds.substack.com&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;LinkedIn:&lt;/strong&gt; &lt;a href="https://www.linkedin.com/in/praveen-veera-6ab22567/" rel="noopener noreferrer"&gt;linkedin.com/in/praveen-veera-6ab22567&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;GitHub (Companion Code):&lt;/strong&gt; &lt;a href="https://github.com/praveenveera/software-permanence" rel="noopener noreferrer"&gt;github.com/praveenveera/software-permanence&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Dev.to:&lt;/strong&gt; &lt;a href="https://dev.to/praveen_builds"&gt;dev.to/praveen_builds&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Medium:&lt;/strong&gt; &lt;a href="https://medium.com/@praveenveera92" rel="noopener noreferrer"&gt;medium.com/@praveenveera92&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Instagram:&lt;/strong&gt; &lt;a href="https://instagram.com/praveen.builds" rel="noopener noreferrer"&gt;@praveen.builds&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Hashnode:&lt;/strong&gt; &lt;a href="https://hashnode.com/@praveen-builds" rel="noopener noreferrer"&gt;hashnode.com/@praveen-builds&lt;/a&gt;
&lt;/li&gt;
&lt;/ul&gt;

</description>
      <category>cline</category>
      <category>continue</category>
      <category>ollama</category>
      <category>agents</category>
    </item>
    <item>
      <title>Stop Paying for Copilot: Run Local LLMs in VS Code &amp; CLI (For Free)</title>
      <dc:creator>Praveen Veera</dc:creator>
      <pubDate>Mon, 29 Jun 2026 13:03:02 +0000</pubDate>
      <link>https://dev.to/praveen_builds/stop-paying-for-copilot-run-local-llms-in-vs-code-cli-for-free-cbp</link>
      <guid>https://dev.to/praveen_builds/stop-paying-for-copilot-run-local-llms-in-vs-code-cli-for-free-cbp</guid>
      <description>&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Farticles%2Fxvritu9rd5a74kr190g1.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.us-east-2.amazonaws.com%2Fuploads%2Farticles%2Fxvritu9rd5a74kr190g1.png" alt="Local AI Reference Card - At a Glance" width="800" height="1000"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;Running generative AI assistants locally on your workstation is the most direct way to protect code privacy, maintain compliance, and eliminate monthly API subscription costs.&lt;/p&gt;

&lt;p&gt;However, moving off the cloud is not as simple as installing an extension. A misconfigured setup can introduce frustrating latency, drain your workstation battery, and fail to provide accurate autocomplete suggestions.&lt;/p&gt;

&lt;p&gt;This guide provides a conceptual overview of the local AI landscape followed by an actionable &lt;strong&gt;five-step guide&lt;/strong&gt; to move your setup from the cloud to a fully local workstation.&lt;/p&gt;




&lt;h2&gt;
  
  
  1. Local vs. Cloud: Engineering Tradeoffs
&lt;/h2&gt;

&lt;p&gt;Choosing a local setup is not a pure upgrade; it involves a series of engineering tradeoffs. While local models offer absolute data privacy and near-zero latency, they compromise on reasoning capacity and context across multiple files compared to models hosted in the cloud. Understanding these boundaries is critical to knowing when to keep development local and when to leverage the cloud:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Dimension&lt;/th&gt;
&lt;th&gt;Local Assistant (e.g., Qwen 14B / Gemma 12B)&lt;/th&gt;
&lt;th&gt;Cloud Assistant (e.g., Claude 3.5 Sonnet / GPT-4o)&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data Privacy&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;100% Private (No data leaves your workstation)&lt;/td&gt;
&lt;td&gt;Subject to compliance review (Data sent to third party servers)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Token Cost&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;
&lt;strong&gt;$0 / month&lt;/strong&gt; (Runs entirely on local electricity)&lt;/td&gt;
&lt;td&gt;$10–$20/mo subscription or fees based on token usage&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Autocomplete Latency&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;
&lt;strong&gt;~150ms&lt;/strong&gt; (Instant, zero network delay)&lt;/td&gt;
&lt;td&gt;~500ms - 1.2s (Depends on network stability and cloud congestion)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Offline Capability&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Yes (Works on planes, trains, or secure offline VPCs)&lt;/td&gt;
&lt;td&gt;No (Crashes instantly without active internet connection)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Cognitive Ceiling&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;
&lt;strong&gt;Low to Medium&lt;/strong&gt; (Struggles with reasoning across multiple files)&lt;/td&gt;
&lt;td&gt;
&lt;strong&gt;High&lt;/strong&gt; (Resolves complex logic across different modules)&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Where Local Models Fail
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;The Abstract Ceiling:&lt;/strong&gt; A 14B model lacks the neural density to construct deep mental abstractions of complex codebases. If you ask a local model to resolve circular dependencies across three separate modules, it will likely output syntax-valid but logically broken code.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Rare Libraries &amp;amp; Edge Cases:&lt;/strong&gt; Cloud models are pre-trained on terabytes of code, including obscure libraries and legacy documentation. Local models are far more narrow; they struggle with undocumented frameworks, internal APIs, or specialized languages (like COBOL or Rust edge-cases).&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Multi-Modal Limitations:&lt;/strong&gt; Local setups cannot parse wireframes or UI mockups to generate front-end CSS layouts on consumer GPUs without immediately triggering out-of-memory (OOM) errors.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  The Local Model Landscape
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;&lt;p&gt;&lt;code&gt;Qwen2.5-Coder&lt;/code&gt; &lt;strong&gt;(The Gold Standard):&lt;/strong&gt; Google-rivaling coding performance. It is optimized specifically for &lt;em&gt;Fill-in-the-Middle&lt;/em&gt; autocomplete tasks, making it the most fluent local coding weight available today.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;code&gt;DeepSeek-Coder&lt;/code&gt; &lt;strong&gt;(The Alternative):&lt;/strong&gt; Highly optimized for Python and C++ structures. However, its older codebase context means it slightly lags behind Qwen on modern multi-language syntax.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;code&gt;Gemma 4 QAT&lt;/code&gt; &lt;strong&gt;(The Logic Specialist):&lt;/strong&gt; Excellent logic capabilities and a robust 32k context capability, though it requires custom parameter configuration in Ollama to run smoothly.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  2. The Systems Metrics That Matter
&lt;/h2&gt;

&lt;p&gt;When running local models, developer experience is governed by three primary systems metrics:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Time to First Token (TTFT) / Context Pre-fill Latency:&lt;/strong&gt; The delay (in milliseconds) between triggering an autocomplete completion and the model generating its first character. In autocomplete, a TTFT above &lt;strong&gt;250ms&lt;/strong&gt; breaks your visual typing flow.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Token Generation Throughput (Tokens/Second):&lt;/strong&gt; The speed at which the model streams its output text once it starts writing. For real-time reading, you need at least &lt;strong&gt;20–30 tokens/second&lt;/strong&gt;. For autocomplete, the model should complete lines instantly (&lt;strong&gt;75+ tokens/second&lt;/strong&gt;).&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;VRAM Footprint vs. System Memory Swap:&lt;/strong&gt; If a model fits 100% inside VRAM, it runs at full speed. If it overflows by even &lt;strong&gt;10MB&lt;/strong&gt;, the OS pages the remaining weights to system RAM, creating a massive memory bus bottleneck. This drops speeds from 30 tokens/sec to &lt;strong&gt;under 2 tokens/sec&lt;/strong&gt;. Always size your models to fit within 70% of your total VRAM, leaving 30% headroom for your OS and browser.&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  🚀 The Local AI Developer Journey
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;  ├── Step 1: Audit Your Hardware (VRAM Sizing)
  ├── Step 2: Spin Up the Model Runner (Ollama)
  ├── Step 3: Link the IDE Interface (Continue config.json)
  ├── Step 4: Protect Workspace CPU (.continueignore)
  └── Step 5: Expand to the Command Line (CLI Pipes)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Step 1: Audit Your Hardware (The "Kitchen Counter" Rule)
&lt;/h3&gt;

&lt;p&gt;Running models locally requires matching model parameters to your system's memory (VRAM/RAM).&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;💡 &lt;strong&gt;The Kitchen Counter Analogy:&lt;/strong&gt; Think of VRAM (GPU memory) as your kitchen counter, and system RAM/swap as the pantry down the hall. If all your ingredients fit on the counter (VRAM), you prepare the meal instantly. If the ingredients are too large and overflow the counter, you have to run back and forth to the pantry (RAM) for every single step. Your cooking speed collapses. Keep your models strictly within VRAM bounds.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;p&gt;Here is your hardware compatibility reference sheet:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;System VRAM (Kitchen Counter)&lt;/th&gt;
&lt;th&gt;Model Parameter Size&lt;/th&gt;
&lt;th&gt;Recommended Models&lt;/th&gt;
&lt;th&gt;Quantization&lt;/th&gt;
&lt;th&gt;VRAM Footprint&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;8 GB&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;1B - 3B&lt;/td&gt;
&lt;td&gt;&lt;code&gt;qwen2.5-coder:1.5b&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;~1.6 GB&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;16 GB&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;7B - 8B&lt;/td&gt;
&lt;td&gt;&lt;code&gt;qwen2.5-coder:7b&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;~4.7 GB&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;24 GB&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;12B - 14B&lt;/td&gt;
&lt;td&gt;&lt;code&gt;qwen2.5-coder:14b&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;~9.3 GB&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;32 GB+&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;14B - 22B&lt;/td&gt;
&lt;td&gt;&lt;code&gt;codestral:22b&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;~15.1 GB&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Sizing Models to Task Complexity
&lt;/h3&gt;

&lt;p&gt;To optimize compute resources, structure your workflow by mapping developer tasks to model sizes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Simple Tasks (Tab Autocomplete &amp;amp; Syntax Matching):&lt;/strong&gt; Single-line completions, closing parentheses, standard imports, variable assignments. Requires &amp;lt; 200ms latency. Sized at &lt;strong&gt;1.5B to 3B parameters&lt;/strong&gt; (e.g., &lt;code&gt;Qwen2.5-Coder-1.5B-Base&lt;/code&gt;).&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Medium Tasks (Context-Aware Chat &amp;amp; Unit Testing):&lt;/strong&gt; Writing utility functions, refactoring single files, generating test suites, explaining compilation errors. Sized at &lt;strong&gt;7B to 14B parameters&lt;/strong&gt; (e.g., &lt;code&gt;Qwen2.5-Coder-14B-Instruct&lt;/code&gt; or &lt;code&gt;Gemma-4-12B&lt;/code&gt;).&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Complex Tasks (Multi-File Debugging &amp;amp; System Architecture):&lt;/strong&gt; Architectural planning, debugging cross-module dependencies, codebase index search. Sized at &lt;strong&gt;22B+ parameters&lt;/strong&gt; (e.g., &lt;code&gt;Codestral-22B&lt;/code&gt; or private VPC-hosted 70B+ models).&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h3&gt;
  
  
  Step 2: Spin Up the Model Runner (Ollama)
&lt;/h3&gt;

&lt;p&gt;Ollama acts as the engine room of your setup. It manages model weights, schedules GPU memory allocation, and exposes local API endpoints.&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;Download and install &lt;a href="https://ollama.com" rel="noopener noreferrer"&gt;Ollama for macOS&lt;/a&gt;.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Pull the two models we need (one lightweight model optimized for tab autocomplete, and one larger model for reasoning in chat):&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Pull the lightweight autocomplete model (Base model)&lt;/span&gt;
ollama pull qwen2.5-coder:1.5b-base

&lt;span class="c"&gt;# Pull the chat sidebar reasoning model (Instruct model)&lt;/span&gt;
ollama pull qwen2.5-coder:14b-instruct
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  (Optional) Tuning Parameters via a Custom Modelfile
&lt;/h3&gt;

&lt;p&gt;If you need custom parameters, such as running &lt;strong&gt;Gemma 4 12B QAT&lt;/strong&gt; with an expanded 32k context window:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;p&gt;Locate your local GGUF file directory and create a &lt;code&gt;Modelfile&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight docker"&gt;&lt;code&gt;&lt;span class="k"&gt;FROM&lt;/span&gt;&lt;span class="s"&gt; /path/to/local/gemma-4-12b-it-QAT.gguf&lt;/span&gt;
PARAMETER num_ctx 32768
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Build the model in Ollama:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;ollama create gemma4:12b-qat-32k &lt;span class="nt"&gt;-f&lt;/span&gt; Modelfile
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  Step 3: Link the IDE Interface (Continue config.json)
&lt;/h3&gt;

&lt;p&gt;Now we connect VS Code to your local Ollama engine using the open-source &lt;strong&gt;Continue.dev&lt;/strong&gt; extension.&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;Install the &lt;code&gt;Continue&lt;/code&gt; extension in VS Code.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;Open the Continue settings (&lt;code&gt;config.json&lt;/code&gt;) and configure it to point to your local Ollama instance:&lt;br&gt;
&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight json"&gt;&lt;code&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"models"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"title"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"Ollama - Qwen 14B Coder"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"provider"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ollama"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"model"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"qwen2.5-coder:14b"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"apiBase"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"http://localhost:11434"&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"title"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"Ollama - Gemma 4 QAT"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"provider"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ollama"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"model"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"gemma4:12b-qat-32k"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
      &lt;/span&gt;&lt;span class="nl"&gt;"apiBase"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"http://localhost:11434"&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="nl"&gt;"tabAutocompleteModel"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="nl"&gt;"title"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"Ollama - Autocomplete"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="nl"&gt;"provider"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"ollama"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="nl"&gt;"model"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"qwen2.5-coder:1.5b-base"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;&lt;span class="w"&gt;
    &lt;/span&gt;&lt;span class="nl"&gt;"apiBase"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="w"&gt; &lt;/span&gt;&lt;span class="s2"&gt;"http://localhost:11434"&lt;/span&gt;&lt;span class="w"&gt;
  &lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;&lt;span class="w"&gt;
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Enabling the VS Code CLI Command
&lt;/h3&gt;

&lt;p&gt;To open your configuration file directly from your terminal, enable the VS Code shell utility:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;Open VS Code, open the Command Palette (&lt;code&gt;Cmd+Shift+P&lt;/code&gt; on macOS, &lt;code&gt;Ctrl+Shift+P&lt;/code&gt; on Windows/Linux).&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;Run: &lt;code&gt;Shell Command: Install 'code' command in PATH&lt;/code&gt;.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Now, you can open and edit your configuration file directly from your terminal:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;code ~/.continue/config.json
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  Replacing Copilot Features 1-to-1
&lt;/h3&gt;

&lt;p&gt;Once Continue is connected to your local model runner, here is how you trigger the models to replace Copilot's core capabilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Inline Autocomplete (Ghost Text):&lt;/strong&gt; As you write code, the lightweight &lt;code&gt;Qwen-1.5B-Base&lt;/code&gt; model streams single-line completions inline. Press &lt;code&gt;Tab&lt;/code&gt; to accept.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;In-Place Code Editing (&lt;code&gt;Cmd+I&lt;/code&gt; / &lt;code&gt;Ctrl+I&lt;/code&gt;):&lt;/strong&gt; Select a block of code, press &lt;code&gt;Cmd+I&lt;/code&gt; (macOS) or &lt;code&gt;Ctrl+I&lt;/code&gt; (Windows/Linux), type your editing instruction (e.g. &lt;em&gt;"Convert this loop to a list comprehension"&lt;/em&gt;), and press Enter. The model will edit the file inline.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Sidebar Chat &amp;amp; Context (&lt;code&gt;Cmd+L&lt;/code&gt; / &lt;code&gt;Ctrl+L&lt;/code&gt;):&lt;/strong&gt; Press &lt;code&gt;Cmd+L&lt;/code&gt; to open the chat panel. Type &lt;code&gt;@&lt;/code&gt; to reference specific files, terminal shell commands, or your entire codebase index, routing the queries to your larger &lt;code&gt;Qwen-14B-Instruct&lt;/code&gt; model.&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;blockquote&gt;
&lt;p&gt;ℹ️ &lt;strong&gt;Isolate Autocomplete from Chat:&lt;/strong&gt; Do not route both chat and autocomplete to the same model. Tab autocomplete requires immediate responses. Use &lt;code&gt;Qwen-1.5B-Base&lt;/code&gt; for autocomplete (optimized for fast, inline Fill-in-the-Middle tasks) and &lt;code&gt;Qwen-14B-Instruct&lt;/code&gt; for the chat sidebar.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h3&gt;
  
  
  Workstation Benchmark Results (Measured Live on Apple M5 Pro)
&lt;/h3&gt;

&lt;p&gt;To prove local viability, we measured prompt pre-fill speeds (Time to First Token) and token generation throughput (text output speed) using your hardware configuration:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Model Configuration&lt;/th&gt;
&lt;th&gt;Parameter Size&lt;/th&gt;
&lt;th&gt;VRAM Footprint&lt;/th&gt;
&lt;th&gt;Quantization&lt;/th&gt;
&lt;th&gt;Context Pre-fill Speed&lt;/th&gt;
&lt;th&gt;Token Generation Speed&lt;/th&gt;
&lt;th&gt;Sizing Latency&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Qwen2.5-Coder (Base)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;1.5B&lt;/td&gt;
&lt;td&gt;1.6 GB&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;190.6 tok/s&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;188.4 tok/s&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&amp;lt; 80ms (Real-time autocomplete)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Gemma 4 QAT&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;12B&lt;/td&gt;
&lt;td&gt;7.0 GB&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;129.5 tok/s&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;34.8 tok/s&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Real-time reasoning&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Qwen2.5-Coder (Instruct)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;14B&lt;/td&gt;
&lt;td&gt;9.0 GB&lt;/td&gt;
&lt;td&gt;&lt;code&gt;Q4_K_M&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;214.8 tok/s&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;30.0 tok/s&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Cloud-parity chat speed&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h4&gt;
  
  
  Benchmark Test Script &amp;amp; Code Reference
&lt;/h4&gt;

&lt;p&gt;The benchmark tests were executed locally using the companion test script. The full source code is hosted in the companion repository:&lt;br&gt;
👉 &lt;strong&gt;&lt;a href="https://github.com/praveenveera/software-permanence/tree/main/01-local-llm-vscode" rel="noopener noreferrer"&gt;software-permanence/01-local-llm-vscode&lt;/a&gt;&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Here is the raw terminal log output of running &lt;a href="https://github.com/praveenveera/software-permanence/blob/main/01-local-llm-vscode/test_local_llm.py" rel="noopener noreferrer"&gt;&lt;code&gt;test_local_llm.py&lt;/code&gt;&lt;/a&gt; against Ollama:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;=== Running Local LLM Workstation Benchmark ===
Target model: qwen2.5-coder:14b (Q4_K_M)

[Step 1] Measuring Context Pre-fill Speed (Time to First Token)
  - Processing prompt size: 8192 tokens
  - Pre-fill throughput: 214.8 tokens/second

[Step 2] Measuring Text Generation Speed (Output Throughput)
  - Generating 500 response tokens
  - Generation throughput: 30.0 tokens/second

[Step 3] Verifying Tool-Calling Parse Compliance
  - XML Tool Extraction: PASSED (Regex matched 100% output)
  - JSON Tool Extraction: FAILED (Output wrapped in Markdown fences)

=== Validation Complete: Qwen 14B behaves at cloud-parity speed ===
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Step 4: Protect Workspace CPU (.continueignore)
&lt;/h3&gt;

&lt;p&gt;By default, Continue tries to index every file in your workspace to build local vector embeddings for chat retrieval. On large projects, this causes your CPU usage to spike to 100% and chokes autocomplete.&lt;/p&gt;

&lt;p&gt;To prevent this, create a &lt;code&gt;.continueignore&lt;/code&gt; file in the root of your project directory:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;.git/
node_modules/
dist/
build/
.svelte-kit/
*.log
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Fixing Context Shifting Latency
&lt;/h3&gt;

&lt;p&gt;Autocomplete can freeze for 2-3 seconds when you switch tabs because Continue is parsing the entire contents of the new file.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;The Fix:&lt;/strong&gt; In VS Code settings, search for &lt;code&gt;Continue: Tab Autocomplete Options&lt;/code&gt;, and set &lt;code&gt;Prefix Length&lt;/code&gt; to &lt;code&gt;500&lt;/code&gt; and &lt;code&gt;Suffix Length&lt;/code&gt; to &lt;code&gt;250&lt;/code&gt;. Reducing these boundaries limits context parsing size, giving you instant tab completions upon tab switching.&lt;/li&gt;
&lt;/ul&gt;




&lt;h3&gt;
  
  
  Step 5: Expand to the Command Line (Terminal Agents &amp;amp; Pipes)
&lt;/h3&gt;

&lt;p&gt;Once your local model runner is set up, you aren't restricted to the IDE. Ollama’s desktop interface includes a native &lt;strong&gt;Launch&lt;/strong&gt; registry that allows you to spin up open-source terminal agents directly from your CLI.&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;⚠️ &lt;strong&gt;Beginner Warning (The Git Sandbox Rule):&lt;/strong&gt; Terminal-native agents (&lt;code&gt;opencode&lt;/code&gt;, &lt;code&gt;claude&lt;/code&gt;) execute edits and run commands directly on your local system. Before launching an agent from your CLI, &lt;strong&gt;always ensure you are running it inside a clean Git repository.&lt;/strong&gt; If the agent runs a destructive command or writes broken code, you can roll back your workspace instantly via &lt;code&gt;git reset --hard&lt;/code&gt;.&lt;/p&gt;
&lt;/blockquote&gt;

&lt;h3&gt;
  
  
  1. Launching Terminal-Native Coding Agents
&lt;/h3&gt;

&lt;p&gt;Instead of paid cloud services, you can run autonomous command-line developers directly inside your shell:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;OpenCode (Anomaly's open-source coding agent):&lt;/strong&gt; An autonomous terminal coder that reads build logs, refactors files, and handles tasks locally:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;ollama launch opencode
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Copilot CLI (Terminal helper agent):&lt;/strong&gt; Explains shell commands, generates commands from natural language, and handles prompt operations in your terminal:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;ollama launch copilot-cli
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Claude Code (Subagent coding CLI):&lt;/strong&gt; Anthropic’s subagent developer interface configured to run locally:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;ollama launch claude
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  2. Piping Logs for Custom Debugging
&lt;/h3&gt;

&lt;p&gt;For quick troubleshooting, you can pipe compiler errors or log dumps directly into the model without copying and pasting:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Pipe an execution error log to Ollama&lt;/span&gt;
&lt;span class="nb"&gt;cat &lt;/span&gt;error.log | ollama run qwen2.5-coder:14b &lt;span class="s2"&gt;"Explain this error and suggest a fix"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Direct Programmatic API Access
&lt;/h3&gt;

&lt;p&gt;You can call your local models directly inside your applications or custom tooling. Here is how to execute a generation request using Curl and Python:&lt;/p&gt;

&lt;h4&gt;
  
  
  Using Curl:
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;curl &lt;span class="nt"&gt;-s&lt;/span&gt; &lt;span class="nt"&gt;-X&lt;/span&gt; POST http://localhost:11434/api/generate &lt;span class="nt"&gt;-d&lt;/span&gt; &lt;span class="s1"&gt;'{
  "model": "qwen2.5-coder:14b",
  "prompt": "Convert this bash script to a Python script: $(cat build.sh)",
  "stream": false
}'&lt;/span&gt; | jq &lt;span class="s1"&gt;'.response'&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  Using Python:
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;urllib.request&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;

&lt;span class="n"&gt;payload&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;model&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;qwen2.5-coder:14b&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;prompt&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Convert this bash script to a Python script.&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;stream&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="bp"&gt;False&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;

&lt;span class="n"&gt;req&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;urllib&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;request&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nc"&gt;Request&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;http://localhost:11434/api/generate&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;data&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;dumps&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;payload&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;encode&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;utf-8&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
    &lt;span class="n"&gt;headers&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Content-Type&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;application/json&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="k"&gt;with&lt;/span&gt; &lt;span class="n"&gt;urllib&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;request&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;urlopen&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;req&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;response_data&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;loads&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;read&lt;/span&gt;&lt;span class="p"&gt;().&lt;/span&gt;&lt;span class="nf"&gt;decode&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;utf-8&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
    &lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;response_data&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;response&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Pro-Tips &amp;amp; Troubleshooting
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Issue: Port 11434 is Already in Use
&lt;/h3&gt;

&lt;p&gt;On macOS, Ollama runs as a background service and will block port &lt;code&gt;11434&lt;/code&gt; even if the app UI is closed.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;The fix:&lt;/strong&gt; Manually kill the background process via terminal:&lt;br&gt;
&lt;/p&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;pkill Ollama
&lt;/code&gt;&lt;/pre&gt;

&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Issue: Zero-Lag Loading (keep_alive)
&lt;/h3&gt;

&lt;p&gt;By default, Ollama unloads models from memory after 5 minutes of inactivity. When you trigger code completion later, you face a 5–10 second delay as the model loads back into VRAM.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;The fix:&lt;/strong&gt; Set the model to remain permanently loaded in GPU memory by configuring the &lt;code&gt;keep_alive&lt;/code&gt; parameter to &lt;code&gt;-1&lt;/code&gt; (always stay in memory) or &lt;code&gt;30m&lt;/code&gt; (30 minutes) in your API settings.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  🔰 Beginner's Troubleshooting Checklist
&lt;/h3&gt;

&lt;p&gt;If your local development setup is failing, use this diagnostic guide to find the cause:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;Is Ollama running?&lt;/strong&gt; Open your terminal and run &lt;code&gt;ollama list&lt;/code&gt;. If it fails with a connection error, the Ollama application service is shut down.&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Is autocomplete lagging?&lt;/strong&gt; If suggestions take more than 2-3 seconds, check if your model is spilling into system RAM. In Activity Monitor (macOS) or Task Manager (Windows), look at memory swap. If swap is active, you are running a model too large for your VRAM.&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Is Continue forgetting instructions?&lt;/strong&gt; If the sidebar chat stops responding or behaves erratically, you have hit the context limit of the loaded model. Restart the chat session to clean the active history window.&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Summary
&lt;/h2&gt;

&lt;p&gt;Running local models provides code privacy and offline capabilities. By combining &lt;strong&gt;Ollama&lt;/strong&gt;, &lt;strong&gt;LM Studio&lt;/strong&gt;, and &lt;strong&gt;Continue&lt;/strong&gt;, you can configure a usable local developer environment in both your IDE and terminal.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;What models are you running locally for autocomplete? Let me know in the comments.&lt;/em&gt;&lt;/p&gt;




&lt;p&gt;&lt;strong&gt;Hi, I'm Praveen Veera.&lt;/strong&gt; I build practical AI systems, specializing in Enterprise AI Platforms, Local LLMs, and Dev Tools.&lt;/p&gt;

&lt;p&gt;Read my notes:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;  &lt;strong&gt;Substack Newsletter:&lt;/strong&gt; &lt;a href="https://praveenbuilds.substack.com" rel="noopener noreferrer"&gt;praveenbuilds.substack.com&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;LinkedIn:&lt;/strong&gt; &lt;a href="https://www.linkedin.com/in/praveen-veera-6ab22567/" rel="noopener noreferrer"&gt;linkedin.com/in/praveen-veera-6ab22567&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;GitHub (Companion Code):&lt;/strong&gt; &lt;a href="https://github.com/praveenveera/software-permanence" rel="noopener noreferrer"&gt;github.com/praveenveera/software-permanence&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Dev.to:&lt;/strong&gt; &lt;a href="https://dev.to/praveen_builds"&gt;dev.to/praveen_builds&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Medium:&lt;/strong&gt; &lt;a href="https://medium.com/@praveenveera92" rel="noopener noreferrer"&gt;medium.com/@praveenveera92&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Instagram:&lt;/strong&gt; &lt;a href="https://instagram.com/praveen.builds" rel="noopener noreferrer"&gt;@praveen.builds&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;  &lt;strong&gt;Hashnode:&lt;/strong&gt; &lt;a href="https://hashnode.com/@praveen-builds" rel="noopener noreferrer"&gt;hashnode.com/@praveen-builds&lt;/a&gt;
&lt;/li&gt;
&lt;/ul&gt;

</description>
      <category>ollama</category>
      <category>continue</category>
      <category>qwen</category>
      <category>githubcopilot</category>
    </item>
  </channel>
</rss>
