<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: VedX Group</title>
    <description>The latest articles on DEV Community by VedX Group (@vedx_group_134578fd77aad4).</description>
    <link>https://dev.to/vedx_group_134578fd77aad4</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3948419%2F17f8efba-8886-41a6-acf4-6774bda244b2.png</url>
      <title>DEV Community: VedX Group</title>
      <link>https://dev.to/vedx_group_134578fd77aad4</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/vedx_group_134578fd77aad4"/>
    <language>en</language>
    <item>
      <title>check this out</title>
      <dc:creator>VedX Group</dc:creator>
      <pubDate>Sun, 24 May 2026 14:42:32 +0000</pubDate>
      <link>https://dev.to/vedx_group_134578fd77aad4/check-this-out-2md6</link>
      <guid>https://dev.to/vedx_group_134578fd77aad4/check-this-out-2md6</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg" class="crayons-story__hidden-navigation-link"&gt;How We Built a Production RAG Chatbot for a Client in 72 Hours (Full Stack Breakdown)&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/vedx_group_134578fd77aad4" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3948419%2F17f8efba-8886-41a6-acf4-6774bda244b2.png" alt="vedx_group_134578fd77aad4 profile" class="crayons-avatar__image"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/vedx_group_134578fd77aad4" class="crayons-story__secondary fw-medium m:hidden"&gt;
              VedX Group
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                VedX Group
                
              
              &lt;div id="story-author-preview-content-3736776" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/vedx_group_134578fd77aad4" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3948419%2F17f8efba-8886-41a6-acf4-6774bda244b2.png" class="crayons-avatar__image" alt=""&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;VedX Group&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;May 24&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg" id="article-link-3736776"&gt;
          How We Built a Production RAG Chatbot for a Client in 72 Hours (Full Stack Breakdown)
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag crayons-tag--filled  " href="/t/showdev"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;showdev&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/ai"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;ai&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/architecture"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;architecture&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/rag"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;rag&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
          &lt;a href="https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left"&gt;
            &lt;div class="multiple_reactions_aggregate"&gt;
              &lt;span class="multiple_reactions_icons_container"&gt;
                  &lt;span class="crayons_icon_container"&gt;
                    &lt;img src="https://assets.dev.to/assets/sparkle-heart-5f9bee3767e18deb1bb725290cb151c25234768a0e9a2bd39370c382d02920cf.svg" width="18" height="18"&gt;
                  &lt;/span&gt;
              &lt;/span&gt;
              &lt;span class="aggregate_reactions_counter"&gt;1&lt;span class="hidden s:inline"&gt; reaction&lt;/span&gt;&lt;/span&gt;
            &lt;/div&gt;
          &lt;/a&gt;
            &lt;a href="https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              Comments


              1&lt;span class="hidden s:inline"&gt; comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            6 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
    </item>
    <item>
      <title>How We Built a Production RAG Chatbot for a Client in 72 Hours (Full Stack Breakdown)</title>
      <dc:creator>VedX Group</dc:creator>
      <pubDate>Sun, 24 May 2026 02:03:30 +0000</pubDate>
      <link>https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg</link>
      <guid>https://dev.to/vedx_group_134578fd77aad4/how-we-built-a-production-rag-chatbot-for-a-client-in-72-hours-full-stack-breakdown-2mg</guid>
      <description>&lt;p&gt;A client messaged us on a Tuesday night. By Friday afternoon, their customer support chatbot was live — handling real queries, citing their actual documentation, and escalating edge cases to humans. No hallucinations. No generic GPT responses. No guesswork.&lt;/p&gt;

&lt;p&gt;This is the breakdown of how we did it: the architecture, the stack, the mistakes, and the two decisions that saved the timeline.&lt;/p&gt;

&lt;p&gt;We're a small two-person AI development team at &lt;a href="https://vedx.space" rel="noopener noreferrer"&gt;VedX&lt;/a&gt; — we specialize in building RAG systems, voicebots, and full-stack AI products for businesses. This particular project is one we're allowed to share the technical details of, so here's the full walkthrough.&lt;/p&gt;




&lt;h2&gt;
  
  
  What the Client Actually Needed
&lt;/h2&gt;

&lt;p&gt;The client ran a SaaS product in the legal compliance space. They had 200+ pages of documentation, an FAQ library, and a Notion workspace full of internal SOPs. Their support team was spending 60% of time answering questions that already existed in their docs — just not findably.&lt;/p&gt;

&lt;p&gt;They did not want a generic ChatGPT wrapper. They wanted a bot that:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Only answered from &lt;em&gt;their&lt;/em&gt; documents (no hallucinations about things not in scope)&lt;/li&gt;
&lt;li&gt;Cited the exact source document when answering&lt;/li&gt;
&lt;li&gt;Escalated to a human when confidence was below a threshold&lt;/li&gt;
&lt;li&gt;Integrated with their existing React frontend via a simple chat widget&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Classic RAG problem. Solvable in 72 hours with the right stack.&lt;/p&gt;




&lt;h2&gt;
  
  
  The Architecture (The Part That Matters)
&lt;/h2&gt;

&lt;p&gt;Before writing a single line of code, we spent 3 hours on architecture. This is the part most freelancers skip — and it's exactly why most RAG prototypes fail in production.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;User query
    ↓
Query preprocessing (intent classification + query expansion)
    ↓
Embedding model (text-embedding-3-small)
    ↓
Vector search → ChromaDB (cosine similarity, top-k=5)
    ↓
Reranker (cross-encoder) → filters to top-k=2
    ↓
LLM (GPT-4o-mini) with retrieved context + source metadata
    ↓
Response + citation + confidence score
    ↓
If confidence &amp;lt; 0.7 → human escalation flag
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Two decisions in this architecture made all the difference:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;1. Query expansion before embedding&lt;/strong&gt;&lt;br&gt;&lt;br&gt;
Raw user queries are often short, ambiguous, and poorly formed. "What's the penalty?" gives a terrible embedding. We added an LLM step that rewrites the query into 3 variations before embedding — dramatically improving retrieval recall.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;2. Cross-encoder reranking after vector search&lt;/strong&gt;&lt;br&gt;&lt;br&gt;
Vector similarity is fast but imprecise. A cross-encoder (we used &lt;code&gt;cross-encoder/ms-marco-MiniLM-L-6-v2&lt;/code&gt; from Hugging Face) reads each retrieved chunk alongside the original query and scores actual relevance. This step alone reduced hallucinations by ~40% in our testing.&lt;/p&gt;


&lt;h2&gt;
  
  
  The Stack
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Component&lt;/th&gt;
&lt;th&gt;Tool&lt;/th&gt;
&lt;th&gt;Why&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Backend API&lt;/td&gt;
&lt;td&gt;FastAPI (Python)&lt;/td&gt;
&lt;td&gt;Async, fast, easy to deploy&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Vector DB&lt;/td&gt;
&lt;td&gt;ChromaDB (local)&lt;/td&gt;
&lt;td&gt;Zero infra for MVP, portable to cloud&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Embeddings&lt;/td&gt;
&lt;td&gt;OpenAI text-embedding-3-small&lt;/td&gt;
&lt;td&gt;Best price/performance ratio&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Reranker&lt;/td&gt;
&lt;td&gt;HuggingFace cross-encoder&lt;/td&gt;
&lt;td&gt;Free, runs locally, &amp;lt; 200ms&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;LLM&lt;/td&gt;
&lt;td&gt;GPT-4o-mini&lt;/td&gt;
&lt;td&gt;4x cheaper than GPT-4, sufficient for Q&amp;amp;A&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Document parsing&lt;/td&gt;
&lt;td&gt;LangChain loaders + custom chunker&lt;/td&gt;
&lt;td&gt;Handles PDF, Notion export, DOCX&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Frontend widget&lt;/td&gt;
&lt;td&gt;React + Tailwind&lt;/td&gt;
&lt;td&gt;Embedded via iframe in client's app&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Hosting&lt;/td&gt;
&lt;td&gt;Railway (API) + Vercel (widget)&lt;/td&gt;
&lt;td&gt;Ships in 10 minutes&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;Total inference cost:&lt;/strong&gt; ~$0.003 per query at production load.&lt;/p&gt;


&lt;h2&gt;
  
  
  Day 1: Document Ingestion Pipeline (0–24 hours)
&lt;/h2&gt;

&lt;p&gt;The first bottleneck was always going to be document quality. 200 pages of legal documentation is not a clean dataset — it had tables, footnotes, embedded PDFs, inconsistent heading structures, and duplicate content across Notion pages.&lt;/p&gt;

&lt;p&gt;We built a custom chunking strategy rather than using LangChain's default &lt;code&gt;RecursiveCharacterTextSplitter&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;smart_chunk&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;document&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;max_tokens&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;int&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;400&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="nb"&gt;list&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;]:&lt;/span&gt;
    &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;
    Split by semantic boundaries: headings &amp;gt; paragraphs &amp;gt; sentences.
    Preserve heading context in each chunk via a sliding header stack.
    &lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
    &lt;span class="n"&gt;chunks&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[]&lt;/span&gt;
    &lt;span class="n"&gt;header_stack&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[]&lt;/span&gt;

    &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;block&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="nf"&gt;parse_blocks&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;document&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
        &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;block&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nb"&gt;type&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;heading&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
            &lt;span class="c1"&gt;# Update header context for subsequent chunks
&lt;/span&gt;            &lt;span class="n"&gt;level&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;block&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;heading_level&lt;/span&gt;
            &lt;span class="n"&gt;header_stack&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;header_stack&lt;/span&gt;&lt;span class="p"&gt;[:&lt;/span&gt;&lt;span class="n"&gt;level&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;block&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;text&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
        &lt;span class="k"&gt;elif&lt;/span&gt; &lt;span class="n"&gt;block&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nb"&gt;type&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;paragraph&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
            &lt;span class="n"&gt;context_prefix&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt; &amp;gt; &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;header_stack&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="se"&gt;\n\n&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
            &lt;span class="n"&gt;chunk_text&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;context_prefix&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="n"&gt;block&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;text&lt;/span&gt;

            &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="nf"&gt;token_count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;chunk_text&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;&amp;lt;=&lt;/span&gt; &lt;span class="n"&gt;max_tokens&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
                &lt;span class="n"&gt;chunks&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;append&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;chunk_text&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
            &lt;span class="k"&gt;else&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
                &lt;span class="c1"&gt;# Sentence-level split with context preserved
&lt;/span&gt;                &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;sentence_chunk&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="nf"&gt;split_by_sentences&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;block&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;text&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;max_tokens&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;context_prefix&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
                    &lt;span class="n"&gt;chunks&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;append&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;sentence_chunk&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;chunks&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The key insight: &lt;strong&gt;prepend the heading path to every chunk&lt;/strong&gt;. When the user asks "what's the penalty for late filing?" and the answer is buried under Section 4 &amp;gt; Subsection 2 &amp;gt; Penalties, the chunk now contains "Section 4 &amp;gt; Subsection 2 &amp;gt; Penalties" in its text — making vector similarity much more accurate.&lt;/p&gt;

&lt;p&gt;This took most of Day 1. We ingested ~4,200 chunks total.&lt;/p&gt;




&lt;h2&gt;
  
  
  Day 2: Retrieval + Reranking + LLM Prompt (24–48 hours)
&lt;/h2&gt;

&lt;p&gt;Retrieval was straightforward once the chunks were clean. The reranking step is where we spent most of Day 2 tuning.&lt;/p&gt;

&lt;p&gt;The prompt we landed on for the final LLM call:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;SYSTEM_PROMPT&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;
You are a precise assistant for {company_name}. 
Answer ONLY using the provided context documents.
If the answer is not clearly in the context, say: 
&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;I don&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;t have enough information to answer that — I&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;ll connect you with the team.&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;

Rules:
- Cite your source document in every response: [Source: {doc_name}]
- Never speculate or add information not in the context
- Keep answers under 150 words unless the question requires more detail
- If multiple documents disagree, surface both answers and note the discrepancy
&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The confidence scoring was the trickiest part. We built a heuristic:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;confidence_score&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;retrieved_chunks&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;list&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="nb"&gt;float&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="c1"&gt;# 1. Did the reranker score the top chunk above threshold?
&lt;/span&gt;    &lt;span class="n"&gt;reranker_score&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;retrieved_chunks&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="n"&gt;rerank_score&lt;/span&gt;  &lt;span class="c1"&gt;# 0–1
&lt;/span&gt;
    &lt;span class="c1"&gt;# 2. Did the LLM response include a source citation?
&lt;/span&gt;    &lt;span class="n"&gt;has_citation&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;[Source:&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;response&lt;/span&gt;

    &lt;span class="c1"&gt;# 3. Is the top chunk semantically close to the response?
&lt;/span&gt;    &lt;span class="n"&gt;response_embedding&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;embed&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="n"&gt;top_chunk_embedding&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;retrieved_chunks&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="n"&gt;embedding&lt;/span&gt;
    &lt;span class="n"&gt;semantic_overlap&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;cosine_similarity&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;response_embedding&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;top_chunk_embedding&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="n"&gt;raw_score&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;reranker_score&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.5&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;semantic_overlap&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.4&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mf"&gt;0.1&lt;/span&gt; &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;has_citation&lt;/span&gt; &lt;span class="k"&gt;else&lt;/span&gt; &lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;raw_score&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Anything below 0.70 triggered the "I'll connect you with the team" path. In production testing, this caught every hallucination we induced deliberately.&lt;/p&gt;




&lt;h2&gt;
  
  
  Day 3: Frontend, Integration, Deploy (48–72 hours)
&lt;/h2&gt;

&lt;p&gt;The React chat widget was 3 hours of work. The actual time sink was the client's CSP (Content Security Policy) blocking our iframe — a problem we've hit before and now check for in every project kickoff call.&lt;/p&gt;

&lt;p&gt;We shipped to Railway in 22 minutes using a single Dockerfile:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight docker"&gt;&lt;code&gt;&lt;span class="k"&gt;FROM&lt;/span&gt;&lt;span class="s"&gt; python:3.11-slim&lt;/span&gt;
&lt;span class="k"&gt;WORKDIR&lt;/span&gt;&lt;span class="s"&gt; /app&lt;/span&gt;
&lt;span class="k"&gt;COPY&lt;/span&gt;&lt;span class="s"&gt; requirements.txt .&lt;/span&gt;
&lt;span class="k"&gt;RUN &lt;/span&gt;pip &lt;span class="nb"&gt;install&lt;/span&gt; &lt;span class="nt"&gt;--no-cache-dir&lt;/span&gt; &lt;span class="nt"&gt;-r&lt;/span&gt; requirements.txt
&lt;span class="k"&gt;COPY&lt;/span&gt;&lt;span class="s"&gt; . .&lt;/span&gt;
&lt;span class="k"&gt;CMD&lt;/span&gt;&lt;span class="s"&gt; ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;ChromaDB data was persisted to a Railway volume. Total infrastructure cost: $5/month.&lt;/p&gt;




&lt;h2&gt;
  
  
  What We Got Wrong (and Fixed Fast)
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Mistake 1: Chunk size too large initially.&lt;/strong&gt;&lt;br&gt;&lt;br&gt;
We started with 800-token chunks because we wanted more context per chunk. Retrieval precision dropped significantly — the cross-encoder had too much noise to rank. Dropping to 400 tokens and increasing top-k from 3 to 5 fixed it.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Mistake 2: No query preprocessing on Day 1.&lt;/strong&gt;&lt;br&gt;&lt;br&gt;
The first version of the pipeline embedded the raw user query directly. Short queries like "late filing?" returned garbage. We added the query expansion step on Day 2 and recall improved dramatically.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Mistake 3: Overlooking document deduplication.&lt;/strong&gt;&lt;br&gt;&lt;br&gt;
The Notion export had significant duplicate content across pages. Without deduplication, the same chunk appeared multiple times in top-k results — wasting context window and making citations redundant. A simple hash-based dedup before ingestion fixed it.&lt;/p&gt;




&lt;h2&gt;
  
  
  Results After 2 Weeks in Production
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;68% of queries&lt;/strong&gt; resolved by the bot without human escalation&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Average response time:&lt;/strong&gt; 1.2 seconds end-to-end&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;0 hallucinations&lt;/strong&gt; detected in human review of 500 sampled responses&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Support ticket volume:&lt;/strong&gt; down 41% in the first two weeks&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Inference cost:&lt;/strong&gt; $23 for 7,600 queries (first two weeks)&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  What We'd Change for a Larger Scale Version
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Switch ChromaDB → Qdrant or Pinecone&lt;/strong&gt; for multi-tenant isolation and horizontal scaling&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Add a caching layer&lt;/strong&gt; (Redis) for frequently asked questions — same query, instant response&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Fine-tune the embedding model&lt;/strong&gt; on domain-specific vocabulary for legal/compliance terminology&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Streaming responses&lt;/strong&gt; via SSE — the 1.2s wait feels long when you're used to ChatGPT's streaming UX&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  The Actual Takeaway
&lt;/h2&gt;

&lt;p&gt;RAG is not a new idea. The reason most RAG implementations fail in production is not the LLM — it's the retrieval pipeline. Clean chunking, query expansion, and cross-encoder reranking are the unglamorous work that determines whether your chatbot answers correctly or confidently makes things up.&lt;/p&gt;

&lt;p&gt;If you're building something similar, the 72-hour timeline is realistic for an MVP with clean documents. For enterprise-grade accuracy on messy document sets, budget 2–3 weeks for the ingestion pipeline alone.&lt;/p&gt;

&lt;p&gt;We build these systems at &lt;a href="https://vedx.space" rel="noopener noreferrer"&gt;VedX&lt;/a&gt; — if you're working on a RAG project and want to talk through the architecture, feel free to reach out.&lt;/p&gt;




&lt;p&gt;&lt;strong&gt;Tags:&lt;/strong&gt; &lt;code&gt;#RAG&lt;/code&gt; &lt;code&gt;#LLM&lt;/code&gt; &lt;code&gt;#AI&lt;/code&gt; &lt;code&gt;#Python&lt;/code&gt; &lt;code&gt;#FastAPI&lt;/code&gt; &lt;code&gt;#freelancing&lt;/code&gt; &lt;code&gt;#webdev&lt;/code&gt; &lt;code&gt;#chatbot&lt;/code&gt;&lt;/p&gt;




&lt;p&gt;&lt;strong&gt;Author Bio (paste this into dev.to/Hashnode profile):&lt;/strong&gt;&lt;/p&gt;

&lt;blockquote&gt;
&lt;p&gt;Divyanshu Purohit — Co-founder at &lt;a href="https://vedx.space" rel="noopener noreferrer"&gt;VedX&lt;/a&gt;, a freelance AI and full-stack development agency. We build RAG systems, voicebots, and animated web products for startups and SMEs. Based in Jaipur, India. Building things that actually work in production.&lt;/p&gt;
&lt;/blockquote&gt;

</description>
      <category>ai</category>
      <category>architecture</category>
      <category>rag</category>
      <category>showdev</category>
    </item>
  </channel>
</rss>
