<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DEV Community: Abdelrahman Adnan</title>
    <description>The latest articles on DEV Community by Abdelrahman Adnan (@abdelrahman_adnan).</description>
    <link>https://dev.to/abdelrahman_adnan</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3198006%2F5e4cd24f-14d6-40e8-82a3-bdf5e6cb60cd.png</url>
      <title>DEV Community: Abdelrahman Adnan</title>
      <link>https://dev.to/abdelrahman_adnan</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://dev.to/feed/abdelrahman_adnan"/>
    <language>en</language>
    <item>
      <title>Part 3: Testing, Documentation &amp; Deployment 🚀</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Mon, 16 Feb 2026 22:46:01 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/part-3-testing-documentation-deployment-1bek</link>
      <guid>https://dev.to/abdelrahman_adnan/part-3-testing-documentation-deployment-1bek</guid>
      <description>&lt;h1&gt;
  
  
  DataEngineeringZoomcamp #dbt #AnalyticsEngineering #DataModeling
&lt;/h1&gt;

&lt;h3&gt;
  
  
  Macros - Reusable SQL Functions 🔧
&lt;/h3&gt;

&lt;p&gt;Macros are like functions in Python - write once, use everywhere.&lt;/p&gt;

&lt;h3&gt;
  
  
  Why Use Macros?
&lt;/h3&gt;

&lt;p&gt;Without macros, you repeat code:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- ❌ Repeated everywhere&lt;/span&gt;
&lt;span class="k"&gt;CASE&lt;/span&gt; 
    &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Credit card'&lt;/span&gt;
    &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;2&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Cash'&lt;/span&gt;
    &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;3&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'No charge'&lt;/span&gt;
    &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;4&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Dispute'&lt;/span&gt;
    &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;5&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Unknown'&lt;/span&gt;
    &lt;span class="k"&gt;ELSE&lt;/span&gt; &lt;span class="s1"&gt;'Unknown'&lt;/span&gt;
&lt;span class="k"&gt;END&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;payment_type_description&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;With macros, write it once:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- macros/get_payment_type_description.sql&lt;/span&gt;
&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="o"&gt;%&lt;/span&gt; &lt;span class="n"&gt;macro&lt;/span&gt; &lt;span class="n"&gt;get_payment_type_description&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;payment_type&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;%&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;
    &lt;span class="k"&gt;CASE&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
        &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Credit card'&lt;/span&gt;
        &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="mi"&gt;2&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Cash'&lt;/span&gt;
        &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="mi"&gt;3&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'No charge'&lt;/span&gt;
        &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="mi"&gt;4&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Dispute'&lt;/span&gt;
        &lt;span class="k"&gt;WHEN&lt;/span&gt; &lt;span class="mi"&gt;5&lt;/span&gt; &lt;span class="k"&gt;THEN&lt;/span&gt; &lt;span class="s1"&gt;'Unknown'&lt;/span&gt;
        &lt;span class="k"&gt;ELSE&lt;/span&gt; &lt;span class="s1"&gt;'Unknown'&lt;/span&gt;
    &lt;span class="k"&gt;END&lt;/span&gt;
&lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="o"&gt;%&lt;/span&gt; &lt;span class="n"&gt;endmacro&lt;/span&gt; &lt;span class="o"&gt;%&lt;/span&gt;&lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Use it in any model:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- models/staging/stg_green_tripdata.sql&lt;/span&gt;
&lt;span class="k"&gt;select&lt;/span&gt;
    &lt;span class="n"&gt;payment_type&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="n"&gt;get_payment_type_description&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'payment_type'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;payment_type_description&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;source&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'staging'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'green_tripdata'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Jinja Templating
&lt;/h3&gt;

&lt;p&gt;dbt uses &lt;strong&gt;Jinja&lt;/strong&gt; - a Python templating language. You'll recognize it by &lt;code&gt;{{ }}&lt;/code&gt; and &lt;code&gt;{% %}&lt;/code&gt;:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Syntax&lt;/th&gt;
&lt;th&gt;Purpose&lt;/th&gt;
&lt;th&gt;Example&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;{{ }}&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Output expression&lt;/td&gt;
&lt;td&gt;&lt;code&gt;{{ ref('my_model') }}&lt;/code&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;{% %}&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Logic/control flow&lt;/td&gt;
&lt;td&gt;&lt;code&gt;{% if is_incremental() %}&lt;/code&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;{# #}&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Comments&lt;/td&gt;
&lt;td&gt;&lt;code&gt;{# This is a comment #}&lt;/code&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h3&gt;
  
  
  dbt Packages - Community Libraries 📦
&lt;/h3&gt;

&lt;p&gt;Packages let you use macros and models built by others.&lt;/p&gt;

&lt;h3&gt;
  
  
  Popular Packages
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Package&lt;/th&gt;
&lt;th&gt;What it Does&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;dbt_utils&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Common SQL helpers (surrogate keys, pivot, etc.)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;dbt_codegen&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Auto-generate YAML and SQL&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;dbt_expectations&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Great Expectations-style tests&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;dbt_audit_helper&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Compare model outputs when refactoring&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Installing Packages
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Create &lt;code&gt;packages.yml&lt;/code&gt;:&lt;/strong&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;packages&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;package&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;dbt-labs/dbt_utils&lt;/span&gt;
    &lt;span class="na"&gt;version&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;1.1.1&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Run &lt;code&gt;dbt deps&lt;/code&gt;:&lt;/strong&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;dbt deps
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Use the macros:&lt;/strong&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Using dbt_utils to generate surrogate keys&lt;/span&gt;
&lt;span class="k"&gt;select&lt;/span&gt;
    &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="n"&gt;dbt_utils&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;generate_surrogate_key&lt;/span&gt;&lt;span class="p"&gt;([&lt;/span&gt;&lt;span class="s1"&gt;'vendorid'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'pickup_datetime'&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;trip_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="o"&gt;*&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;source&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'staging'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'green_tripdata'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Testing in dbt 🧪
&lt;/h3&gt;

&lt;p&gt;Tests ensure your data meets expectations. dbt has several test types:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;1. Generic Tests (Most Common)&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Built-in tests you apply in YAML:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="c1"&gt;# models/staging/schema.yml&lt;/span&gt;
&lt;span class="na"&gt;version&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="m"&gt;2&lt;/span&gt;

&lt;span class="na"&gt;models&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;stg_green_tripdata&lt;/span&gt;
    &lt;span class="na"&gt;columns&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;trip_id&lt;/span&gt;
        &lt;span class="na"&gt;tests&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
          &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="s"&gt;unique&lt;/span&gt;       &lt;span class="c1"&gt;# No duplicate values&lt;/span&gt;
          &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="s"&gt;not_null&lt;/span&gt;     &lt;span class="c1"&gt;# No null values&lt;/span&gt;

      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;payment_type&lt;/span&gt;
        &lt;span class="na"&gt;tests&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
          &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;accepted_values&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
              &lt;span class="na"&gt;values&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="pi"&gt;[&lt;/span&gt;&lt;span class="nv"&gt;1&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;2&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;3&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;4&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;5&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;6&lt;/span&gt;&lt;span class="pi"&gt;]&lt;/span&gt;  &lt;span class="c1"&gt;# Only these values allowed&lt;/span&gt;

      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;pickup_location_id&lt;/span&gt;
        &lt;span class="na"&gt;tests&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
          &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;relationships&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;  &lt;span class="c1"&gt;# Referential integrity&lt;/span&gt;
              &lt;span class="na"&gt;to&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;ref('dim_zones')&lt;/span&gt;
              &lt;span class="na"&gt;field&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;location_id&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;The four built-in tests:&lt;/strong&gt;&lt;br&gt;
| Test | What it Checks |&lt;br&gt;
|------|----------------|&lt;br&gt;
| &lt;code&gt;unique&lt;/code&gt; | No duplicate values in column |&lt;br&gt;
| &lt;code&gt;not_null&lt;/code&gt; | No NULL values in column |&lt;br&gt;
| &lt;code&gt;accepted_values&lt;/code&gt; | Values must be in specified list |&lt;br&gt;
| &lt;code&gt;relationships&lt;/code&gt; | Values must exist in another table |&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;2. Singular Tests&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Custom SQL tests in the &lt;code&gt;tests/&lt;/code&gt; folder:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- tests/assert_positive_fare_amount.sql&lt;/span&gt;
&lt;span class="c1"&gt;-- Test FAILS if any rows are returned&lt;/span&gt;

&lt;span class="k"&gt;select&lt;/span&gt;
    &lt;span class="n"&gt;trip_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;fare_amount&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'fct_trips'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;span class="k"&gt;where&lt;/span&gt; &lt;span class="n"&gt;fare_amount&lt;/span&gt; &lt;span class="o"&gt;&amp;lt;&lt;/span&gt; &lt;span class="mi"&gt;0&lt;/span&gt;  &lt;span class="c1"&gt;-- Find negative fares (bad data!)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;3. Source Freshness Tests&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Check if your source data is up to date:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;sources&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;staging&lt;/span&gt;
    &lt;span class="na"&gt;tables&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;green_tripdata&lt;/span&gt;
        &lt;span class="na"&gt;freshness&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
          &lt;span class="na"&gt;warn_after&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="pi"&gt;{&lt;/span&gt;&lt;span class="nv"&gt;count&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="nv"&gt;24&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;period&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="nv"&gt;hour&lt;/span&gt;&lt;span class="pi"&gt;}&lt;/span&gt;
          &lt;span class="na"&gt;error_after&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="pi"&gt;{&lt;/span&gt;&lt;span class="nv"&gt;count&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="nv"&gt;48&lt;/span&gt;&lt;span class="pi"&gt;,&lt;/span&gt; &lt;span class="nv"&gt;period&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="nv"&gt;hour&lt;/span&gt;&lt;span class="pi"&gt;}&lt;/span&gt;
        &lt;span class="na"&gt;loaded_at_field&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;pickup_datetime&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Running Tests
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Run all tests&lt;/span&gt;
dbt &lt;span class="nb"&gt;test&lt;/span&gt;

&lt;span class="c"&gt;# Run tests for specific model&lt;/span&gt;
dbt &lt;span class="nb"&gt;test&lt;/span&gt; &lt;span class="nt"&gt;--select&lt;/span&gt; stg_green_tripdata

&lt;span class="c"&gt;# Run tests and models together&lt;/span&gt;
dbt build
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Documentation 📝
&lt;/h3&gt;

&lt;p&gt;dbt generates beautiful documentation automatically!&lt;/p&gt;

&lt;h3&gt;
  
  
  Adding Descriptions
&lt;/h3&gt;

&lt;p&gt;In your schema YAML:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;version&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="m"&gt;2&lt;/span&gt;

&lt;span class="na"&gt;models&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;fct_trips&lt;/span&gt;
    &lt;span class="na"&gt;description&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="pi"&gt;&amp;gt;&lt;/span&gt;
      &lt;span class="s"&gt;Fact table containing all taxi trips (yellow and green).&lt;/span&gt;
      &lt;span class="s"&gt;One row per trip with fare details and zone information.&lt;/span&gt;

    &lt;span class="na"&gt;columns&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;trip_id&lt;/span&gt;
        &lt;span class="na"&gt;description&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;Unique identifier for each trip (surrogate key)&lt;/span&gt;

      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;service_type&lt;/span&gt;
        &lt;span class="na"&gt;description&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;Type of taxi service - 'Yellow' or 'Green'&lt;/span&gt;

      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;total_amount&lt;/span&gt;
        &lt;span class="na"&gt;description&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;Total trip cost including fare, tips, taxes, and fees&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Generating Docs
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Generate documentation&lt;/span&gt;
dbt docs generate

&lt;span class="c"&gt;# Serve locally (opens browser)&lt;/span&gt;
dbt docs serve
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This creates an interactive website with:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Model descriptions&lt;/li&gt;
&lt;li&gt;Column definitions&lt;/li&gt;
&lt;li&gt;Dependency graph (visual DAG)&lt;/li&gt;
&lt;li&gt;Source information&lt;/li&gt;
&lt;/ul&gt;




&lt;h3&gt;
  
  
  Essential dbt Commands 💻
&lt;/h3&gt;

&lt;h3&gt;
  
  
  The Big Four
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Command&lt;/th&gt;
&lt;th&gt;What it Does&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;dbt run&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Build all models (create views/tables)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;dbt test&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Run all tests&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;dbt build&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Run + test together (recommended!)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;dbt compile&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Generate SQL without executing&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Other Useful Commands
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Check connection&lt;/span&gt;
dbt debug

&lt;span class="c"&gt;# Load seed files&lt;/span&gt;
dbt seed

&lt;span class="c"&gt;# Install packages&lt;/span&gt;
dbt deps

&lt;span class="c"&gt;# Generate docs&lt;/span&gt;
dbt docs generate

&lt;span class="c"&gt;# Retry failed models&lt;/span&gt;
dbt retry
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Selecting Specific Models
&lt;/h3&gt;

&lt;p&gt;Use &lt;code&gt;--select&lt;/code&gt; (or &lt;code&gt;-s&lt;/code&gt;) to run specific models:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Single model&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--select&lt;/span&gt; stg_green_tripdata

&lt;span class="c"&gt;# Model and all upstream dependencies&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--select&lt;/span&gt; +fct_trips

&lt;span class="c"&gt;# Model and all downstream models&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--select&lt;/span&gt; stg_green_tripdata+

&lt;span class="c"&gt;# Both directions&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--select&lt;/span&gt; +fct_trips+

&lt;span class="c"&gt;# All models in a folder&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--select&lt;/span&gt; staging.&lt;span class="k"&gt;*&lt;/span&gt;

&lt;span class="c"&gt;# Multiple models&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--select&lt;/span&gt; stg_green_tripdata stg_yellow_tripdata
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Target Environments
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Development (default)&lt;/span&gt;
dbt run

&lt;span class="c"&gt;# Production&lt;/span&gt;
dbt run &lt;span class="nt"&gt;--target&lt;/span&gt; prod
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Materializations - Views vs Tables 📊
&lt;/h3&gt;

&lt;p&gt;Materialization controls how dbt persists your models in the warehouse.&lt;/p&gt;

&lt;h3&gt;
  
  
  Types of Materializations
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;What it Creates&lt;/th&gt;
&lt;th&gt;Use Case&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;view&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;SQL view (query stored, runs on access)&lt;/td&gt;
&lt;td&gt;Staging models, frequently changing logic&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;table&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Physical table (data stored)&lt;/td&gt;
&lt;td&gt;Final marts, large datasets, performance&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;incremental&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Appends new data only&lt;/td&gt;
&lt;td&gt;Very large tables, event data&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;ephemeral&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Not created (CTE in downstream)&lt;/td&gt;
&lt;td&gt;Helper models, intermediate steps&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Setting Materializations
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;In the model file:&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="n"&gt;config&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;materialized&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s1"&gt;'table'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;

&lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'stg_trips'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;In dbt_project.yml (project-wide):&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;models&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="na"&gt;my_project&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
    &lt;span class="na"&gt;staging&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="na"&gt;materialized&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;view&lt;/span&gt;
    &lt;span class="na"&gt;marts&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="na"&gt;materialized&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;table&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  View vs Table Decision
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌─────────────────────────────────────────────────────────────┐
│                 Should I use view or table?                  │
└─────────────────────────────────────────────────────────────┘
                            │
                            ▼
              ┌──────────────────────────┐
              │ Is the query expensive?  │
              └──────────────────────────┘
                     │            │
                    Yes          No
                     │            │
                     ▼            ▼
               ┌─────────┐  ┌─────────┐
               │  TABLE  │  │  VIEW   │
               └─────────┘  └─────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Use VIEW when:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Staging models (simple transformations)&lt;/li&gt;
&lt;li&gt;Logic changes frequently&lt;/li&gt;
&lt;li&gt;Storage cost is a concern&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Use TABLE when:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Final marts queried often&lt;/li&gt;
&lt;li&gt;Complex joins/aggregations&lt;/li&gt;
&lt;li&gt;Query performance matters&lt;/li&gt;
&lt;/ul&gt;




&lt;h3&gt;
  
  
  Putting It All Together - The NYC Taxi Project 🚕
&lt;/h3&gt;

&lt;p&gt;In this module, we build a complete dbt project for NYC taxi data:&lt;/p&gt;

&lt;h3&gt;
  
  
  What We Build
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌──────────────────────────────────────────────────────────────┐
│                      RAW DATA                                 │
│  green_tripdata (GCS/BigQuery) │ yellow_tripdata (GCS/BigQuery)│
└───────────────────┬─────────────────────┬────────────────────┘
                    │                     │
                    ▼                     ▼
┌──────────────────────────────────────────────────────────────┐
│                    STAGING LAYER                              │
│      stg_green_tripdata    │    stg_yellow_tripdata          │
│      (cleaned, renamed)    │    (cleaned, renamed)           │
└───────────────────┬─────────────────────┬────────────────────┘
                    │                     │
                    └──────────┬──────────┘
                               │
                               ▼
┌──────────────────────────────────────────────────────────────┐
│                  INTERMEDIATE LAYER                           │
│                   int_trips_unioned                           │
│            (green + yellow combined)                          │
└───────────────────────────────┬──────────────────────────────┘
                                │
                                ▼
┌──────────────────────────────────────────────────────────────┐
│                      MARTS LAYER                              │
│  ┌─────────────┐  ┌───────────────┐  ┌─────────────────────┐ │
│  │ dim_zones   │  │   fct_trips   │  │fct_monthly_zone_rev │ │
│  │ (dimension) │  │    (fact)     │  │     (report)        │ │
│  └─────────────┘  └───────────────┘  └─────────────────────┘ │
└──────────────────────────────────────────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  The Models We Create
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Model&lt;/th&gt;
&lt;th&gt;Type&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;stg_green_tripdata&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Staging&lt;/td&gt;
&lt;td&gt;Cleaned green taxi data&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;stg_yellow_tripdata&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Staging&lt;/td&gt;
&lt;td&gt;Cleaned yellow taxi data&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;int_trips_unioned&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Intermediate&lt;/td&gt;
&lt;td&gt;Combined yellow + green trips&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;dim_zones&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Dimension&lt;/td&gt;
&lt;td&gt;Zone lookup table&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;fct_trips&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Fact&lt;/td&gt;
&lt;td&gt;One row per trip&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;fct_monthly_zone_revenue&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Report&lt;/td&gt;
&lt;td&gt;Monthly revenue by zone&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h3&gt;
  
  
  Setup Options 🔧
&lt;/h3&gt;

&lt;h3&gt;
  
  
  Option 1: Local Setup (DuckDB + dbt Core)
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;Pros:&lt;/strong&gt; Free, no cloud account needed&lt;br&gt;
&lt;strong&gt;Cons:&lt;/strong&gt; Limited to your machine's power&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# 1. Install dbt with DuckDB adapter&lt;/span&gt;
pip &lt;span class="nb"&gt;install &lt;/span&gt;dbt-duckdb

&lt;span class="c"&gt;# 2. Clone the project&lt;/span&gt;
git clone https://github.com/DataTalksClub/data-engineering-zoomcamp
&lt;span class="nb"&gt;cd &lt;/span&gt;data-engineering-zoomcamp/04-analytics-engineering/taxi_rides_ny

&lt;span class="c"&gt;# 3. Create profiles.yml in ~/.dbt/&lt;/span&gt;
&lt;span class="c"&gt;# 4. Run dbt debug to test connection&lt;/span&gt;
dbt debug

&lt;span class="c"&gt;# 5. Build the project&lt;/span&gt;
dbt build &lt;span class="nt"&gt;--target&lt;/span&gt; prod
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Option 2: Cloud Setup (BigQuery + dbt Cloud)
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;Pros:&lt;/strong&gt; Powerful, team collaboration, scheduler&lt;br&gt;
&lt;strong&gt;Cons:&lt;/strong&gt; Requires GCP account (free tier available)&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Create dbt Cloud account (free)&lt;/li&gt;
&lt;li&gt;Connect to your BigQuery project&lt;/li&gt;
&lt;li&gt;Clone the repo in dbt Cloud IDE&lt;/li&gt;
&lt;li&gt;Run &lt;code&gt;dbt build --target prod&lt;/code&gt;
&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  Troubleshooting Common Issues 🔍
&lt;/h3&gt;

&lt;h3&gt;
  
  
  "Profile not found"
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Check &lt;code&gt;dbt_project.yml&lt;/code&gt; profile name matches &lt;code&gt;profiles.yml&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Ensure &lt;code&gt;profiles.yml&lt;/code&gt; is in &lt;code&gt;~/.dbt/&lt;/code&gt;
&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  "Source not found"
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Verify database/schema names in &lt;code&gt;sources.yml&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Check your data is actually loaded in the warehouse&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  "Model depends on model that was not found"
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Check for typos in &lt;code&gt;ref()&lt;/code&gt; calls&lt;/li&gt;
&lt;li&gt;Ensure referenced model exists&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  DuckDB Out of Memory
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Add memory settings to profiles.yml:
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;settings&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="na"&gt;memory_limit&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s1"&gt;'&lt;/span&gt;&lt;span class="s"&gt;2GB'&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Key Takeaways 🎓
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Analytics Engineering&lt;/strong&gt; bridges data engineering and data analysis&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;dbt&lt;/strong&gt; brings software engineering best practices to SQL transformations&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Dimensional modeling&lt;/strong&gt; organizes data into facts (events) and dimensions (attributes)&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Three layers&lt;/strong&gt; - staging (raw copy), intermediate (transformations), marts (final)&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;&lt;code&gt;ref()&lt;/code&gt; and &lt;code&gt;source()&lt;/code&gt;&lt;/strong&gt; are your main functions for building dependencies&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Testing&lt;/strong&gt; ensures data quality - use unique, not_null, accepted_values, relationships&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Documentation&lt;/strong&gt; is auto-generated from YAML descriptions&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;&lt;code&gt;dbt build&lt;/code&gt;&lt;/strong&gt; runs and tests everything in dependency order&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;




&lt;h3&gt;
  
  
  Additional Resources 📚
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;&lt;a href="https://docs.getdbt.com/" rel="noopener noreferrer"&gt;dbt Documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;
&lt;a href="https://learn.getdbt.com/courses/dbt-fundamentals" rel="noopener noreferrer"&gt;dbt Fundamentals Course&lt;/a&gt; (free)&lt;/li&gt;
&lt;li&gt;&lt;a href="https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/04-analytics-engineering/refreshers/SQL.md" rel="noopener noreferrer"&gt;SQL Refresher for Window Functions&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="https://community.getdbt.com/" rel="noopener noreferrer"&gt;dbt Community Slack&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;




</description>
      <category>analytics</category>
      <category>dataengineering</category>
      <category>sql</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Part 2: dbt Project Structure &amp; Building Models 📁</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Mon, 16 Feb 2026 22:45:11 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/part-2-dbt-project-structure-building-models-5g3g</link>
      <guid>https://dev.to/abdelrahman_adnan/part-2-dbt-project-structure-building-models-5g3g</guid>
      <description>&lt;h1&gt;
  
  
  DataEngineeringZoomcamp #dbt #AnalyticsEngineering #DataModeling
&lt;/h1&gt;

&lt;h3&gt;
  
  
  Why Model Data? 📐
&lt;/h3&gt;

&lt;p&gt;Raw data is messy and hard to query. Dimensional modeling organizes data into a structure that's:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Easy to understand&lt;/li&gt;
&lt;li&gt;Fast to query&lt;/li&gt;
&lt;li&gt;Flexible for different analyses&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Fact Tables vs Dimension Tables
&lt;/h3&gt;

&lt;p&gt;This is the core of dimensional modeling (also called "star schema"):&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Fact Tables (&lt;code&gt;fct_&lt;/code&gt;)&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Contain &lt;strong&gt;measurements&lt;/strong&gt; or &lt;strong&gt;events&lt;/strong&gt;
&lt;/li&gt;
&lt;li&gt;One row per thing that happened&lt;/li&gt;
&lt;li&gt;Usually have many rows (millions/billions)&lt;/li&gt;
&lt;li&gt;Contain numeric values you want to analyze&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Examples:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;fct_trips&lt;/code&gt; - one row per taxi trip&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;fct_sales&lt;/code&gt; - one row per sale&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;fct_orders&lt;/code&gt; - one row per order
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Example fact table&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="n"&gt;fct_trips&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt;
    &lt;span class="n"&gt;trip_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;           &lt;span class="c1"&gt;-- unique identifier&lt;/span&gt;
    &lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;   &lt;span class="c1"&gt;-- when it happened&lt;/span&gt;
    &lt;span class="n"&gt;dropoff_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;pickup_zone_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;    &lt;span class="c1"&gt;-- foreign keys to dimensions&lt;/span&gt;
    &lt;span class="n"&gt;dropoff_zone_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;       &lt;span class="c1"&gt;-- numeric measures&lt;/span&gt;
    &lt;span class="n"&gt;tip_amount&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;total_amount&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;transformed_trips&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Dimension Tables (&lt;code&gt;dim_&lt;/code&gt;)&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Contain &lt;strong&gt;attributes&lt;/strong&gt; or &lt;strong&gt;descriptive information&lt;/strong&gt;
&lt;/li&gt;
&lt;li&gt;One row per entity&lt;/li&gt;
&lt;li&gt;Usually fewer rows&lt;/li&gt;
&lt;li&gt;Provide context for fact tables&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Examples:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;dim_zones&lt;/code&gt; - one row per taxi zone&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;dim_customers&lt;/code&gt; - one row per customer&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;dim_products&lt;/code&gt; - one row per product
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Example dimension table&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="n"&gt;dim_zones&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt;
    &lt;span class="n"&gt;location_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;       &lt;span class="c1"&gt;-- primary key&lt;/span&gt;
    &lt;span class="n"&gt;borough&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;           &lt;span class="c1"&gt;-- descriptive attributes&lt;/span&gt;
    &lt;span class="n"&gt;zone_name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;service_zone&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;zone_lookup&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  The Star Schema ⭐
&lt;/h3&gt;

&lt;p&gt;When you join facts and dimensions, you get a star shape:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;                    ┌──────────────┐
                    │  dim_zones   │
                    │  (pickup)    │
                    └───────┬──────┘
                            │
┌──────────────┐    ┌───────┴──────┐    ┌──────────────┐
│  dim_vendors │────│  fct_trips   │────│  dim_zones   │
│              │    │  (center)    │    │  (dropoff)   │
└──────────────┘    └───────┬──────┘    └──────────────┘
                            │
                    ┌───────┴──────┐
                    │ dim_payment  │
                    │    types     │
                    └──────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Why it's powerful:&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Easy to answer business questions!&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; 
    &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;borough&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;trip_count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;SUM&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;f&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;total_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;total_revenue&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;fct_trips&lt;/span&gt; &lt;span class="n"&gt;f&lt;/span&gt;
&lt;span class="k"&gt;JOIN&lt;/span&gt; &lt;span class="n"&gt;dim_zones&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt; &lt;span class="k"&gt;ON&lt;/span&gt; &lt;span class="n"&gt;f&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;pickup_zone_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;location_id&lt;/span&gt;
&lt;span class="k"&gt;GROUP&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;borough&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;total_revenue&lt;/span&gt; &lt;span class="k"&gt;DESC&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  dbt Project Structure
&lt;/h3&gt;

&lt;p&gt;A dbt project has a specific folder structure. Understanding this helps you navigate any project:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;taxi_rides_ny/
├── dbt_project.yml      # Project configuration (most important!)
├── profiles.yml         # Database connection (often in ~/.dbt/)
├── packages.yml         # External packages to install
│
├── models/              # ⭐ YOUR SQL MODELS LIVE HERE
│   ├── staging/         # Raw data, minimally cleaned
│   ├── intermediate/    # Complex transformations
│   └── marts/           # Final, business-ready tables
│
├── seeds/               # CSV files to load as tables
├── macros/              # Reusable SQL functions
├── tests/               # Custom test files
├── snapshots/           # Track data changes over time
└── analysis/            # Ad-hoc queries (not built)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  The &lt;code&gt;dbt_project.yml&lt;/code&gt; File
&lt;/h3&gt;

&lt;p&gt;This is the &lt;strong&gt;most important file&lt;/strong&gt; - dbt looks for it first:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s1"&gt;'&lt;/span&gt;&lt;span class="s"&gt;taxi_rides_ny'&lt;/span&gt;
&lt;span class="na"&gt;version&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s1"&gt;'&lt;/span&gt;&lt;span class="s"&gt;1.0.0'&lt;/span&gt;
&lt;span class="na"&gt;profile&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s1"&gt;'&lt;/span&gt;&lt;span class="s"&gt;taxi_rides_ny'&lt;/span&gt;  &lt;span class="c1"&gt;# Must match profiles.yml!&lt;/span&gt;

&lt;span class="c1"&gt;# Default configurations&lt;/span&gt;
&lt;span class="na"&gt;models&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="na"&gt;taxi_rides_ny&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
    &lt;span class="na"&gt;staging&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="na"&gt;materialized&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;view&lt;/span&gt;  &lt;span class="c1"&gt;# Staging models become views&lt;/span&gt;
    &lt;span class="na"&gt;marts&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="na"&gt;materialized&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;table&lt;/span&gt; &lt;span class="c1"&gt;# Mart models become tables&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  The Three Model Layers
&lt;/h3&gt;

&lt;p&gt;dbt recommends organizing models into three layers:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;1. Staging Layer (&lt;code&gt;staging/&lt;/code&gt;)&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Purpose:&lt;/strong&gt; Clean copy of raw data with minimal transformations&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What happens here:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Rename columns (snake_case, clear names)&lt;/li&gt;
&lt;li&gt;Cast data types&lt;/li&gt;
&lt;li&gt;Filter obviously bad data&lt;/li&gt;
&lt;li&gt;Keep 1:1 with source (same rows, similar columns)
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- models/staging/stg_green_tripdata.sql&lt;/span&gt;
&lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="n"&gt;config&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;materialized&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s1"&gt;'view'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;

&lt;span class="k"&gt;with&lt;/span&gt; &lt;span class="n"&gt;tripdata&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; 
    &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;source&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'staging'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'green_tripdata'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
    &lt;span class="k"&gt;where&lt;/span&gt; &lt;span class="n"&gt;vendorid&lt;/span&gt; &lt;span class="k"&gt;is&lt;/span&gt; &lt;span class="k"&gt;not&lt;/span&gt; &lt;span class="k"&gt;null&lt;/span&gt;  &lt;span class="c1"&gt;-- filter bad data&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="k"&gt;select&lt;/span&gt;
    &lt;span class="c1"&gt;-- Rename and cast columns&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;vendorid&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;integer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;lpep_pickup_datetime&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;timestamp&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;lpep_dropoff_datetime&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;timestamp&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;dropoff_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pulocationid&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;integer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pickup_location_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;dolocationid&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;integer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;dropoff_location_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;passenger_count&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;integer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;passenger_count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;trip_distance&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;numeric&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;trip_distance&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;numeric&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;total_amount&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="nb"&gt;numeric&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;total_amount&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="n"&gt;tripdata&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;2. Intermediate Layer (&lt;code&gt;intermediate/&lt;/code&gt;)&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Purpose:&lt;/strong&gt; Complex transformations, joins, business logic&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What happens here:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Combine multiple staging models&lt;/li&gt;
&lt;li&gt;Apply business rules&lt;/li&gt;
&lt;li&gt;Heavy data manipulation&lt;/li&gt;
&lt;li&gt;NOT exposed to end users
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- models/intermediate/int_trips_unioned.sql&lt;/span&gt;
&lt;span class="k"&gt;with&lt;/span&gt; &lt;span class="n"&gt;green_trips&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'Green'&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;service_type&lt;/span&gt;
    &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'stg_green_tripdata'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;span class="p"&gt;),&lt;/span&gt;

&lt;span class="n"&gt;yellow_trips&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'Yellow'&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;service_type&lt;/span&gt;
    &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'stg_yellow_tripdata'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="n"&gt;green_trips&lt;/span&gt;
&lt;span class="k"&gt;union&lt;/span&gt; &lt;span class="k"&gt;all&lt;/span&gt;
&lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="n"&gt;yellow_trips&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;3. Marts Layer (&lt;code&gt;marts/&lt;/code&gt;)&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Purpose:&lt;/strong&gt; Final, business-ready tables for end users&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;What happens here:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Final fact and dimension tables&lt;/li&gt;
&lt;li&gt;Ready for dashboards and reports&lt;/li&gt;
&lt;li&gt;Only these should be exposed to BI tools!
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- models/marts/fct_trips.sql&lt;/span&gt;
&lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="n"&gt;config&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;materialized&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s1"&gt;'table'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;

&lt;span class="k"&gt;select&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;trip_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;service_type&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;dropoff_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;pickup_location_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;dropoff_location_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;z_pickup&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="k"&gt;zone&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pickup_zone&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;z_dropoff&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="k"&gt;zone&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;dropoff_zone&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;passenger_count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;trip_distance&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;total_amount&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'int_trips_unioned'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;
&lt;span class="k"&gt;left&lt;/span&gt; &lt;span class="k"&gt;join&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'dim_zones'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt; &lt;span class="n"&gt;z_pickup&lt;/span&gt; 
    &lt;span class="k"&gt;on&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;pickup_location_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;z_pickup&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;location_id&lt;/span&gt;
&lt;span class="k"&gt;left&lt;/span&gt; &lt;span class="k"&gt;join&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'dim_zones'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt; &lt;span class="n"&gt;z_dropoff&lt;/span&gt; 
    &lt;span class="k"&gt;on&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;dropoff_location_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;z_dropoff&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;location_id&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  Sources and the &lt;code&gt;source()&lt;/code&gt; Function 📥
&lt;/h3&gt;

&lt;h3&gt;
  
  
  What are Sources?
&lt;/h3&gt;

&lt;p&gt;Sources tell dbt where your raw data lives in the warehouse. They're defined in YAML files:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="c1"&gt;# models/staging/sources.yml&lt;/span&gt;
&lt;span class="na"&gt;version&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="m"&gt;2&lt;/span&gt;

&lt;span class="na"&gt;sources&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;staging&lt;/span&gt;           &lt;span class="c1"&gt;# Logical name (you choose)&lt;/span&gt;
    &lt;span class="na"&gt;database&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;my_project&lt;/span&gt;    &lt;span class="c1"&gt;# Your GCP project or database&lt;/span&gt;
    &lt;span class="na"&gt;schema&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;nytaxi&lt;/span&gt;          &lt;span class="c1"&gt;# BigQuery dataset or schema&lt;/span&gt;
    &lt;span class="na"&gt;tables&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;green_tripdata&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="na"&gt;name&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;yellow_tripdata&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Using the &lt;code&gt;source()&lt;/code&gt; Function
&lt;/h3&gt;

&lt;p&gt;Instead of hardcoding table names, use &lt;code&gt;source()&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- ❌ Bad - hardcoded path&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;my_project&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;nytaxi&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;green_tripdata&lt;/span&gt;

&lt;span class="c1"&gt;-- ✅ Good - using source()&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;source&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'staging'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="s1"&gt;'green_tripdata'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Benefits:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Change database/schema in one place (YAML file)&lt;/li&gt;
&lt;li&gt;dbt tracks dependencies automatically&lt;/li&gt;
&lt;li&gt;Can add freshness tests on sources&lt;/li&gt;
&lt;/ul&gt;




&lt;h3&gt;
  
  
  The &lt;code&gt;ref()&lt;/code&gt; Function - Building Dependencies 🔗
&lt;/h3&gt;

&lt;p&gt;This is &lt;strong&gt;the most important dbt function!&lt;/strong&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  &lt;code&gt;source()&lt;/code&gt; vs &lt;code&gt;ref()&lt;/code&gt;
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Function&lt;/th&gt;
&lt;th&gt;Use When&lt;/th&gt;
&lt;th&gt;Example&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;source()&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Reading raw/external data&lt;/td&gt;
&lt;td&gt;&lt;code&gt;{{ source('staging', 'green_tripdata') }}&lt;/code&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;code&gt;ref()&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Reading another dbt model&lt;/td&gt;
&lt;td&gt;&lt;code&gt;{{ ref('stg_green_tripdata') }}&lt;/code&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  How &lt;code&gt;ref()&lt;/code&gt; Works
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- models/marts/fct_trips.sql&lt;/span&gt;
&lt;span class="k"&gt;select&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'int_trips_unioned'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;  &lt;span class="c1"&gt;-- References the int_trips_unioned model&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;What &lt;code&gt;ref()&lt;/code&gt; does:&lt;/strong&gt;&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;✅ Resolves to the correct schema/table name&lt;/li&gt;
&lt;li&gt;✅ Builds the dependency graph automatically&lt;/li&gt;
&lt;li&gt;✅ Ensures models run in the correct order&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  The DAG (Directed Acyclic Graph)
&lt;/h3&gt;

&lt;p&gt;dbt builds a &lt;strong&gt;dependency graph&lt;/strong&gt; from your &lt;code&gt;ref()&lt;/code&gt; calls:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌──────────────────┐     ┌──────────────────┐
│ stg_green_trips  │     │ stg_yellow_trips │
└────────┬─────────┘     └────────┬─────────┘
         │                        │
         └──────────┬─────────────┘
                    │
                    ▼
         ┌──────────────────┐
         │ int_trips_unioned│
         └────────┬─────────┘
                  │
                  ▼
         ┌──────────────────┐
         │    fct_trips     │
         └──────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;When you run &lt;code&gt;dbt build&lt;/code&gt;, models run in dependency order automatically!&lt;/p&gt;




&lt;h3&gt;
  
  
  Seeds - Loading CSV Files 🌱
&lt;/h3&gt;

&lt;p&gt;Seeds let you load small CSV files into your warehouse as tables.&lt;/p&gt;

&lt;h3&gt;
  
  
  When to Use Seeds
&lt;/h3&gt;

&lt;p&gt;✅ &lt;strong&gt;Good use cases:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Lookup tables (zone names, country codes)&lt;/li&gt;
&lt;li&gt;Static mappings (vendor ID → vendor name)&lt;/li&gt;
&lt;li&gt;Small reference data that rarely changes&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;❌ &lt;strong&gt;Not good for:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Large datasets (use proper data loading)&lt;/li&gt;
&lt;li&gt;Frequently changing data&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  How to Use Seeds
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Put CSV files in the &lt;code&gt;seeds/&lt;/code&gt; folder:&lt;/strong&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;seeds/
└── taxi_zone_lookup.csv
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;locationid,borough,zone,service_zone
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham Gardens,Boro Zone
...
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Run &lt;code&gt;dbt seed&lt;/code&gt;:&lt;/strong&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;dbt seed
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Reference in models using &lt;code&gt;ref()&lt;/code&gt;:&lt;/strong&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- models/marts/dim_zones.sql&lt;/span&gt;
&lt;span class="k"&gt;select&lt;/span&gt;
    &lt;span class="n"&gt;locationid&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;location_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;borough&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;zone&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;service_zone&lt;/span&gt;
&lt;span class="k"&gt;from&lt;/span&gt; &lt;span class="p"&gt;{{&lt;/span&gt; &lt;span class="k"&gt;ref&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="s1"&gt;'taxi_zone_lookup'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="p"&gt;}}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






</description>
      <category>data</category>
      <category>dataengineering</category>
      <category>sql</category>
      <category>tutorial</category>
    </item>
    <item>
      <title># Module 4 Summary - Analytics Engineering with dbt</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Mon, 16 Feb 2026 22:44:26 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/-module-4-summary-analytics-engineering-with-dbt-1p8g</link>
      <guid>https://dev.to/abdelrahman_adnan/-module-4-summary-analytics-engineering-with-dbt-1p8g</guid>
      <description>&lt;h1&gt;
  
  
  DataEngineeringZoomcamp #dbt #AnalyticsEngineering #DataModeling
&lt;/h1&gt;




&lt;h2&gt;
  
  
  Part 1: Introduction to Analytics Engineering &amp;amp; dbt Fundamentals 🎯
&lt;/h2&gt;

&lt;h3&gt;
  
  
  What is Analytics Engineering?
&lt;/h3&gt;

&lt;h3&gt;
  
  
  The Evolution of Data Roles
&lt;/h3&gt;

&lt;p&gt;Traditionally, there were two main roles in data:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Role&lt;/th&gt;
&lt;th&gt;Focus&lt;/th&gt;
&lt;th&gt;Skills&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data Engineer&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Building pipelines, infrastructure, data movement&lt;/td&gt;
&lt;td&gt;Python, Spark, Airflow, cloud services&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data Analyst&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Creating reports, dashboards, insights&lt;/td&gt;
&lt;td&gt;SQL, Excel, BI tools&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;But there was a gap! Who transforms the raw data into clean, analysis-ready tables? Enter the &lt;strong&gt;Analytics Engineer&lt;/strong&gt;.&lt;/p&gt;

&lt;h3&gt;
  
  
  What Does an Analytics Engineer Do?
&lt;/h3&gt;

&lt;p&gt;An Analytics Engineer sits between Data Engineering and Data Analytics:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌─────────────────┐     ┌──────────────────────┐     ┌─────────────────┐
│  Data Engineer  │ ──► │  Analytics Engineer  │ ──► │   Data Analyst  │
│                 │     │                      │     │                 │
│  • Pipelines    │     │  • Transform data    │     │  • Dashboards   │
│  • Infrastructure│    │  • Data modeling     │     │  • Reports      │
│  • Data movement│     │  • Quality tests     │     │  • Insights     │
└─────────────────┘     │  • Documentation     │     └─────────────────┘
                        └──────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Key responsibilities:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;📊 Transform raw data into clean, modeled datasets&lt;/li&gt;
&lt;li&gt;🧪 Write tests to ensure data quality&lt;/li&gt;
&lt;li&gt;📝 Document everything so others can understand&lt;/li&gt;
&lt;li&gt;🔗 Build the "T" in ELT (Extract, Load, Transform)&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  The Kitchen Analogy 🍳
&lt;/h3&gt;

&lt;p&gt;Think of a data warehouse like a restaurant:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Restaurant&lt;/th&gt;
&lt;th&gt;Data Warehouse&lt;/th&gt;
&lt;th&gt;Who accesses it&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;
&lt;strong&gt;Pantry&lt;/strong&gt; (raw ingredients)&lt;/td&gt;
&lt;td&gt;Staging area (raw data)&lt;/td&gt;
&lt;td&gt;Data Engineers&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;
&lt;strong&gt;Kitchen&lt;/strong&gt; (cooking happens)&lt;/td&gt;
&lt;td&gt;Processing area (transformations)&lt;/td&gt;
&lt;td&gt;Analytics Engineers&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;
&lt;strong&gt;Dining Hall&lt;/strong&gt; (served dishes)&lt;/td&gt;
&lt;td&gt;Presentation area (final tables)&lt;/td&gt;
&lt;td&gt;Business users, Analysts&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;Raw ingredients (data) come in, get processed (transformed), and are served as polished dishes (analytics-ready tables).&lt;/p&gt;




&lt;h2&gt;
  
  
  What is dbt? 🛠️
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;dbt&lt;/strong&gt; stands for &lt;strong&gt;data build tool&lt;/strong&gt;. It's the most popular tool for analytics engineering.&lt;/p&gt;

&lt;h3&gt;
  
  
  The Problems dbt Solves
&lt;/h3&gt;

&lt;p&gt;Before dbt, data transformation was messy:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;❌ SQL scripts scattered everywhere with no organization&lt;/li&gt;
&lt;li&gt;❌ No version control (changes got lost)&lt;/li&gt;
&lt;li&gt;❌ No testing (errors discovered too late)&lt;/li&gt;
&lt;li&gt;❌ No documentation (nobody knew what anything meant)&lt;/li&gt;
&lt;li&gt;❌ No environments (changes went straight to production!)&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;dbt brings software engineering best practices to analytics:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;✅ &lt;strong&gt;Version control&lt;/strong&gt; - Your SQL lives in Git&lt;/li&gt;
&lt;li&gt;✅ &lt;strong&gt;Modularity&lt;/strong&gt; - Reusable pieces instead of copy-paste&lt;/li&gt;
&lt;li&gt;✅ &lt;strong&gt;Testing&lt;/strong&gt; - Automated data quality checks&lt;/li&gt;
&lt;li&gt;✅ &lt;strong&gt;Documentation&lt;/strong&gt; - Generated from your code&lt;/li&gt;
&lt;li&gt;✅ &lt;strong&gt;Environments&lt;/strong&gt; - Separate dev and prod&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  How dbt Works
&lt;/h3&gt;

&lt;p&gt;dbt follows a simple principle: &lt;strong&gt;write SQL, dbt handles the rest&lt;/strong&gt;.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌─────────────────────────────────────────────────────────────┐
│                     Your dbt Project                        │
│                                                             │
│   ┌───────────────┐    ┌───────────────┐    ┌────────────┐ │
│   │  models/*.sql │───►│   dbt compile │───►│ SQL Queries│ │
│   │  (your logic) │    │   dbt run     │    │ (executed) │ │
│   └───────────────┘    └───────────────┘    └────────────┘ │
│                              │                              │
│                              ▼                              │
│                    ┌──────────────────┐                     │
│                    │  Data Warehouse  │                     │
│                    │  (views/tables)  │                     │
│                    └──────────────────┘                     │
└─────────────────────────────────────────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;You write SQL files (called "models")&lt;/li&gt;
&lt;li&gt;dbt compiles them (adds warehouse-specific syntax)&lt;/li&gt;
&lt;li&gt;dbt runs them against your data warehouse&lt;/li&gt;
&lt;li&gt;Views/tables are created automatically!&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  dbt Core vs dbt Cloud
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Feature&lt;/th&gt;
&lt;th&gt;dbt Core&lt;/th&gt;
&lt;th&gt;dbt Cloud&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Cost&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Free (open source)&lt;/td&gt;
&lt;td&gt;Free tier + paid plans&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Where it runs&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Your machine/server&lt;/td&gt;
&lt;td&gt;Cloud-hosted&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Setup&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Manual installation&lt;/td&gt;
&lt;td&gt;Browser-based IDE&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Scheduling&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Need external tool&lt;/td&gt;
&lt;td&gt;Built-in scheduler&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Best for&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Local development, cost savings&lt;/td&gt;
&lt;td&gt;Teams, ease of use&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;💡 &lt;strong&gt;For this course:&lt;/strong&gt; You can use either! Local setup uses DuckDB + dbt Core (free). Cloud setup uses BigQuery + dbt Cloud.&lt;/p&gt;




</description>
      <category>analytics</category>
      <category>dataengineering</category>
      <category>sql</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Part 3: Partitioning &amp; Clustering for Performance 🚀</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Mon, 09 Feb 2026 23:09:31 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/part-3-partitioning-clustering-for-performance-37if</link>
      <guid>https://dev.to/abdelrahman_adnan/part-3-partitioning-clustering-for-performance-37if</guid>
      <description>&lt;p&gt;This is where BigQuery optimization gets really powerful! These two techniques can reduce your query costs by 90% or more.&lt;/p&gt;

&lt;h3&gt;
  
  
  The Problem We're Solving 🤔
&lt;/h3&gt;

&lt;p&gt;Imagine you have a table with 5 years of taxi trip data - about 500 million rows. Every time you query:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;taxi_trips&lt;/span&gt; &lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="n"&gt;pickup_date&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'2024-01-15'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Without optimization, BigQuery scans ALL 500 million rows just to find trips from one day. That's:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Slow (lots of data to read)&lt;/li&gt;
&lt;li&gt;Expensive (you pay for all data scanned)&lt;/li&gt;
&lt;li&gt;Wasteful (you only needed 0.05% of the data!)&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Partitioning and clustering solve this problem!&lt;/strong&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Partitioning: Dividing Your Table into Sections 📁
&lt;/h3&gt;

&lt;p&gt;Think of partitioning like organizing a filing cabinet. Instead of one giant drawer with all documents, you have:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Drawer for January&lt;/li&gt;
&lt;li&gt;Drawer for February&lt;/li&gt;
&lt;li&gt;Drawer for March&lt;/li&gt;
&lt;li&gt;...and so on&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;When you need something from March, you ONLY open the March drawer!&lt;/p&gt;

&lt;h4&gt;
  
  
  How Partitioning Works in BigQuery
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Create a table partitioned by date&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_partitioned`&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Now your table looks like this internally:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;taxi_partitioned/
├── 2024-01-01/    (all trips from Jan 1)
├── 2024-01-02/    (all trips from Jan 2)
├── 2024-01-03/    (all trips from Jan 3)
│   ... 
├── 2024-06-30/    (all trips from Jun 30)
└── [metadata]
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;When you query with a date filter:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;taxi_partitioned&lt;/span&gt; 
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'2024-03-15'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;BigQuery ONLY reads the 2024-03-15 partition! The other 180+ partitions are never touched.&lt;/p&gt;

&lt;h4&gt;
  
  
  Types of Partitioning
&lt;/h4&gt;

&lt;p&gt;&lt;strong&gt;1. Time-based partitioning (most common)&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Partition by day (default)&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;-- Partition by month (for less granular data)&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;DATE_TRUNC&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="k"&gt;MONTH&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;-- Partition by year&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;DATE_TRUNC&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nb"&gt;YEAR&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;2. Integer range partitioning&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Partition by customer ID ranges&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;RANGE_BUCKET&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;customer_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;GENERATE_ARRAY&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;1000000&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;10000&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;3. Ingestion time partitioning&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Partition by when data was loaded&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;_PARTITIONDATE&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  Partitioning Rules to Remember ⚠️
&lt;/h4&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Rule&lt;/th&gt;
&lt;th&gt;Details&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Max partitions&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;4,000 per table&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Min partition size&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Aim for at least 1GB per partition&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;One column only&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Can only partition on ONE column&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Column types&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;DATE, TIMESTAMP, DATETIME, or INTEGER&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;💡 &lt;strong&gt;When NOT to use partitioning:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;If you'd have &amp;lt; 1GB per partition (use clustering instead)&lt;/li&gt;
&lt;li&gt;If you'd exceed 4,000 partitions&lt;/li&gt;
&lt;li&gt;If you rarely filter on the partition column&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Clustering: Organizing Data Within Partitions 🗂️
&lt;/h3&gt;

&lt;p&gt;If partitioning is like having separate drawers in a filing cabinet, clustering is like organizing the folders WITHIN each drawer alphabetically.&lt;/p&gt;

&lt;h4&gt;
  
  
  How Clustering Works
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Create table with partitioning AND clustering&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_optimized`&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="k"&gt;CLUSTER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;payment_type&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Now within each date partition, data is sorted by vendor_id, then by payment_type:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;taxi_optimized/
├── 2024-01-15/
│   ├── vendor_id=1, payment_type=1, ...
│   ├── vendor_id=1, payment_type=2, ...
│   ├── vendor_id=2, payment_type=1, ...
│   └── vendor_id=2, payment_type=2, ...
├── 2024-01-16/
│   └── (similarly organized)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;When you query:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;taxi_optimized&lt;/span&gt; 
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'2024-01-15'&lt;/span&gt;
  &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;BigQuery:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Goes directly to the 2024-01-15 partition (thanks to partitioning)&lt;/li&gt;
&lt;li&gt;Reads only the vendor_id=1 blocks (thanks to clustering)&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;Even more data skipped = even faster and cheaper!&lt;/strong&gt;&lt;/p&gt;

&lt;h4&gt;
  
  
  Clustering Rules 📏
&lt;/h4&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Rule&lt;/th&gt;
&lt;th&gt;Details&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Max columns&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Up to 4 clustering columns&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Order matters&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Put most filtered column first&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;No cost for re-clustering&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;BigQuery automatically re-clusters as data is added&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Works with partitioning&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Best used together!&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Minimum table size&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Most effective for tables &amp;gt; 1GB&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;Good clustering column candidates:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Columns you frequently filter on (WHERE clause)&lt;/li&gt;
&lt;li&gt;Columns you frequently group by (GROUP BY clause)&lt;/li&gt;
&lt;li&gt;High-cardinality columns (many distinct values)&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Partitioning vs Clustering: When to Use What? 🤷
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Scenario&lt;/th&gt;
&lt;th&gt;Recommendation&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Always filter by date&lt;/td&gt;
&lt;td&gt;Partition by date&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Filter by date AND other columns&lt;/td&gt;
&lt;td&gt;Partition by date, cluster by other columns&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Filter by multiple non-date columns&lt;/td&gt;
&lt;td&gt;Cluster by those columns&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Need to know query cost upfront&lt;/td&gt;
&lt;td&gt;Must use partitioning (clustering doesn't show estimates)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Less than 1GB per potential partition&lt;/td&gt;
&lt;td&gt;Use clustering instead&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Would have &amp;gt; 4,000 partitions&lt;/td&gt;
&lt;td&gt;Use clustering instead&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Data is rarely filtered&lt;/td&gt;
&lt;td&gt;Maybe neither - analyze your query patterns first&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Real-World Performance Comparison 📊
&lt;/h3&gt;

&lt;p&gt;I ran tests on the NYC taxi dataset (about 20 million rows). Here are the results:&lt;/p&gt;

&lt;h4&gt;
  
  
  Test 1: Filtering by Date Range
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;DISTINCT&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt; 
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="k"&gt;table&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; 
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;BETWEEN&lt;/span&gt; &lt;span class="s1"&gt;'2024-03-01'&lt;/span&gt; &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="s1"&gt;'2024-03-15'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Table Type&lt;/th&gt;
&lt;th&gt;Data Scanned&lt;/th&gt;
&lt;th&gt;Cost&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Non-partitioned&lt;/td&gt;
&lt;td&gt;310 MB&lt;/td&gt;
&lt;td&gt;$0.00155&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Partitioned by date&lt;/td&gt;
&lt;td&gt;27 MB&lt;/td&gt;
&lt;td&gt;$0.000135&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Savings&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;91% less!&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;91% cheaper!&lt;/strong&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h4&gt;
  
  
  Test 2: Filtering by Date AND Vendor
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; 
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="k"&gt;table&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; 
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;BETWEEN&lt;/span&gt; &lt;span class="s1"&gt;'2024-06-01'&lt;/span&gt; &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="s1"&gt;'2024-06-30'&lt;/span&gt;
  &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Table Type&lt;/th&gt;
&lt;th&gt;Data Scanned&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Partitioned only&lt;/td&gt;
&lt;td&gt;1.1 GB&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Partitioned + Clustered&lt;/td&gt;
&lt;td&gt;865 MB&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Additional savings&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;strong&gt;21% less!&lt;/strong&gt;&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;Combined savings: Over 90% reduction in costs!&lt;/strong&gt; 💰&lt;/p&gt;

&lt;h3&gt;
  
  
  Step-by-Step: Creating an Optimized Table
&lt;/h3&gt;

&lt;p&gt;Here's the full workflow:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Step 1: Create external table pointing to your data in GCS&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;EXTERNAL&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`my-project.dataset.taxi_external`&lt;/span&gt;
&lt;span class="k"&gt;OPTIONS&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
  &lt;span class="n"&gt;format&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'PARQUET'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="n"&gt;uris&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="s1"&gt;'gs://my-bucket/yellow_taxi_2024/*.parquet'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="p"&gt;);&lt;/span&gt;

&lt;span class="c1"&gt;-- Step 2: Check how many records we have&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`my-project.dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="c1"&gt;-- Result: 20,332,093 records&lt;/span&gt;

&lt;span class="c1"&gt;-- Step 3: Create optimized table with partitioning and clustering&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`my-project.dataset.taxi_optimized`&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;tpep_dropoff_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="k"&gt;CLUSTER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;VendorID&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`my-project.dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Step 4: Verify partitions were created&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; 
  &lt;span class="k"&gt;table_name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; 
  &lt;span class="n"&gt;partition_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; 
  &lt;span class="n"&gt;total_rows&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`dataset.INFORMATION_SCHEMA.PARTITIONS`&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="k"&gt;table_name&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'taxi_optimized'&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;partition_id&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Best Practices Cheat Sheet ✅
&lt;/h3&gt;

&lt;h4&gt;
  
  
  For Reducing Costs 💵
&lt;/h4&gt;

&lt;ul&gt;
&lt;li&gt;❌ Never use &lt;code&gt;SELECT *&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;✅ Only query columns you need&lt;/li&gt;
&lt;li&gt;✅ Use partitioned tables&lt;/li&gt;
&lt;li&gt;✅ Add clustering for frequently filtered columns&lt;/li&gt;
&lt;li&gt;✅ Check estimated bytes before running&lt;/li&gt;
&lt;li&gt;✅ Use table previews instead of SELECT for quick looks&lt;/li&gt;
&lt;/ul&gt;

&lt;h4&gt;
  
  
  For Better Performance ⚡
&lt;/h4&gt;

&lt;ul&gt;
&lt;li&gt;✅ Filter early - apply WHERE before JOINs&lt;/li&gt;
&lt;li&gt;✅ Put largest table first in JOINs&lt;/li&gt;
&lt;li&gt;✅ Use ORDER BY at the end of query&lt;/li&gt;
&lt;li&gt;✅ Consider approximate functions (APPROX_COUNT_DISTINCT) when exact precision isn't needed&lt;/li&gt;
&lt;li&gt;✅ Avoid JavaScript UDFs when possible&lt;/li&gt;
&lt;li&gt;✅ Don't over-partition (keep partitions &amp;gt; 1GB)&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Quick Reference: Common SQL Patterns for Beginners 📝
&lt;/h2&gt;

&lt;p&gt;Here are the most useful BigQuery SQL commands you'll need:&lt;/p&gt;

&lt;h3&gt;
  
  
  Basic Queries
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Count all records in a table&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.table`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Count distinct values in a column&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="k"&gt;DISTINCT&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Get first 10 rows (but remember - this still scans the whole table!)&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt; &lt;span class="k"&gt;LIMIT&lt;/span&gt; &lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Better way to preview - use table preview in BigQuery console instead!&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Filtering Data
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Filter by exact value&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Filter by date range&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;BETWEEN&lt;/span&gt; &lt;span class="s1"&gt;'2024-01-01'&lt;/span&gt; &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="s1"&gt;'2024-01-31'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Filter with multiple conditions&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt; 
  &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="n"&gt;fare_amount&lt;/span&gt; &lt;span class="o"&gt;&amp;gt;&lt;/span&gt; &lt;span class="mi"&gt;10&lt;/span&gt;
  &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'2024-03-15'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Aggregations
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Sum, average, min, max&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; 
  &lt;span class="k"&gt;SUM&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;total_fares&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="k"&gt;AVG&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;avg_fare&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="k"&gt;MIN&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;min_fare&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="k"&gt;MAX&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;max_fare&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;trip_count&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Group by&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; 
  &lt;span class="n"&gt;vendor_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;trips&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="k"&gt;AVG&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;fare_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;avg_fare&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi`&lt;/span&gt;
&lt;span class="k"&gt;GROUP&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Creating Tables
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Create external table from GCS&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;EXTERNAL&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_external`&lt;/span&gt;
&lt;span class="k"&gt;OPTIONS&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
  &lt;span class="n"&gt;format&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'PARQUET'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="n"&gt;uris&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="s1"&gt;'gs://bucket-name/folder/*.parquet'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="p"&gt;);&lt;/span&gt;

&lt;span class="c1"&gt;-- Create native table from external&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_native`&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Create partitioned + clustered table&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_optimized`&lt;/span&gt;
&lt;span class="k"&gt;PARTITION&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;pickup_datetime&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="k"&gt;CLUSTER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;vendor_id&lt;/span&gt;
&lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`project.dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Checking Table Info
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- View partition information&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; 
  &lt;span class="k"&gt;table_name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; 
  &lt;span class="n"&gt;partition_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; 
  &lt;span class="n"&gt;total_rows&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`dataset.INFORMATION_SCHEMA.PARTITIONS`&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="k"&gt;table_name&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'your_table_name'&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;partition_id&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Check table schema&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;column_name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;data_type&lt;/span&gt; 
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`dataset.INFORMATION_SCHEMA.COLUMNS`&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="k"&gt;table_name&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'your_table_name'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Glossary for Beginners 📚
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Term&lt;/th&gt;
&lt;th&gt;Simple Explanation&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data Warehouse&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;A big database designed for analyzing historical data, not for running apps&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;OLTP&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Databases for running applications (fast, small transactions)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;OLAP&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Databases for analysis (complex queries, lots of data)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;BigQuery&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Google's cloud data warehouse service&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;GCS&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Google Cloud Storage - where you store files in the cloud&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;External Table&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;A table that reads data from GCS without copying it&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Native Table&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;A table with data stored in BigQuery itself&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Partitioning&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Splitting a table into smaller pieces by date or number&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Clustering&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Sorting data within partitions by specific columns&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Columnar Storage&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Storing data by column instead of row (faster for analytics)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Slot&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;A unit of compute power in BigQuery&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data Scanned&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;How much data BigQuery reads to answer your query (you pay for this!)&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;




&lt;h2&gt;
  
  
  Common Mistakes to Avoid ⚠️
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Using &lt;code&gt;SELECT *&lt;/code&gt; everywhere&lt;/strong&gt; - Always specify columns you need!&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Thinking LIMIT reduces cost&lt;/strong&gt; - It doesn't! BigQuery scans first, limits after.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Not using partitions&lt;/strong&gt; - Always partition time-series data by date.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Wrong partition column&lt;/strong&gt; - Partition by columns you ALWAYS filter on.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Too many partitions&lt;/strong&gt; - Keep it under 4,000, aim for &amp;gt;1GB per partition.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Ignoring the query validator&lt;/strong&gt; - Always check estimated bytes before running!&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Not using clustering with partitioning&lt;/strong&gt; - They work best together!&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  Resources for Learning More 📖
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;📊 &lt;a href="https://cloud.google.com/bigquery/docs" rel="noopener noreferrer"&gt;BigQuery Official Documentation&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;🎥 &lt;a href="https://youtu.be/jrHljAoD6nM" rel="noopener noreferrer"&gt;DE Zoomcamp Video: Data Warehouse and BigQuery&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;🎥 &lt;a href="https://youtu.be/-CqXf7vhhDs" rel="noopener noreferrer"&gt;DE Zoomcamp Video: Partitioning vs Clustering&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;🎥 &lt;a href="https://youtu.be/k81mLJVX08w" rel="noopener noreferrer"&gt;DE Zoomcamp Video: Best Practices&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;🎥 &lt;a href="https://youtu.be/eduHi1inM4s" rel="noopener noreferrer"&gt;DE Zoomcamp Video: Internals of BigQuery&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;📝 &lt;a href="https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/03-data-warehouse/big_query.sql" rel="noopener noreferrer"&gt;Course SQL Examples&lt;/a&gt;
&lt;/li&gt;
&lt;li&gt;📑 &lt;a href="https://docs.google.com/presentation/d/1a3ZoBAXFk8-EhUsd7rAZd-5p_HpltkzSeujjRGB2TAI/edit" rel="noopener noreferrer"&gt;Course Slides&lt;/a&gt;
&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Summary: Key Takeaways 🎯
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Data warehouses&lt;/strong&gt; are for analysis, not running apps - that's why they exist!&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;BigQuery&lt;/strong&gt; is serverless - no servers to manage, just write SQL.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Columnar storage&lt;/strong&gt; = only reads columns you request = faster + cheaper.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;External tables&lt;/strong&gt; = data in GCS, slower but flexible.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Native tables&lt;/strong&gt; = data in BigQuery, faster but costs more storage.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Partitioning&lt;/strong&gt; = split table by date, only scan relevant dates.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Clustering&lt;/strong&gt; = sort data within partitions, skip irrelevant blocks.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Always check estimated bytes&lt;/strong&gt; before running queries!&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Never use &lt;code&gt;SELECT *&lt;/code&gt;&lt;/strong&gt; - specify only the columns you need.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Combine partitioning + clustering&lt;/strong&gt; for maximum optimization!&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;




&lt;h1&gt;
  
  
  DataEngineeringZoomcamp #BigQuery #DataWarehouse #GCP #SQL #CloudComputing
&lt;/h1&gt;

</description>
      <category>dataengineering</category>
      <category>googlecloud</category>
      <category>performance</category>
      <category>sql</category>
    </item>
    <item>
      <title>Part 2: BigQuery Deep Dive 🔍</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Mon, 09 Feb 2026 23:08:43 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/part-2-bigquery-deep-dive-3m29</link>
      <guid>https://dev.to/abdelrahman_adnan/part-2-bigquery-deep-dive-3m29</guid>
      <description>&lt;h3&gt;
  
  
  What is BigQuery?
&lt;/h3&gt;

&lt;p&gt;BigQuery is Google's &lt;strong&gt;data warehouse in the cloud&lt;/strong&gt;. It's one of the most popular choices for storing and analyzing large amounts of data because it's:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Serverless&lt;/strong&gt; - You don't manage any servers. No installing software, no worrying about disk space, no maintenance. Google handles everything.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Fully managed&lt;/strong&gt; - Google takes care of security, backups, scaling, and updates.&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Petabyte-scale&lt;/strong&gt; - Can handle absolutely massive datasets (1 petabyte = 1,000 terabytes = 1,000,000 gigabytes!)&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;SQL-based&lt;/strong&gt; - You just write SQL queries. No need to learn a new programming language!&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  Why BigQuery is Great for Beginners 🌟
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;☁️ &lt;strong&gt;No setup headaches&lt;/strong&gt; - Create a project, load data, start querying. That's it!&lt;/li&gt;
&lt;li&gt;💰 &lt;strong&gt;Free tier&lt;/strong&gt; - 1TB of queries and 10GB storage free per month&lt;/li&gt;
&lt;li&gt;📊 &lt;strong&gt;Familiar SQL&lt;/strong&gt; - If you know basic SQL, you can use BigQuery&lt;/li&gt;
&lt;li&gt;🔗 &lt;strong&gt;Works with everything&lt;/strong&gt; - Google Sheets, Data Studio, Python, R, etc.&lt;/li&gt;
&lt;li&gt;🤖 &lt;strong&gt;Built-in ML&lt;/strong&gt; - Train machine learning models using just SQL!&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  How BigQuery Works Under the Hood 🔧
&lt;/h3&gt;

&lt;p&gt;Understanding the architecture helps you write better queries and save money. Don't worry, I'll keep it simple!&lt;/p&gt;

&lt;h4&gt;
  
  
  The Secret: Separation of Storage and Compute
&lt;/h4&gt;

&lt;p&gt;Traditional databases store data and process queries on the same machine. BigQuery does something clever - it separates them:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌─────────────────────────────────────────────────────────┐
│                    YOUR SQL QUERY                        │
└─────────────────────────┬───────────────────────────────┘
                          │
                          ▼
┌─────────────────────────────────────────────────────────┐
│                DREMEL (Compute Engine)                   │
│                                                         │
│   Your query gets broken into tiny pieces and           │
│   thousands of workers process them in parallel         │
└─────────────────────────┬───────────────────────────────┘
                          │
                          │  Jupiter Network (super fast!)
                          │  1 Terabyte per second
                          │
                          ▼
┌─────────────────────────────────────────────────────────┐
│                 COLOSSUS (Storage)                       │
│                                                         │
│   Your data lives here in COLUMNAR format               │
│   (organized by columns, not rows)                      │
└─────────────────────────────────────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  What Does "Columnar Storage" Mean? 📋
&lt;/h4&gt;

&lt;p&gt;This is SUPER important for understanding BigQuery performance!&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Traditional databases (row-oriented):&lt;/strong&gt;&lt;br&gt;
Stores data like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Row 1: [John, 25, New York, $50000]
Row 2: [Jane, 30, Chicago, $60000]
Row 3: [Bob, 35, Miami, $55000]
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;To find all salaries, it reads EVERY row, even though you only need one column.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;BigQuery (column-oriented):&lt;/strong&gt;&lt;br&gt;
Stores data like this:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;Names column:    [John, Jane, Bob]
Ages column:     [25, 30, 35]
Cities column:   [New York, Chicago, Miami]
Salaries column: [$50000, $60000, $55000]
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;To find all salaries, it ONLY reads the salary column! Much faster and cheaper!&lt;/p&gt;

&lt;p&gt;💡 &lt;strong&gt;This is why &lt;code&gt;SELECT *&lt;/code&gt; is expensive in BigQuery&lt;/strong&gt; - it has to read EVERY column. Always specify only the columns you need!&lt;/p&gt;

&lt;h4&gt;
  
  
  The Dremel Execution Engine 🚀
&lt;/h4&gt;

&lt;p&gt;When you run a query, here's what happens:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Root Server&lt;/strong&gt; receives your query&lt;/li&gt;
&lt;li&gt;Query is broken into smaller pieces&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Mixers&lt;/strong&gt; distribute work to thousands of &lt;strong&gt;Leaf Nodes&lt;/strong&gt;
&lt;/li&gt;
&lt;li&gt;Each Leaf Node processes a small chunk of data in parallel&lt;/li&gt;
&lt;li&gt;Results flow back up through Mixers to Root&lt;/li&gt;
&lt;li&gt;You get your final result!
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;                    ┌──────────┐
                    │   ROOT   │  ← Your query comes here
                    └────┬─────┘
                         │
           ┌─────────────┼─────────────┐
           ▼             ▼             ▼
      ┌────────┐    ┌────────┐    ┌────────┐
      │ MIXER  │    │ MIXER  │    │ MIXER  │
      └───┬────┘    └───┬────┘    └───┬────┘
          │             │             │
    ┌─────┼─────┐ ┌─────┼─────┐ ┌─────┼─────┐
    ▼     ▼     ▼ ▼     ▼     ▼ ▼     ▼     ▼
   [L]   [L]   [L][L]   [L]   [L][L]   [L]   [L]

   L = Leaf nodes (thousands of them!)
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Why this matters:&lt;/strong&gt; A query that would take hours on your laptop can run in seconds because thousands of machines work on it simultaneously!&lt;/p&gt;

&lt;h3&gt;
  
  
  External Tables vs Native Tables 📦
&lt;/h3&gt;

&lt;p&gt;You have two ways to work with data in BigQuery:&lt;/p&gt;

&lt;h4&gt;
  
  
  Option 1: External Tables (Data stays in GCS)
&lt;/h4&gt;

&lt;p&gt;Your data remains in Google Cloud Storage, BigQuery just reads it when you query.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Create external table pointing to files in GCS bucket&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;EXTERNAL&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`my-project.my_dataset.taxi_external`&lt;/span&gt;
&lt;span class="k"&gt;OPTIONS&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
  &lt;span class="n"&gt;format&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'PARQUET'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
  &lt;span class="n"&gt;uris&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="s1"&gt;'gs://my-bucket/taxi_data/*.parquet'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="p"&gt;);&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;When to use External Tables:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;✅ You want to save on storage costs (GCS is cheaper than BigQuery storage)&lt;/li&gt;
&lt;li&gt;✅ One-time or occasional analysis&lt;/li&gt;
&lt;li&gt;✅ Data is updated frequently in source system&lt;/li&gt;
&lt;li&gt;✅ Quick exploration before committing to load&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Downsides:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;❌ Slower queries (data needs to be read from GCS each time)&lt;/li&gt;
&lt;li&gt;❌ No cost estimation before running queries&lt;/li&gt;
&lt;li&gt;❌ Can't partition or cluster (limited optimization)&lt;/li&gt;
&lt;/ul&gt;

&lt;h4&gt;
  
  
  Option 2: Native Tables (Data loaded into BigQuery)
&lt;/h4&gt;

&lt;p&gt;Data is copied into BigQuery's own storage (Colossus).&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Create native table from external table&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;OR&lt;/span&gt; &lt;span class="k"&gt;REPLACE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="nv"&gt;`my-project.my_dataset.taxi_native`&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="nv"&gt;`my-project.my_dataset.taxi_external`&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;When to use Native Tables:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;✅ Frequently queried data&lt;/li&gt;
&lt;li&gt;✅ Need best query performance&lt;/li&gt;
&lt;li&gt;✅ Want to use partitioning and clustering&lt;/li&gt;
&lt;li&gt;✅ Need accurate cost estimates before running queries&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Downsides:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;❌ Higher storage costs&lt;/li&gt;
&lt;li&gt;❌ Data duplication (exists in both GCS and BigQuery)&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;💡 &lt;strong&gt;Pro tip:&lt;/strong&gt; Start with external tables for exploration, then load into native tables once you know what data you actually need!&lt;/p&gt;

&lt;h3&gt;
  
  
  Understanding BigQuery Costs 💰
&lt;/h3&gt;

&lt;p&gt;BigQuery has two main pricing models:&lt;/p&gt;

&lt;h4&gt;
  
  
  On-Demand Pricing (Pay per query)
&lt;/h4&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;$5 per TB&lt;/strong&gt; of data scanned&lt;/li&gt;
&lt;li&gt;Good for: Occasional users, unpredictable workloads&lt;/li&gt;
&lt;li&gt;You pay for how much data your queries read&lt;/li&gt;
&lt;/ul&gt;

&lt;h4&gt;
  
  
  Flat-Rate Pricing (Monthly commitment)
&lt;/h4&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;~$2,000/month&lt;/strong&gt; for 100 "slots" (compute units)&lt;/li&gt;
&lt;li&gt;Good for: Heavy users, predictable workloads&lt;/li&gt;
&lt;li&gt;Unlimited queries within your slot capacity&lt;/li&gt;
&lt;/ul&gt;

&lt;h4&gt;
  
  
  How to Estimate Query Cost 🧮
&lt;/h4&gt;

&lt;p&gt;Before running a query, BigQuery shows you how much data it will scan:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌────────────────────────────────────────────────┐
│  Query Editor                                  │
│  ─────────────────────────────────────────────│
│  SELECT * FROM my_table WHERE date = '2024-01'│
│                                                │
│  [This query will process 2.5 GB when run]    │ ← Check this!
└────────────────────────────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Cost calculation:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;2.5 GB = 0.0025 TB&lt;/li&gt;
&lt;li&gt;0.0025 TB × $5 = $0.0125 (about 1 cent)&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;But if you run that query 100 times a day... costs add up!&lt;/p&gt;

&lt;h3&gt;
  
  
  Cost Optimization Tips 💡
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;NEVER use &lt;code&gt;SELECT *&lt;/code&gt;&lt;/strong&gt; unless you absolutely need every column
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;   &lt;span class="c1"&gt;-- ❌ Bad - reads ALL columns&lt;/span&gt;
   &lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;taxi_data&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

   &lt;span class="c1"&gt;-- ✅ Good - reads only what you need&lt;/span&gt;
   &lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;pickup_time&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;dropoff_time&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;fare_amount&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;taxi_data&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Use partitioned tables&lt;/strong&gt; (covered in Part 3)&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Preview before running&lt;/strong&gt; - Always check the estimated bytes&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Use LIMIT wisely&lt;/strong&gt; - It doesn't reduce data scanned! The filtering happens AFTER reading.&lt;br&gt;
&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;   &lt;span class="c1"&gt;-- ❌ Still scans the whole table!&lt;/span&gt;
   &lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;huge_table&lt;/span&gt; &lt;span class="k"&gt;LIMIT&lt;/span&gt; &lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

   &lt;span class="c1"&gt;-- ✅ Better - add a WHERE clause first&lt;/span&gt;
   &lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;huge_table&lt;/span&gt; &lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nb"&gt;date&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="k"&gt;CURRENT_DATE&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt; &lt;span class="k"&gt;LIMIT&lt;/span&gt; &lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Cache results&lt;/strong&gt; - BigQuery caches query results for 24 hours (free!)&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  BigQuery Caching 🗄️
&lt;/h3&gt;

&lt;p&gt;When you run the same query twice:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;First run: Scans data, costs money&lt;/li&gt;
&lt;li&gt;Second run: Returns cached result, FREE!&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Cache is invalidated when:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Underlying table data changes&lt;/li&gt;
&lt;li&gt;24 hours pass&lt;/li&gt;
&lt;li&gt;You disable caching in query settings&lt;/li&gt;
&lt;/ul&gt;




&lt;h1&gt;
  
  
  DataEngineeringZoomcamp #BigQuery #DataWarehouse #GCP #SQL #CloudComputing
&lt;/h1&gt;

</description>
      <category>beginners</category>
      <category>database</category>
      <category>googlecloud</category>
      <category>sql</category>
    </item>
    <item>
      <title>Module 3 Summary - Data Warehousing &amp; BigQuery</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Mon, 09 Feb 2026 23:07:36 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/-module-3-summary-data-warehousing-bigquery-2m26</link>
      <guid>https://dev.to/abdelrahman_adnan/-module-3-summary-data-warehousing-bigquery-2m26</guid>
      <description>&lt;h1&gt;
  
  
  DataEngineeringZoomcamp #BigQuery #DataWarehouse #GCP
&lt;/h1&gt;




&lt;h2&gt;
  
  
  Part 1: Understanding Data Warehouses &amp;amp; OLAP vs OLTP 🏢
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Why Do We Need Data Warehouses? 🤔
&lt;/h3&gt;

&lt;p&gt;Imagine you run an online store. Your website has a database that handles:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Customer sign-ups&lt;/li&gt;
&lt;li&gt;Product orders&lt;/li&gt;
&lt;li&gt;Payment processing&lt;/li&gt;
&lt;li&gt;Inventory updates&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This database needs to be FAST because customers are waiting. Every millisecond counts!&lt;/p&gt;

&lt;p&gt;Now, your boss asks: "What were our top-selling products last year by region, and how did that compare to the year before?"&lt;/p&gt;

&lt;p&gt;Running that query on your production database would:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Slow down your website (bad for customers!)&lt;/li&gt;
&lt;li&gt;Take forever because the database isn't designed for such complex analysis&lt;/li&gt;
&lt;li&gt;Potentially crash things if the query is too heavy&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;This is exactly why data warehouses exist!&lt;/strong&gt; They're a separate place to store your data, specifically designed for answering complex analytical questions without affecting your live applications.&lt;/p&gt;

&lt;h3&gt;
  
  
  OLTP vs OLAP - The Two Worlds of Databases
&lt;/h3&gt;

&lt;p&gt;These acronyms sound scary, but they're simple concepts:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;OLTP = Online Transaction Processing&lt;/strong&gt; (Your everyday app databases)&lt;br&gt;
&lt;strong&gt;OLAP = Online Analytical Processing&lt;/strong&gt; (Data warehouses for analysis)&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Aspect&lt;/th&gt;
&lt;th&gt;OLTP (Transactional)&lt;/th&gt;
&lt;th&gt;OLAP (Analytical)&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;What it's for&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Running your app - orders, logins, updates&lt;/td&gt;
&lt;td&gt;Answering business questions - reports, dashboards&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Type of queries&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Simple: "Get user #123's info"&lt;/td&gt;
&lt;td&gt;Complex: "Show sales trends by region for 5 years"&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Speed&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Super fast for small operations&lt;/td&gt;
&lt;td&gt;Can take minutes for huge analyses&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data freshness&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Real-time, always up-to-date&lt;/td&gt;
&lt;td&gt;Usually updated daily/hourly (batch)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;How data is organized&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Normalized (split into many tables, no duplicates)&lt;/td&gt;
&lt;td&gt;Denormalized (fewer tables, some duplication OK)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data size&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Gigabytes (current data)&lt;/td&gt;
&lt;td&gt;Terabytes/Petabytes (years of history)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Who uses it&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Your application, customers&lt;/td&gt;
&lt;td&gt;Data analysts, managers, executives&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Examples&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;MySQL for your website, PostgreSQL for your app&lt;/td&gt;
&lt;td&gt;BigQuery, Snowflake, Amazon Redshift&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;
&lt;h4&gt;
  
  
  Real-World Example 🛒
&lt;/h4&gt;

&lt;p&gt;&lt;strong&gt;OLTP scenario (your app database):&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- A customer places an order - needs to be FAST&lt;/span&gt;
&lt;span class="k"&gt;INSERT&lt;/span&gt; &lt;span class="k"&gt;INTO&lt;/span&gt; &lt;span class="n"&gt;orders&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;customer_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;product_id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;quantity&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;price&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; 
&lt;span class="k"&gt;VALUES&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;123&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;456&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;29&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="mi"&gt;99&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;OLAP scenario (data warehouse):&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Your CEO wants to know Q4 performance - can take a minute, that's fine&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; 
    &lt;span class="n"&gt;region&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;product_category&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;SUM&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;revenue&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;total_revenue&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="k"&gt;DISTINCT&lt;/span&gt; &lt;span class="n"&gt;customer_id&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;unique_customers&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;sales_data&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="n"&gt;order_date&lt;/span&gt; &lt;span class="k"&gt;BETWEEN&lt;/span&gt; &lt;span class="s1"&gt;'2023-10-01'&lt;/span&gt; &lt;span class="k"&gt;AND&lt;/span&gt; &lt;span class="s1"&gt;'2023-12-31'&lt;/span&gt;
&lt;span class="k"&gt;GROUP&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;region&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;product_category&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;total_revenue&lt;/span&gt; &lt;span class="k"&gt;DESC&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;💡 &lt;strong&gt;Key insight:&lt;/strong&gt; OLTP is like a cashier at a store - fast, handles one customer at a time. OLAP is like the accounting department - takes time to analyze all the receipts and produce reports.&lt;/p&gt;

&lt;h3&gt;
  
  
  What Exactly is a Data Warehouse? 🏗️
&lt;/h3&gt;

&lt;p&gt;A data warehouse is a &lt;strong&gt;centralized repository&lt;/strong&gt; where you collect data from ALL your different systems and store it in a way that's optimized for analysis.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Think of it like this:&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;Imagine a company with multiple departments:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Sales team uses Salesforce&lt;/li&gt;
&lt;li&gt;Marketing uses HubSpot
&lt;/li&gt;
&lt;li&gt;Website runs on PostgreSQL&lt;/li&gt;
&lt;li&gt;Inventory managed in SAP&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Each system has its own database. But your CEO wants a report combining data from ALL of them. This is where a data warehouse comes in!&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;┌─────────────┐   ┌─────────────┐   ┌─────────────┐
│  Salesforce │   │   HubSpot   │   │  PostgreSQL │
│   (Sales)   │   │ (Marketing) │   │  (Website)  │
└──────┬──────┘   └──────┬──────┘   └──────┬──────┘
       │                 │                 │
       │    ETL/ELT      │                 │
       │   (Extract,     │                 │
       │   Transform,    │                 │
       │    Load)        │                 │
       ▼                 ▼                 ▼
┌──────────────────────────────────────────────────┐
│              DATA WAREHOUSE (BigQuery)            │
│                                                  │
│   All your data, cleaned, organized, ready       │
│   for analysis!                                  │
└──────────────────────────────────────────────────┘
                        │
                        ▼
         ┌──────────────────────────┐
         │  Reports, Dashboards,    │
         │  Machine Learning, etc.  │
         └──────────────────────────┘
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Key characteristics of a data warehouse:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;📊 &lt;strong&gt;Subject-oriented&lt;/strong&gt; - Organized by business topics (sales, customers, products)&lt;/li&gt;
&lt;li&gt;🔗 &lt;strong&gt;Integrated&lt;/strong&gt; - Data from multiple sources combined together&lt;/li&gt;
&lt;li&gt;📅 &lt;strong&gt;Time-variant&lt;/strong&gt; - Keeps historical data (years worth!)&lt;/li&gt;
&lt;li&gt;🔒 &lt;strong&gt;Non-volatile&lt;/strong&gt; - Data doesn't change once loaded (it's a historical record)&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Modern Cloud Data Warehouses ☁️
&lt;/h3&gt;

&lt;p&gt;Traditional data warehouses (like Oracle, Teradata) required:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Buying expensive hardware&lt;/li&gt;
&lt;li&gt;Hiring DBAs to manage servers&lt;/li&gt;
&lt;li&gt;Months of setup time&lt;/li&gt;
&lt;li&gt;Huge upfront costs&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Modern cloud data warehouses&lt;/strong&gt; (BigQuery, Snowflake, Redshift) changed everything:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;✅ No servers to manage (serverless)&lt;/li&gt;
&lt;li&gt;✅ Pay only for what you use&lt;/li&gt;
&lt;li&gt;✅ Scales automatically&lt;/li&gt;
&lt;li&gt;✅ Set up in minutes&lt;/li&gt;
&lt;li&gt;✅ Access from anywhere&lt;/li&gt;
&lt;/ul&gt;

</description>
      <category>analytics</category>
      <category>database</category>
      <category>dataengineering</category>
      <category>googlecloud</category>
    </item>
    <item>
      <title>Module 2 Summary - Workflow Orchestration with Kestra Part 3</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Tue, 03 Feb 2026 01:02:10 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/module-2-summary-workflow-orchestration-with-kestra-part-3-4nn8</link>
      <guid>https://dev.to/abdelrahman_adnan/module-2-summary-workflow-orchestration-with-kestra-part-3-4nn8</guid>
      <description>&lt;h2&gt;
  
  
  Part 3: AI Integration &amp;amp; Best Practices
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Using AI for Data Engineering
&lt;/h3&gt;

&lt;p&gt;AI tools help data engineers by:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Generating workflows faster&lt;/strong&gt; - Describe tasks in natural language&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Avoiding errors&lt;/strong&gt; - Get syntax-correct code following best practices&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Key Insight:&lt;/strong&gt; AI is only as good as the context you provide.&lt;/p&gt;

&lt;h3&gt;
  
  
  Context Engineering with LLMs
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;Problem:&lt;/strong&gt; Generic AI assistants (like ChatGPT without context) may produce:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Outdated plugin syntax&lt;/li&gt;
&lt;li&gt;Incorrect property names&lt;/li&gt;
&lt;li&gt;Hallucinated features that don't exist&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Why?&lt;/strong&gt; LLMs are trained on data up to a knowledge cutoff date and don't know about software updates.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Solution:&lt;/strong&gt; Provide proper context to AI!&lt;/p&gt;

&lt;h3&gt;
  
  
  Kestra AI Copilot
&lt;/h3&gt;

&lt;p&gt;Kestra's built-in AI Copilot is designed specifically for generating Kestra flows with:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Full context about latest plugins&lt;/li&gt;
&lt;li&gt;Correct workflow syntax&lt;/li&gt;
&lt;li&gt;Current best practices&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Setup Requirements:&lt;/strong&gt;&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Get Gemini API key from Google AI Studio&lt;/li&gt;
&lt;li&gt;Configure in docker-compose.yml with &lt;code&gt;GEMINI_API_KEY&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Access via sparkle icon (✨) in Kestra UI&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  Retrieval Augmented Generation (RAG)
&lt;/h3&gt;

&lt;p&gt;RAG is a technique that:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Retrieves&lt;/strong&gt; relevant information from data sources&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Augments&lt;/strong&gt; the AI prompt with this context&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Generates&lt;/strong&gt; responses grounded in real data&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;RAG Process in Kestra:&lt;/strong&gt;&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;Ingest documents (documentation, release notes)&lt;/li&gt;
&lt;li&gt;Create embeddings (vector representations)&lt;/li&gt;
&lt;li&gt;Store embeddings in KV Store or vector database&lt;/li&gt;
&lt;li&gt;Query with context at runtime&lt;/li&gt;
&lt;li&gt;Generate accurate, context-aware responses&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;RAG Best Practices:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Keep documents updated regularly&lt;/li&gt;
&lt;li&gt;Chunk large documents appropriately&lt;/li&gt;
&lt;li&gt;Test retrieval quality&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Deployment &amp;amp; Production
&lt;/h3&gt;

&lt;p&gt;For production deployment:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Deploy Kestra on Google Cloud&lt;/li&gt;
&lt;li&gt;Sync workflows from Git repository&lt;/li&gt;
&lt;li&gt;Use &lt;strong&gt;Secrets&lt;/strong&gt; and &lt;strong&gt;KV Store&lt;/strong&gt; for sensitive data&lt;/li&gt;
&lt;li&gt;Never commit API keys to Git&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Troubleshooting Tips
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Issue&lt;/th&gt;
&lt;th&gt;Solution&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;Port conflict with pgAdmin&lt;/td&gt;
&lt;td&gt;Change Kestra port to 18080&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;CSV column mismatch in BigQuery&lt;/td&gt;
&lt;td&gt;Rerun entire execution including re-download&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;Container issues&lt;/td&gt;
&lt;td&gt;Stop, remove, and restart containers&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;Recommended Docker Images:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;kestra/kestra:v1.1&lt;/code&gt; (stable version)&lt;/li&gt;
&lt;li&gt;&lt;code&gt;postgres:18&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Additional Resources
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;&lt;a href="https://kestra.io/docs" rel="noopener noreferrer"&gt;Kestra Documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;
&lt;a href="https://kestra.io/blueprints" rel="noopener noreferrer"&gt;Blueprints Library&lt;/a&gt; - Pre-built workflow examples&lt;/li&gt;
&lt;li&gt;&lt;a href="https://kestra.io/plugins" rel="noopener noreferrer"&gt;600+ Plugins&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="http://kestra.io/slack" rel="noopener noreferrer"&gt;Kestra Slack Community&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Key Takeaways
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Workflow orchestration&lt;/strong&gt; is essential for managing complex data pipelines&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Kestra&lt;/strong&gt; provides a flexible, scalable solution with YAML-based flows&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;ETL&lt;/strong&gt; is ideal for local processing; &lt;strong&gt;ELT&lt;/strong&gt; leverages cloud computing power&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Scheduling and backfills&lt;/strong&gt; enable automated and historical data processing&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;AI Copilot&lt;/strong&gt; accelerates workflow development with proper context&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;RAG&lt;/strong&gt; eliminates AI hallucinations by grounding responses in real data
#dezoomcamp&lt;/li&gt;
&lt;/ol&gt;

</description>
      <category>ai</category>
      <category>automation</category>
      <category>dataengineering</category>
      <category>llm</category>
    </item>
    <item>
      <title>Module 2 Summary - Workflow Orchestration with Kestra Part 2</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Tue, 03 Feb 2026 01:01:18 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/module-2-summary-workflow-orchestration-with-kestra-part-2-3l07</link>
      <guid>https://dev.to/abdelrahman_adnan/module-2-summary-workflow-orchestration-with-kestra-part-2-3l07</guid>
      <description>&lt;h2&gt;
  
  
  Part 2: Building ETL &amp;amp; ELT Data Pipelines
&lt;/h2&gt;

&lt;h3&gt;
  
  
  ETL Pipeline (Local Postgres)
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;ETL = Extract → Transform → Load&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;The local pipeline workflow:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Extract&lt;/strong&gt; CSV data from GitHub (partitioned by year and month)&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Transform&lt;/strong&gt; data using Python&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Load&lt;/strong&gt; data into PostgreSQL database&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;Key steps in the flow:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Create tables&lt;/li&gt;
&lt;li&gt;Load data to monthly staging table&lt;/li&gt;
&lt;li&gt;Merge data to final destination table&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;&lt;strong&gt;Dataset Source:&lt;/strong&gt; NYC Taxi and Limousine Commission (TLC) Trip Record Data available in CSV format from the DataTalksClub GitHub repository.&lt;/p&gt;

&lt;h3&gt;
  
  
  Scheduling and Backfills
&lt;/h3&gt;

&lt;p&gt;Kestra provides powerful scheduling capabilities:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Schedule Trigger&lt;/strong&gt; - Run pipelines at specific times (e.g., daily at 9 AM UTC)&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Backfill&lt;/strong&gt; - Process historical data by running workflows for past dates&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Example: Backfill green taxi data for year 2019.&lt;/p&gt;

&lt;h3&gt;
  
  
  ELT Pipeline (Google Cloud Platform)
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;ELT = Extract → Load → Transform&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;When working with large datasets in the cloud, ELT is often preferred:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Step&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Extract&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Get dataset from source (GitHub)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Load&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Upload to data lake (Google Cloud Storage)&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Transform&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Create tables in data warehouse (BigQuery) using data from GCS&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;Advantage:&lt;/strong&gt; Leverage cloud's performance for transforming large datasets much faster than local machines.&lt;/p&gt;

&lt;h3&gt;
  
  
  GCP Setup for Kestra
&lt;/h3&gt;

&lt;p&gt;Required KV Store values:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;GCP_PROJECT_ID&lt;/code&gt; - Your Google Cloud project&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;GCP_LOCATION&lt;/code&gt; - Region for resources&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;GCP_BUCKET_NAME&lt;/code&gt; - GCS bucket name&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;GCP_DATASET&lt;/code&gt; - BigQuery dataset name&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;GCP_CREDS&lt;/code&gt; - Service account credentials (keep secure!)&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  GCP Pipeline Flow
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;Extract CSV from GitHub&lt;/li&gt;
&lt;li&gt;Upload to Google Cloud Storage (data lake)&lt;/li&gt;
&lt;li&gt;Create external table in BigQuery from GCS&lt;/li&gt;
&lt;li&gt;Create partitioned table in BigQuery&lt;/li&gt;
&lt;li&gt;Schedule with timezone support (e.g., &lt;code&gt;America/New_York&lt;/code&gt;)&lt;/li&gt;
&lt;/ol&gt;




&lt;h1&gt;
  
  
  dezoomccamp
&lt;/h1&gt;

</description>
      <category>automation</category>
      <category>dataengineering</category>
      <category>postgres</category>
      <category>python</category>
    </item>
    <item>
      <title>Module 2 Summary - Workflow Orchestration with Kestra Part 1</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Tue, 03 Feb 2026 01:00:26 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/module-2-summary-workflow-orchestration-with-kestra-part-1-4ah0</link>
      <guid>https://dev.to/abdelrahman_adnan/module-2-summary-workflow-orchestration-with-kestra-part-1-4ah0</guid>
      <description>&lt;h2&gt;
  
  
  Part 1: Introduction to Workflow Orchestration &amp;amp; Kestra Fundamentals
&lt;/h2&gt;

&lt;h3&gt;
  
  
  What is Workflow Orchestration?
&lt;/h3&gt;

&lt;p&gt;Think of a music orchestra with various instruments that need to work together. The conductor helps them play in harmony. Similarly, a &lt;strong&gt;workflow orchestrator&lt;/strong&gt; coordinates multiple tools and platforms to work together.&lt;/p&gt;

&lt;p&gt;A workflow orchestrator typically:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Runs workflows&lt;/strong&gt; containing predefined steps&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Monitors and logs errors&lt;/strong&gt; with additional handling when they occur&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Automatically triggers workflows&lt;/strong&gt; based on schedules or events&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;In data engineering, we often need to move data from one place to another with modifications. The orchestrator manages these steps while providing visibility into the process.&lt;/p&gt;

&lt;h3&gt;
  
  
  What is Kestra?
&lt;/h3&gt;

&lt;p&gt;&lt;strong&gt;Kestra&lt;/strong&gt; is an open-source, event-driven, infinitely-scalable orchestration platform. Key features include:&lt;/p&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Feature&lt;/th&gt;
&lt;th&gt;Description&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Flow Code (YAML)&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Build workflows with code, no-code, or AI Copilot&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;1000+ Plugins&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Integrate with virtually any tool&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Multi-language Support&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Use Python, SQL, or any programming language&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Flexible Triggers&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;Schedule-based or event-based execution&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Core Kestra Concepts
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Flow&lt;/strong&gt; - A container for tasks and orchestration logic (defined in YAML)&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Tasks&lt;/strong&gt; - Individual steps within a flow&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Inputs&lt;/strong&gt; - Dynamic values passed at runtime&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Outputs&lt;/strong&gt; - Data passed between tasks and flows&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Triggers&lt;/strong&gt; - Mechanisms that automatically start flow execution&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Execution&lt;/strong&gt; - A single run of a flow with a specific state&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Variables&lt;/strong&gt; - Key-value pairs for reusable values across tasks&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Plugin Defaults&lt;/strong&gt; - Default values applied to tasks of a given type&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Concurrency&lt;/strong&gt; - Control how many executions can run simultaneously&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  Installing Kestra
&lt;/h3&gt;

&lt;p&gt;Kestra runs via Docker Compose with two main services:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Kestra server container&lt;/li&gt;
&lt;li&gt;PostgreSQL database container
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;cd &lt;/span&gt;02-workflow-orchestration
docker compose up &lt;span class="nt"&gt;-d&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Access the UI at: &lt;code&gt;http://localhost:8080&lt;/code&gt;&lt;/p&gt;

&lt;h3&gt;
  
  
  Running Python Code in Kestra
&lt;/h3&gt;

&lt;p&gt;Kestra can execute Python code either:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;From a dedicated file&lt;/li&gt;
&lt;li&gt;Written directly inside the workflow YAML&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;This allows you to pick the right tools for your pipelines without limitations.&lt;/p&gt;




&lt;h1&gt;
  
  
  dezoomcamp
&lt;/h1&gt;

</description>
      <category>automation</category>
      <category>dataengineering</category>
      <category>devops</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Data Engineering ZoomCamp Module 1 Notes Part 1</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Tue, 27 Jan 2026 01:22:08 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/data-engineering-zoomcamp-module-1-notes-part-1-12h3</link>
      <guid>https://dev.to/abdelrahman_adnan/data-engineering-zoomcamp-module-1-notes-part-1-12h3</guid>
      <description>&lt;h1&gt;
  
  
  Module 1: Docker, SQL &amp;amp; Terraform
&lt;/h1&gt;

&lt;p&gt;This is my notes and walkthrough for Module 1 of the Data Engineering Zoomcamp. If you're new to data engineering, this should help you understand the basics.&lt;/p&gt;

&lt;h2&gt;
  
  
  What is Data Engineering?
&lt;/h2&gt;

&lt;p&gt;Data Engineering is basically about building systems that collect, store, and analyze data at scale. Think of it as the plumbing that makes data flow from point A to point B so analysts and data scientists can do their thing.&lt;/p&gt;

&lt;p&gt;A &lt;strong&gt;data pipeline&lt;/strong&gt; is just a service that takes data in, does something with it, and outputs more data. Simple example: read a CSV file, clean it up, store it in a database.&lt;/p&gt;




&lt;h2&gt;
  
  
  Part 1: Docker Basics
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Why Docker?
&lt;/h3&gt;

&lt;p&gt;Docker lets you package your application and all its dependencies into a "container". This solves the classic "it works on my machine" problem.&lt;/p&gt;

&lt;p&gt;Main benefits:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Reproducibility&lt;/strong&gt; - same environment everywhere&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Isolation&lt;/strong&gt; - apps run independently, won't mess with your system&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Portability&lt;/strong&gt; - works on any machine with Docker installed&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Containers are different from virtual machines - they're much lighter because they share the host OS kernel.&lt;/p&gt;

&lt;h3&gt;
  
  
  Getting Started with Docker
&lt;/h3&gt;

&lt;p&gt;First, check if Docker is installed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker &lt;span class="nt"&gt;--version&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Run your first container:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run hello-world
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Try running Ubuntu:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run &lt;span class="nt"&gt;-it&lt;/span&gt; ubuntu
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The &lt;code&gt;-it&lt;/code&gt; flag means interactive mode with a terminal. Without it, the container just starts and exits.&lt;/p&gt;

&lt;h3&gt;
  
  
  Important: Containers are Stateless
&lt;/h3&gt;

&lt;p&gt;This tripped me up at first. Any changes you make inside a container are &lt;strong&gt;lost&lt;/strong&gt; when the container stops. For example:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run &lt;span class="nt"&gt;-it&lt;/span&gt; ubuntu
apt update &lt;span class="o"&gt;&amp;amp;&amp;amp;&lt;/span&gt; apt &lt;span class="nb"&gt;install &lt;/span&gt;python3
&lt;span class="nb"&gt;exit&lt;/span&gt;
&lt;span class="c"&gt;# Run it again&lt;/span&gt;
docker run &lt;span class="nt"&gt;-it&lt;/span&gt; ubuntu
python3  &lt;span class="c"&gt;# Error! Python is not installed&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;This is actually a feature, not a bug. It means you can always start fresh.&lt;/p&gt;

&lt;h3&gt;
  
  
  Managing Containers
&lt;/h3&gt;

&lt;p&gt;See all containers (including stopped ones):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker ps &lt;span class="nt"&gt;-a&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Clean up old containers:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker &lt;span class="nb"&gt;rm&lt;/span&gt; &lt;span class="si"&gt;$(&lt;/span&gt;docker ps &lt;span class="nt"&gt;-aq&lt;/span&gt;&lt;span class="si"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Better approach - use &lt;code&gt;--rm&lt;/code&gt; to auto-delete when container stops:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nt"&gt;--rm&lt;/span&gt; ubuntu
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Using Different Base Images
&lt;/h3&gt;

&lt;p&gt;You can use pre-built images with software already installed:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Python image - starts Python interpreter&lt;/span&gt;
docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nt"&gt;--rm&lt;/span&gt; python:3.13

&lt;span class="c"&gt;# If you want bash instead of Python:&lt;/span&gt;
docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nt"&gt;--rm&lt;/span&gt; &lt;span class="nt"&gt;--entrypoint&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;bash python:3.13-slim
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Volumes - Persisting Data
&lt;/h3&gt;

&lt;p&gt;Since containers are stateless, we need volumes to save data. There are two types:&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Named volumes&lt;/strong&gt; (Docker manages them):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nt"&gt;-v&lt;/span&gt; my_data:/app/data ubuntu
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;strong&gt;Bind mounts&lt;/strong&gt; (map to a folder on your computer):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nt"&gt;-v&lt;/span&gt; &lt;span class="si"&gt;$(&lt;/span&gt;&lt;span class="nb"&gt;pwd&lt;/span&gt;&lt;span class="si"&gt;)&lt;/span&gt;/my_folder:/app/data ubuntu
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Part 2: Creating a Dockerfile
&lt;/h2&gt;

&lt;p&gt;A Dockerfile is a recipe for building your own Docker image.&lt;/p&gt;

&lt;h3&gt;
  
  
  Simple Example
&lt;/h3&gt;

&lt;p&gt;Create a file called &lt;code&gt;pipeline.py&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;sys&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;pandas&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;

&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;sys&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;argv&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;day&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;sys&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;argv&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Job finished for day = &lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;day&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Create a &lt;code&gt;Dockerfile&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight docker"&gt;&lt;code&gt;&lt;span class="k"&gt;FROM&lt;/span&gt;&lt;span class="s"&gt; python:3.13-slim&lt;/span&gt;

&lt;span class="k"&gt;RUN &lt;/span&gt;pip &lt;span class="nb"&gt;install &lt;/span&gt;pandas pyarrow

&lt;span class="k"&gt;WORKDIR&lt;/span&gt;&lt;span class="s"&gt; /app&lt;/span&gt;
&lt;span class="k"&gt;COPY&lt;/span&gt;&lt;span class="s"&gt; pipeline.py pipeline.py&lt;/span&gt;

&lt;span class="k"&gt;ENTRYPOINT&lt;/span&gt;&lt;span class="s"&gt; ["python", "pipeline.py"]&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;What each line does:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;FROM&lt;/code&gt; - base image to build on&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;RUN&lt;/code&gt; - execute commands during build&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;WORKDIR&lt;/code&gt; - set the working directory&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;COPY&lt;/code&gt; - copy files from your machine to the image&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;ENTRYPOINT&lt;/code&gt; - the command that runs when container starts&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Build and run:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker build &lt;span class="nt"&gt;-t&lt;/span&gt; &lt;span class="nb"&gt;test&lt;/span&gt;:pandas &lt;span class="nb"&gt;.&lt;/span&gt;
docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nb"&gt;test&lt;/span&gt;:pandas some_argument
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Part 3: Running PostgreSQL with Docker
&lt;/h2&gt;

&lt;p&gt;Now let's do some real data engineering. We'll run Postgres in a container.&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker run &lt;span class="nt"&gt;-it&lt;/span&gt; &lt;span class="nt"&gt;--rm&lt;/span&gt; &lt;span class="se"&gt;\&lt;/span&gt;
  &lt;span class="nt"&gt;-e&lt;/span&gt; &lt;span class="nv"&gt;POSTGRES_USER&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s2"&gt;"root"&lt;/span&gt; &lt;span class="se"&gt;\&lt;/span&gt;
  &lt;span class="nt"&gt;-e&lt;/span&gt; &lt;span class="nv"&gt;POSTGRES_PASSWORD&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s2"&gt;"root"&lt;/span&gt; &lt;span class="se"&gt;\&lt;/span&gt;
  &lt;span class="nt"&gt;-e&lt;/span&gt; &lt;span class="nv"&gt;POSTGRES_DB&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s2"&gt;"ny_taxi"&lt;/span&gt; &lt;span class="se"&gt;\&lt;/span&gt;
  &lt;span class="nt"&gt;-v&lt;/span&gt; ny_taxi_postgres_data:/var/lib/postgresql/data &lt;span class="se"&gt;\&lt;/span&gt;
  &lt;span class="nt"&gt;-p&lt;/span&gt; 5432:5432 &lt;span class="se"&gt;\&lt;/span&gt;
  postgres:17
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Breaking this down:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;-e&lt;/code&gt; sets environment variables (username, password, database name)&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;-v&lt;/code&gt; creates a named volume so data persists&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;-p 5432:5432&lt;/code&gt; maps the container port to your machine&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Connecting to Postgres
&lt;/h3&gt;

&lt;p&gt;Install pgcli (a nice command-line client):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;pip &lt;span class="nb"&gt;install &lt;/span&gt;pgcli
&lt;span class="c"&gt;# or with uv:&lt;/span&gt;
uv add &lt;span class="nt"&gt;--dev&lt;/span&gt; pgcli
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Connect:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;pgcli &lt;span class="nt"&gt;-h&lt;/span&gt; localhost &lt;span class="nt"&gt;-p&lt;/span&gt; 5432 &lt;span class="nt"&gt;-u&lt;/span&gt; root &lt;span class="nt"&gt;-d&lt;/span&gt; ny_taxi
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Try some SQL:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="err"&gt;\&lt;/span&gt;&lt;span class="n"&gt;dt&lt;/span&gt;                              &lt;span class="c1"&gt;-- list tables&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;TABLE&lt;/span&gt; &lt;span class="n"&gt;test&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;id&lt;/span&gt; &lt;span class="nb"&gt;INTEGER&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;
&lt;span class="k"&gt;INSERT&lt;/span&gt; &lt;span class="k"&gt;INTO&lt;/span&gt; &lt;span class="n"&gt;test&lt;/span&gt; &lt;span class="k"&gt;VALUES&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;test&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="err"&gt;\&lt;/span&gt;&lt;span class="n"&gt;q&lt;/span&gt;                               &lt;span class="c1"&gt;-- quit&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



</description>
      <category>beginners</category>
      <category>dataengineering</category>
      <category>docker</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Data Engineering ZoomCamp Module 1 Notes Part 2</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Tue, 27 Jan 2026 01:20:18 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/data-engineering-zoomcamp-module-1-notes-part-2-5871</link>
      <guid>https://dev.to/abdelrahman_adnan/data-engineering-zoomcamp-module-1-notes-part-2-5871</guid>
      <description>&lt;h2&gt;
  
  
  Part 4: Data Ingestion with Python
&lt;/h2&gt;

&lt;p&gt;We're going to load the NYC Taxi dataset into Postgres.&lt;/p&gt;

&lt;h3&gt;
  
  
  Setting Up
&lt;/h3&gt;

&lt;p&gt;Install dependencies:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;pip &lt;span class="nb"&gt;install &lt;/span&gt;pandas sqlalchemy psycopg2-binary jupyter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Or with uv:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;uv add pandas sqlalchemy psycopg2-binary
uv add &lt;span class="nt"&gt;--dev&lt;/span&gt; jupyter
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  The Dataset
&lt;/h3&gt;

&lt;p&gt;We use the NYC Taxi trip data. Download it:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-01.csv.gz
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Loading Data into Postgres
&lt;/h3&gt;

&lt;p&gt;Here's the basic approach:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;pandas&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;sqlalchemy&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;create_engine&lt;/span&gt;

&lt;span class="c1"&gt;# Create connection
&lt;/span&gt;&lt;span class="n"&gt;engine&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;create_engine&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;postgresql://root:root@localhost:5432/ny_taxi&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Read CSV in chunks (it's a big file)
&lt;/span&gt;&lt;span class="n"&gt;df_iter&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;read_csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;yellow_tripdata_2021-01.csv.gz&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; 
                       &lt;span class="n"&gt;iterator&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; 
                       &lt;span class="n"&gt;chunksize&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="mi"&gt;100000&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Create table from first chunk
&lt;/span&gt;&lt;span class="n"&gt;first_chunk&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;next&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df_iter&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;first_chunk&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;head&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;to_sql&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;name&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;yellow_taxi_data&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;con&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;engine&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;if_exists&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;replace&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Insert first chunk
&lt;/span&gt;&lt;span class="n"&gt;first_chunk&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;to_sql&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;name&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;yellow_taxi_data&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;con&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;engine&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;if_exists&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;append&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Insert remaining chunks
&lt;/span&gt;&lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;chunk&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;df_iter&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;chunk&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;to_sql&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;name&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;yellow_taxi_data&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;con&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;engine&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;if_exists&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;append&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Inserted &lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="nf"&gt;len&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;chunk&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt; rows&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The key things here:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;chunksize&lt;/code&gt; prevents loading the whole file into memory&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;if_exists='replace'&lt;/code&gt; creates the table (first time)&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;if_exists='append'&lt;/code&gt; adds rows (subsequent chunks)&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Part 5: Docker Compose
&lt;/h2&gt;

&lt;p&gt;Running multiple &lt;code&gt;docker run&lt;/code&gt; commands is annoying. Docker Compose lets you define everything in one file.&lt;/p&gt;

&lt;p&gt;Create &lt;code&gt;docker-compose.yaml&lt;/code&gt;:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight yaml"&gt;&lt;code&gt;&lt;span class="na"&gt;services&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="na"&gt;pgdatabase&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
    &lt;span class="na"&gt;image&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;postgres:17&lt;/span&gt;
    &lt;span class="na"&gt;environment&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="na"&gt;POSTGRES_USER&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;root"&lt;/span&gt;
      &lt;span class="na"&gt;POSTGRES_PASSWORD&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;root"&lt;/span&gt;
      &lt;span class="na"&gt;POSTGRES_DB&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;ny_taxi"&lt;/span&gt;
    &lt;span class="na"&gt;volumes&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;ny_taxi_postgres_data:/var/lib/postgresql/data"&lt;/span&gt;
    &lt;span class="na"&gt;ports&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;5432:5432"&lt;/span&gt;

  &lt;span class="na"&gt;pgadmin&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
    &lt;span class="na"&gt;image&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s"&gt;dpage/pgadmin4&lt;/span&gt;
    &lt;span class="na"&gt;environment&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="na"&gt;PGADMIN_DEFAULT_EMAIL&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;admin@admin.com"&lt;/span&gt;
      &lt;span class="na"&gt;PGADMIN_DEFAULT_PASSWORD&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;root"&lt;/span&gt;
    &lt;span class="na"&gt;volumes&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;pgadmin_data:/var/lib/pgadmin"&lt;/span&gt;
    &lt;span class="na"&gt;ports&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
      &lt;span class="pi"&gt;-&lt;/span&gt; &lt;span class="s2"&gt;"&lt;/span&gt;&lt;span class="s"&gt;8080:80"&lt;/span&gt;

&lt;span class="na"&gt;volumes&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="na"&gt;ny_taxi_postgres_data&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
  &lt;span class="na"&gt;pgadmin_data&lt;/span&gt;&lt;span class="pi"&gt;:&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Now just run:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker-compose up      &lt;span class="c"&gt;# start everything&lt;/span&gt;
docker-compose up &lt;span class="nt"&gt;-d&lt;/span&gt;   &lt;span class="c"&gt;# start in background&lt;/span&gt;
docker-compose down    &lt;span class="c"&gt;# stop everything&lt;/span&gt;
docker-compose down &lt;span class="nt"&gt;-v&lt;/span&gt; &lt;span class="c"&gt;# stop and remove volumes&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Docker Compose automatically creates a network so containers can talk to each other using their service names (e.g., &lt;code&gt;pgdatabase&lt;/code&gt; instead of &lt;code&gt;localhost&lt;/code&gt;).&lt;/p&gt;

&lt;h3&gt;
  
  
  Connecting to Postgres from pgAdmin
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;Open &lt;code&gt;http://localhost:8080&lt;/code&gt; in browser&lt;/li&gt;
&lt;li&gt;Login with the email/password from docker-compose&lt;/li&gt;
&lt;li&gt;Right-click Servers &amp;gt; Create &amp;gt; Server&lt;/li&gt;
&lt;li&gt;Name it whatever you want&lt;/li&gt;
&lt;li&gt;Under Connection tab:

&lt;ul&gt;
&lt;li&gt;Host: &lt;code&gt;pgdatabase&lt;/code&gt; (the service name, not localhost!)&lt;/li&gt;
&lt;li&gt;Port: &lt;code&gt;5432&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Username: &lt;code&gt;root&lt;/code&gt;
&lt;/li&gt;
&lt;li&gt;Password: &lt;code&gt;root&lt;/code&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ol&gt;




&lt;h2&gt;
  
  
  Part 6: SQL Refresher
&lt;/h2&gt;

&lt;p&gt;Quick review of SQL queries we'll use a lot.&lt;/p&gt;

&lt;h3&gt;
  
  
  JOINs
&lt;/h3&gt;

&lt;p&gt;There are two ways to write an INNER JOIN:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Implicit join (old style)&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"Zone"&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;zones&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"PULocationID"&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"LocationID"&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;

&lt;span class="c1"&gt;-- Explicit join (preferred)&lt;/span&gt;
&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"Zone"&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;
&lt;span class="k"&gt;JOIN&lt;/span&gt; &lt;span class="n"&gt;zones&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt; &lt;span class="k"&gt;ON&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"PULocationID"&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;z&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"LocationID"&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For multiple joins:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; 
    &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;total_amount&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;zpu&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"Zone"&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="n"&gt;pickup_zone&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="n"&gt;zdo&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"Zone"&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="n"&gt;dropoff_zone&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;
&lt;span class="k"&gt;JOIN&lt;/span&gt; &lt;span class="n"&gt;zones&lt;/span&gt; &lt;span class="n"&gt;zpu&lt;/span&gt; &lt;span class="k"&gt;ON&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"PULocationID"&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;zpu&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"LocationID"&lt;/span&gt;
&lt;span class="k"&gt;JOIN&lt;/span&gt; &lt;span class="n"&gt;zones&lt;/span&gt; &lt;span class="n"&gt;zdo&lt;/span&gt; &lt;span class="k"&gt;ON&lt;/span&gt; &lt;span class="n"&gt;t&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"DOLocationID"&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;zdo&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nv"&gt;"LocationID"&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  GROUP BY and Aggregations
&lt;/h3&gt;

&lt;p&gt;Count trips per day:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; 
    &lt;span class="k"&gt;CAST&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;tpep_dropoff_datetime&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="k"&gt;day&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="n"&gt;trip_count&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt;
&lt;span class="k"&gt;GROUP&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="k"&gt;CAST&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;tpep_dropoff_datetime&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="k"&gt;day&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Multiple aggregations:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; 
    &lt;span class="k"&gt;CAST&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;tpep_dropoff_datetime&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="nb"&gt;DATE&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="k"&gt;day&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="n"&gt;trips&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;MAX&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;total_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="n"&gt;max_amount&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="k"&gt;SUM&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;total_amount&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;AS&lt;/span&gt; &lt;span class="n"&gt;total_revenue&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt;
&lt;span class="k"&gt;GROUP&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;trips&lt;/span&gt; &lt;span class="k"&gt;DESC&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Data Quality Checks
&lt;/h3&gt;

&lt;p&gt;Find NULL values:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="k"&gt;COUNT&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="o"&gt;*&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nv"&gt;"PULocationID"&lt;/span&gt; &lt;span class="k"&gt;IS&lt;/span&gt; &lt;span class="k"&gt;NULL&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Find values not in lookup table:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;yellow_taxi_data&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="nv"&gt;"PULocationID"&lt;/span&gt; &lt;span class="k"&gt;NOT&lt;/span&gt; &lt;span class="k"&gt;IN&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="nv"&gt;"LocationID"&lt;/span&gt; &lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;zones&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Part 7: Terraform &amp;amp; GCP
&lt;/h2&gt;

&lt;p&gt;Terraform is Infrastructure as Code (IaC). Instead of clicking around in a cloud console, you write config files describing what you want, and Terraform creates it.&lt;/p&gt;

&lt;h3&gt;
  
  
  Why Terraform?
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Version control your infrastructure&lt;/li&gt;
&lt;li&gt;Reproducible environments&lt;/li&gt;
&lt;li&gt;Easy to replicate across dev/staging/production&lt;/li&gt;
&lt;li&gt;Works with AWS, GCP, Azure, and many more&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  GCP Setup
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;Create a Google Cloud account (free tier gives you $300 credits)&lt;/li&gt;
&lt;li&gt;Create a new project&lt;/li&gt;
&lt;li&gt;Create a service account:

&lt;ul&gt;
&lt;li&gt;Go to IAM &amp;amp; Admin &amp;gt; Service Accounts&lt;/li&gt;
&lt;li&gt;Create new service account&lt;/li&gt;
&lt;li&gt;Give it these roles: Storage Admin, BigQuery Admin&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;Download the JSON key file&lt;/li&gt;
&lt;li&gt;Set the environment variable:
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="nb"&gt;export &lt;/span&gt;&lt;span class="nv"&gt;GOOGLE_APPLICATION_CREDENTIALS&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="s2"&gt;"/path/to/your/key.json"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Terraform Basics
&lt;/h3&gt;

&lt;p&gt;Main files:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;main.tf&lt;/code&gt; - main configuration&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;variables.tf&lt;/code&gt; - variable definitions&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Basic &lt;code&gt;main.tf&lt;/code&gt; example:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight hcl"&gt;&lt;code&gt;&lt;span class="nx"&gt;terraform&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;required_providers&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="nx"&gt;google&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
      &lt;span class="nx"&gt;source&lt;/span&gt;  &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"hashicorp/google"&lt;/span&gt;
      &lt;span class="nx"&gt;version&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"5.6.0"&lt;/span&gt;
    &lt;span class="p"&gt;}&lt;/span&gt;
  &lt;span class="p"&gt;}&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;

&lt;span class="nx"&gt;provider&lt;/span&gt; &lt;span class="s2"&gt;"google"&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;project&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"your-project-id"&lt;/span&gt;
  &lt;span class="nx"&gt;region&lt;/span&gt;  &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"us-central1"&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;

&lt;span class="nx"&gt;resource&lt;/span&gt; &lt;span class="s2"&gt;"google_storage_bucket"&lt;/span&gt; &lt;span class="s2"&gt;"data_lake"&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;name&lt;/span&gt;          &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"your-unique-bucket-name"&lt;/span&gt;
  &lt;span class="nx"&gt;location&lt;/span&gt;      &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"US"&lt;/span&gt;
  &lt;span class="nx"&gt;force_destroy&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="kc"&gt;true&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;

&lt;span class="nx"&gt;resource&lt;/span&gt; &lt;span class="s2"&gt;"google_bigquery_dataset"&lt;/span&gt; &lt;span class="s2"&gt;"dataset"&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
  &lt;span class="nx"&gt;dataset_id&lt;/span&gt; &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"trips_data"&lt;/span&gt;
  &lt;span class="nx"&gt;location&lt;/span&gt;   &lt;span class="p"&gt;=&lt;/span&gt; &lt;span class="s2"&gt;"US"&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Terraform Commands
&lt;/h3&gt;

&lt;p&gt;The workflow is always:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# 1. Initialize (download providers)&lt;/span&gt;
terraform init

&lt;span class="c"&gt;# 2. Preview changes&lt;/span&gt;
terraform plan

&lt;span class="c"&gt;# 3. Apply changes&lt;/span&gt;
terraform apply

&lt;span class="c"&gt;# 4. When you're done, destroy resources&lt;/span&gt;
terraform destroy
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;For auto-approving (skips confirmation):&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;terraform apply &lt;span class="nt"&gt;-auto-approve&lt;/span&gt;
terraform destroy &lt;span class="nt"&gt;-auto-approve&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Common Terraform Flags
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;code&gt;-auto-approve&lt;/code&gt; - don't ask for confirmation&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;-var="name=value"&lt;/code&gt; - pass variables&lt;/li&gt;
&lt;li&gt;
&lt;code&gt;-var-file="file.tfvars"&lt;/code&gt; - use a variables file&lt;/li&gt;
&lt;/ul&gt;




&lt;h2&gt;
  
  
  Useful Tips
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Docker Cleanup Commands
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Remove all stopped containers&lt;/span&gt;
docker container prune

&lt;span class="c"&gt;# Remove unused images&lt;/span&gt;
docker image prune

&lt;span class="c"&gt;# Remove unused volumes&lt;/span&gt;
docker volume prune

&lt;span class="c"&gt;# Nuclear option - remove everything unused&lt;/span&gt;
docker system prune &lt;span class="nt"&gt;-a&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Checking Ports
&lt;/h3&gt;

&lt;p&gt;If a port is already in use:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;&lt;span class="c"&gt;# Find what's using port 5432&lt;/span&gt;
lsof &lt;span class="nt"&gt;-i&lt;/span&gt; :5432
&lt;span class="c"&gt;# or&lt;/span&gt;
netstat &lt;span class="nt"&gt;-tulpn&lt;/span&gt; | &lt;span class="nb"&gt;grep &lt;/span&gt;5432
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Docker Networking
&lt;/h3&gt;

&lt;p&gt;When containers need to talk to each other:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;In Docker Compose: use service names as hostnames&lt;/li&gt;
&lt;li&gt;Manual setup: create a network with &lt;code&gt;docker network create&lt;/code&gt;
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight shell"&gt;&lt;code&gt;docker network create my_network
docker run &lt;span class="nt"&gt;--network&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;my_network &lt;span class="nt"&gt;--name&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;container1 ...
docker run &lt;span class="nt"&gt;--network&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;my_network &lt;span class="nt"&gt;--name&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;container2 ...
&lt;span class="c"&gt;# container2 can reach container1 using hostname "container1"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  Summary
&lt;/h2&gt;

&lt;p&gt;What we covered:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Docker&lt;/strong&gt; - containerization for reproducible environments&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;PostgreSQL&lt;/strong&gt; - relational database running in Docker&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Data Ingestion&lt;/strong&gt; - loading data with Python/pandas/SQLAlchemy&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Docker Compose&lt;/strong&gt; - orchestrating multiple containers&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;SQL&lt;/strong&gt; - querying and aggregating data&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Terraform&lt;/strong&gt; - infrastructure as code for GCP&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;The main takeaway: these tools help you build reproducible, scalable data pipelines. Docker ensures your code runs the same everywhere, and Terraform ensures your infrastructure is consistent and version-controlled.&lt;/p&gt;




&lt;h2&gt;
  
  
  Resources
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;&lt;a href="https://docs.docker.com/" rel="noopener noreferrer"&gt;Docker Documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="https://www.postgresql.org/docs/" rel="noopener noreferrer"&gt;PostgreSQL Documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="https://www.terraform.io/docs" rel="noopener noreferrer"&gt;Terraform Documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a href="https://github.com/DataTalksClub/data-engineering-zoomcamp" rel="noopener noreferrer"&gt;Data Engineering Zoomcamp GitHub&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;

</description>
      <category>dataengineering</category>
      <category>postgres</category>
      <category>python</category>
      <category>tutorial</category>
    </item>
    <item>
      <title># Medical RAG Architecture Overview #llmszoomcamp</title>
      <dc:creator>Abdelrahman Adnan</dc:creator>
      <pubDate>Sat, 04 Oct 2025 18:26:50 +0000</pubDate>
      <link>https://dev.to/abdelrahman_adnan/-medical-rag-architecture-overview-llmszoomcamp-7h</link>
      <guid>https://dev.to/abdelrahman_adnan/-medical-rag-architecture-overview-llmszoomcamp-7h</guid>
      <description>&lt;p&gt;This document provides a comprehensive explanation of the Retrieval-Augmented Generation (RAG) system architecture, breaking down each component and showing how they work together to deliver accurate medical information.&lt;/p&gt;

&lt;h2&gt;
  
  
  1. What is RAG?
&lt;/h2&gt;

&lt;p&gt;RAG combines the power of:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Information Retrieval&lt;/strong&gt;: Finding relevant documents from a knowledge base&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Language Generation&lt;/strong&gt;: Using LLMs to synthesize coherent answers&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Context Grounding&lt;/strong&gt;: Ensuring answers are based on retrieved evidence&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  2. High-Level Architecture Flow
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;[User Question] 
    ↓
[Hybrid Search: Vector + BM25]
    ↓
[Context Assembly &amp;amp; Prompt Building]
    ↓
[LLM Generation (GPT-4o-mini/GPT-4o)]
    ↓
[Answer Evaluation &amp;amp; Quality Assessment]
    ↓
[Metrics Calculation &amp;amp; Response Packaging]
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  3. Detailed Processing Pipeline
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Step 1: Query Processing
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Clean and normalize the input medical question&lt;/li&gt;
&lt;li&gt;Prepare query for both semantic and lexical search&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Step 2: Hybrid Retrieval
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Vector Search&lt;/strong&gt;: Semantic similarity using 384-dimensional embeddings&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;BM25 Search&lt;/strong&gt;: Keyword-based exact matching&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;RRF Fusion&lt;/strong&gt;: Combines both approaches using Reciprocal Rank Fusion&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Step 3: Context Assembly
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Select top-k most relevant medical cases&lt;/li&gt;
&lt;li&gt;Format retrieved documents into structured context&lt;/li&gt;
&lt;li&gt;Apply medical domain-specific scoring enhancements&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Step 4: Answer Generation
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Build specialized medical prompt with retrieved context&lt;/li&gt;
&lt;li&gt;Generate response using OpenAI models with controlled parameters&lt;/li&gt;
&lt;li&gt;Apply medical safety guidelines&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Step 5: Quality Assurance
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Evaluate answer relevance using LLM-as-a-judge&lt;/li&gt;
&lt;li&gt;Calculate confidence scores and metadata&lt;/li&gt;
&lt;li&gt;Track performance metrics and costs&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  4. Core System Components
&lt;/h2&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Component&lt;/th&gt;
&lt;th&gt;File Location&lt;/th&gt;
&lt;th&gt;Primary Responsibility&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;RAG Orchestrator&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;src/core/rag.py&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Main pipeline coordination&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Vector Database&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;src/database/vector_db.py&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Hybrid search + RRF fusion&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Data Ingestion&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;scripts/ingest.py&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Document processing &amp;amp; indexing&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;API Layer&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;src/api/main_api.py&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;REST endpoints &amp;amp; async processing&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Web Interface&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;src/api/web_interface.py&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Interactive Streamlit UI&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;&lt;strong&gt;Monitoring&lt;/strong&gt;&lt;/td&gt;
&lt;td&gt;&lt;code&gt;src/services/s3_service.py&lt;/code&gt;&lt;/td&gt;
&lt;td&gt;Logging &amp;amp; metrics collection&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h2&gt;
  
  
  5. Advanced Search Mechanism
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Hybrid Search Strategy
&lt;/h3&gt;

&lt;p&gt;Our system implements a sophisticated hybrid approach that combines:&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;Semantic Vector Search&lt;/strong&gt; (Cosine Similarity)
&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;   &lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;   &lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;search&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;top_k&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;int&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;5&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="n"&gt;List&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;]:&lt;/span&gt;
       &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;Search medical knowledge base using hybrid search&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="nf"&gt;hybrid_query_rrf&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;top_k&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;top_k&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ol&gt;
&lt;li&gt;
&lt;strong&gt;BM25 Keyword Search&lt;/strong&gt; (Exact Token Matching)

&lt;ul&gt;
&lt;li&gt;Handles medical terminology and acronyms&lt;/li&gt;
&lt;li&gt;Captures exact drug names and dosages&lt;/li&gt;
&lt;li&gt;Preserves clinical precision&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ol&gt;

&lt;h3&gt;
  
  
  Reciprocal Rank Fusion (RRF) Algorithm
&lt;/h3&gt;

&lt;p&gt;RRF combines multiple ranking approaches using the formula:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight plaintext"&gt;&lt;code&gt;RRF_score = Σ(1 / (k + rank_i))
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Where &lt;code&gt;k=60&lt;/code&gt; (tuning parameter) and &lt;code&gt;rank_i&lt;/code&gt; is the position in each ranking list.&lt;/p&gt;

&lt;h3&gt;
  
  
  Medical Domain Scoring Enhancements
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Severity Weighting&lt;/strong&gt;: Life-threatening conditions get priority&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Department Relevance&lt;/strong&gt;: Matches medical specialties&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Symptom Alignment&lt;/strong&gt;: Boosts exact symptom matches&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Treatment Precision&lt;/strong&gt;: Enhances therapeutic recommendations&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  6. Medical Prompt Engineering
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Structured Prompt Architecture
&lt;/h3&gt;

&lt;p&gt;Our prompts are carefully designed for medical accuracy and safety:&lt;/p&gt;

&lt;h4&gt;
  
  
  System Instruction Design
&lt;/h4&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Role Definition&lt;/strong&gt;: "You are a knowledgeable medical assistant"&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Evidence Constraint&lt;/strong&gt;: "Answer based solely on provided CONTEXT"&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Factual Grounding&lt;/strong&gt;: "Use only facts from the CONTEXT"&lt;/li&gt;
&lt;/ul&gt;

&lt;h4&gt;
  
  
  Context Formatting Strategy
&lt;/h4&gt;

&lt;p&gt;Each retrieved medical case follows a structured template:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;&lt;span class="n"&gt;PROMPT_TEMPLATE&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;You are a knowledgeable medical assistant. Answer the QUESTION based solely on the information provided in the CONTEXT from the medical database.

Use only the facts from the CONTEXT when formulating your answer.

QUESTION: {question}

CONTEXT:
{context}&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

&lt;span class="n"&gt;ENTRY_TEMPLATE&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;Medical Case:
Question: {question}
Answer: {answer}
Relevance Score: {score:.3f}&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;build_prompt&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;search_results&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;List&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;context&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;""&lt;/span&gt;
    &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;doc&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;search_results&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="n"&gt;context&lt;/span&gt; &lt;span class="o"&gt;+=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
            &lt;span class="n"&gt;ENTRY_TEMPLATE&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;format&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
                &lt;span class="n"&gt;question&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;doc&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;question&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;N/A&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
                &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;doc&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;answer&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;N/A&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
                &lt;span class="n"&gt;score&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;doc&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;score&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mf"&gt;0.0&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
            &lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="se"&gt;\n\n&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
        &lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;PROMPT_TEMPLATE&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;format&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;question&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;context&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;context&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  8. Language Model Integration
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Model Selection Strategy
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;llm&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;prompt&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;gpt-4o-mini&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="n"&gt;Tuple&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;]:&lt;/span&gt;
    &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;Generate response using OpenAI LLM with medical-optimized parameters&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
    &lt;span class="n"&gt;response&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;client&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;chat&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;completions&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;create&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="n"&gt;messages&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="p"&gt;[{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;role&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;user&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;content&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;prompt&lt;/span&gt;&lt;span class="p"&gt;}],&lt;/span&gt;
        &lt;span class="n"&gt;max_tokens&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="mi"&gt;1000&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;        &lt;span class="c1"&gt;# Sufficient for comprehensive medical answers
&lt;/span&gt;        &lt;span class="n"&gt;temperature&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="mf"&gt;0.1&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;        &lt;span class="c1"&gt;# Low temperature for consistency and accuracy
&lt;/span&gt;    &lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Parameter Optimization for Medical Use
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Temperature (0.1)&lt;/strong&gt;: Ensures deterministic, conservative responses&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Max Tokens (1000)&lt;/strong&gt;: Balances comprehensiveness with cost&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Model Choice&lt;/strong&gt;: GPT-4o-mini provides 91.11% relevance vs GPT-4o's 64.75%&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Cost-Performance Analysis
&lt;/h3&gt;

&lt;div class="table-wrapper-paragraph"&gt;&lt;table&gt;
&lt;thead&gt;
&lt;tr&gt;
&lt;th&gt;Model&lt;/th&gt;
&lt;th&gt;Relevance Rate&lt;/th&gt;
&lt;th&gt;Cost per 1K tokens&lt;/th&gt;
&lt;th&gt;Use Case&lt;/th&gt;
&lt;/tr&gt;
&lt;/thead&gt;
&lt;tbody&gt;
&lt;tr&gt;
&lt;td&gt;GPT-4o-mini&lt;/td&gt;
&lt;td&gt;91.11%&lt;/td&gt;
&lt;td&gt;$0.00015 (input)&lt;/td&gt;
&lt;td&gt;Primary model&lt;/td&gt;
&lt;/tr&gt;
&lt;tr&gt;
&lt;td&gt;GPT-4o&lt;/td&gt;
&lt;td&gt;64.75%&lt;/td&gt;
&lt;td&gt;$0.03 (input)&lt;/td&gt;
&lt;td&gt;Complex cases only&lt;/td&gt;
&lt;/tr&gt;
&lt;/tbody&gt;
&lt;/table&gt;&lt;/div&gt;

&lt;h3&gt;
  
  
  Response Processing
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;token_stats&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;prompt_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;usage&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;prompt_tokens&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;completion_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;usage&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;completion_tokens&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
    &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;total_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;usage&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;total_tokens&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="p"&gt;}&lt;/span&gt;
&lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;token_stats&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  9. Comprehensive Answer Evaluation
&lt;/h2&gt;

&lt;h3&gt;
  
  
  LLM-as-a-Judge Methodology
&lt;/h3&gt;

&lt;p&gt;We implement automated quality assessment using a specialized evaluation prompt:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;&lt;span class="n"&gt;EVALUATION_PROMPT_TEMPLATE&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;You are an expert medical reviewer evaluating the quality and relevance of AI-generated medical responses.

You will be given a medical question and a generated answer. Based on the relevance of the generated answer, you will classify it as &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;NON_RELEVANT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;, &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;PARTLY_RELEVANT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;, or &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;RELEVANT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;.

Question: {question}
Generated Answer: {answer}

Provide evaluation in JSON format:
{{
  &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Relevance&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;: &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;NON_RELEVANT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt; | &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;PARTLY_RELEVANT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt; | &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;RELEVANT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;,
  &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Explanation&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;: &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;[Brief explanation for evaluation]&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;
}}&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Quality Assessment Metrics
&lt;/h3&gt;

&lt;ol&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Relevance Categories&lt;/strong&gt;:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;RELEVANT&lt;/strong&gt;: Direct, accurate medical information&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;PARTLY_RELEVANT&lt;/strong&gt;: Partially helpful but incomplete&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;NON_RELEVANT&lt;/strong&gt;: Off-topic or potentially harmful&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;p&gt;&lt;strong&gt;Evaluation Processing&lt;/strong&gt;:&lt;br&gt;
&lt;/p&gt;&lt;/li&gt;
&lt;/ol&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;   &lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;evaluate_relevance&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;question&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="n"&gt;Tuple&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;]:&lt;/span&gt;
       &lt;span class="n"&gt;prompt&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;EVALUATION_PROMPT_TEMPLATE&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;format&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;question&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;question&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
       &lt;span class="n"&gt;evaluation&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;tokens&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;llm&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;prompt&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;gpt-4o-mini&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
       &lt;span class="k"&gt;try&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
           &lt;span class="n"&gt;json_eval&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;loads&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;evaluation&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
           &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;json_eval&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;tokens&lt;/span&gt;
       &lt;span class="k"&gt;except&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;JSONDecodeError&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
           &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Relevance&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;UNKNOWN&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Explanation&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Parse failed&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;},&lt;/span&gt; &lt;span class="n"&gt;tokens&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Medical Safety Considerations
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Conservative Evaluation&lt;/strong&gt;: Strict relevance criteria&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Explanation Tracking&lt;/strong&gt;: Maintains audit trail for quality decisions&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Error Handling&lt;/strong&gt;: Graceful degradation for parsing failures&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Dual Model Use&lt;/strong&gt;: Separate evaluation model reduces bias&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  10. Cost Optimization &amp;amp; Monitoring
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Transparent Cost Calculation
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;calculate_openai_cost&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;tokens&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="nb"&gt;float&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;Calculate OpenAI API cost with model-specific pricing&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
    &lt;span class="n"&gt;cost&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mf"&gt;0.0&lt;/span&gt;
    &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;gpt-4o-mini&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="n"&gt;cost&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
            &lt;span class="n"&gt;tokens&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;prompt_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.00015&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt;      &lt;span class="c1"&gt;# Input cost
&lt;/span&gt;            &lt;span class="n"&gt;tokens&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;completion_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.0006&lt;/span&gt;     &lt;span class="c1"&gt;# Output cost
&lt;/span&gt;        &lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;/&lt;/span&gt; &lt;span class="mi"&gt;1000&lt;/span&gt;
    &lt;span class="k"&gt;elif&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;gpt-4o&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
        &lt;span class="n"&gt;cost&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
            &lt;span class="n"&gt;tokens&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;prompt_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.03&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt;         &lt;span class="c1"&gt;# Higher input cost
&lt;/span&gt;            &lt;span class="n"&gt;tokens&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;completion_tokens&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.06&lt;/span&gt;       &lt;span class="c1"&gt;# Higher output cost
&lt;/span&gt;        &lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;/&lt;/span&gt; &lt;span class="mi"&gt;1000&lt;/span&gt;
    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;cost&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Cost Performance Metrics
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Average Cost per Query&lt;/strong&gt;: $0.003&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Token Efficiency&lt;/strong&gt;: ~23 seconds response time&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Cost Breakdown&lt;/strong&gt;: RAG generation + evaluation costs tracked separately&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  11. Complete Pipeline Integration
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Master RAG Function
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# src/core/rag.py
&lt;/span&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;rag&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nb"&gt;str&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;gpt-4o-mini&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;-&amp;gt;&lt;/span&gt; &lt;span class="n"&gt;Dict&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="sh"&gt;"""&lt;/span&gt;&lt;span class="s"&gt;Complete RAG pipeline with comprehensive response data&lt;/span&gt;&lt;span class="sh"&gt;"""&lt;/span&gt;
    &lt;span class="n"&gt;t0&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;time&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;  &lt;span class="c1"&gt;# Start timing
&lt;/span&gt;
    &lt;span class="c1"&gt;# Step 1: Hybrid search
&lt;/span&gt;    &lt;span class="n"&gt;search_results&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;search&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="c1"&gt;# Step 2: Prompt assembly
&lt;/span&gt;    &lt;span class="n"&gt;prompt&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;build_prompt&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;search_results&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="c1"&gt;# Step 3: LLM generation
&lt;/span&gt;    &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;token_stats&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;llm&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;prompt&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="c1"&gt;# Step 4: Quality evaluation
&lt;/span&gt;    &lt;span class="n"&gt;relevance&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;rel_token_stats&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;evaluate_relevance&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;query&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="c1"&gt;# Step 5: Metrics calculation
&lt;/span&gt;    &lt;span class="n"&gt;took&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;time&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;t0&lt;/span&gt;
    &lt;span class="n"&gt;total_cost&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;calculate_openai_cost&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;token_stats&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; \
                &lt;span class="nf"&gt;calculate_openai_cost&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;gpt-4o-mini&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;rel_token_stats&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

    &lt;span class="c1"&gt;# Step 6: Response packaging
&lt;/span&gt;    &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;answer&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;answer&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;model_used&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;model&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;response_time&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;took&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;relevance&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;relevance&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Relevance&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;UNKNOWN&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;relevance_explanation&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;relevance&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Explanation&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;None&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;total_cost&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;total_cost&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;token_stats&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;{...},&lt;/span&gt;  &lt;span class="c1"&gt;# Comprehensive token tracking
&lt;/span&gt;        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;search_results_count&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="nf"&gt;len&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;search_results&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
        &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;search_results&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;search_results&lt;/span&gt;&lt;span class="p"&gt;[:&lt;/span&gt;&lt;span class="mi"&gt;5&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;  &lt;span class="c1"&gt;# Top results for audit
&lt;/span&gt;    &lt;span class="p"&gt;}&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  12. Architecture Design Principles
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Medical Safety First
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Evidence-Based&lt;/strong&gt;: All answers must cite retrieved medical literature&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Conservative Generation&lt;/strong&gt;: Low temperature prevents hallucination&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Quality Gates&lt;/strong&gt;: Multi-step evaluation ensures reliability&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Audit Trails&lt;/strong&gt;: Complete logging for medical compliance&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Performance Optimization
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Hybrid Retrieval&lt;/strong&gt;: Combines semantic understanding with exact matching&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Model Selection&lt;/strong&gt;: Cost-effective model choice based on performance data&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Caching Ready&lt;/strong&gt;: Architecture supports future caching implementations&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Scalable Design&lt;/strong&gt;: Async processing and background tasks&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Production Considerations
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Error Handling&lt;/strong&gt;: Graceful degradation at each step&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Monitoring Integration&lt;/strong&gt;: Built-in metrics and logging hooks&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Cost Control&lt;/strong&gt;: Transparent pricing with usage tracking&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Extensibility&lt;/strong&gt;: Modular design supports feature additions&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  13. Future Enhancement Opportunities
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Immediate Improvements
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Citation Generation&lt;/strong&gt;: Add source document references in answers&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Query Preprocessing&lt;/strong&gt;: Medical entity recognition and normalization&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Response Caching&lt;/strong&gt;: Cache frequent queries to reduce costs&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Batch Processing&lt;/strong&gt;: Optimize multiple simultaneous queries&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Advanced Features
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Multi-Modal Support&lt;/strong&gt;: Integrate medical images and charts&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Specialized Models&lt;/strong&gt;: Fine-tuned models for specific medical domains&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Real-Time Learning&lt;/strong&gt;: Incorporate user feedback into model updates&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Clinical Integration&lt;/strong&gt;: EMR system compatibility and FHIR support&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Research Directions
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Retrieval Enhancement&lt;/strong&gt;: Advanced embedding models for medical text&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Generation Improvement&lt;/strong&gt;: Medical-specific language model fine-tuning&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Evaluation Evolution&lt;/strong&gt;: Automated medical accuracy assessment&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Safety Advancement&lt;/strong&gt;: Enhanced harm detection and prevention&lt;/li&gt;
&lt;/ul&gt;




</description>
      <category>rag</category>
      <category>ai</category>
      <category>architecture</category>
      <category>llm</category>
    </item>
  </channel>
</rss>
