DEV Community

Armaan Khan
Armaan Khan

Posted on

USER

-- =============================================
-- OPTIMIZED SNOWFLAKE QUERY PERFORMANCE ANALYZER
-- =============================================

-- Step 1: Create materialized table for refreshable data
CREATE OR REPLACE TABLE QUERY_PERFORMANCE_ANALYSIS (
    USER_NAME VARCHAR(255),
    TOTAL_QUERIES NUMBER,
    WAREHOUSES_USED NUMBER,
    DATABASES_ACCESSED NUMBER,
    TOTAL_CREDITS NUMBER(10,2),
    AVG_EXECUTION_TIME_MS NUMBER(10,2),
    AVG_BYTES_PER_ROW NUMBER(15,2),
    TOTAL_DATA_SCANNED_GB NUMBER(15,2),
    FAILURE_CANCELLATION_RATE_PCT NUMBER(5,2),
    SPILLED_QUERIES NUMBER,
    OVER_PROVISIONED_QUERIES NUMBER,
    PEAK_HOUR_LONG_RUNNING_QUERIES NUMBER,
    SELECT_STAR_QUERIES NUMBER,
    UNPARTITIONED_SCAN_QUERIES NUMBER,
    REPEATED_QUERIES NUMBER,
    COMPLEX_JOIN_QUERIES NUMBER,
    ZERO_RESULT_QUERIES NUMBER,
    HIGH_COMPILE_QUERIES NUMBER,
    UNTAGGED_QUERIES NUMBER,
    UNLIMITED_ORDER_BY_QUERIES NUMBER,
    SLOW_QUERIES NUMBER,
    EXPENSIVE_DISTINCT_QUERIES NUMBER,
    INEFFICIENT_LIKE_QUERIES NUMBER,
    NO_RESULTS_WITH_SCAN_QUERIES NUMBER,
    HIGH_COMPILE_RATIO_QUERIES NUMBER,
    -- Normalized scores (1-100)
    SPILLED_SCORE NUMBER(5,2),
    OVER_PROVISIONED_SCORE NUMBER(5,2),
    PEAK_HOUR_SCORE NUMBER(5,2),
    SELECT_STAR_SCORE NUMBER(5,2),
    UNPARTITIONED_SCORE NUMBER(5,2),
    REPEATED_SCORE NUMBER(5,2),
    COMPLEX_JOIN_SCORE NUMBER(5,2),
    ZERO_RESULT_SCORE NUMBER(5,2),
    HIGH_COMPILE_SCORE NUMBER(5,2),
    SLOW_QUERY_SCORE NUMBER(5,2),
    EXPENSIVE_DISTINCT_SCORE NUMBER(5,2),
    INEFFICIENT_LIKE_SCORE NUMBER(5,2),
    NO_RESULTS_SCAN_SCORE NUMBER(5,2),
    HIGH_COMPILE_RATIO_SCORE NUMBER(5,2),
    OVERALL_OPTIMIZATION_SCORE NUMBER(5,2), -- 1 = optimized, 100 = most unoptimized
    COST_STATUS VARCHAR(50),
    RECOMMENDATIONS ARRAY,
    QUERY_SAMPLES OBJECT,
    ANALYSIS_DATE TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);

-- Step 2: Create procedure to refresh the analysis
CREATE OR REPLACE PROCEDURE REFRESH_QUERY_PERFORMANCE_ANALYSIS(
    START_DATE STRING,
    END_DATE STRING,
    OBJECT_FILTER STRING DEFAULT ''
)
RETURNS STRING
LANGUAGE SQL
AS
$$
BEGIN
    -- Clear existing data
    DELETE FROM QUERY_PERFORMANCE_ANALYSIS;

    -- Insert new analysis
    INSERT INTO QUERY_PERFORMANCE_ANALYSIS
    WITH percentile_reference AS (
        SELECT
            COALESCE(warehouse_size, 'UNKNOWN') as warehouse_size,
            PERCENTILE_CONT(0.1) WITHIN GROUP (ORDER BY COALESCE(bytes_scanned, 0)) AS bytes_scanned_p10,
            PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY total_elapsed_time) AS execution_time_p90
        FROM snowflake.account_usage.query_history
        WHERE start_time >= TRY_TO_TIMESTAMP_NTZ(START_DATE)
          AND start_time <= TRY_TO_TIMESTAMP_NTZ(END_DATE || ' 23:59:59')
          AND user_name IS NOT NULL
          AND query_type NOT IN ('DESCRIBE', 'SHOW', 'USE')
          AND total_elapsed_time >= 2000  -- Minimum 2 seconds
        GROUP BY COALESCE(warehouse_size, 'UNKNOWN')
    ),
    query_flags AS (
        SELECT
            qh.query_id,
            qh.warehouse_size,
            qh.bytes_scanned,
            qh.total_elapsed_time AS execution_time_ms,
            qh.compilation_time,
            qh.start_time,
            qh.query_text, -- Full query text, no truncation
            qh.query_hash,
            qh.query_tag,
            qh.error_code,
            qh.execution_status,
            qh.partitions_scanned,
            qh.partitions_total,
            qh.bytes_spilled_to_local_storage,
            qh.bytes_spilled_to_remote_storage,
            qh.user_name,
            qh.database_name,
            qh.schema_name,
            qh.warehouse_name,
            COALESCE(qh.credits_used_cloud_services, 0) AS credits_used_cloud_services,
            qh.rows_produced,
            -- Optimized flag calculations
            CASE
                WHEN qh.warehouse_size IN ('MEDIUM', 'LARGE', 'X-LARGE', '2X-LARGE', '3X-LARGE', '4X-LARGE')
                 AND qh.bytes_scanned < pr.bytes_scanned_p10 THEN 1 ELSE 0
            END AS over_provisioned,
            CASE
                WHEN EXTRACT(HOUR FROM qh.start_time) BETWEEN 9 AND 17
                 AND qh.total_elapsed_time > 300000 THEN 1 ELSE 0
            END AS peak_hour_long_running,
            CASE
                WHEN UPPER(qh.query_text) LIKE 'SELECT *%' THEN 1 ELSE 0
            END AS select_star,
            CASE
                WHEN qh.partitions_total > 0 AND qh.partitions_scanned = qh.partitions_total THEN 1 ELSE 0
            END AS unpartitioned_scan,
            CASE
                WHEN qh.query_hash IS NOT NULL THEN 1 ELSE 0
            END AS repeated_query,
            CASE
                WHEN (UPPER(qh.query_text) LIKE '%JOIN%' AND UPPER(qh.query_text) LIKE '%JOIN%')
                   OR UPPER(qh.query_text) LIKE '%WINDOW%' THEN 1 ELSE 0
            END AS complex_query,
            CASE
                WHEN qh.error_code IS NOT NULL OR qh.execution_status IN ('FAILED', 'CANCELLED') THEN 1 ELSE 0
            END AS failed_cancelled,
            CASE
                WHEN qh.bytes_spilled_to_local_storage > 0 OR qh.bytes_spilled_to_remote_storage > 0 THEN 1 ELSE 0
            END AS spilled,
            CASE
                WHEN qh.rows_produced = 0 AND qh.bytes_scanned > 1000000 THEN 1 ELSE 0
            END AS zero_result_query,
            CASE
                WHEN qh.compilation_time > 5000 THEN 1 ELSE 0
            END AS high_compile_time,
            CASE
                WHEN qh.query_tag IS NULL THEN 1 ELSE 0
            END AS untagged_query,
            CASE
                WHEN UPPER(qh.query_text) LIKE '%ORDER BY%' AND UPPER(qh.query_text) NOT LIKE '%LIMIT%' THEN 1 ELSE 0
            END AS unlimited_order_by,
            CASE
                WHEN qh.total_elapsed_time > pr.execution_time_p90 THEN 1 ELSE 0
            END AS slow_query,
            CASE
                WHEN UPPER(qh.query_text) LIKE '%DISTINCT%' AND qh.bytes_scanned > 100000000 THEN 1 ELSE 0
            END AS expensive_distinct,
            CASE
                WHEN UPPER(qh.query_text) LIKE '%LIKE%' AND UPPER(qh.query_text) NOT LIKE '%INDEX%' THEN 1 ELSE 0
            END AS inefficient_like,
            CASE
                WHEN qh.bytes_scanned > 0 AND qh.rows_produced = 0 THEN 1 ELSE 0
            END AS no_results_with_scan,
            CASE
                WHEN qh.total_elapsed_time > 0 AND qh.compilation_time / qh.total_elapsed_time > 0.5 THEN 1 ELSE 0
            END AS high_compile_ratio
        FROM snowflake.account_usage.query_history qh
        LEFT JOIN percentile_reference pr ON COALESCE(qh.warehouse_size, 'UNKNOWN') = pr.warehouse_size
        WHERE
            qh.start_time >= TRY_TO_TIMESTAMP_NTZ(START_DATE)
            AND qh.start_time <= TRY_TO_TIMESTAMP_NTZ(END_DATE || ' 23:59:59')
            AND qh.query_type NOT IN ('DESCRIBE', 'SHOW', 'USE')
            AND qh.user_name IS NOT NULL
            AND qh.total_elapsed_time >= 2000  -- Minimum 2 seconds
    ),
    -- Optimized sample queries collection
    sample_queries AS (
        SELECT
            user_name,
            OBJECT_CONSTRUCT(
                'over_provisioned', ARRAY_AGG(
                    CASE WHEN over_provisioned = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text, -- Full query text
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        ) 
                    END
                ) WITHIN GROUP (ORDER BY start_time DESC),
                'peak_hour_long_running', ARRAY_AGG(
                    CASE WHEN peak_hour_long_running = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY execution_time_ms DESC),
                'select_star', ARRAY_AGG(
                    CASE WHEN select_star = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY bytes_scanned DESC),
                'spilled', ARRAY_AGG(
                    CASE WHEN spilled = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_spilled_to_local_storage', bytes_spilled_to_local_storage,
                            'bytes_spilled_to_remote_storage', bytes_spilled_to_remote_storage,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY (bytes_spilled_to_local_storage + bytes_spilled_to_remote_storage) DESC),
                'slow_query', ARRAY_AGG(
                    CASE WHEN slow_query = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY execution_time_ms DESC)
            ) AS query_samples
        FROM query_flags
        GROUP BY user_name
    ),
    user_metrics AS (
        SELECT
            user_name,
            COUNT(DISTINCT query_id) AS total_queries,
            COUNT(DISTINCT warehouse_name) AS warehouses_used,
            COUNT(DISTINCT database_name) AS databases_accessed,
            ROUND(SUM(credits_used_cloud_services), 2) AS total_credits,
            ROUND(AVG(execution_time_ms), 2) AS avg_execution_time_ms,
            ROUND(AVG(NULLIF(bytes_scanned, 0) / NULLIF(rows_produced, 0)), 2) AS avg_bytes_per_row,
            ROUND(SUM(bytes_scanned) / POWER(1024, 3), 2) AS total_data_scanned_gb,
            ROUND(SUM(failed_cancelled) * 100.0 / NULLIF(COUNT(*), 0), 2) AS failure_cancellation_rate_pct,
            SUM(spilled) AS spilled_queries,
            SUM(over_provisioned) AS over_provisioned_queries,
            SUM(peak_hour_long_running) AS peak_hour_long_running_queries,
            SUM(select_star) AS select_star_queries,
            SUM(unpartitioned_scan) AS unpartitioned_scan_queries,
            COUNT(*) - COUNT(DISTINCT query_hash) AS repeated_queries,
            SUM(complex_query) AS complex_join_queries,
            SUM(zero_result_query) AS zero_result_queries,
            SUM(high_compile_time) AS high_compile_queries,
            SUM(untagged_query) AS untagged_queries,
            SUM(unlimited_order_by) AS unlimited_order_by_queries,
            SUM(slow_query) AS slow_queries,
            SUM(expensive_distinct) AS expensive_distinct_queries,
            SUM(inefficient_like) AS inefficient_like_queries,
            SUM(no_results_with_scan) AS no_results_with_scan_queries,
            SUM(high_compile_ratio) AS high_compile_ratio_queries
        FROM query_flags
        GROUP BY user_name
    ),
    -- Calculate percentiles for normalization
    normalization_stats AS (
        SELECT
            MAX(spilled_queries) AS max_spilled,
            MAX(over_provisioned_queries) AS max_over_provisioned,
            MAX(peak_hour_long_running_queries) AS max_peak_hour,
            MAX(select_star_queries) AS max_select_star,
            MAX(unpartitioned_scan_queries) AS max_unpartitioned,
            MAX(repeated_queries) AS max_repeated,
            MAX(complex_join_queries) AS max_complex,
            MAX(zero_result_queries) AS max_zero_result,
            MAX(high_compile_queries) AS max_high_compile,
            MAX(slow_queries) AS max_slow,
            MAX(expensive_distinct_queries) AS max_expensive_distinct,
            MAX(inefficient_like_queries) AS max_inefficient_like,
            MAX(no_results_with_scan_queries) AS max_no_results_scan,
            MAX(high_compile_ratio_queries) AS max_high_compile_ratio,
            PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY total_credits) AS cost_p90
        FROM user_metrics
    )
    SELECT
        um.user_name,
        um.total_queries,
        um.warehouses_used,
        um.databases_accessed,
        um.total_credits,
        um.avg_execution_time_ms,
        um.avg_bytes_per_row,
        um.total_data_scanned_gb,
        um.failure_cancellation_rate_pct,
        um.spilled_queries,
        um.over_provisioned_queries,
        um.peak_hour_long_running_queries,
        um.select_star_queries,
        um.unpartitioned_scan_queries,
        um.repeated_queries,
        um.complex_join_queries,
        um.zero_result_queries,
        um.high_compile_queries,
        um.untagged_queries,
        um.unlimited_order_by_queries,
        um.slow_queries,
        um.expensive_distinct_queries,
        um.inefficient_like_queries,
        um.no_results_with_scan_queries,
        um.high_compile_ratio_queries,
        -- Normalized scores (1-100 scale)
        GREATEST(1, LEAST(100, ROUND((um.spilled_queries * 100.0) / NULLIF(ns.max_spilled, 0), 2))) AS spilled_score,
        GREATEST(1, LEAST(100, ROUND((um.over_provisioned_queries * 100.0) / NULLIF(ns.max_over_provisioned, 0), 2))) AS over_provisioned_score,
        GREATEST(1, LEAST(100, ROUND((um.peak_hour_long_running_queries * 100.0) / NULLIF(ns.max_peak_hour, 0), 2))) AS peak_hour_score,
        GREATEST(1, LEAST(100, ROUND((um.select_star_queries * 100.0) / NULLIF(ns.max_select_star, 0), 2))) AS select_star_score,
        GREATEST(1, LEAST(100, ROUND((um.unpartitioned_scan_queries * 100.0) / NULLIF(ns.max_unpartitioned, 0), 2))) AS unpartitioned_score,
        GREATEST(1, LEAST(100, ROUND((um.repeated_queries * 100.0) / NULLIF(ns.max_repeated, 0), 2))) AS repeated_score,
        GREATEST(1, LEAST(100, ROUND((um.complex_join_queries * 100.0) / NULLIF(ns.max_complex, 0), 2))) AS complex_join_score,
        GREATEST(1, LEAST(100, ROUND((um.zero_result_queries * 100.0) / NULLIF(ns.max_zero_result, 0), 2))) AS zero_result_score,
        GREATEST(1, LEAST(100, ROUND((um.high_compile_queries * 100.0) / NULLIF(ns.max_high_compile, 0), 2))) AS high_compile_score,
        GREATEST(1, LEAST(100, ROUND((um.slow_queries * 100.0) / NULLIF(ns.max_slow, 0), 2))) AS slow_query_score,
        GREATEST(1, LEAST(100, ROUND((um.expensive_distinct_queries * 100.0) / NULLIF(ns.max_expensive_distinct, 0), 2))) AS expensive_distinct_score,
        GREATEST(1, LEAST(100, ROUND((um.inefficient_like_queries * 100.0) / NULLIF(ns.max_inefficient_like, 0), 2))) AS inefficient_like_score,
        GREATEST(1, LEAST(100, ROUND((um.no_results_with_scan_queries * 100.0) / NULLIF(ns.max_no_results_scan, 0), 2))) AS no_results_scan_score,
        GREATEST(1, LEAST(100, ROUND((um.high_compile_ratio_queries * 100.0) / NULLIF(ns.max_high_compile_ratio, 0), 2))) AS high_compile_ratio_score,
        -- Overall optimization score (weighted average)
        GREATEST(1, LEAST(100, ROUND(
            (GREATEST(1, LEAST(100, (um.spilled_queries * 100.0) / NULLIF(ns.max_spilled, 0))) * 0.15 +
             GREATEST(1, LEAST(100, (um.over_provisioned_queries * 100.0) / NULLIF(ns.max_over_provisioned, 0))) * 0.12 +
             GREATEST(1, LEAST(100, (um.peak_hour_long_running_queries * 100.0) / NULLIF(ns.max_peak_hour, 0))) * 0.12 +
             GREATEST(1, LEAST(100, (um.select_star_queries * 100.0) / NULLIF(ns.max_select_star, 0))) * 0.10 +
             GREATEST(1, LEAST(100, (um.unpartitioned_scan_queries * 100.0) / NULLIF(ns.max_unpartitioned, 0))) * 0.12 +
             GREATEST(1, LEAST(100, (um.repeated_queries * 100.0) / NULLIF(ns.max_repeated, 0))) * 0.08 +
             GREATEST(1, LEAST(100, (um.zero_result_queries * 100.0) / NULLIF(ns.max_zero_result, 0))) * 0.15 +
             GREATEST(1, LEAST(100, (um.slow_queries * 100.0) / NULLIF(ns.max_slow, 0))) * 0.16), 2
        ))) AS overall_optimization_score,
        CASE
            WHEN um.total_credits > ns.cost_p90 THEN 'High Cost'
            ELSE 'Normal'
        END AS cost_status,
        ARRAY_CONSTRUCT_COMPACT(
            CASE WHEN um.spilled_queries > 5 THEN 'Optimize memory usage or increase warehouse size.' END,
            CASE WHEN um.over_provisioned_queries > 5 THEN 'Use smaller warehouses for simple queries.' END,
            CASE WHEN um.peak_hour_long_running_queries > 5 THEN 'Schedule long queries off-peak.' END,
            CASE WHEN um.select_star_queries > 5 THEN 'Specify columns instead of SELECT *.' END,
            CASE WHEN um.unpartitioned_scan_queries > 5 THEN 'Implement partitioning or clustering.' END,
            CASE WHEN um.repeated_queries > 10 THEN 'Review frequently executed queries.' END,
            CASE WHEN um.zero_result_queries > 3 THEN 'Avoid queries that return no data.' END,
            CASE WHEN um.slow_queries > 3 THEN 'Optimize slow-running queries.' END,
            CASE WHEN um.expensive_distinct_queries > 3 THEN 'Replace DISTINCT with GROUP BY where possible.' END,
            CASE WHEN um.no_results_with_scan_queries > 3 THEN 'Add WHERE clauses to prevent unnecessary scans.' END
        ) AS recommendations,
        COALESCE(sq.query_samples, OBJECT_CONSTRUCT()) AS query_samples,
        CURRENT_TIMESTAMP() AS analysis_date
    FROM user_metrics um
    CROSS JOIN normalization_stats ns
    LEFT JOIN sample_queries sq ON um.user_name = sq.user_name
    ORDER BY overall_optimization_score DESC;

    RETURN 'Analysis refreshed successfully for ' || START_DATE || ' to ' || END_DATE;
END;
$$;

-- Step 3: Query to refresh and view results
-- Usage example:
-- CALL REFRESH_QUERY_PERFORMANCE_ANALYSIS('2024-01-01', '2024-01-31');

-- Step 4: Debug query to check data availability
-- Run this first to see if there's any data in your date range
CREATE OR REPLACE VIEW DEBUG_QUERY_DATA AS
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT user_name) as unique_users,
    MIN(start_time) as earliest_query,
    MAX(start_time) as latest_query,
    COUNT(CASE WHEN total_elapsed_time >= 2000 THEN 1 END) as queries_over_2sec,
    COUNT(CASE WHEN query_type NOT IN ('DESCRIBE', 'SHOW', 'USE') THEN 1 END) as non_metadata_queries,
    ARRAY_AGG(DISTINCT query_type) as query_types_found,
    ARRAY_AGG(DISTINCT user_name) WITHIN GROUP (ORDER BY user_name) as sample_users
FROM snowflake.account_usage.query_history
WHERE start_time >= CURRENT_DATE - 30  -- Last 30 days
  AND user_name IS NOT NULL;

-- Step 5: Simple test query to verify data
CREATE OR REPLACE PROCEDURE TEST_QUERY_DATA(
    START_DATE STRING,
    END_DATE STRING
)
RETURNS VARIANT
LANGUAGE SQL
AS
$
BEGIN
    LET result VARIANT;

    SELECT 
        OBJECT_CONSTRUCT(
            'total_queries', COUNT(*),
            'unique_users', COUNT(DISTINCT user_name),
            'date_range_start', MIN(start_time),
            'date_range_end', MAX(start_time),
            'queries_over_2sec', COUNT(CASE WHEN total_elapsed_time >= 2000 THEN 1 END),
            'sample_users', ARRAY_AGG(DISTINCT user_name) WITHIN GROUP (ORDER BY user_name LIMIT 10),
            'query_types', ARRAY_AGG(DISTINCT query_type) WITHIN GROUP (ORDER BY query_type)
        )
    INTO result
    FROM snowflake.account_usage.query_history
    WHERE start_time >= TRY_TO_TIMESTAMP_NTZ(START_DATE)
      AND start_time <= TRY_TO_TIMESTAMP_NTZ(END_DATE || ' 23:59:59')
      AND user_name IS NOT NULL;

    RETURN result;
END;
$;

-- Step 6: View the results
-- SELECT * FROM QUERY_PERFORMANCE_ANALYSIS ORDER BY OVERALL_OPTIMIZATION_SCORE DESC;
Enter fullscreen mode Exit fullscreen mode
-- =============================================
-- OPTIMIZED SNOWFLAKE QUERY PERFORMANCE ANALYZER
-- =============================================

-- Step 1: Create materialized table for refreshable data
CREATE OR REPLACE TABLE QUERY_PERFORMANCE_ANALYSIS (
    USER_NAME VARCHAR(255),
    TOTAL_QUERIES NUMBER,
    WAREHOUSES_USED NUMBER,
    DATABASES_ACCESSED NUMBER,
    TOTAL_CREDITS NUMBER(10,2),
    AVG_EXECUTION_TIME_MS NUMBER(10,2),
    AVG_BYTES_PER_ROW NUMBER(15,2),
    TOTAL_DATA_SCANNED_GB NUMBER(15,2),
    FAILURE_CANCELLATION_RATE_PCT NUMBER(5,2),
    SPILLED_QUERIES NUMBER,
    OVER_PROVISIONED_QUERIES NUMBER,
    PEAK_HOUR_LONG_RUNNING_QUERIES NUMBER,
    SELECT_STAR_QUERIES NUMBER,
    UNPARTITIONED_SCAN_QUERIES NUMBER,
    REPEATED_QUERIES NUMBER,
    COMPLEX_JOIN_QUERIES NUMBER,
    ZERO_RESULT_QUERIES NUMBER,
    HIGH_COMPILE_QUERIES NUMBER,
    UNTAGGED_QUERIES NUMBER,
    UNLIMITED_ORDER_BY_QUERIES NUMBER,
    SLOW_QUERIES NUMBER,
    EXPENSIVE_DISTINCT_QUERIES NUMBER,
    INEFFICIENT_LIKE_QUERIES NUMBER,
    NO_RESULTS_WITH_SCAN_QUERIES NUMBER,
    HIGH_COMPILE_RATIO_QUERIES NUMBER,
    -- Normalized scores (1-100)
    SPILLED_SCORE NUMBER(5,2),
    OVER_PROVISIONED_SCORE NUMBER(5,2),
    PEAK_HOUR_SCORE NUMBER(5,2),
    SELECT_STAR_SCORE NUMBER(5,2),
    UNPARTITIONED_SCORE NUMBER(5,2),
    REPEATED_SCORE NUMBER(5,2),
    COMPLEX_JOIN_SCORE NUMBER(5,2),
    ZERO_RESULT_SCORE NUMBER(5,2),
    HIGH_COMPILE_SCORE NUMBER(5,2),
    SLOW_QUERY_SCORE NUMBER(5,2),
    EXPENSIVE_DISTINCT_SCORE NUMBER(5,2),
    INEFFICIENT_LIKE_SCORE NUMBER(5,2),
    NO_RESULTS_SCAN_SCORE NUMBER(5,2),
    HIGH_COMPILE_RATIO_SCORE NUMBER(5,2),
    OVERALL_OPTIMIZATION_SCORE NUMBER(5,2), -- 1 = optimized, 100 = most unoptimized
    COST_STATUS VARCHAR(50),
    RECOMMENDATIONS ARRAY,
    QUERY_SAMPLES OBJECT,
    ANALYSIS_DATE TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP()
);

-- Step 2: Create procedure to refresh the analysis
CREATE OR REPLACE PROCEDURE REFRESH_QUERY_PERFORMANCE_ANALYSIS(
    START_DATE STRING,
    END_DATE STRING,
    OBJECT_FILTER STRING DEFAULT ''
)
RETURNS STRING
LANGUAGE SQL
AS
$$
BEGIN
    -- Clear existing data
    DELETE FROM QUERY_PERFORMANCE_ANALYSIS;

    -- Insert new analysis
    INSERT INTO QUERY_PERFORMANCE_ANALYSIS
    WITH percentile_reference AS (
        SELECT
            warehouse_size,
            PERCENTILE_CONT(0.1) WITHIN GROUP (ORDER BY bytes_scanned) AS bytes_scanned_p10,
            PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY total_elapsed_time) AS execution_time_p90
        FROM snowflake.account_usage.query_history
        WHERE start_time >= TRY_TO_TIMESTAMP_NTZ(START_DATE)
          AND start_time <= TRY_TO_TIMESTAMP_NTZ(END_DATE || ' 23:59:59')
          AND OBJECT_FILTER = '' OR OBJECT_FILTER != ''  -- Dynamic filter placeholder
          AND user_name IS NOT NULL
          AND query_type NOT IN ('DESCRIBE', 'SHOW', 'USE', 'CREATE', 'DROP', 'ALTER', 'GRANT', 'REVOKE')
          AND total_elapsed_time >= 2000  -- Minimum 2 seconds
        GROUP BY warehouse_size
    ),
    query_flags AS (
        SELECT
            qh.query_id,
            qh.warehouse_size,
            qh.bytes_scanned,
            qh.total_elapsed_time AS execution_time_ms,
            qh.compilation_time,
            qh.start_time,
            qh.query_text, -- Full query text, no truncation
            qh.query_hash,
            qh.query_tag,
            qh.error_code,
            qh.execution_status,
            qh.partitions_scanned,
            qh.partitions_total,
            qh.bytes_spilled_to_local_storage,
            qh.bytes_spilled_to_remote_storage,
            qh.user_name,
            qh.database_name,
            qh.schema_name,
            qh.warehouse_name,
            COALESCE(qh.credits_used_cloud_services, 0) AS credits_used_cloud_services,
            qh.rows_produced,
            -- Optimized flag calculations
            CASE
                WHEN qh.warehouse_size IN ('MEDIUM', 'LARGE', 'X-LARGE', '2X-LARGE', '3X-LARGE', '4X-LARGE')
                 AND qh.bytes_scanned < pr.bytes_scanned_p10 THEN 1 ELSE 0
            END AS over_provisioned,
            CASE
                WHEN EXTRACT(HOUR FROM qh.start_time) BETWEEN 9 AND 17
                 AND qh.total_elapsed_time > 300000 THEN 1 ELSE 0
            END AS peak_hour_long_running,
            CASE
                WHEN UPPER(qh.query_text) LIKE 'SELECT *%' THEN 1 ELSE 0
            END AS select_star,
            CASE
                WHEN qh.partitions_total > 0 AND qh.partitions_scanned = qh.partitions_total THEN 1 ELSE 0
            END AS unpartitioned_scan,
            CASE
                WHEN qh.query_hash IS NOT NULL THEN 1 ELSE 0
            END AS repeated_query,
            CASE
                WHEN (UPPER(qh.query_text) LIKE '%JOIN%' AND UPPER(qh.query_text) LIKE '%JOIN%')
                   OR UPPER(qh.query_text) LIKE '%WINDOW%' THEN 1 ELSE 0
            END AS complex_query,
            CASE
                WHEN qh.error_code IS NOT NULL OR qh.execution_status IN ('FAILED', 'CANCELLED') THEN 1 ELSE 0
            END AS failed_cancelled,
            CASE
                WHEN qh.bytes_spilled_to_local_storage > 0 OR qh.bytes_spilled_to_remote_storage > 0 THEN 1 ELSE 0
            END AS spilled,
            CASE
                WHEN qh.rows_produced = 0 AND qh.bytes_scanned > 1000000 THEN 1 ELSE 0
            END AS zero_result_query,
            CASE
                WHEN qh.compilation_time > 5000 THEN 1 ELSE 0
            END AS high_compile_time,
            CASE
                WHEN qh.query_tag IS NULL THEN 1 ELSE 0
            END AS untagged_query,
            CASE
                WHEN UPPER(qh.query_text) LIKE '%ORDER BY%' AND UPPER(qh.query_text) NOT LIKE '%LIMIT%' THEN 1 ELSE 0
            END AS unlimited_order_by,
            CASE
                WHEN qh.total_elapsed_time > pr.execution_time_p90 THEN 1 ELSE 0
            END AS slow_query,
            CASE
                WHEN UPPER(qh.query_text) LIKE '%DISTINCT%' AND qh.bytes_scanned > 100000000 THEN 1 ELSE 0
            END AS expensive_distinct,
            CASE
                WHEN UPPER(qh.query_text) LIKE '%LIKE%' AND UPPER(qh.query_text) NOT LIKE '%INDEX%' THEN 1 ELSE 0
            END AS inefficient_like,
            CASE
                WHEN qh.bytes_scanned > 0 AND qh.rows_produced = 0 THEN 1 ELSE 0
            END AS no_results_with_scan,
            CASE
                WHEN qh.total_elapsed_time > 0 AND qh.compilation_time / qh.total_elapsed_time > 0.5 THEN 1 ELSE 0
            END AS high_compile_ratio
        FROM snowflake.account_usage.query_history qh
        LEFT JOIN percentile_reference pr ON qh.warehouse_size = pr.warehouse_size
        WHERE
            qh.start_time >= TRY_TO_TIMESTAMP_NTZ(START_DATE)
            AND qh.start_time <= TRY_TO_TIMESTAMP_NTZ(END_DATE || ' 23:59:59')
            AND OBJECT_FILTER = '' OR OBJECT_FILTER != ''  -- Dynamic filter placeholder
            AND qh.query_type NOT IN ('DESCRIBE', 'SHOW', 'USE', 'CREATE', 'DROP', 'ALTER', 'GRANT', 'REVOKE')
            AND qh.user_name IS NOT NULL
            AND qh.total_elapsed_time >= 2000  -- Minimum 2 seconds
    ),
    -- Optimized sample queries collection
    sample_queries AS (
        SELECT
            user_name,
            OBJECT_CONSTRUCT(
                'over_provisioned', ARRAY_AGG(
                    CASE WHEN over_provisioned = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text, -- Full query text
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        ) 
                    END
                ) WITHIN GROUP (ORDER BY start_time DESC),
                'peak_hour_long_running', ARRAY_AGG(
                    CASE WHEN peak_hour_long_running = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY execution_time_ms DESC),
                'select_star', ARRAY_AGG(
                    CASE WHEN select_star = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY bytes_scanned DESC),
                'spilled', ARRAY_AGG(
                    CASE WHEN spilled = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_spilled_to_local_storage', bytes_spilled_to_local_storage,
                            'bytes_spilled_to_remote_storage', bytes_spilled_to_remote_storage,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY (bytes_spilled_to_local_storage + bytes_spilled_to_remote_storage) DESC),
                'slow_query', ARRAY_AGG(
                    CASE WHEN slow_query = 1 THEN
                        OBJECT_CONSTRUCT(
                            'query_id', query_id,
                            'query_text', query_text,
                            'execution_time_ms', execution_time_ms,
                            'bytes_scanned', bytes_scanned,
                            'warehouse_size', warehouse_size,
                            'start_time', start_time
                        )
                    END
                ) WITHIN GROUP (ORDER BY execution_time_ms DESC)
            ) AS query_samples
        FROM query_flags
        GROUP BY user_name
    ),
    user_metrics AS (
        SELECT
            user_name,
            COUNT(DISTINCT query_id) AS total_queries,
            COUNT(DISTINCT warehouse_name) AS warehouses_used,
            COUNT(DISTINCT database_name) AS databases_accessed,
            ROUND(SUM(credits_used_cloud_services), 2) AS total_credits,
            ROUND(AVG(execution_time_ms), 2) AS avg_execution_time_ms,
            ROUND(AVG(NULLIF(bytes_scanned, 0) / NULLIF(rows_produced, 0)), 2) AS avg_bytes_per_row,
            ROUND(SUM(bytes_scanned) / POWER(1024, 3), 2) AS total_data_scanned_gb,
            ROUND(SUM(failed_cancelled) * 100.0 / NULLIF(COUNT(*), 0), 2) AS failure_cancellation_rate_pct,
            SUM(spilled) AS spilled_queries,
            SUM(over_provisioned) AS over_provisioned_queries,
            SUM(peak_hour_long_running) AS peak_hour_long_running_queries,
            SUM(select_star) AS select_star_queries,
            SUM(unpartitioned_scan) AS unpartitioned_scan_queries,
            COUNT(*) - COUNT(DISTINCT query_hash) AS repeated_queries,
            SUM(complex_query) AS complex_join_queries,
            SUM(zero_result_query) AS zero_result_queries,
            SUM(high_compile_time) AS high_compile_queries,
            SUM(untagged_query) AS untagged_queries,
            SUM(unlimited_order_by) AS unlimited_order_by_queries,
            SUM(slow_query) AS slow_queries,
            SUM(expensive_distinct) AS expensive_distinct_queries,
            SUM(inefficient_like) AS inefficient_like_queries,
            SUM(no_results_with_scan) AS no_results_with_scan_queries,
            SUM(high_compile_ratio) AS high_compile_ratio_queries
        FROM query_flags
        GROUP BY user_name
    ),
    -- Calculate percentiles for normalization
    normalization_stats AS (
        SELECT
            MAX(spilled_queries) AS max_spilled,
            MAX(over_provisioned_queries) AS max_over_provisioned,
            MAX(peak_hour_long_running_queries) AS max_peak_hour,
            MAX(select_star_queries) AS max_select_star,
            MAX(unpartitioned_scan_queries) AS max_unpartitioned,
            MAX(repeated_queries) AS max_repeated,
            MAX(complex_join_queries) AS max_complex,
            MAX(zero_result_queries) AS max_zero_result,
            MAX(high_compile_queries) AS max_high_compile,
            MAX(slow_queries) AS max_slow,
            MAX(expensive_distinct_queries) AS max_expensive_distinct,
            MAX(inefficient_like_queries) AS max_inefficient_like,
            MAX(no_results_with_scan_queries) AS max_no_results_scan,
            MAX(high_compile_ratio_queries) AS max_high_compile_ratio,
            PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY total_credits) AS cost_p90
        FROM user_metrics
    )
    SELECT
        um.user_name,
        um.total_queries,
        um.warehouses_used,
        um.databases_accessed,
        um.total_credits,
        um.avg_execution_time_ms,
        um.avg_bytes_per_row,
        um.total_data_scanned_gb,
        um.failure_cancellation_rate_pct,
        um.spilled_queries,
        um.over_provisioned_queries,
        um.peak_hour_long_running_queries,
        um.select_star_queries,
        um.unpartitioned_scan_queries,
        um.repeated_queries,
        um.complex_join_queries,
        um.zero_result_queries,
        um.high_compile_queries,
        um.untagged_queries,
        um.unlimited_order_by_queries,
        um.slow_queries,
        um.expensive_distinct_queries,
        um.inefficient_like_queries,
        um.no_results_with_scan_queries,
        um.high_compile_ratio_queries,
        -- Normalized scores (1-100 scale)
        GREATEST(1, LEAST(100, ROUND((um.spilled_queries * 100.0) / NULLIF(ns.max_spilled, 0), 2))) AS spilled_score,
        GREATEST(1, LEAST(100, ROUND((um.over_provisioned_queries * 100.0) / NULLIF(ns.max_over_provisioned, 0), 2))) AS over_provisioned_score,
        GREATEST(1, LEAST(100, ROUND((um.peak_hour_long_running_queries * 100.0) / NULLIF(ns.max_peak_hour, 0), 2))) AS peak_hour_score,
        GREATEST(1, LEAST(100, ROUND((um.select_star_queries * 100.0) / NULLIF(ns.max_select_star, 0), 2))) AS select_star_score,
        GREATEST(1, LEAST(100, ROUND((um.unpartitioned_scan_queries * 100.0) / NULLIF(ns.max_unpartitioned, 0), 2))) AS unpartitioned_score,
        GREATEST(1, LEAST(100, ROUND((um.repeated_queries * 100.0) / NULLIF(ns.max_repeated, 0), 2))) AS repeated_score,
        GREATEST(1, LEAST(100, ROUND((um.complex_join_queries * 100.0) / NULLIF(ns.max_complex, 0), 2))) AS complex_join_score,
        GREATEST(1, LEAST(100, ROUND((um.zero_result_queries * 100.0) / NULLIF(ns.max_zero_result, 0), 2))) AS zero_result_score,
        GREATEST(1, LEAST(100, ROUND((um.high_compile_queries * 100.0) / NULLIF(ns.max_high_compile, 0), 2))) AS high_compile_score,
        GREATEST(1, LEAST(100, ROUND((um.slow_queries * 100.0) / NULLIF(ns.max_slow, 0), 2))) AS slow_query_score,
        GREATEST(1, LEAST(100, ROUND((um.expensive_distinct_queries * 100.0) / NULLIF(ns.max_expensive_distinct, 0), 2))) AS expensive_distinct_score,
        GREATEST(1, LEAST(100, ROUND((um.inefficient_like_queries * 100.0) / NULLIF(ns.max_inefficient_like, 0), 2))) AS inefficient_like_score,
        GREATEST(1, LEAST(100, ROUND((um.no_results_with_scan_queries * 100.0) / NULLIF(ns.max_no_results_scan, 0), 2))) AS no_results_scan_score,
        GREATEST(1, LEAST(100, ROUND((um.high_compile_ratio_queries * 100.0) / NULLIF(ns.max_high_compile_ratio, 0), 2))) AS high_compile_ratio_score,
        -- Overall optimization score (weighted average)
        GREATEST(1, LEAST(100, ROUND(
            (GREATEST(1, LEAST(100, (um.spilled_queries * 100.0) / NULLIF(ns.max_spilled, 0))) * 0.15 +
             GREATEST(1, LEAST(100, (um.over_provisioned_queries * 100.0) / NULLIF(ns.max_over_provisioned, 0))) * 0.12 +
             GREATEST(1, LEAST(100, (um.peak_hour_long_running_queries * 100.0) / NULLIF(ns.max_peak_hour, 0))) * 0.12 +
             GREATEST(1, LEAST(100, (um.select_star_queries * 100.0) / NULLIF(ns.max_select_star, 0))) * 0.10 +
             GREATEST(1, LEAST(100, (um.unpartitioned_scan_queries * 100.0) / NULLIF(ns.max_unpartitioned, 0))) * 0.12 +
             GREATEST(1, LEAST(100, (um.repeated_queries * 100.0) / NULLIF(ns.max_repeated, 0))) * 0.08 +
             GREATEST(1, LEAST(100, (um.zero_result_queries * 100.0) / NULLIF(ns.max_zero_result, 0))) * 0.15 +
             GREATEST(1, LEAST(100, (um.slow_queries * 100.0) / NULLIF(ns.max_slow, 0))) * 0.16), 2
        ))) AS overall_optimization_score,
        CASE
            WHEN um.total_credits > ns.cost_p90 THEN 'High Cost'
            ELSE 'Normal'
        END AS cost_status,
        ARRAY_CONSTRUCT_COMPACT(
            CASE WHEN um.spilled_queries > 5 THEN 'Optimize memory usage or increase warehouse size.' END,
            CASE WHEN um.over_provisioned_queries > 5 THEN 'Use smaller warehouses for simple queries.' END,
            CASE WHEN um.peak_hour_long_running_queries > 5 THEN 'Schedule long queries off-peak.' END,
            CASE WHEN um.select_star_queries > 5 THEN 'Specify columns instead of SELECT *.' END,
            CASE WHEN um.unpartitioned_scan_queries > 5 THEN 'Implement partitioning or clustering.' END,
            CASE WHEN um.repeated_queries > 10 THEN 'Review frequently executed queries.' END,
            CASE WHEN um.zero_result_queries > 3 THEN 'Avoid queries that return no data.' END,
            CASE WHEN um.slow_queries > 3 THEN 'Optimize slow-running queries.' END,
            CASE WHEN um.expensive_distinct_queries > 3 THEN 'Replace DISTINCT with GROUP BY where possible.' END,
            CASE WHEN um.no_results_with_scan_queries > 3 THEN 'Add WHERE clauses to prevent unnecessary scans.' END
        ) AS recommendations,
        COALESCE(sq.query_samples, OBJECT_CONSTRUCT()) AS query_samples,
        CURRENT_TIMESTAMP() AS analysis_date
    FROM user_metrics um
    CROSS JOIN normalization_stats ns
    LEFT JOIN sample_queries sq ON um.user_name = sq.user_name
    ORDER BY overall_optimization_score DESC;

    RETURN 'Analysis refreshed successfully for ' || START_DATE || ' to ' || END_DATE;
END;
$$;

-- Step 3: Query to refresh and view results
-- Usage example:
-- CALL REFRESH_QUERY_PERFORMANCE_ANALYSIS('2024-01-01', '2024-01-31');

-- Step 4: View the results
-- SELECT * FROM QUERY_PERFORMANCE_ANALYSIS ORDER BY OVERALL_OPTIMIZATION_SCORE DESC;
Enter fullscreen mode Exit fullscreen mode

Top comments (0)