DEV Community

Alain Airom
Alain Airom

Posted on

Testing qwen3-vl… quite impressive!

Rapid test using qwen3 vision language

Image from https://ollama.com/library/qwen3-vl<br>

Introduction

Vision Language Models — TL;DR

Vision-Language Models (VLMs) represent a significant advancement in artificial intelligence, successfully bridging the gap between textual and visual understanding by allowing models to process and reason across both modalities simultaneously. These multimodal capabilities are crucial for tasks ranging from detailed image captioning and visual question answering to complex document analysis.


My idea to dive into this analysis came from an email from the “Ollama blog” highlighting the capabilities of the qwen3-vl model. Eager to test this state-of-the-art model myself, I enhanced the sample code from Ollama’s site to test it on my own.

Test and Results

  • First, prepare the environment 🔪
python3 -m venv venv
source venv/bin/activate

pip install --upgrade pip
pip install ollama

Enter fullscreen mode Exit fullscreen mode
  • Pull the model from Ollama;
ollama run qwen3-vl:235b-cloud
Enter fullscreen mode Exit fullscreen mode
  • I downloaded the 3 images from the blog post (2 provided here) 🖼️

  • The I made two versions of code, one which process all the images recusrsively, and another one to process any given image.
# image_analyzer_auto.py
import os
import time
from pathlib import Path
from typing import List
import re
import hashlib
import sys

try:
    from ollama import chat, ChatResponse
except ImportError:
    print("Error: The 'ollama' Python library is not installed.", file=sys.stderr)
    print("Please install it using: pip install ollama", file=sys.stderr)
    sys.exit(1)

# --- Configuration ---
INPUT_DIR = Path("./input")
OUTPUT_DIR = Path("./output")
MODEL_NAME = 'qwen3-vl:235b-cloud'
GENERIC_PROMPT = 'Describe this image in detail and summarize its key content.'
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}


def ensure_output_directory():
    """Ensures the output directory exists, creating it if necessary."""
    print(f"Checking for output directory: {OUTPUT_DIR}")
    if not OUTPUT_DIR.exists():
        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
        print(f"Created output directory: {OUTPUT_DIR.resolve()}")
    else:
        print("Output directory already exists.")


def find_all_image_files() -> List[Path]:
    """
    Finds all image files recursively under the input directory using Path.rglob.
    """
    image_paths: List[Path] = []
    print(f"Recursively searching for images in: {INPUT_DIR.resolve()}")

    if not INPUT_DIR.exists():
        print(f"Input directory not found: {INPUT_DIR.resolve()}")
        return image_paths

    valid_extensions = set(IMAGE_EXTENSIONS)

   all_items = list(INPUT_DIR.rglob('*'))

   print(f"DEBUG: rglob found {len(all_items)} total items (files/folders) to examine.")

    for path in all_items:
       is_file = path.is_file()
        is_valid_ext = path.suffix.lower() in valid_extensions

        # print(f"DEBUG: Examining entry: '{path.relative_to(INPUT_DIR)}'")
        # print(f"  - Is File: {is_file}")
        # print(f"  - Suffix: '{path.suffix.lower()}', Is Valid Extension: {is_valid_ext}")

        if is_file and is_valid_ext:
            image_paths.append(path)

    print(f"Files found and selected for processing: {len(image_paths)}")
    for p in image_paths:
        print(f"DEBUG: Selected file path for processing: {p.resolve()}")

    return image_paths


def analyze_images_in_batch():
    """
    Main function to analyze all images found recursively and log responses 
    to individual markdown files.
    """

   batch_start_time = time.time()

   ensure_output_directory()

   input_image_paths = find_all_image_files()

    if not input_image_paths:
        print(f"\n--- WARNING ---")
        print(f"No image files found in or under the {INPUT_DIR.resolve()} directory.")
        print("Please place one or more images in the 'input' folder and try again.")
        print(f"-------------")
        return

    print(f"Found {len(input_image_paths)} image(s) to process.")

    for i, input_image_path in enumerate(input_image_paths):
        relative_path = input_image_path.relative_to(INPUT_DIR)
        print(f"\n--- Processing Image {i + 1}/{len(input_image_paths)}: {relative_path} ---")

        analysis_start_time = time.time()
        current_timestamp = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(analysis_start_time))

        messages = [
            {
                'role': 'user',
                'content': GENERIC_PROMPT,
                'images': [str(input_image_path.resolve())] # Use absolute path for robustness
            },
        ]

        try:
           response: ChatResponse = chat(
                model=MODEL_NAME, 
                messages=messages
            )

           try:
                answer_content = response.message.content
            except AttributeError:
                answer_content = response.get('message', {}).get('content', 'Error: Could not extract content from Ollama response.')

           analysis_end_time = time.time()
            elapsed_time = analysis_end_time - analysis_start_time

           relative_path_str = str(relative_path)

           sanitized_name = relative_path_str.replace(os.sep, '__')
           sanitized_name = re.sub(r'[^\w\.\-]', '_', sanitized_name) 

            output_filename = f"{sanitized_name}_analysis.md"

           if len(output_filename) > 255:
                 hash_id = hashlib.sha1(relative_path_str.encode('utf-8')).hexdigest()[:10]
                 output_filename = f"{input_image_path.stem}_{hash_id}_analysis.md"

            output_filepath = OUTPUT_DIR / output_filename

            markdown_content = f"""# Image Analysis Report

**Model Used:** `{MODEL_NAME}`
**Input File (Path):** `{relative_path_str}`
**User Prompt:** > {GENERIC_PROMPT}

---
## Analysis Metadata
**Analysis Start Time (Local):** `{current_timestamp}`
**Individual Analysis Time:** `{elapsed_time:.2f} seconds`
---

## Generated Answer

{answer_content}

---

*Analysis completed by Ollama.*
"""

           with open(output_filepath, 'w', encoding='utf-8') as f:
                f.write(markdown_content)

            print(f"  -> Analysis successful. Saved to: {output_filepath.name} ({elapsed_time:.2f}s)")

        except Exception as e:
           print(f"  -> CRITICAL ERROR: Could not process or save analysis for {input_image_path.name}: {e}", file=sys.stderr)

    batch_end_time = time.time()
    total_batch_time = batch_end_time - batch_start_time
    print(f"\n=======================================================")
    print(f"Batch processing complete.")
    print(f"Total time elapsed for processing {len(input_image_paths)} images: {total_batch_time:.2f} seconds")
    print(f"=======================================================")


if __name__ == "__main__":
    # Ensure the input directory exists for clarity
    if not INPUT_DIR.exists():
        INPUT_DIR.mkdir(parents=True, exist_ok=True)
        print(f"NOTE: Created required input directory: {INPUT_DIR.resolve()}")

    analyze_images_in_batch()
Enter fullscreen mode Exit fullscreen mode
  • The output(s) 👇
# Image Analysis Report

**Model Used:** `qwen3-vl:235b-cloud`
**Input File (Path):** `flower-min.png`
**User Prompt:** > Describe this image in detail and summarize its key content.

---
## Analysis Metadata
**Analysis Start Time (Local):** `2025-10-17 11:12:18 CEST`
**Individual Analysis Time:** `9.97 seconds`
---

## Generated Answer

This vibrant, sun-drenched image captures a striking tropical scene centered on a hanging heliconia flower, also known as a “lobster claw” or “parrot’s beak” due to its distinctive shape.

**Detailed Description:**

- **Main Subject**: The focal point is a large, pendulous inflorescence of a heliconia plant. It hangs vertically, composed of a series of overlapping, boat-shaped bracts arranged in a zigzag pattern along a central stem. Each bract is vividly colored — deep, glossy red on the upper surface and bright chartreuse or yellow-green along the edges and underside. The bracts are slightly open, revealing small, inconspicuous true flowers nestled inside.

- **Background**: The heliconia is surrounded by a lush, dense backdrop of tropical foliage. Various shades of green leaves — broad, lanceolate, and variegated — create a rich, textured environment. Sunlight filters through the canopy, casting dappled highlights and creating a soft, glowing effect, especially on the leaves to the right and in the background.

- **Additional Elements**: In the blurred background, other heliconia plants can be seen, some with similar red-and-yellow bracts, adding depth and reinforcing the tropical setting. The overall lighting is bright and natural, suggesting a sunny day in a garden, conservatory, or rainforest.

- **Composition**: The heliconia is positioned slightly off-center, drawing the eye downward along its elegant curve. The shallow depth of field keeps the main flower in sharp focus while softly blurring the background, emphasizing its form and color.

**Summary of Key Content:**

This image showcases a vibrant red-and-yellow heliconia flower hanging amidst lush tropical greenery under bright sunlight. The composition highlights the exotic beauty and striking color contrast of the flower against its verdant, sunlit backdrop, evoking a sense of tropical abundance and natural elegance.

---

*Analysis completed by Ollama.*
# Image Analysis Report

**Model Used:** `qwen3-vl:235b-cloud`
**Input File (Path):** `flower-min.png`
**User Prompt:** > Describe this image in detail and summarize its key content.

---
## Analysis Metadata
**Analysis Start Time (Local):** `2025-10-17 11:22:55 CEST`
**Individual Analysis Time:** `7.84 seconds`
---

## Generated Answer

This vibrant, sun-drenched image captures a close-up view of a tropical plant in full bloom, set against a lush, green backdrop.

**Detailed Description:**

- **Main Subject**: The focal point is a striking, pendulous inflorescence — likely a species of *Heliconia*, commonly known as a “lobster claw” or “hanging lobster claw.” It hangs vertically, composed of a series of overlapping, boat-shaped bracts (modified leaves) arranged in a zigzag or alternating pattern along a central stem.

- **Coloration**: The bracts are vividly colored — deep, glossy red on the upper surface and bright, chartreuse yellow-green along the edges and undersides. This creates a dramatic, two-toned effect that stands out against the foliage.

- **Structure**: Each bract curves gently downward, resembling a claw or beak. The arrangement gives the flower a sculptural, almost architectural quality. Tiny, true flowers may be nestled within the bracts, though they are not clearly visible.

- **Background**: The background is a dense, sunlit jungle or garden setting filled with various tropical plants. You can see broad, lance-shaped leaves, some with variegated patterns, and hints of other red *Heliconia* blooms in the distance, softly blurred due to shallow depth of field. Sunlight filters through the canopy, creating dappled highlights and a warm, glowing ambiance.

- **Lighting & Atmosphere**: Bright, natural sunlight illuminates the scene, enhancing the saturation of the reds and greens. The overall mood is tropical, vibrant, and serene — evoking a sense of being in a lush rainforest or botanical garden.

**Summary of Key Content:**

This image features a striking red-and-yellow *Heliconia* flower hanging prominently in the foreground, surrounded by dense, sunlit tropical foliage. The composition highlights the exotic beauty and vivid coloration of the plant, capturing the lush, vibrant essence of a tropical environment. The focus on the unique, claw-like structure of the bracts makes it a visually captivating botanical portrait.

---

*Analysis completed by Ollama.*
Enter fullscreen mode Exit fullscreen mode
# Image Analysis Report

**Model Used:** `qwen3-vl:235b-cloud`
**Input File (Path):** `flower-min.png`
**User Prompt:** > Describe this image in detail and summarize its key content.

---
## Analysis Metadata
**Analysis Start Time (Local):** `2025-10-17 11:22:55 CEST`
**Individual Analysis Time:** `7.84 seconds`
---

## Generated Answer

This vibrant, sun-drenched image captures a close-up view of a tropical plant in full bloom, set against a lush, green backdrop.

**Detailed Description:**

- **Main Subject**: The focal point is a striking, pendulous inflorescence — likely a species of *Heliconia*, commonly known as a “lobster claw” or “hanging lobster claw.” It hangs vertically, composed of a series of overlapping, boat-shaped bracts (modified leaves) arranged in a zigzag or alternating pattern along a central stem.

- **Coloration**: The bracts are vividly colored — deep, glossy red on the upper surface and bright, chartreuse yellow-green along the edges and undersides. This creates a dramatic, two-toned effect that stands out against the foliage.

- **Structure**: Each bract curves gently downward, resembling a claw or beak. The arrangement gives the flower a sculptural, almost architectural quality. Tiny, true flowers may be nestled within the bracts, though they are not clearly visible.

- **Background**: The background is a dense, sunlit jungle or garden setting filled with various tropical plants. You can see broad, lance-shaped leaves, some with variegated patterns, and hints of other red *Heliconia* blooms in the distance, softly blurred due to shallow depth of field. Sunlight filters through the canopy, creating dappled highlights and a warm, glowing ambiance.

- **Lighting & Atmosphere**: Bright, natural sunlight illuminates the scene, enhancing the saturation of the reds and greens. The overall mood is tropical, vibrant, and serene — evoking a sense of being in a lush rainforest or botanical garden.

**Summary of Key Content:**

This image features a striking red-and-yellow *Heliconia* flower hanging prominently in the foreground, surrounded by dense, sunlit tropical foliage. The composition highlights the exotic beauty and vivid coloration of the plant, capturing the lush, vibrant essence of a tropical environment. The focus on the unique, claw-like structure of the bracts makes it a visually captivating botanical portrait.

---

*Analysis completed by Ollama.*
Enter fullscreen mode Exit fullscreen mode
# Image Analysis Report

**Model Used:** `qwen3-vl:235b-cloud`
**Input File (Path):** `math-min.jpg`
**User Prompt:** > Describe this image in detail and summarize its key content.

---
## Analysis Metadata
**Analysis Start Time (Local):** `2025-10-17 11:23:17 CEST`
**Individual Analysis Time:** `15.08 seconds`
---

## Generated Answer

This image shows a handwritten mathematical problem on a light-colored, possibly off-white, sheet of paper. The handwriting is in dark blue or black ink and appears to be written with a pen.

---

### **Detailed Description:**

- **Problem Number:** The problem is labeled with a circled number “①” at the beginning.

- **Text Content:**
  - The problem states:  
    > “If 3 and 15 are the two eigen values of the matrix A = [matrix], find |A| without expanding.”

- **Matrix A:**
  - A 3×3 matrix is written as:
    ```


    A = (  8   -6    2  )
        ( -6    7   -4  )
        (  2   -4    3  )


    ```
  - The matrix is enclosed in large parentheses drawn by hand. The entries are clearly legible:
    - Row 1: 8, -6, 2
    - Row 2: -6, 7, -4
    - Row 3: 2, -4, 3

- **Task:** The problem asks to find the determinant of matrix A — denoted |A| — **without expanding** (i.e., without computing the determinant via cofactor expansion or row reduction).

---

### **Key Mathematical Insight (Implied by the Problem):**

The problem gives two eigenvalues (3 and 15) of a 3×3 matrix and asks for the determinant. Since the determinant of a matrix equals the **product of its eigenvalues**, and a 3×3 matrix has three eigenvalues (counting multiplicity), we can deduce the third eigenvalue using the trace (sum of eigenvalues = trace of matrix).

- **Trace of A** = 8 + 7 + 3 = **18**
- Sum of known eigenvalues = 3 + 15 = **18**
- Therefore, the third eigenvalue = 18 - 18 = **0**

→ So, eigenvalues are 3, 15, and 0.

→ Determinant |A| = 3 × 15 × 0 = **0**

---

### **Summary:**

This is a linear algebra problem asking to compute the determinant of a 3×3 symmetric matrix **without direct expansion**, by using the given eigenvalues (3 and 15) and properties of eigenvalues (trace = sum of eigenvalues, determinant = product of eigenvalues). The solution hinges on recognizing that the third eigenvalue must be 0, making the determinant 0.

The problem is well-posed and tests conceptual understanding rather than computational skill.

---

*Analysis completed by Ollama.*
Enter fullscreen mode Exit fullscreen mode

Quite impressive!

  • 2nd version of the application, can be used to ask for file input (the best would be a Streamlit interface… but anyways 🦥)
# image_analyzer_input.py
import os
import time
from pathlib import Path
from typing import List
import re
import hashlib

try:
    from ollama import chat, ChatResponse
except ImportError:
    print("Error: The 'ollama' Python library is not installed.")
    print("Please install it using: pip install ollama")
    exit()

# --- Configuration ---
OUTPUT_DIR = Path("./output")
BATCH_OUTPUT_FILE = OUTPUT_DIR / "batch_analysis.md"
MODEL_NAME = 'qwen3-vl:235b-cloud'
GENERIC_PROMPT = 'Describe this image in detail and summarize its key content.'
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}

def ensure_output_directory():
    """Ensures the output directory exists, creating it if necessary."""
    if not OUTPUT_DIR.exists():
        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def generate_markdown_block(input_path: Path, answer_content: str, elapsed_time: float) -> str:
    """Generates a formatted Markdown block for a single analysis."""
    current_timestamp = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(time.time()))

    filename = input_path.name

    markdown_content = f"""
## Analysis for: {filename}

**Model Used:** `{MODEL_NAME}`
**Input File (Full Path):** `{input_path.resolve()}`
**User Prompt:** > {GENERIC_PROMPT}

---
### Analysis Metadata
**Timestamp (Local):** `{current_timestamp}`
**Individual Analysis Time:** `{elapsed_time:.2f} seconds`
---

### Generated Answer

{answer_content}

---
"""
    return markdown_content

def interactive_analyze():
    """
    Runs the main interactive loop, prompting the user for image paths 
    and appending results to a single output file.
    """
    print(f"\n--- Ollama Interactive Image Analyzer ---")
    print(f"Results will be appended to: {BATCH_OUTPUT_FILE.resolve()}")
    print(f"Current model: {MODEL_NAME}")

    ensure_output_directory()
    total_batch_time = 0.0
    image_count = 0

   while True:
       input_file_str = input("\nEnter image path (or type 'quit'/'exit'): ").strip()

        if input_file_str.lower() in ('quit', 'exit'):
            break

        if not input_file_str:
            continue

        input_path = Path(input_file_str)

        if not input_path.exists():
            print(f"Error: File not found at '{input_file_str}'")
            continue

        if not input_path.is_file() or input_path.suffix.lower() not in IMAGE_EXTENSIONS:
            print(f"Error: '{input_file_str}' is not a valid image file. Accepted extensions: {', '.join(IMAGE_EXTENSIONS)}.")
            continue

        print(f"Processing '{input_path.name}'...")
        analysis_start_time = time.time()

        messages = [
            {
                'role': 'user',
                'content': GENERIC_PROMPT,
                'images': [str(input_path.resolve())] # Use absolute path for robustness
            },
        ]

        try:
            response: ChatResponse = chat(model=MODEL_NAME, messages=messages)

            try:
                answer_content = response.message.content
            except AttributeError:
                answer_content = response.get('message', {}).get('content', 'Error: Could not extract content from Ollama response.')

            elapsed_time = time.time() - analysis_start_time
            total_batch_time += elapsed_time
            image_count += 1

            markdown_block = generate_markdown_block(input_path, answer_content, elapsed_time)

            with open(BATCH_OUTPUT_FILE, 'a', encoding='utf-8') as f:
                f.write(markdown_block)

            print(f"-> Analysis successful. Result appended to {BATCH_OUTPUT_FILE.name} ({elapsed_time:.2f}s)")

        except Exception as e:
            print(f"CRITICAL ERROR during analysis of {input_path.name}: {e}")
            print("Skipping this file and continuing.")


    print(f"\n=======================================================")
    print(f"Interactive batch processing finished.")
    if image_count > 0:
        print(f"Total files analyzed: {image_count}")
        print(f"Total cumulative time elapsed: {total_batch_time:.2f} seconds")
    else:
        print("No images were successfully analyzed in this session.")
    print(f"Final log saved to: {BATCH_OUTPUT_FILE.resolve()}")
    print(f"=======================================================")


if __name__ == "__main__":
    interactive_analyze()
Enter fullscreen mode Exit fullscreen mode

Et voilà ⚡

Conclusion

Using the sample images provided, which had lots of differences, the results are great. I would go on with more tests but it seems really promising.

Thanks for reading 🥂

Links

Top comments (0)