Running DeepSeek Janus-Pro-1B in the Browser: A Comprehensive Guide

#webdev #javascript #ai #worker

The ability to run large language models (LLMs) directly in the browser has opened new possibilities for privacy-preserving, client-side AI applications. In this blog post, we’ll explore how to run DeepSeek Janus-Pro-1B, a powerful text-to-image generation model, entirely in the browser using WebGPU and Hugging Face’s Transformers.js library.

Why Browser-Based Inference?

Privacy: Data never leaves the user’s device.
Cost Efficiency: No server infrastructure required.
Accessibility: Runs on any device with a modern browser and WebGPU support.

DeepSeek Janus-Pro-1B, designed for multimodal tasks like text-to-image generation, is now accessible via browser-based inference thanks to optimizations in Transformers.js and WebGPU acceleration.

Key Tools & Libraries

Transformers.js: A JavaScript port of Hugging Face’s Transformers library, optimized for browser execution.
WebGPU: A modern API for GPU acceleration in browsers, replacing WebGL with improved performance for ML workloads.
ONNX Runtime: Enables model execution via optimized computation graphs.

Demo Code Walkthrough

The following example demonstrates how to load and run DeepSeek Janus-Pro-1B in a Web Worker for non-blocking inference. The full code is available in the GitHub repository.

import {
  AutoProcessor,
  MultiModalityCausalLM,
  BaseStreamer,
  TextStreamer,
  InterruptableStoppingCriteria,
} from "@huggingface/transformers";

// Define constants
const IMAGE_GENERATION_COMMAND_PREFIX = "/imagine ";
const MAX_NEW_TEXT_TOKENS = 1024;

/**
 * Helper function to perform WebGPU feature detection
 */
let fp16_supported = false;
async function check() {
  try {
    const adapter = await navigator.gpu.requestAdapter();
    if (!adapter) {
      throw new Error("WebGPU is not supported (no adapter found)");
    }
    fp16_supported = adapter.features.has("shader-f16");
    self.postMessage({
      status: "success",
      data: fp16_supported,
    });
  } catch (e) {
    self.postMessage({
      status: "error",
      data: e.toString(),
    });
  }
}

/**
 * This class uses the Singleton pattern to enable lazy-loading of the pipeline
 */
class ImageGenerationPipeline {
  static model_id = "onnx-community/Janus-Pro-1B-ONNX";

  static async getInstance(progress_callback = null) {
    this.processor ??= AutoProcessor.from_pretrained(this.model_id, {
      progress_callback,
    });

    this.model ??= MultiModalityCausalLM.from_pretrained(this.model_id, {
      dtype: fp16_supported
        ? {
            prepare_inputs_embeds: "q4",
            language_model: "q4f16",
            lm_head: "fp16",
            gen_head: "fp16",
            gen_img_embeds: "fp16",
            image_decode: "fp32",
          }
        : {
            prepare_inputs_embeds: "fp32",
            language_model: "q4",
            lm_head: "fp32",
            gen_head: "fp32",
            gen_img_embeds: "fp32",
            image_decode: "fp32",
          },
      device: {
        prepare_inputs_embeds: "wasm", // TODO use "webgpu" when bug is fixed
        language_model: "webgpu",
        lm_head: "webgpu",
        gen_head: "webgpu",
        gen_img_embeds: "webgpu",
        image_decode: "webgpu",
      },
      progress_callback,
    });

    return Promise.all([this.processor, this.model]);
  }
}

class ProgressStreamer extends BaseStreamer {
  constructor(total, on_progress) {
    super();
    this.total = total;
    this.on_progress = on_progress;

    this.count = null;
    this.start_time = null;
  }

  put(value) {
    if (this.count === null) {
      // Ignore the first batch of tokens (prompt)
      this.count = 0;
      this.start_time = performance.now();
      return;
    }

    const progress = ++this.count / this.total;

    this.on_progress({
      count: this.count,
      total: this.total,
      progress,
      time: performance.now() - this.start_time,
    });
  }

  end() {
    /* no nothing */
  }
}

const stopping_criteria = new InterruptableStoppingCriteria();

async function generate(messages) {
  // For this demo, we only respond to the last message
  const message = messages.at(-1);

  // Tell the main thread we are starting
  self.postMessage({ status: "start" });

  // Load the pipeline
  const [processor, model] = await ImageGenerationPipeline.getInstance();

  // Determine if the user wants to generate an image or text
  if (message.content.startsWith(IMAGE_GENERATION_COMMAND_PREFIX)) {
    const text = message.content.replace(IMAGE_GENERATION_COMMAND_PREFIX, "");

    const conversation = [
      {
        role: "<|User|>", // uses title case
        content: text,
      },
    ];
    const inputs = await processor(conversation, {
      chat_template: "text_to_image",
    });

    const callback_function = (output) => {
      self.postMessage({
        status: "image-update",
        ...output,
      });
    };

    const num_image_tokens = processor.num_image_tokens;
    const streamer = new ProgressStreamer(num_image_tokens, callback_function);

    const outputs = await model.generate_images({
      ...inputs,
      min_new_tokens: num_image_tokens,
      max_new_tokens: num_image_tokens,
      do_sample: true,
      streamer,
    });

    const blob = await outputs[0].toBlob();

    // Send the output back to the main thread
    self.postMessage({
      status: "image-update",
      blob,
    });
  } else {
    const inputs = await processor(
      message.image
        ? [
            {
              role: "<|User|>",
              content: "<image_placeholder>\n" + message.content,
              images: [message.image],
            },
          ]
        : [
            {
              role: "<|System|>",
              content:
                "You are a helpful assistant. Answer the user's questions in a concise manner.",
            },
            {
              role: "<|User|>",
              content: message.content,
            },
          ],
    );

    let startTime;
    let numTokens = 0;
    let tps;
    const token_callback_function = () => {
      startTime ??= performance.now();

      if (numTokens++ > 0) {
        tps = (numTokens / (performance.now() - startTime)) * 1000;
      }
    };
    const callback_function = (output) => {
      self.postMessage({
        status: "text-update",
        output,
        tps,
        numTokens,
      });
    };

    const streamer = new TextStreamer(processor.tokenizer, {
      skip_prompt: true,
      skip_special_tokens: true,
      callback_function,
      token_callback_function,
    });

    // Generate response
    const outputs = await model.generate({
      ...inputs,
      max_new_tokens: MAX_NEW_TEXT_TOKENS,
      do_sample: false,
      streamer,
      stopping_criteria,
    });
  }

  // Tell the main thread we are done
  self.postMessage({
    status: "complete",
  });
}

async function load() {
  self.postMessage({
    status: "loading",
    data: "Loading model...",
  });

  // Load the pipeline and save it for future use.
  const [processor, model] = await ImageGenerationPipeline.getInstance((x) => {
    // We also add a progress callback to the pipeline so that we can
    // track model loading.
    self.postMessage(x);
  });

  self.postMessage({ status: "ready" });
}

// Listen for messages from the main thread
self.addEventListener("message", async (e) => {
  const { type, data } = e.data;

  switch (type) {
    case "check":
      check();
      break;

    case "load":
      load();
      break;

    case "generate":
      stopping_criteria.reset();
      generate(data);
      break;

    case "interrupt":
      stopping_criteria.interrupt();
      break;

    case "reset":
      stopping_criteria.reset();
      break;
  }
});

Running the Demo

Check out the live demo here: DeepSeek Janus-Pro-1B Browser Demo.

Key Features of the Demo:

Real-time progress updates during model loading and inference.
WebGPU-accelerated generation (requires Chrome 113+ or Edge 113+).
Full client-side execution—no data is sent to external servers.

Challenges & Optimizations

Model Quantization: The model is quantized to 8-bit to reduce its size and improve loading speed.
Memory Management: Web Workers prevent UI freezing during inference.
Browser Compatibility: WebGPU is still experimental but critical for performance.

Conclusion

Running DeepSeek Janus-Pro-1B in the browser showcases the potential of client-side AI. With tools like Transformers.js and WebGPU, complex models can now operate efficiently in constrained environments while preserving user privacy.

Next Steps: