Source code for pipecat.services.moondream.vision

#
# Copyright (c) 2024–2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

import asyncio
from typing import AsyncGenerator

from loguru import logger
from PIL import Image

from pipecat.frames.frames import ErrorFrame, Frame, TextFrame, VisionImageRawFrame
from pipecat.services.vision_service import VisionService

try:
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
except ModuleNotFoundError as e:
    logger.error(f"Exception: {e}")
    logger.error("In order to use Moondream, you need to `pip install pipecat-ai[moondream]`.")
    raise Exception(f"Missing module(s): {e}")


[docs] def detect_device(): """Detects the appropriate device to run on, and return the device and dtype.""" try: import intel_extension_for_pytorch if torch.xpu.is_available(): return torch.device("xpu"), torch.float32 except ImportError: pass if torch.cuda.is_available(): return torch.device("cuda"), torch.float16 elif torch.backends.mps.is_available(): return torch.device("mps"), torch.float16 else: return torch.device("cpu"), torch.float32
[docs] class MoondreamService(VisionService): def __init__( self, *, model="vikhyatk/moondream2", revision="2024-08-26", use_cpu=False, **kwargs ): super().__init__(**kwargs) self.set_model_name(model) if not use_cpu: device, dtype = detect_device() else: device = torch.device("cpu") dtype = torch.float32 self._tokenizer = AutoTokenizer.from_pretrained(model, revision=revision) logger.debug("Loading Moondream model...") self._model = AutoModelForCausalLM.from_pretrained( model, trust_remote_code=True, revision=revision ).to(device=device, dtype=dtype) self._model.eval() logger.debug("Loaded Moondream model")
[docs] async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]: if not self._model: logger.error(f"{self} error: Moondream model not available ({self.model_name})") yield ErrorFrame("Moondream model not available") return logger.debug(f"Analyzing image: {frame}") def get_image_description(frame: VisionImageRawFrame): image = Image.frombytes(frame.format, frame.size, frame.image) image_embeds = self._model.encode_image(image) description = self._model.answer_question( image_embeds=image_embeds, question=frame.text, tokenizer=self._tokenizer ) return description description = await asyncio.to_thread(get_image_description, frame) yield TextFrame(text=description)