Source code for pipecat.services.vision_service

#
# Copyright (c) 2024–2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

"""Vision service implementation.

Provides base classes and implementations for computer vision services that can
analyze images and generate textual descriptions or answers to questions about
visual content.
"""

from abc import abstractmethod
from typing import AsyncGenerator

from pipecat.frames.frames import Frame, VisionImageRawFrame
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_service import AIService



[docs]
class VisionService(AIService):
    """Base class for vision services.

    Provides common functionality for vision services that process images and
    generate textual responses. Handles image frame processing and integrates
    with the AI service infrastructure for metrics and lifecycle management.

    Args:
        **kwargs: Additional arguments passed to the parent AIService.
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._describe_text = None


[docs]
    @abstractmethod
    async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
        """Process a vision image frame and generate results.

        This method must be implemented by subclasses to provide actual computer
        vision functionality such as image description, object detection, or
        visual question answering.

        Args:
            frame: The vision image frame to process, containing image data.

        Yields:
            Frame: Frames containing the vision analysis results, typically TextFrame
            objects with descriptions or answers.
        """
        pass



[docs]
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        """Process frames, handling vision image frames for analysis.

        Automatically processes VisionImageRawFrame objects by calling run_vision
        and handles metrics tracking. Other frames are passed through unchanged.

        Args:
            frame: The frame to process.
            direction: The direction of frame processing.
        """
        await super().process_frame(frame, direction)

        if isinstance(frame, VisionImageRawFrame):
            await self.start_processing_metrics()
            await self.process_generator(self.run_vision(frame))
            await self.stop_processing_metrics()
        else:
            await self.push_frame(frame, direction)