Source code for pipecat.services.gemini_multimodal_live.events

#
# Copyright (c) 2024–2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
#

import base64
import io
import json
from enum import Enum
from typing import List, Literal, Optional

from PIL import Image
from pydantic import BaseModel, Field

from pipecat.frames.frames import ImageRawFrame

#
# Client events
#


[docs] class MediaChunk(BaseModel): mimeType: str data: str
[docs] class ContentPart(BaseModel): text: Optional[str] = Field(default=None, validate_default=False) inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
[docs] class Turn(BaseModel): role: Literal["user", "model"] = "user" parts: List[ContentPart]
[docs] class StartSensitivity(str, Enum): """Determines how start of speech is detected.""" UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
[docs] class EndSensitivity(str, Enum): """Determines how end of speech is detected.""" UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH HIGH = "END_SENSITIVITY_HIGH" # End speech more often LOW = "END_SENSITIVITY_LOW" # End speech less often
[docs] class AutomaticActivityDetection(BaseModel): """Configures automatic detection of activity.""" disabled: Optional[bool] = None start_of_speech_sensitivity: Optional[StartSensitivity] = None prefix_padding_ms: Optional[int] = None end_of_speech_sensitivity: Optional[EndSensitivity] = None silence_duration_ms: Optional[int] = None
[docs] class RealtimeInputConfig(BaseModel): """Configures the realtime input behavior.""" automatic_activity_detection: Optional[AutomaticActivityDetection] = None
[docs] class RealtimeInput(BaseModel): mediaChunks: List[MediaChunk]
[docs] class ClientContent(BaseModel): turns: Optional[List[Turn]] = None turnComplete: bool = False
[docs] class AudioInputMessage(BaseModel): realtimeInput: RealtimeInput
[docs] @classmethod def from_raw_audio(cls, raw_audio: bytes, sample_rate: int) -> "AudioInputMessage": data = base64.b64encode(raw_audio).decode("utf-8") return cls( realtimeInput=RealtimeInput( mediaChunks=[MediaChunk(mimeType=f"audio/pcm;rate={sample_rate}", data=data)] ) )
[docs] class VideoInputMessage(BaseModel): realtimeInput: RealtimeInput
[docs] @classmethod def from_image_frame(cls, frame: ImageRawFrame) -> "VideoInputMessage": buffer = io.BytesIO() Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG") data = base64.b64encode(buffer.getvalue()).decode("utf-8") return cls( realtimeInput=RealtimeInput(mediaChunks=[MediaChunk(mimeType=f"image/jpeg", data=data)]) )
[docs] class ClientContentMessage(BaseModel): clientContent: ClientContent
[docs] class SystemInstruction(BaseModel): parts: List[ContentPart]
[docs] class AudioTranscriptionConfig(BaseModel): pass
[docs] class Setup(BaseModel): model: str system_instruction: Optional[SystemInstruction] = None tools: Optional[List[dict]] = None generation_config: Optional[dict] = None input_audio_transcription: Optional[AudioTranscriptionConfig] = None output_audio_transcription: Optional[AudioTranscriptionConfig] = None realtime_input_config: Optional[RealtimeInputConfig] = None
[docs] class Config(BaseModel): setup: Setup
# # Server events #
[docs] class SetupComplete(BaseModel): pass
[docs] class InlineData(BaseModel): mimeType: str data: str
[docs] class Part(BaseModel): inlineData: Optional[InlineData] = None text: Optional[str] = None
[docs] class ModelTurn(BaseModel): parts: List[Part]
[docs] class ServerContentInterrupted(BaseModel): interrupted: bool
[docs] class ServerContentTurnComplete(BaseModel): turnComplete: bool
[docs] class BidiGenerateContentTranscription(BaseModel): text: str
[docs] class ServerContent(BaseModel): modelTurn: Optional[ModelTurn] = None interrupted: Optional[bool] = None turnComplete: Optional[bool] = None inputTranscription: Optional[BidiGenerateContentTranscription] = None outputTranscription: Optional[BidiGenerateContentTranscription] = None
[docs] class FunctionCall(BaseModel): id: str name: str args: dict
[docs] class ToolCall(BaseModel): functionCalls: List[FunctionCall]
[docs] class Modality(str, Enum): """Modality types in token counts.""" UNSPECIFIED = "MODALITY_UNSPECIFIED" TEXT = "TEXT" IMAGE = "IMAGE" AUDIO = "AUDIO" VIDEO = "VIDEO"
[docs] class ModalityTokenCount(BaseModel): """Token count for a specific modality.""" modality: Modality tokenCount: int
[docs] class UsageMetadata(BaseModel): """Usage metadata about the response.""" promptTokenCount: Optional[int] = None cachedContentTokenCount: Optional[int] = None responseTokenCount: Optional[int] = None toolUsePromptTokenCount: Optional[int] = None thoughtsTokenCount: Optional[int] = None totalTokenCount: Optional[int] = None promptTokensDetails: Optional[List[ModalityTokenCount]] = None cacheTokensDetails: Optional[List[ModalityTokenCount]] = None responseTokensDetails: Optional[List[ModalityTokenCount]] = None toolUsePromptTokensDetails: Optional[List[ModalityTokenCount]] = None
[docs] class ServerEvent(BaseModel): setupComplete: Optional[SetupComplete] = None serverContent: Optional[ServerContent] = None toolCall: Optional[ToolCall] = None usageMetadata: Optional[UsageMetadata] = None
[docs] def parse_server_event(str): try: evt = json.loads(str) return ServerEvent.model_validate(evt) except Exception as e: print(f"Error parsing server event: {e}") return None
[docs] class ContextWindowCompressionConfig(BaseModel): """Configuration for context window compression.""" sliding_window: Optional[bool] = Field(default=True) trigger_tokens: Optional[int] = Field(default=None)