#
# Copyright (c) 2024–2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
#
import base64
import io
import json
from enum import Enum
from typing import List, Literal, Optional
from PIL import Image
from pydantic import BaseModel, Field
from pipecat.frames.frames import ImageRawFrame
#
# Client events
#
[docs]
class ContentPart(BaseModel):
text: Optional[str] = Field(default=None, validate_default=False)
inlineData: Optional[MediaChunk] = Field(default=None, validate_default=False)
[docs]
class Turn(BaseModel):
role: Literal["user", "model"] = "user"
parts: List[ContentPart]
[docs]
class StartSensitivity(str, Enum):
"""Determines how start of speech is detected."""
UNSPECIFIED = "START_SENSITIVITY_UNSPECIFIED" # Default is HIGH
HIGH = "START_SENSITIVITY_HIGH" # Detect start of speech more often
LOW = "START_SENSITIVITY_LOW" # Detect start of speech less often
[docs]
class EndSensitivity(str, Enum):
"""Determines how end of speech is detected."""
UNSPECIFIED = "END_SENSITIVITY_UNSPECIFIED" # Default is HIGH
HIGH = "END_SENSITIVITY_HIGH" # End speech more often
LOW = "END_SENSITIVITY_LOW" # End speech less often
[docs]
class AutomaticActivityDetection(BaseModel):
"""Configures automatic detection of activity."""
disabled: Optional[bool] = None
start_of_speech_sensitivity: Optional[StartSensitivity] = None
prefix_padding_ms: Optional[int] = None
end_of_speech_sensitivity: Optional[EndSensitivity] = None
silence_duration_ms: Optional[int] = None
[docs]
class ClientContent(BaseModel):
turns: Optional[List[Turn]] = None
turnComplete: bool = False
[docs]
class ClientContentMessage(BaseModel):
clientContent: ClientContent
[docs]
class SystemInstruction(BaseModel):
parts: List[ContentPart]
[docs]
class AudioTranscriptionConfig(BaseModel):
pass
[docs]
class Setup(BaseModel):
model: str
system_instruction: Optional[SystemInstruction] = None
tools: Optional[List[dict]] = None
generation_config: Optional[dict] = None
input_audio_transcription: Optional[AudioTranscriptionConfig] = None
output_audio_transcription: Optional[AudioTranscriptionConfig] = None
realtime_input_config: Optional[RealtimeInputConfig] = None
[docs]
class Config(BaseModel):
setup: Setup
#
# Server events
#
[docs]
class SetupComplete(BaseModel):
pass
[docs]
class InlineData(BaseModel):
mimeType: str
data: str
[docs]
class Part(BaseModel):
inlineData: Optional[InlineData] = None
text: Optional[str] = None
[docs]
class ModelTurn(BaseModel):
parts: List[Part]
[docs]
class ServerContentInterrupted(BaseModel):
interrupted: bool
[docs]
class ServerContentTurnComplete(BaseModel):
turnComplete: bool
[docs]
class BidiGenerateContentTranscription(BaseModel):
text: str
[docs]
class ServerContent(BaseModel):
modelTurn: Optional[ModelTurn] = None
interrupted: Optional[bool] = None
turnComplete: Optional[bool] = None
inputTranscription: Optional[BidiGenerateContentTranscription] = None
outputTranscription: Optional[BidiGenerateContentTranscription] = None
[docs]
class FunctionCall(BaseModel):
id: str
name: str
args: dict
[docs]
class Modality(str, Enum):
"""Modality types in token counts."""
UNSPECIFIED = "MODALITY_UNSPECIFIED"
TEXT = "TEXT"
IMAGE = "IMAGE"
AUDIO = "AUDIO"
VIDEO = "VIDEO"
[docs]
class ModalityTokenCount(BaseModel):
"""Token count for a specific modality."""
modality: Modality
tokenCount: int
[docs]
class ServerEvent(BaseModel):
setupComplete: Optional[SetupComplete] = None
serverContent: Optional[ServerContent] = None
toolCall: Optional[ToolCall] = None
usageMetadata: Optional[UsageMetadata] = None
[docs]
def parse_server_event(str):
try:
evt = json.loads(str)
return ServerEvent.model_validate(evt)
except Exception as e:
print(f"Error parsing server event: {e}")
return None
[docs]
class ContextWindowCompressionConfig(BaseModel):
"""Configuration for context window compression."""
sliding_window: Optional[bool] = Field(default=True)
trigger_tokens: Optional[int] = Field(default=None)