|
|
from __future__ import annotations |
|
|
|
|
|
from typing import ( |
|
|
Any, |
|
|
Literal, |
|
|
Union, |
|
|
) |
|
|
|
|
|
from openai.types.realtime.realtime_audio_formats import ( |
|
|
RealtimeAudioFormats as OpenAIRealtimeAudioFormats, |
|
|
) |
|
|
from typing_extensions import NotRequired, TypeAlias, TypedDict |
|
|
|
|
|
from agents.prompts import Prompt |
|
|
|
|
|
from ..guardrail import OutputGuardrail |
|
|
from ..handoffs import Handoff |
|
|
from ..model_settings import ToolChoice |
|
|
from ..tool import Tool |
|
|
|
|
|
RealtimeModelName: TypeAlias = Union[ |
|
|
Literal[ |
|
|
"gpt-realtime", |
|
|
"gpt-realtime-2025-08-28", |
|
|
"gpt-4o-realtime-preview", |
|
|
"gpt-4o-mini-realtime-preview", |
|
|
"gpt-4o-realtime-preview-2025-06-03", |
|
|
"gpt-4o-realtime-preview-2024-12-17", |
|
|
"gpt-4o-realtime-preview-2024-10-01", |
|
|
"gpt-4o-mini-realtime-preview-2024-12-17", |
|
|
], |
|
|
str, |
|
|
] |
|
|
"""The name of a realtime model.""" |
|
|
|
|
|
|
|
|
RealtimeAudioFormat: TypeAlias = Union[Literal["pcm16", "g711_ulaw", "g711_alaw"], str] |
|
|
"""The audio format for realtime audio streams.""" |
|
|
|
|
|
|
|
|
class RealtimeClientMessage(TypedDict): |
|
|
"""A raw message to be sent to the model.""" |
|
|
|
|
|
type: str |
|
|
"""The type of the message.""" |
|
|
|
|
|
other_data: NotRequired[dict[str, Any]] |
|
|
"""Merged into the message body.""" |
|
|
|
|
|
|
|
|
class RealtimeInputAudioTranscriptionConfig(TypedDict): |
|
|
"""Configuration for audio transcription in realtime sessions.""" |
|
|
|
|
|
language: NotRequired[str] |
|
|
"""The language code for transcription.""" |
|
|
|
|
|
model: NotRequired[Literal["gpt-4o-transcribe", "gpt-4o-mini-transcribe", "whisper-1"] | str] |
|
|
"""The transcription model to use.""" |
|
|
|
|
|
prompt: NotRequired[str] |
|
|
"""An optional prompt to guide transcription.""" |
|
|
|
|
|
|
|
|
class RealtimeInputAudioNoiseReductionConfig(TypedDict): |
|
|
"""Noise reduction configuration for input audio.""" |
|
|
|
|
|
type: NotRequired[Literal["near_field", "far_field"]] |
|
|
"""Noise reduction mode to apply to input audio.""" |
|
|
|
|
|
|
|
|
class RealtimeTurnDetectionConfig(TypedDict): |
|
|
"""Turn detection config. Allows extra vendor keys if needed.""" |
|
|
|
|
|
type: NotRequired[Literal["semantic_vad", "server_vad"]] |
|
|
"""The type of voice activity detection to use.""" |
|
|
|
|
|
create_response: NotRequired[bool] |
|
|
"""Whether to create a response when a turn is detected.""" |
|
|
|
|
|
eagerness: NotRequired[Literal["auto", "low", "medium", "high"]] |
|
|
"""How eagerly to detect turn boundaries.""" |
|
|
|
|
|
interrupt_response: NotRequired[bool] |
|
|
"""Whether to allow interrupting the assistant's response.""" |
|
|
|
|
|
prefix_padding_ms: NotRequired[int] |
|
|
"""Padding time in milliseconds before turn detection.""" |
|
|
|
|
|
silence_duration_ms: NotRequired[int] |
|
|
"""Duration of silence in milliseconds to trigger turn detection.""" |
|
|
|
|
|
threshold: NotRequired[float] |
|
|
"""The threshold for voice activity detection.""" |
|
|
|
|
|
idle_timeout_ms: NotRequired[int] |
|
|
"""Threshold for server-vad to trigger a response if the user is idle for this duration.""" |
|
|
|
|
|
|
|
|
class RealtimeSessionModelSettings(TypedDict): |
|
|
"""Model settings for a realtime model session.""" |
|
|
|
|
|
model_name: NotRequired[RealtimeModelName] |
|
|
"""The name of the realtime model to use.""" |
|
|
|
|
|
instructions: NotRequired[str] |
|
|
"""System instructions for the model.""" |
|
|
|
|
|
prompt: NotRequired[Prompt] |
|
|
"""The prompt to use for the model.""" |
|
|
|
|
|
modalities: NotRequired[list[Literal["text", "audio"]]] |
|
|
"""The modalities the model should support.""" |
|
|
|
|
|
voice: NotRequired[str] |
|
|
"""The voice to use for audio output.""" |
|
|
|
|
|
speed: NotRequired[float] |
|
|
"""The speed of the model's responses.""" |
|
|
|
|
|
input_audio_format: NotRequired[RealtimeAudioFormat | OpenAIRealtimeAudioFormats] |
|
|
"""The format for input audio streams.""" |
|
|
|
|
|
output_audio_format: NotRequired[RealtimeAudioFormat | OpenAIRealtimeAudioFormats] |
|
|
"""The format for output audio streams.""" |
|
|
|
|
|
input_audio_transcription: NotRequired[RealtimeInputAudioTranscriptionConfig] |
|
|
"""Configuration for transcribing input audio.""" |
|
|
|
|
|
input_audio_noise_reduction: NotRequired[RealtimeInputAudioNoiseReductionConfig | None] |
|
|
"""Noise reduction configuration for input audio.""" |
|
|
|
|
|
turn_detection: NotRequired[RealtimeTurnDetectionConfig] |
|
|
"""Configuration for detecting conversation turns.""" |
|
|
|
|
|
tool_choice: NotRequired[ToolChoice] |
|
|
"""How the model should choose which tools to call.""" |
|
|
|
|
|
tools: NotRequired[list[Tool]] |
|
|
"""List of tools available to the model.""" |
|
|
|
|
|
handoffs: NotRequired[list[Handoff]] |
|
|
"""List of handoff configurations.""" |
|
|
|
|
|
tracing: NotRequired[RealtimeModelTracingConfig | None] |
|
|
"""Configuration for request tracing.""" |
|
|
|
|
|
|
|
|
class RealtimeGuardrailsSettings(TypedDict): |
|
|
"""Settings for output guardrails in realtime sessions.""" |
|
|
|
|
|
debounce_text_length: NotRequired[int] |
|
|
""" |
|
|
The minimum number of characters to accumulate before running guardrails on transcript |
|
|
deltas. Defaults to 100. Guardrails run every time the accumulated text reaches |
|
|
1x, 2x, 3x, etc. times this threshold. |
|
|
""" |
|
|
|
|
|
|
|
|
class RealtimeModelTracingConfig(TypedDict): |
|
|
"""Configuration for tracing in realtime model sessions.""" |
|
|
|
|
|
workflow_name: NotRequired[str] |
|
|
"""The workflow name to use for tracing.""" |
|
|
|
|
|
group_id: NotRequired[str] |
|
|
"""A group identifier to use for tracing, to link multiple traces together.""" |
|
|
|
|
|
metadata: NotRequired[dict[str, Any]] |
|
|
"""Additional metadata to include with the trace.""" |
|
|
|
|
|
|
|
|
class RealtimeRunConfig(TypedDict): |
|
|
"""Configuration for running a realtime agent session.""" |
|
|
|
|
|
model_settings: NotRequired[RealtimeSessionModelSettings] |
|
|
"""Settings for the realtime model session.""" |
|
|
|
|
|
output_guardrails: NotRequired[list[OutputGuardrail[Any]]] |
|
|
"""List of output guardrails to run on the agent's responses.""" |
|
|
|
|
|
guardrails_settings: NotRequired[RealtimeGuardrailsSettings] |
|
|
"""Settings for guardrail execution.""" |
|
|
|
|
|
tracing_disabled: NotRequired[bool] |
|
|
"""Whether tracing is disabled for this run.""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RealtimeUserInputText(TypedDict): |
|
|
"""A text input from the user.""" |
|
|
|
|
|
type: Literal["input_text"] |
|
|
"""The type identifier for text input.""" |
|
|
|
|
|
text: str |
|
|
"""The text content from the user.""" |
|
|
|
|
|
|
|
|
class RealtimeUserInputImage(TypedDict, total=False): |
|
|
"""An image input from the user (Realtime).""" |
|
|
|
|
|
type: Literal["input_image"] |
|
|
image_url: str |
|
|
detail: NotRequired[Literal["auto", "low", "high"] | str] |
|
|
|
|
|
|
|
|
class RealtimeUserInputMessage(TypedDict): |
|
|
"""A message input from the user.""" |
|
|
|
|
|
type: Literal["message"] |
|
|
"""The type identifier for message inputs.""" |
|
|
|
|
|
role: Literal["user"] |
|
|
"""The role identifier for user messages.""" |
|
|
|
|
|
content: list[RealtimeUserInputText | RealtimeUserInputImage] |
|
|
"""List of content items (text and image) in the message.""" |
|
|
|
|
|
|
|
|
RealtimeUserInput: TypeAlias = Union[str, RealtimeUserInputMessage] |
|
|
"""User input that can be a string or structured message.""" |
|
|
|