|
| 1 | +from pydantic import BaseModel, Field |
| 2 | + |
| 3 | + |
| 4 | +class SpeechToTextRequest(BaseModel): |
| 5 | + model_id: str = Field(...) |
| 6 | + cloud_storage_url: str = Field(...) |
| 7 | + language_code: str | None = Field(None, description="ISO-639-1 or ISO-639-3 language code") |
| 8 | + tag_audio_events: bool | None = Field(None, description="Annotate sounds like (laughter) in transcript") |
| 9 | + num_speakers: int | None = Field(None, description="Max speakers predicted") |
| 10 | + timestamps_granularity: str = Field(default="word", description="Timing precision: none, word, or character") |
| 11 | + diarize: bool | None = Field(None, description="Annotate which speaker is talking") |
| 12 | + diarization_threshold: float | None = Field(None, description="Speaker separation sensitivity") |
| 13 | + temperature: float | None = Field(None, description="Randomness control") |
| 14 | + seed: int = Field(..., description="Seed for deterministic sampling") |
| 15 | + |
| 16 | + |
| 17 | +class SpeechToTextWord(BaseModel): |
| 18 | + text: str = Field(..., description="The word text") |
| 19 | + type: str = Field(default="word", description="Type of text element (word, spacing, etc.)") |
| 20 | + start: float | None = Field(None, description="Start time in seconds (when timestamps enabled)") |
| 21 | + end: float | None = Field(None, description="End time in seconds (when timestamps enabled)") |
| 22 | + speaker_id: str | None = Field(None, description="Speaker identifier when diarization is enabled") |
| 23 | + logprob: float | None = Field(None, description="Log probability of the word") |
| 24 | + |
| 25 | + |
| 26 | +class SpeechToTextResponse(BaseModel): |
| 27 | + language_code: str = Field(..., description="Detected or specified language code") |
| 28 | + language_probability: float | None = Field(None, description="Confidence of language detection") |
| 29 | + text: str = Field(..., description="Full transcript text") |
| 30 | + words: list[SpeechToTextWord] | None = Field(None, description="Word-level timing information") |
| 31 | + |
| 32 | + |
| 33 | +class TextToSpeechVoiceSettings(BaseModel): |
| 34 | + stability: float | None = Field(None, description="Voice stability") |
| 35 | + similarity_boost: float | None = Field(None, description="Similarity boost") |
| 36 | + style: float | None = Field(None, description="Style exaggeration") |
| 37 | + use_speaker_boost: bool | None = Field(None, description="Boost similarity to original speaker") |
| 38 | + speed: float | None = Field(None, description="Speech speed") |
| 39 | + |
| 40 | + |
| 41 | +class TextToSpeechRequest(BaseModel): |
| 42 | + text: str = Field(..., description="Text to convert to speech") |
| 43 | + model_id: str = Field(..., description="Model ID for TTS") |
| 44 | + language_code: str | None = Field(None, description="ISO-639-1 or ISO-639-3 language code") |
| 45 | + voice_settings: TextToSpeechVoiceSettings | None = Field(None, description="Voice settings") |
| 46 | + seed: int = Field(..., description="Seed for deterministic sampling") |
| 47 | + apply_text_normalization: str | None = Field(None, description="Text normalization mode: auto, on, off") |
| 48 | + |
| 49 | + |
| 50 | +class TextToSoundEffectsRequest(BaseModel): |
| 51 | + text: str = Field(..., description="Text prompt to convert into a sound effect") |
| 52 | + duration_seconds: float = Field(..., description="Duration of generated sound in seconds") |
| 53 | + prompt_influence: float = Field(..., description="How closely generation follows the prompt") |
| 54 | + loop: bool | None = Field(None, description="Whether to create a smoothly looping sound effect") |
| 55 | + |
| 56 | + |
| 57 | +class AddVoiceRequest(BaseModel): |
| 58 | + name: str = Field(..., description="Name that identifies the voice") |
| 59 | + remove_background_noise: bool = Field(..., description="Remove background noise from voice samples") |
| 60 | + |
| 61 | + |
| 62 | +class AddVoiceResponse(BaseModel): |
| 63 | + voice_id: str = Field(..., description="The newly created voice's unique identifier") |
| 64 | + |
| 65 | + |
| 66 | +class SpeechToSpeechRequest(BaseModel): |
| 67 | + model_id: str = Field(..., description="Model ID for speech-to-speech") |
| 68 | + voice_settings: str = Field(..., description="JSON string of voice settings") |
| 69 | + seed: int = Field(..., description="Seed for deterministic sampling") |
| 70 | + remove_background_noise: bool = Field(..., description="Remove background noise from input audio") |
| 71 | + |
| 72 | + |
| 73 | +class DialogueInput(BaseModel): |
| 74 | + text: str = Field(..., description="Text content to convert to speech") |
| 75 | + voice_id: str = Field(..., description="Voice identifier for this dialogue segment") |
| 76 | + |
| 77 | + |
| 78 | +class DialogueSettings(BaseModel): |
| 79 | + stability: float | None = Field(None, description="Voice stability (0-1)") |
| 80 | + |
| 81 | + |
| 82 | +class TextToDialogueRequest(BaseModel): |
| 83 | + inputs: list[DialogueInput] = Field(..., description="List of dialogue segments") |
| 84 | + model_id: str = Field(..., description="Model ID for dialogue generation") |
| 85 | + language_code: str | None = Field(None, description="ISO-639-1 language code") |
| 86 | + settings: DialogueSettings | None = Field(None, description="Voice settings") |
| 87 | + seed: int | None = Field(None, description="Seed for deterministic sampling") |
| 88 | + apply_text_normalization: str | None = Field(None, description="Text normalization mode: auto, on, off") |
0 commit comments