Skip to content

Commit 1bb956f

Browse files
[API Nodes] add ElevenLabs nodes (Comfy-Org#12207)
* feat(api-nodes): add ElevenLabs API nodes * added price badge for ElevenLabsInstantVoiceClone node --------- Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
1 parent 96d6bd1 commit 1bb956f

File tree

3 files changed

+1016
-0
lines changed

3 files changed

+1016
-0
lines changed

comfy_api_nodes/apis/elevenlabs.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from pydantic import BaseModel, Field
2+
3+
4+
class SpeechToTextRequest(BaseModel):
5+
model_id: str = Field(...)
6+
cloud_storage_url: str = Field(...)
7+
language_code: str | None = Field(None, description="ISO-639-1 or ISO-639-3 language code")
8+
tag_audio_events: bool | None = Field(None, description="Annotate sounds like (laughter) in transcript")
9+
num_speakers: int | None = Field(None, description="Max speakers predicted")
10+
timestamps_granularity: str = Field(default="word", description="Timing precision: none, word, or character")
11+
diarize: bool | None = Field(None, description="Annotate which speaker is talking")
12+
diarization_threshold: float | None = Field(None, description="Speaker separation sensitivity")
13+
temperature: float | None = Field(None, description="Randomness control")
14+
seed: int = Field(..., description="Seed for deterministic sampling")
15+
16+
17+
class SpeechToTextWord(BaseModel):
18+
text: str = Field(..., description="The word text")
19+
type: str = Field(default="word", description="Type of text element (word, spacing, etc.)")
20+
start: float | None = Field(None, description="Start time in seconds (when timestamps enabled)")
21+
end: float | None = Field(None, description="End time in seconds (when timestamps enabled)")
22+
speaker_id: str | None = Field(None, description="Speaker identifier when diarization is enabled")
23+
logprob: float | None = Field(None, description="Log probability of the word")
24+
25+
26+
class SpeechToTextResponse(BaseModel):
27+
language_code: str = Field(..., description="Detected or specified language code")
28+
language_probability: float | None = Field(None, description="Confidence of language detection")
29+
text: str = Field(..., description="Full transcript text")
30+
words: list[SpeechToTextWord] | None = Field(None, description="Word-level timing information")
31+
32+
33+
class TextToSpeechVoiceSettings(BaseModel):
34+
stability: float | None = Field(None, description="Voice stability")
35+
similarity_boost: float | None = Field(None, description="Similarity boost")
36+
style: float | None = Field(None, description="Style exaggeration")
37+
use_speaker_boost: bool | None = Field(None, description="Boost similarity to original speaker")
38+
speed: float | None = Field(None, description="Speech speed")
39+
40+
41+
class TextToSpeechRequest(BaseModel):
42+
text: str = Field(..., description="Text to convert to speech")
43+
model_id: str = Field(..., description="Model ID for TTS")
44+
language_code: str | None = Field(None, description="ISO-639-1 or ISO-639-3 language code")
45+
voice_settings: TextToSpeechVoiceSettings | None = Field(None, description="Voice settings")
46+
seed: int = Field(..., description="Seed for deterministic sampling")
47+
apply_text_normalization: str | None = Field(None, description="Text normalization mode: auto, on, off")
48+
49+
50+
class TextToSoundEffectsRequest(BaseModel):
51+
text: str = Field(..., description="Text prompt to convert into a sound effect")
52+
duration_seconds: float = Field(..., description="Duration of generated sound in seconds")
53+
prompt_influence: float = Field(..., description="How closely generation follows the prompt")
54+
loop: bool | None = Field(None, description="Whether to create a smoothly looping sound effect")
55+
56+
57+
class AddVoiceRequest(BaseModel):
58+
name: str = Field(..., description="Name that identifies the voice")
59+
remove_background_noise: bool = Field(..., description="Remove background noise from voice samples")
60+
61+
62+
class AddVoiceResponse(BaseModel):
63+
voice_id: str = Field(..., description="The newly created voice's unique identifier")
64+
65+
66+
class SpeechToSpeechRequest(BaseModel):
67+
model_id: str = Field(..., description="Model ID for speech-to-speech")
68+
voice_settings: str = Field(..., description="JSON string of voice settings")
69+
seed: int = Field(..., description="Seed for deterministic sampling")
70+
remove_background_noise: bool = Field(..., description="Remove background noise from input audio")
71+
72+
73+
class DialogueInput(BaseModel):
74+
text: str = Field(..., description="Text content to convert to speech")
75+
voice_id: str = Field(..., description="Voice identifier for this dialogue segment")
76+
77+
78+
class DialogueSettings(BaseModel):
79+
stability: float | None = Field(None, description="Voice stability (0-1)")
80+
81+
82+
class TextToDialogueRequest(BaseModel):
83+
inputs: list[DialogueInput] = Field(..., description="List of dialogue segments")
84+
model_id: str = Field(..., description="Model ID for dialogue generation")
85+
language_code: str | None = Field(None, description="ISO-639-1 language code")
86+
settings: DialogueSettings | None = Field(None, description="Voice settings")
87+
seed: int | None = Field(None, description="Seed for deterministic sampling")
88+
apply_text_normalization: str | None = Field(None, description="Text normalization mode: auto, on, off")

0 commit comments

Comments
 (0)