Skip to content

Instantly share code, notes, and snippets.

@k0ta0uchi
Created September 7, 2025 00:59
Show Gist options
  • Select an option

  • Save k0ta0uchi/74f01fde8eb1985bc98baaab9fac40fa to your computer and use it in GitHub Desktop.

Select an option

Save k0ta0uchi/74f01fde8eb1985bc98baaab9fac40fa to your computer and use it in GitHub Desktop.
chatterboxをvoicebox apiぽくしてつかうあれ
import torchaudio as ta
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
import uvicorn
from fastapi import FastAPI, Query
from pydantic import BaseModel
from typing import List, Optional
import sounddevice as sd
import torch
import uuid
import pyopenjtalk
import re
import warnings
import logging
warnings.filterwarnings("ignore")
logging.getLogger("chatterbox.models.t3.inference.alignment_stream_analyzer").setLevel(logging.ERROR)
# --- Model Loading ---
print("Loading model...")
multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
print("Model loaded.")
AUDIO_PROMPT_PATH="base.mp3"
# --- FastAPI App ---
app = FastAPI()
# --- Pydantic Models ---
class Mora(BaseModel):
text: str
consonant: Optional[str] = None
consonant_length: Optional[float] = None
vowel: str
vowel_length: float
pitch: float
class AccentPhrase(BaseModel):
moras: List[Mora]
accent: int
pause_mora: Optional[Mora] = None
is_interrogative: Optional[bool] = False
class SynthesisRequestBody(BaseModel):
accent_phrases: List[AccentPhrase]
speedScale: float = 1.0
pitchScale: float = 0.0
intonationScale: float = 1.0
volumeScale: float = 1.0
prePhonemeLength: float = 0.1
postPhonemeLength: float = 0.1
outputSamplingRate: int = 24000
outputStereo: bool = False
kana: Optional[str] = None
class SpeakerStyle(BaseModel):
name: str
id: int
type: str
class SupportedFeatures(BaseModel):
permitted_synthesis_morphing: str
class Speaker(BaseModel):
name: str
speaker_uuid: str
styles: List[SpeakerStyle]
version: str
supported_features: SupportedFeatures
# --- Core Functions ---
def text_to_accent_phrases(text: str):
labels = pyopenjtalk.extract_fullcontext(text)
accent_phrases = []
current_moras = []
accent_position = 0
for label in labels:
# Extract phoneme, context, and accent information from the label
# This is a simplified parsing logic. A more robust implementation would be needed for production.
mora_match = re.search(r"/A:(\-?\d+)\+(\d+)\+(\d+)", label)
if not mora_match:
continue
phoneme_match = re.search(r"-(.*?)\+", label)
if not phoneme_match:
continue
phoneme = phoneme_match.group(1)
# Simple split of phoneme into consonant and vowel
# This is a very rough approximation.
if phoneme in "aiueoN":
consonant = None
vowel = phoneme
else:
consonant = phoneme[0]
vowel = phoneme[1:] if len(phoneme) > 1 else 'a' # fallback
current_moras.append({
"text": "?", # pyopenjtalk doesn't directly give mora text
"consonant": consonant,
"consonant_length": 0.0, # Placeholder
"vowel": vowel,
"vowel_length": 0.0, # Placeholder
"pitch": 0.0 # Placeholder
})
# Accent phrase boundary detection (simplified)
if mora_match.group(2) == "1":
accent_position = len(current_moras)
if "/F:" in label:
accent_phrases.append({
"moras": current_moras,
"accent": accent_position,
"is_interrogative": text.endswith("?") or text.endswith("?")
})
current_moras = []
accent_position = 0
if current_moras:
accent_phrases.append({
"moras": current_moras,
"accent": accent_position,
"is_interrogative": text.endswith("?") or text.endswith("?")
})
return accent_phrases
def synthesize(text: str):
"""Synthesizes audio from text and returns it as a waveform."""
wav = multilingual_model.generate(
text,
language_id="ja",
audio_prompt_path=AUDIO_PROMPT_PATH
)
return wav
# --- API Endpoints ---
@app.get("/speakers", response_model=List[Speaker])
async def get_speakers(core_version: Optional[str] = Query(None)):
return [
{
"name": "Normal",
"speaker_uuid": str(uuid.uuid4()),
"styles": [
{
"name": "Normal",
"id": 0,
"type": "talk"
}
],
"version": "1.0.0",
"supported_features": {
"permitted_synthesis_morphing": "ALL"
}
}
]
@app.post("/audio_query", response_model=SynthesisRequestBody)
async def audio_query(
text: str = Query(..., description="Text to synthesize"),
speaker: int = Query(..., description="Speaker ID"),
core_version: Optional[str] = Query(None, description="Core version")
):
accent_phrases = text_to_accent_phrases(text)
kana = "".join(pyopenjtalk.g2p(text, kana=True).split())
return {
"accent_phrases": accent_phrases,
"speedScale": 1.0,
"pitchScale": 0.0,
"intonationScale": 1.0,
"volumeScale": 1.0,
"prePhonemeLength": 0.1,
"postPhonemeLength": 0.1,
"outputSamplingRate": 24000,
"outputStereo": False,
"kana": kana
}
@app.post("/synthesis")
async def api_synthesis(
body: SynthesisRequestBody,
speaker: int = Query(..., description="Speaker ID. Currently, only one speaker (Normal, ID 0) is available, so this parameter is effectively ignored."),
enable_interrogative_upspeak: bool = Query(True),
core_version: Optional[str] = Query(None)
):
text_to_speak = body.kana or "これはテストです"
wav = synthesize(text_to_speak)
print("Playing audio from API request...")
sd.play(wav.cpu().numpy().T, multilingual_model.sr)
sd.wait()
print("Done playing audio from API request.")
output_path = "api_output.wav"
ta.save(output_path, wav, multilingual_model.sr)
return {"message": "Audio synthesized and played successfully", "path": output_path}
# --- Interactive CLI ---
def interactive_mode():
print("\nInteractive mode. Enter text to synthesize, or 'quit' to exit.")
while True:
try:
text = input("> ")
if text.lower() == 'quit':
break
if text:
print("Synthesizing...")
wav = synthesize(text)
print("Playing audio...")
sd.play(wav.cpu().numpy().T, multilingual_model.sr)
sd.wait()
print("Done.")
except KeyboardInterrupt:
print("\nExiting interactive mode.")
break
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
interactive_mode()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment