Created
September 7, 2025 00:59
-
-
Save k0ta0uchi/74f01fde8eb1985bc98baaab9fac40fa to your computer and use it in GitHub Desktop.
chatterboxをvoicebox apiぽくしてつかうあれ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torchaudio as ta | |
| from chatterbox.mtl_tts import ChatterboxMultilingualTTS | |
| import uvicorn | |
| from fastapi import FastAPI, Query | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| import sounddevice as sd | |
| import torch | |
| import uuid | |
| import pyopenjtalk | |
| import re | |
| import warnings | |
| import logging | |
| warnings.filterwarnings("ignore") | |
| logging.getLogger("chatterbox.models.t3.inference.alignment_stream_analyzer").setLevel(logging.ERROR) | |
| # --- Model Loading --- | |
| print("Loading model...") | |
| multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu") | |
| print("Model loaded.") | |
| AUDIO_PROMPT_PATH="base.mp3" | |
| # --- FastAPI App --- | |
| app = FastAPI() | |
| # --- Pydantic Models --- | |
| class Mora(BaseModel): | |
| text: str | |
| consonant: Optional[str] = None | |
| consonant_length: Optional[float] = None | |
| vowel: str | |
| vowel_length: float | |
| pitch: float | |
| class AccentPhrase(BaseModel): | |
| moras: List[Mora] | |
| accent: int | |
| pause_mora: Optional[Mora] = None | |
| is_interrogative: Optional[bool] = False | |
| class SynthesisRequestBody(BaseModel): | |
| accent_phrases: List[AccentPhrase] | |
| speedScale: float = 1.0 | |
| pitchScale: float = 0.0 | |
| intonationScale: float = 1.0 | |
| volumeScale: float = 1.0 | |
| prePhonemeLength: float = 0.1 | |
| postPhonemeLength: float = 0.1 | |
| outputSamplingRate: int = 24000 | |
| outputStereo: bool = False | |
| kana: Optional[str] = None | |
| class SpeakerStyle(BaseModel): | |
| name: str | |
| id: int | |
| type: str | |
| class SupportedFeatures(BaseModel): | |
| permitted_synthesis_morphing: str | |
| class Speaker(BaseModel): | |
| name: str | |
| speaker_uuid: str | |
| styles: List[SpeakerStyle] | |
| version: str | |
| supported_features: SupportedFeatures | |
| # --- Core Functions --- | |
| def text_to_accent_phrases(text: str): | |
| labels = pyopenjtalk.extract_fullcontext(text) | |
| accent_phrases = [] | |
| current_moras = [] | |
| accent_position = 0 | |
| for label in labels: | |
| # Extract phoneme, context, and accent information from the label | |
| # This is a simplified parsing logic. A more robust implementation would be needed for production. | |
| mora_match = re.search(r"/A:(\-?\d+)\+(\d+)\+(\d+)", label) | |
| if not mora_match: | |
| continue | |
| phoneme_match = re.search(r"-(.*?)\+", label) | |
| if not phoneme_match: | |
| continue | |
| phoneme = phoneme_match.group(1) | |
| # Simple split of phoneme into consonant and vowel | |
| # This is a very rough approximation. | |
| if phoneme in "aiueoN": | |
| consonant = None | |
| vowel = phoneme | |
| else: | |
| consonant = phoneme[0] | |
| vowel = phoneme[1:] if len(phoneme) > 1 else 'a' # fallback | |
| current_moras.append({ | |
| "text": "?", # pyopenjtalk doesn't directly give mora text | |
| "consonant": consonant, | |
| "consonant_length": 0.0, # Placeholder | |
| "vowel": vowel, | |
| "vowel_length": 0.0, # Placeholder | |
| "pitch": 0.0 # Placeholder | |
| }) | |
| # Accent phrase boundary detection (simplified) | |
| if mora_match.group(2) == "1": | |
| accent_position = len(current_moras) | |
| if "/F:" in label: | |
| accent_phrases.append({ | |
| "moras": current_moras, | |
| "accent": accent_position, | |
| "is_interrogative": text.endswith("?") or text.endswith("?") | |
| }) | |
| current_moras = [] | |
| accent_position = 0 | |
| if current_moras: | |
| accent_phrases.append({ | |
| "moras": current_moras, | |
| "accent": accent_position, | |
| "is_interrogative": text.endswith("?") or text.endswith("?") | |
| }) | |
| return accent_phrases | |
| def synthesize(text: str): | |
| """Synthesizes audio from text and returns it as a waveform.""" | |
| wav = multilingual_model.generate( | |
| text, | |
| language_id="ja", | |
| audio_prompt_path=AUDIO_PROMPT_PATH | |
| ) | |
| return wav | |
| # --- API Endpoints --- | |
| @app.get("/speakers", response_model=List[Speaker]) | |
| async def get_speakers(core_version: Optional[str] = Query(None)): | |
| return [ | |
| { | |
| "name": "Normal", | |
| "speaker_uuid": str(uuid.uuid4()), | |
| "styles": [ | |
| { | |
| "name": "Normal", | |
| "id": 0, | |
| "type": "talk" | |
| } | |
| ], | |
| "version": "1.0.0", | |
| "supported_features": { | |
| "permitted_synthesis_morphing": "ALL" | |
| } | |
| } | |
| ] | |
| @app.post("/audio_query", response_model=SynthesisRequestBody) | |
| async def audio_query( | |
| text: str = Query(..., description="Text to synthesize"), | |
| speaker: int = Query(..., description="Speaker ID"), | |
| core_version: Optional[str] = Query(None, description="Core version") | |
| ): | |
| accent_phrases = text_to_accent_phrases(text) | |
| kana = "".join(pyopenjtalk.g2p(text, kana=True).split()) | |
| return { | |
| "accent_phrases": accent_phrases, | |
| "speedScale": 1.0, | |
| "pitchScale": 0.0, | |
| "intonationScale": 1.0, | |
| "volumeScale": 1.0, | |
| "prePhonemeLength": 0.1, | |
| "postPhonemeLength": 0.1, | |
| "outputSamplingRate": 24000, | |
| "outputStereo": False, | |
| "kana": kana | |
| } | |
| @app.post("/synthesis") | |
| async def api_synthesis( | |
| body: SynthesisRequestBody, | |
| speaker: int = Query(..., description="Speaker ID. Currently, only one speaker (Normal, ID 0) is available, so this parameter is effectively ignored."), | |
| enable_interrogative_upspeak: bool = Query(True), | |
| core_version: Optional[str] = Query(None) | |
| ): | |
| text_to_speak = body.kana or "これはテストです" | |
| wav = synthesize(text_to_speak) | |
| print("Playing audio from API request...") | |
| sd.play(wav.cpu().numpy().T, multilingual_model.sr) | |
| sd.wait() | |
| print("Done playing audio from API request.") | |
| output_path = "api_output.wav" | |
| ta.save(output_path, wav, multilingual_model.sr) | |
| return {"message": "Audio synthesized and played successfully", "path": output_path} | |
| # --- Interactive CLI --- | |
| def interactive_mode(): | |
| print("\nInteractive mode. Enter text to synthesize, or 'quit' to exit.") | |
| while True: | |
| try: | |
| text = input("> ") | |
| if text.lower() == 'quit': | |
| break | |
| if text: | |
| print("Synthesizing...") | |
| wav = synthesize(text) | |
| print("Playing audio...") | |
| sd.play(wav.cpu().numpy().T, multilingual_model.sr) | |
| sd.wait() | |
| print("Done.") | |
| except KeyboardInterrupt: | |
| print("\nExiting interactive mode.") | |
| break | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| if __name__ == "__main__": | |
| interactive_mode() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment