k0ta0uchi · September 7, 2025 00:59
diff --git a/main.py b/main.py
 import torchaudio as ta
 from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 import uvicorn
 from fastapi import FastAPI, Query
 from pydantic import BaseModel
 from typing import List, Optional
 import sounddevice as sd
 import torch
 import uuid
 import pyopenjtalk
 import re
 import warnings
 import logging


 warnings.filterwarnings("ignore")
 logging.getLogger("chatterbox.models.t3.inference.alignment_stream_analyzer").setLevel(logging.ERROR)


 # --- Model Loading ---
 print("Loading model...")
 multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
 print("Model loaded.")

 AUDIO_PROMPT_PATH="base.mp3"

 # --- FastAPI App ---
 app = FastAPI()

 # --- Pydantic Models ---
 class Mora(BaseModel):
    text: str
    consonant: Optional[str] = None
    consonant_length: Optional[float] = None
    vowel: str
    vowel_length: float
    pitch: float

 class AccentPhrase(BaseModel):
    moras: List[Mora]
    accent: int
    pause_mora: Optional[Mora] = None
    is_interrogative: Optional[bool] = False

 class SynthesisRequestBody(BaseModel):
    accent_phrases: List[AccentPhrase]
    speedScale: float = 1.0
    pitchScale: float = 0.0
    intonationScale: float = 1.0
    volumeScale: float = 1.0
    prePhonemeLength: float = 0.1
    postPhonemeLength: float = 0.1
    outputSamplingRate: int = 24000
    outputStereo: bool = False
    kana: Optional[str] = None

 class SpeakerStyle(BaseModel):
    name: str
    id: int
    type: str

 class SupportedFeatures(BaseModel):
    permitted_synthesis_morphing: str

 class Speaker(BaseModel):
    name: str
    speaker_uuid: str
    styles: List[SpeakerStyle]
    version: str
    supported_features: SupportedFeatures

 # --- Core Functions ---
 def text_to_accent_phrases(text: str):
    labels = pyopenjtalk.extract_fullcontext(text)
    accent_phrases = []
    current_moras = []
    accent_position = 0
    
    for label in labels:
        # Extract phoneme, context, and accent information from the label
        # This is a simplified parsing logic. A more robust implementation would be needed for production.
        mora_match = re.search(r"/A:(\-?\d+)\+(\d+)\+(\d+)", label)
        if not mora_match:
            continue

        phoneme_match = re.search(r"-(.*?)\+", label)
        if not phoneme_match:
            continue
        phoneme = phoneme_match.group(1)
        
        # Simple split of phoneme into consonant and vowel
        # This is a very rough approximation.
        if phoneme in "aiueoN":
            consonant = None
            vowel = phoneme
        else:
            consonant = phoneme[0]
            vowel = phoneme[1:] if len(phoneme) > 1 else 'a' # fallback

        current_moras.append({
            "text": "？", # pyopenjtalk doesn't directly give mora text
            "consonant": consonant,
            "consonant_length": 0.0, # Placeholder
            "vowel": vowel,
            "vowel_length": 0.0, # Placeholder
            "pitch": 0.0 # Placeholder
        })

        # Accent phrase boundary detection (simplified)
        if mora_match.group(2) == "1":
            accent_position = len(current_moras)
        
        if "/F:" in label:
            accent_phrases.append({
                "moras": current_moras,
                "accent": accent_position,
                "is_interrogative": text.endswith("？") or text.endswith("?")
            })
            current_moras = []
            accent_position = 0

    if current_moras:
         accent_phrases.append({
            "moras": current_moras,
            "accent": accent_position,
            "is_interrogative": text.endswith("？") or text.endswith("?")
        })

    return accent_phrases


 def synthesize(text: str):
    """Synthesizes audio from text and returns it as a waveform."""
    wav = multilingual_model.generate(
        text,
        language_id="ja",
        audio_prompt_path=AUDIO_PROMPT_PATH
    )
    return wav

 # --- API Endpoints ---
 @app.get("/speakers", response_model=List[Speaker])
 async def get_speakers(core_version: Optional[str] = Query(None)):
    return [
        {
            "name": "Normal",
            "speaker_uuid": str(uuid.uuid4()),
            "styles": [
                {
                    "name": "Normal",
                    "id": 0,
                    "type": "talk"
                }
            ],
            "version": "1.0.0",
            "supported_features": {
                "permitted_synthesis_morphing": "ALL"
            }
        }
    ]

 @app.post("/audio_query", response_model=SynthesisRequestBody)
 async def audio_query(
    text: str = Query(..., description="Text to synthesize"),
    speaker: int = Query(..., description="Speaker ID"),
    core_version: Optional[str] = Query(None, description="Core version")
 ):
    accent_phrases = text_to_accent_phrases(text)
    kana = "".join(pyopenjtalk.g2p(text, kana=True).split())

    return {
        "accent_phrases": accent_phrases,
        "speedScale": 1.0,
        "pitchScale": 0.0,
        "intonationScale": 1.0,
        "volumeScale": 1.0,
        "prePhonemeLength": 0.1,
        "postPhonemeLength": 0.1,
        "outputSamplingRate": 24000,
        "outputStereo": False,
        "kana": kana
    }

 @app.post("/synthesis")
 async def api_synthesis(
    body: SynthesisRequestBody,
    speaker: int = Query(..., description="Speaker ID. Currently, only one speaker (Normal, ID 0) is available, so this parameter is effectively ignored."),
    enable_interrogative_upspeak: bool = Query(True),
    core_version: Optional[str] = Query(None)
 ):
    text_to_speak = body.kana or "これはテストです"
    wav = synthesize(text_to_speak)

    print("Playing audio from API request...")
    sd.play(wav.cpu().numpy().T, multilingual_model.sr)
    sd.wait()
    print("Done playing audio from API request.")

    output_path = "api_output.wav"
    ta.save(output_path, wav, multilingual_model.sr)
    
    return {"message": "Audio synthesized and played successfully", "path": output_path}

 # --- Interactive CLI ---
 def interactive_mode():
    print("\nInteractive mode. Enter text to synthesize, or 'quit' to exit.")
    while True:
        try:
            text = input("> ")
            if text.lower() == 'quit':
                break
            if text:
                print("Synthesizing...")
                wav = synthesize(text)
                print("Playing audio...")
                sd.play(wav.cpu().numpy().T, multilingual_model.sr)
                sd.wait()
                print("Done.")
        except KeyboardInterrupt:
            print("\nExiting interactive mode.")
            break
        except Exception as e:
            print(f"An error occurred: {e}")

 if __name__ == "__main__":
    interactive_mode()
	import torchaudio as ta
	from chatterbox.mtl_tts import ChatterboxMultilingualTTS
	import uvicorn
	from fastapi import FastAPI, Query
	from pydantic import BaseModel
	from typing import List, Optional
	import sounddevice as sd
	import torch
	import uuid
	import pyopenjtalk
	import re
	import warnings
	import logging


	warnings.filterwarnings("ignore")
	logging.getLogger("chatterbox.models.t3.inference.alignment_stream_analyzer").setLevel(logging.ERROR)


	# --- Model Loading ---
	print("Loading model...")
	multilingual_model = ChatterboxMultilingualTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
	print("Model loaded.")

	AUDIO_PROMPT_PATH="base.mp3"

	# --- FastAPI App ---
	app = FastAPI()

	# --- Pydantic Models ---
	class Mora(BaseModel):
	text: str
	consonant: Optional[str] = None
	consonant_length: Optional[float] = None
	vowel: str
	vowel_length: float
	pitch: float

	class AccentPhrase(BaseModel):
	moras: List[Mora]
	accent: int
	pause_mora: Optional[Mora] = None
	is_interrogative: Optional[bool] = False

	class SynthesisRequestBody(BaseModel):
	accent_phrases: List[AccentPhrase]
	speedScale: float = 1.0
	pitchScale: float = 0.0
	intonationScale: float = 1.0
	volumeScale: float = 1.0
	prePhonemeLength: float = 0.1
	postPhonemeLength: float = 0.1
	outputSamplingRate: int = 24000
	outputStereo: bool = False
	kana: Optional[str] = None

	class SpeakerStyle(BaseModel):
	name: str
	id: int
	type: str

	class SupportedFeatures(BaseModel):
	permitted_synthesis_morphing: str

	class Speaker(BaseModel):
	name: str
	speaker_uuid: str
	styles: List[SpeakerStyle]
	version: str
	supported_features: SupportedFeatures

	# --- Core Functions ---
	def text_to_accent_phrases(text: str):
	labels = pyopenjtalk.extract_fullcontext(text)
	accent_phrases = []
	current_moras = []
	accent_position = 0

	for label in labels:
	# Extract phoneme, context, and accent information from the label
	# This is a simplified parsing logic. A more robust implementation would be needed for production.
	mora_match = re.search(r"/A:(\-?\d+)\+(\d+)\+(\d+)", label)
	if not mora_match:
	continue

	phoneme_match = re.search(r"-(.*?)\+", label)
	if not phoneme_match:
	continue
	phoneme = phoneme_match.group(1)

	# Simple split of phoneme into consonant and vowel
	# This is a very rough approximation.
	if phoneme in "aiueoN":
	consonant = None
	vowel = phoneme
	else:
	consonant = phoneme[0]
	vowel = phoneme[1:] if len(phoneme) > 1 else 'a' # fallback

	current_moras.append({
	"text": "？", # pyopenjtalk doesn't directly give mora text
	"consonant": consonant,
	"consonant_length": 0.0, # Placeholder
	"vowel": vowel,
	"vowel_length": 0.0, # Placeholder
	"pitch": 0.0 # Placeholder
	})

	# Accent phrase boundary detection (simplified)
	if mora_match.group(2) == "1":
	accent_position = len(current_moras)

	if "/F:" in label:
	accent_phrases.append({
	"moras": current_moras,
	"accent": accent_position,
	"is_interrogative": text.endswith("？") or text.endswith("?")
	})
	current_moras = []
	accent_position = 0

	if current_moras:
	accent_phrases.append({
	"moras": current_moras,
	"accent": accent_position,
	"is_interrogative": text.endswith("？") or text.endswith("?")
	})

	return accent_phrases


	def synthesize(text: str):
	"""Synthesizes audio from text and returns it as a waveform."""
	wav = multilingual_model.generate(
	text,
	language_id="ja",
	audio_prompt_path=AUDIO_PROMPT_PATH
	)
	return wav

	# --- API Endpoints ---
	@app.get("/speakers", response_model=List[Speaker])
	async def get_speakers(core_version: Optional[str] = Query(None)):
	return [
	{
	"name": "Normal",
	"speaker_uuid": str(uuid.uuid4()),
	"styles": [
	{
	"name": "Normal",
	"id": 0,
	"type": "talk"
	}
	],
	"version": "1.0.0",
	"supported_features": {
	"permitted_synthesis_morphing": "ALL"
	}
	}
	]

	@app.post("/audio_query", response_model=SynthesisRequestBody)
	async def audio_query(
	text: str = Query(..., description="Text to synthesize"),
	speaker: int = Query(..., description="Speaker ID"),
	core_version: Optional[str] = Query(None, description="Core version")
	):
	accent_phrases = text_to_accent_phrases(text)
	kana = "".join(pyopenjtalk.g2p(text, kana=True).split())

	return {
	"accent_phrases": accent_phrases,
	"speedScale": 1.0,
	"pitchScale": 0.0,
	"intonationScale": 1.0,
	"volumeScale": 1.0,
	"prePhonemeLength": 0.1,
	"postPhonemeLength": 0.1,
	"outputSamplingRate": 24000,
	"outputStereo": False,
	"kana": kana
	}

	@app.post("/synthesis")
	async def api_synthesis(
	body: SynthesisRequestBody,
	speaker: int = Query(..., description="Speaker ID. Currently, only one speaker (Normal, ID 0) is available, so this parameter is effectively ignored."),
	enable_interrogative_upspeak: bool = Query(True),
	core_version: Optional[str] = Query(None)
	):
	text_to_speak = body.kana or "これはテストです"
	wav = synthesize(text_to_speak)

	print("Playing audio from API request...")
	sd.play(wav.cpu().numpy().T, multilingual_model.sr)
	sd.wait()
	print("Done playing audio from API request.")

	output_path = "api_output.wav"
	ta.save(output_path, wav, multilingual_model.sr)

	return {"message": "Audio synthesized and played successfully", "path": output_path}

	# --- Interactive CLI ---
	def interactive_mode():
	print("\nInteractive mode. Enter text to synthesize, or 'quit' to exit.")
	while True:
	try:
	text = input("> ")
	if text.lower() == 'quit':
	break
	if text:
	print("Synthesizing...")
	wav = synthesize(text)
	print("Playing audio...")
	sd.play(wav.cpu().numpy().T, multilingual_model.sr)
	sd.wait()
	print("Done.")
	except KeyboardInterrupt:
	print("\nExiting interactive mode.")
	break
	except Exception as e:
	print(f"An error occurred: {e}")

	if __name__ == "__main__":
	interactive_mode()
No results found