Skip to content

Instantly share code, notes, and snippets.

@milljm
Last active April 11, 2025 02:05
Show Gist options
  • Select an option

  • Save milljm/52830a77b9017159b29dd52427836277 to your computer and use it in GitHub Desktop.

Select an option

Save milljm/52830a77b9017159b29dd52427836277 to your computer and use it in GitHub Desktop.
Kokoro txt --> speech
#!/usr/bin/env python3
""" a quick text to speech tool """
import os
import sys
import argparse
import sounddevice as sd
import warnings
warnings.filterwarnings('ignore')
try:
from IPython.display import display, Audio
from kokoro import KPipeline
import soundfile as sf
except ImportError:
print('There were errors importing the necessary libraries.\nBe sure to have the following '
'libraries installed\n(all available via Conda and PIP):\n\n\tkokoro, IPython,'
'soundfile, and sounddevice\n\n')
sys.exit(1)
def verify_args(args):
if not args.input and not args.args:
print('You need to specify an input file I will read, or just throw what you want said as '
'an argument')
sys.exit(1)
return args
def parse_args(argv, piped_input):
""" parses arguments """
parser = argparse.ArgumentParser(description='Using kokoro to generate text to speech')
parser.add_argument('-i', '--input', nargs='?',
help='treated as what to speak')
parser.add_argument('-a', '--accent', nargs='?', default='b',
help='Accent. Choose from: a (american), b (british). Default: b')
parser.add_argument('--voice', nargs='?', default='af_heart',
help='Voice to use. Default: af_heart. '
'Combined voices with: af_heart,af_bella')
parser.add_argument('-s', '--speed', nargs='?', type=float, default=1.0,
help='speed at which to speek')
parser.add_argument('--stream', action='store_const', const=True, default=False,
help='Stream audio directly')
parser.add_argument('args', nargs=argparse.REMAINDER)
args = parser.parse_args(argv)
if piped_input:
args.args = [piped_input]
return verify_args(args)
def txt2voice(args):
pipeline = KPipeline(lang_code=args.accent, repo_id='hexgrad/Kokoro-82M')
if args.input and os.path.exists(args.input):
with open(args.input, 'r', encoding="utf-8") as f:
text = f.read()
elif args.input and not os.path.exists(args.input):
print('path to input file not found or readable')
sys.exit(1)
else:
text = ' '.join(args.args)
generator = pipeline(text, voice=args.voice, speed=args.speed, split_pattern=r'\n+')
for i, (gs, ps, audio) in enumerate(generator):
print(f'Voice:\t{args.voice}\nGraphemes/Text:\n{gs}\n\nPhonemes:\n{ps}\n')
if args.stream:
sd.play(audio, 24000)
sd.wait()
else:
print(f'writing file:\t{i}.wav')
sf.write(f'{i}.wav', audio, 24000)
if __name__ == '__main__':
piped_input = None
if len(sys.argv) > 1:
arguments = sys.argv[1:]
if not sys.stdin.isatty():
piped_input = sys.stdin.read()
args = parse_args(sys.argv[1:], piped_input)
txt2voice(args)
@milljm
Copy link
Author

milljm commented Mar 17, 2025

hahah same. The closest I got:

ɔˈθːːˈɒɹɪtˌiː

@WilkAndy
Copy link

Wow, well done! That is actually really good !! :-)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment