-
-
Save milljm/52830a77b9017159b29dd52427836277 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python3 | |
| """ a quick text to speech tool """ | |
| import os | |
| import sys | |
| import argparse | |
| import sounddevice as sd | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| try: | |
| from IPython.display import display, Audio | |
| from kokoro import KPipeline | |
| import soundfile as sf | |
| except ImportError: | |
| print('There were errors importing the necessary libraries.\nBe sure to have the following ' | |
| 'libraries installed\n(all available via Conda and PIP):\n\n\tkokoro, IPython,' | |
| 'soundfile, and sounddevice\n\n') | |
| sys.exit(1) | |
| def verify_args(args): | |
| if not args.input and not args.args: | |
| print('You need to specify an input file I will read, or just throw what you want said as ' | |
| 'an argument') | |
| sys.exit(1) | |
| return args | |
| def parse_args(argv, piped_input): | |
| """ parses arguments """ | |
| parser = argparse.ArgumentParser(description='Using kokoro to generate text to speech') | |
| parser.add_argument('-i', '--input', nargs='?', | |
| help='treated as what to speak') | |
| parser.add_argument('-a', '--accent', nargs='?', default='b', | |
| help='Accent. Choose from: a (american), b (british). Default: b') | |
| parser.add_argument('--voice', nargs='?', default='af_heart', | |
| help='Voice to use. Default: af_heart. ' | |
| 'Combined voices with: af_heart,af_bella') | |
| parser.add_argument('-s', '--speed', nargs='?', type=float, default=1.0, | |
| help='speed at which to speek') | |
| parser.add_argument('--stream', action='store_const', const=True, default=False, | |
| help='Stream audio directly') | |
| parser.add_argument('args', nargs=argparse.REMAINDER) | |
| args = parser.parse_args(argv) | |
| if piped_input: | |
| args.args = [piped_input] | |
| return verify_args(args) | |
| def txt2voice(args): | |
| pipeline = KPipeline(lang_code=args.accent, repo_id='hexgrad/Kokoro-82M') | |
| if args.input and os.path.exists(args.input): | |
| with open(args.input, 'r', encoding="utf-8") as f: | |
| text = f.read() | |
| elif args.input and not os.path.exists(args.input): | |
| print('path to input file not found or readable') | |
| sys.exit(1) | |
| else: | |
| text = ' '.join(args.args) | |
| generator = pipeline(text, voice=args.voice, speed=args.speed, split_pattern=r'\n+') | |
| for i, (gs, ps, audio) in enumerate(generator): | |
| print(f'Voice:\t{args.voice}\nGraphemes/Text:\n{gs}\n\nPhonemes:\n{ps}\n') | |
| if args.stream: | |
| sd.play(audio, 24000) | |
| sd.wait() | |
| else: | |
| print(f'writing file:\t{i}.wav') | |
| sf.write(f'{i}.wav', audio, 24000) | |
| if __name__ == '__main__': | |
| piped_input = None | |
| if len(sys.argv) > 1: | |
| arguments = sys.argv[1:] | |
| if not sys.stdin.isatty(): | |
| piped_input = sys.stdin.read() | |
| args = parse_args(sys.argv[1:], piped_input) | |
| txt2voice(args) |
Oh you're right... From everything I've already read it supposed to be simply ending with ?. If I find out how to do it, I'll post again and tag ya.
Edit, pasting what I could find about directly using Phonemes, so I don't have to search for it again:
Customize pronunciation with Markdown link syntax and /slashes/ like [Kokoro](/kˈOkəɹO/)
To adjust intonation, try punctuation ;:,.!?—…"()“” or stress ˈ and ˌ
Lower stress [1 level](-1) or [2 levels](-2)
Raise stress 1 level [or](+2) 2 levels (only works on less stressed, usually short words)
ref: https://huggingface.co/spaces/hexgrad/Kokoro-TTS
echo '[Kokoro](/kˈOkəɹO/)' | ./speak.py --stream@WilkAndy
Yeah that was a bit interesting. I had to begin understanding how to use Phonemes directly.
echo '[How are you?](/hˌW ɑː jˈːːːuˇ?/)' | ./speak.py --stream --speed .8
^^^ ^ upward pitch (or supposedly does, its hard to tell)
\ holding "like shhhh!"vs what the system does by default:
echo '[How are you?](/hˌW ɑː juː?/)' | ./speak.py --stream --speed .8ref: https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
Hmmm, i can see i could spend a long time having fun with the intonation!
echo '[How are you? How are you? How are you?](/hˌW ɑː ju? hˌW ɑː jˈːːːuˇ? hˌW ɑː jˈːːːu↗︎?/)' | ./speak.py --stream
Yeah, me too. I think I am going to try and reproduce Amelia Tyler's "Authority" line in BG3 😄
You should also try mixing some voices. I found --voice af_nicole,af_heart --accent b --speed .8 pretty darn good. Oh, and be sure to try af_nicole by itself! It is a very distinctive voice.
Yeah, me too. I think I am going to try and reproduce Amelia Tyler's "Authority" line in BG3 😄
That's going to be super hard! She's got such perfect intonation :-)
hahaha just posting to possibly get ahead of this, and because I don't want to strike a nerve with anyone:
I only mention 'Amelia Tyler' because she is perfect. And I only want to learn the phonetics/intonation process. Might as well pick from the best!
OK, after 34 minutes i admit defeat. My computer will never turn into Amelia Tyler, boohoo.
hahah same. The closest I got:
ɔˈθːːˈɒɹɪtˌiː
Wow, well done! That is actually really good !! :-)
I think this is really cool. One failing is that the intonation is not super accurate though, eg "Is this a question" sounds the same as "Is this a question?" (with question mark). Is there a way of marking the text with ? @milljm