-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathProgram.cs
106 lines (81 loc) · 7.62 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
namespace KokoroSharp;
using KokoroSharp.Core;
using KokoroSharp.Processing;
using System.Diagnostics;
/// <summary> Sample test program that reads the console line, then plays it back with the voice. </summary>
/// <remarks> Intended to act as an introduction to the lower-level parts for users interested in advanced tasks. </remarks>
internal class Program {
// Mixing voice A with B in this example, but you can mix numerous voices together.
// .. keeping this outside the 'Main' method for hot-reload support.
static (int a, int b, int c) Mix => (2, 10, 5);
static void Main(string[] _) {
// You'll need to download the model first. You can find it in https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0.
using KokoroTTS tts = KokoroTTS.LoadModel(); // The high level inference engine provided by KokoroSharp. We instantiate once, cache it, and reuse it.
//KokoroVoiceManager.LoadVoicesFromPath("voices"); // The voices are pre-bundled with the package in "/voices", but can still be loaded manually from a different path if needed.
KokoroVoice sarah = KokoroVoiceManager.GetVoice("af_sarah"); // Once the voices are loaded, they can be retrieved instantly from memory.
KokoroVoice nicole = KokoroVoiceManager.GetVoice("af_nicole"); // Kokoro always needs a voice for inference.
// You can check out the available/loaded voices by iterating through them:
foreach (var voice in KokoroVoiceManager.Voices) { Debug.WriteLine(voice.Name); }
foreach (var voice in KokoroVoiceManager.GetVoices(KokoroLanguage.AmericanEnglish)) { Debug.WriteLine(voice.Name); }
tts.Speak("Welcome.", KokoroVoiceManager.GetVoice("af_heart")); // ..and synthesize speech with one line of code!
// You can access and subscribe to various callbacks regarding speech to stay informed:
tts.OnSpeechStarted += (s) => Debug.WriteLine($"Started: {new string(s.PhonemesToSpeak)}");
tts.OnSpeechProgressed += (p) => Debug.WriteLine($"Progress: {new string(p.SpokenText_BestGuess)}");
tts.OnSpeechCompleted += (c) => Debug.WriteLine($"Completed: {new string(c.PhonemesSpoken)}");
tts.OnSpeechCanceled += (c) => Debug.WriteLine($"Canceled: {new string(c.SpokenText_BestGuess)}");
while (true) {
Console.Write("Type text to speak: ");
string txt = Console.ReadLine();
if (string.IsNullOrWhiteSpace(txt)) { return; }
// The easiest way to do text-to-speech with Kokoro is by invoking `tts.Speak()`/`tts.SpeakFast()` directly with input text.
tts.SpeakFast(txt, KokoroVoiceManager.Mix([(sarah, Mix.a), (nicole, Mix.b)])); // Segmented with various rules (see `Segmentation.cs`). Getting an ~instant response, with a potential quality hit.
//tts.Speak(txt, KokoroVoiceManager.Mix([(sarah, mix.a), (nicole, mix.b)])); // Without segmentations; increasing the playback response time, but may offer increased quality.
continue; // Comment out this line to proceed.
// Although, what's MORE SUITABLE for more advanced tasks, is the `tts.EnqueueJob` method,
// .. because it allows queueing up multiple *inference jobs* to the engine asynchronously,
// .. and when, in order, one gets completed, the audio is also being played back in order.
tts.StopPlayback(); // Immediately stops any ongoing playbacks and jobs invoked via `Speak`/`SpeakFast`.
// Note that the `KokoroTTS` instance hosts its own instance of `KokoroPlayback`, for convenience,
// .. but for anything more advanced than `SpeakFast`, you'll need to provide your own, or an alternative.
KokoroPlayback playback = new KokoroPlayback();
// *KokoroPlayback* equivalent of `tts.StopPlayback()` is 'playback.StopPlayback()'.
playback.NicifySamples = true; // Optionally, trim the otherwise silent samples, for even faster responses.
var segmentationStrategy = new KokoroTTSPipelineConfig() { SecondsOfPauseBetweenProperSegments = new(CommaPause: 0f) };
// From here on, these will enqueue to the same `playback` instance, ensuring audio will not overlap.
// Also, the callbacks are built-in inside `KokoroTTS`, so if you want them, you'd have to create your own.
// Feel free to check out how it's done there, use it as an example, and tweak it to your liking!
int[] tokens = Tokenizer.Tokenize(txt); // (1D array)
List<int[]> ttokens = SegmentationSystem.SplitToSegments(tokens, new DefaultSegmentationConfig() { MaxFirstSegmentLength = 100 }); // (2D array)
// Mixing voices is easy, and you can mix as many as you want together, even ones intended for different languages!
// .. Note that doing that might result in potential artifacts on the spoken text when the mixed weight is high.
var mixedVoice = KokoroVoiceManager.Mix([(sarah, Mix.a), (nicole, Mix.b), (KokoroVoiceManager.GetVoice("hf_beta"), Mix.c)]);
// The library will try to infer the desired language, but if you wanna be sure the language does indeed match, you need to specify so.
mixedVoice.Rename("Mixed Voice", KokoroLanguage.BritishEnglish, KokoroGender.Female);
// You can inference with a 1D token array, waiting until the full inference completes before hearing back (up to 510 tokens).
tts.EnqueueJob(KokoroJob.Create(tokens, mixedVoice, speed:1f, playback.Enqueue));
// Or with 2D token array, processing them segment-by-segment, hearing back as quickly as possible (same with `tts.SpeakFast()`).
// .. 2D arrays are not restricted by the 510 token limit, because none of the segments will surpass that.
tts.EnqueueJob(KokoroJob.Create(ttokens, sarah, speed:1f, playback.Enqueue));
// BTW, you can customize the pipeline in any way you want. Here's an example on how to add a 2.5 second pause.
{ tts.EnqueueJob(KokoroJob.Create(Tokenizer.Tokenize("Pausing for 2 sec"), nicole, 1, playback.Enqueue)); } // just for clarity to know when the pause occurs.
tts.EnqueueJob(new KokoroPauseJob() { PauseTime = 2f, OnComplete = playback.Enqueue });
// And can also manually load the voice from the path you want, as a float array...
float[,,] michaelNPY = NumSharp.np.Load<float[,,]>(@"voices/am_michael.npy");
tts.EnqueueJob(KokoroJob.Create(ttokens, michaelNPY, speed:0.8f, playback.Enqueue));
// ...or as a KokoroVoice. Those types are fully interchangeable with each other.
KokoroVoice onyxVoice = KokoroVoice.FromPath(@"voices/am_onyx.npy");
tts.EnqueueJob(KokoroJob.Create(ttokens, onyxVoice, speed:1.2f, playback.Enqueue));
}
}
/// <summary> Simple example of a "Pause" job that will cause the playback to wait for a fixed amount of seconds before playing back the next audio in queue. </summary>
/// <remarks> Of course, this pause could have happened mid-segment if we chose to, but for the sake of simplicity, it'll just delay the next speaker. </remarks>
internal sealed class KokoroPauseJob : KokoroJob {
public float PauseTime { get; init; }
public Action<float[]> OnComplete { get; init; }
/// <summary> Instantly responds with an empty array for 'PauseTime' amount of seconds, causing the playback buffer to play some empty audio. </summary>
public override void Progress(KokoroModel model) {
OnComplete?.Invoke(new float[(int) Math.Round(KokoroPlayback.waveFormat.SampleRate * PauseTime)]);
State = KokoroJobState.Completed;
}
}
}