Replies: 88 comments 824 replies
-
hello @SWivid do you think is possible fine tuning pretrained model on new language planing to add another language italian + english (to avoid catastrophic forgetting) |
Beta Was this translation helpful? Give feedback.
-
audio files of maybe 3 - 12s duration, i'm not sure what's good, and their transcripts /your_dataset
|-- metadata.csv
|-- wavs/
| |-- audio_0001.wav
| |-- audio_0002.wav
| `-- ... metadata.csv contents:
python scripts/prepare_csv_wavs.py <path_to_your_dataset> <F5-TTS_repo_data_path>/<dataset_name>_pinyin example: python scripts/prepare_csv_wavs.py /my_pc/your_dataset /my_pc/F5-TTS/data/your_dataset_pinyin
set dataset name to name of your dataset in f5-tts data folder dataset_name = "your_dataset" play around with these parameters and see what give the best results: set max samples to 2, or whatever you seem fit max_samples = 2 also play around with learning rate, don't know which one is best learning_rate = 5e-06 change epochs and warmup to whatver you seem fit for your dataset epochs = 10 # use linear decay, thus epochs control the slope
num_warmup_updates = 20 # warmup steps adjust this to your dataset size, eg for 100 audio files and 2 max samples, maybe 500 last_per_steps = 500 # save last checkpoint per steps
hopefully we find good hyperparams for good finetuning results could put it doesn't handle other tokenizers, always assumes english dataset and import sys, os
sys.path.append(os.getcwd())
from pathlib import Path
import json
import shutil
import argparse
from tqdm import tqdm
from datasets.arrow_writer import ArrowWriter
from model.utils import (
convert_char_to_pinyin,
)
PRETRAINED_VOCAB_PATH = Path(__file__).parent.parent / "data/Emilia_ZH_EN_pinyin/vocab.txt"
def is_csv_wavs_format(input_dataset_dir):
fpath = Path(input_dataset_dir)
metadata = fpath / "metadata.csv"
wavs = fpath / 'wavs'
return metadata.exists() and metadata.is_file() and wavs.exists() and wavs.is_dir()
def prepare_csv_wavs_dir(input_dir):
assert is_csv_wavs_format(input_dir), f"not csv_wavs format: {input_dir}"
input_dir = Path(input_dir)
metadata_path = input_dir / "metadata.csv"
audio_path_text_pairs = read_audio_text_pairs(metadata_path.as_posix())
sub_result, durations = [], []
vocab_set = set()
polyphone = True
for audio_path, text in audio_path_text_pairs:
if not Path(audio_path).exists():
print(f"audio {audio_path} not found, skipping")
continue
audio_duration = get_audio_duration(audio_path)
# assume tokenizer = "pinyin" ("pinyin" | "char")
text = convert_char_to_pinyin([text], polyphone=polyphone)[0]
sub_result.append({"audio_path": audio_path, "text": text, "duration": audio_duration})
durations.append(audio_duration)
vocab_set.update(list(text))
return sub_result, durations, vocab_set
def get_audio_duration(audio_path):
import torchaudio
audio, sample_rate = torchaudio.load(audio_path)
num_channels = audio.shape[0]
return audio.shape[1] / (sample_rate * num_channels)
def read_audio_text_pairs(csv_file_path):
import csv
audio_text_pairs = []
parent = Path(csv_file_path).parent
with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter='|')
next(reader) # Skip the header row
for row in reader:
if len(row) >= 2:
audio_file = row[0].strip() # First column: audio file path
text = row[1].strip() # Second column: text
audio_file_path = parent / audio_file
audio_text_pairs.append((audio_file_path.as_posix(), text))
return audio_text_pairs
def save_prepped_dataset(out_dir, result, duration_list, text_vocab_set, is_finetune):
out_dir = Path(out_dir)
# save preprocessed dataset to disk
out_dir.mkdir(exist_ok=True, parents=True)
print(f"\nSaving to {out_dir} ...")
# dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list}) # oom
# dataset.save_to_disk(f"data/{dataset_name}/raw", max_shard_size="2GB")
raw_arrow_path = out_dir / "raw.arrow"
with ArrowWriter(path=raw_arrow_path.as_posix(), writer_batch_size=1) as writer:
for line in tqdm(result, desc=f"Writing to raw.arrow ..."):
writer.write(line)
# dup a json separately saving duration in case for DynamicBatchSampler ease
dur_json_path = out_dir / "duration.json"
with open(dur_json_path.as_posix(), 'w', encoding='utf-8') as f:
json.dump({"duration": duration_list}, f, ensure_ascii=False)
# vocab map, i.e. tokenizer
# add alphabets and symbols (optional, if plan to ft on de/fr etc.)
# if tokenizer == "pinyin":
# text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
voca_out_path = out_dir / "vocab.txt"
with open(voca_out_path.as_posix(), "w") as f:
for vocab in sorted(text_vocab_set):
f.write(vocab + "\n")
if is_finetune:
file_vocab_finetune = PRETRAINED_VOCAB_PATH.as_posix()
shutil.copy2(file_vocab_finetune, voca_out_path)
else:
with open(voca_out_path, "w") as f:
for vocab in sorted(text_vocab_set):
f.write(vocab + "\n")
dataset_name = out_dir.stem
print(f"\nFor {dataset_name}, sample count: {len(result)}")
print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
def prepare_and_save_set(inp_dir, out_dir, is_finetune: bool = True):
if is_finetune:
assert PRETRAINED_VOCAB_PATH.exists(), f"pretrained vocab.txt not found: {PRETRAINED_VOCAB_PATH}"
sub_result, durations, vocab_set = prepare_csv_wavs_dir(inp_dir)
save_prepped_dataset(out_dir, sub_result, durations, vocab_set, is_finetune)
def cli():
# finetune: python script.py /path/to/input_dir /path/to/output_dir
# pretrain: python script.py /path/to/input_dir /path/to/output_dir --pretrain
parser = argparse.ArgumentParser(description="Prepare and save dataset.")
parser.add_argument('inp_dir', type=str, help="Input directory containing the data.")
parser.add_argument('out_dir', type=str, help="Output directory to save the prepared data.")
parser.add_argument('--pretrain', action='store_true', help="Enable for new pretrain, otherwise is a fine-tune")
args = parser.parse_args()
prepare_and_save_set(args.inp_dir, args.out_dir, is_finetune=not args.pretrain)
if __name__ == "__main__":
cli() |
Beta Was this translation helpful? Give feedback.
-
@kunibald413 Thank you for the script. I've already created something similar here. #62 (comment) can you update this part i think be nice to have like this
before
after
|
Beta Was this translation helpful? Give feedback.
-
The code @kunibald413 has provided works. However, when training it seems to initialize from a model with random weights. Can we initialize from the trained model weights instead? |
Beta Was this translation helpful? Give feedback.
-
Just started my spanish finetune from the facebook libraspeech dataset. Single 4090 so it will take a while. |
Beta Was this translation helpful? Give feedback.
-
I am using a Chinese dataset (about 33hr) to fine-tune my model. The loss is continuously decreasing, and the generated voice tone is getting closer to the target. However, as the training steps increase, the pronunciation of words is becoming increasingly unclear.
parm
|
Beta Was this translation helpful? Give feedback.
-
hi i just create gradio interface for easy user-friendly and accessible for beginners you can see here Features
|
Beta Was this translation helpful? Give feedback.
-
can confirm also training work i training 3 language indonesia-italian-english eng: https://vocaroo.com/1mGEFlRNgouY italian: https://voca.ro/1l6SYplhnSxz (Quattro imperdibili appuntamenti con l’Orchestra da Camera di Caserta e solisti internazionali.) indonesia: it even can do code switching (eng-indonesia): using same config as train ![]() |
Beta Was this translation helpful? Give feedback.
-
Hi, I was just wondering why you dont try to train on small data first instead of starting with a large dataset. For me, I trained for only 40 hours greek and with 20 hours (LibriTTS-R) focused on English, and it’s working fine speak very well. in half a day about with the 4090, and after about 100k to 150k steps, the model can speak greek and english in same time, very well and have great zero shot , try see if ths working for you i hope this help |
Beta Was this translation helpful? Give feedback.
-
here the setting i use
|
Beta Was this translation helpful? Give feedback.
-
Hi all, this is very important and might be confusing for some. You need to copy the original model If you start training without copying this model, it will train from scratch! I’ve created a script called finetune-cli.py that can automate this process. However, before running the script, you need to update all the settings accordingly. Please make sure to do this before you start. or you can run simple run simple change only the dataname my_speak in 3090 with about 60-80 hours dataset working well for 4090 like say @JarodMica working very well and also with very big dataset about the vocab i dont replace anything because suport all symbols in language i train make sure if suport all symbols in your language you want to train if miss symbols not working correct or another idea it's in case miss symbols , you can simple covert all symbols in english language , here how check the vocab in finetune_gradio.py make sure in data/project_name/ you have inside metadata.csv for all text thats why i make gradio_finetune.py to dont confuse for begin users i hope this help |
Beta Was this translation helpful? Give feedback.
-
@jpgallegoar I’m trying to train in Spanish as an experiment , let see this take some hours. I just hope the dataset I’m using is okay since I don’t speak Spanish. I’ll let you know soon. |
Beta Was this translation helpful? Give feedback.
-
Given a large dataset, how important is it that the transcription is 1-1 with the source audio? The reason I ask, most of my datasets are built using a Whisper model, and they often do some text compression and correct misspoken words or stutter. Is this TTS-architecture forgiving for those kinds of variations or inconsistencies in transcription, or should I consider using a more verbose Whisper model for creating this dataset? |
Beta Was this translation helpful? Give feedback.
-
Has anyone here tried finetuning the base model on a single speaker dataset? I tried finetuning with a 6 hr English dataset, but I don’t hear any difference after the training. |
Beta Was this translation helpful? Give feedback.
-
After much testing, I'm gonna have to give up on the spanish finetune for now. Anyway, if anyone wants it, here is the model: Link |
Beta Was this translation helpful? Give feedback.
-
i ran 50k steps on a 10 hours of professional Arabic audio recordeing segmented in 2 to 17 seconds chunks, with very good transcribe. but the results are no good. it feels like the chinese is over powering it. any suggestions? can it be the vocab.txt file? |
Beta Was this translation helpful? Give feedback.
-
Hello, has anyone experimented with creating a finetune on another language while adding english audios in the dataset to prevent catastrophic forgetting?
Thanks! |
Beta Was this translation helpful? Give feedback.
-
Hello everyone. I am trying to finetune a model that was previously already finetuned on German. The model which I want to finetune sounds good, but as soon as I try to finetune it on a small single speaker dataset (around 10 minutes) the output is basically just noise. It almost sounds like it is training from scratch and not finetuning the model. Has anybody have similar issues or am I doing something wrong? Or is a dataset of 10 minutes too small? With others this was plenty enough. |
Beta Was this translation helpful? Give feedback.
-
Hello. I am trying to do single speaker finetuning, English. It training for only 100 epochs too little? Thanks |
Beta Was this translation helpful? Give feedback.
-
alternatively if overtraining is a big problem, modify the learning rate to be slightly less so the model doesn't converge as quickly.
Get Outlook for Android<https://aka.ms/AAb9ysg>
…________________________________
From: Alykasym Begov ***@***.***>
Sent: Saturday, January 4, 2025 1:43:12 PM
To: SWivid/F5-TTS ***@***.***>
Cc: Robert Agee ***@***.***>; Mention ***@***.***>
Subject: Re: [SWivid/F5-TTS] Finetune practice (Discussion #57)
100 epochs seems too much because:
1. The base model already supports English language, so it doesn't need to learn the language but the voice, which doesn't need much training.
2. 6.5 hrs is a small dataset and training it for more than 20-30 epochs will easily overfit the model.
For starters, maybe you can just use Auto Settings from the WebUI, only changing the batch size to match your VRAM, and set the epoch count to something like 1000000, but manually stop around 15-20 epochs during training.
—
Reply to this email directly, view it on GitHub<#57 (reply in thread)>, or unsubscribe<https://github.com/notifications/unsubscribe-auth/AMWGMPIEKLQRKTUA36G74CL2JATUBAVCNFSM6AAAAABP4ADUBSVHI2DSMVQWIX3LMV43URDJONRXK43TNFXW4Q3PNVWWK3TUHMYTCNZTGQ4TIMQ>.
You are receiving this because you were mentioned.Message ID: ***@***.***>
|
Beta Was this translation helpful? Give feedback.
-
anyone has anyl luck training Arabic? any ideas? i have more audio for the same speaker. more than 100 hours. if that would help |
Beta Was this translation helpful? Give feedback.
-
Have you guys found a good solution for splitting long audio files into shorter ones? |
Beta Was this translation helpful? Give feedback.
-
Hi @jpgallegoar, Thanks for your Spanish model, it works great. Could you please share your experience?
Thanks |
Beta Was this translation helpful? Give feedback.
-
I've found F5 for Turkish https://huggingface.co/marduk-ra/F5-TTS-Turkish but it has some turkish character problem like ı,ü,ö,ç Unfortunately, I only have 6800xt. What do you suggest? What could be the problem? I'm considering hiring GPU and train in the cloud. |
Beta Was this translation helpful? Give feedback.
-
Has anyone tested training on fp32 vs fp16 vs bf16? Is there a noticeable quality dropoff? Which is the best? |
Beta Was this translation helpful? Give feedback.
-
I'm finetuning models with F5-TTS via Pinokio but i'm struggling to identify how to use the models i've trained Would a kind person possibly update the Gradio UI for Pinokio and add an ability to automatically find and be able to select any of the finetuned models that have been trained / created to make it easy please? 😊 |
Beta Was this translation helpful? Give feedback.
-
I'm training 200hrs for pt-br reaching 1M steps, using google colab, half with A100 and half with T4, but it still not perfect, it is actually doing a little inference, but have some misspellings, and for numbers, just does not work. Is it possible to finetune it with a new dataset with only numbers and misspells? will it destroy the previous trainings? |
Beta Was this translation helpful? Give feedback.
-
Beta Was this translation helpful? Give feedback.
-
Has anyone able to train this on Multiple-4090 GPU's setup (2 or more)?? I am getting this - #728 (comment) |
Beta Was this translation helpful? Give feedback.
-
Has anyone tried parallelized training with multi GPUs? I mean getting parallel performance, not only more VRAM. Is it even possible? |
Beta Was this translation helpful? Give feedback.
-
Full finetune is currently supported, lora or adapter not yet.
checkpoint_path
to pretrained model dir intest_train.py
,model/trainer.py
will load from there to resume. Reuse thevocab.txt
underdata /Emilia_ZH_EN_pinyin
(Emilia_ZH_EN_pinyin <- tokenizer = "pinyin"; dataset_name = "Emilia_ZH_EN" intest_train.py
setting)model/dataset.py
. Just need e.g. the audio path, text (tokenized, leverageconvert_char_to_pinyin
func inmodel/utils.py
see script/prepare_xxxx.py), duration of audio in seconds.grad_accumulation_steps
could be used to simulate a large batchsize. Also other settings, e.g. few warmup steps, 1e-4 lr, etc.We didn't specifically experiment with finetuning, so if you get positive results, welcome to share :)
Some helpful issues, #16 #27
Welcome to share your successful results with finetuning, maybe also start a new tutorial doc helping others to get start with it.
Many Thanks !
Beta Was this translation helpful? Give feedback.
All reactions