diff --git a/Taskfile.yml b/Taskfile.yml index d5a9d6584..18ea676e5 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -49,6 +49,16 @@ tasks: poetry run python -W ignore utils/taskcluster_downloader.py --mode=model {{.CLI_ARGS}} + config-generator: + desc: Create a training config for a language pair + summary: | + The models will be saved to: ./data/taskcluster-model + Example: `task config-generator -- en fi` + deps: [poetry-install-utils] + cmds: + - >- + PYTHONPATH=$(pwd) poetry run python -W ignore utils/config_generator.py {{.CLI_ARGS}} + opuscleaner: desc: Run the opuscleaner tool. deps: [poetry-install-opuscleaner] diff --git a/configs/bs-en-spring-2024.yml b/configs/bs-en-spring-2024.yml new file mode 100644 index 000000000..b3b501500 --- /dev/null +++ b/configs/bs-en-spring-2024.yml @@ -0,0 +1,118 @@ +# The initial configuration was generated using: +# task config-generator -- bs en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: bs + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-bos + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 94,895,603 sentences + # + # Skipped datasets: + # - opus_MultiHPLT/v1.1 - ignored datasets (240,013 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-bos-eng - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-bos-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-bos - duplicate with opus + # - mtdata_Statmt-ccaligned-1-bos_BA-eng - duplicate with opus + train: + - opus_NLLB/v1 # 79,334,034 sentences + - opus_OpenSubtitles/v2018 # 14,041,160 sentences + - opus_XLEnt/v1.2 # 266,696 sentences + - opus_Tanzil/v1 # 246,913 sentences + - opus_HPLT/v1.1 # 240,015 sentences + - opus_WikiMatrix/v1 # 210,691 sentences + - opus_CCAligned/v1 # 192,099 sentences + - opus_GNOME/v1 # 164,960 sentences + - opus_SETIMES/v2 # 138,387 sentences + - opus_wikimedia/v20230407 # 28,167 sentences + - opus_QED/v2.0a # 12,541 sentences + - opus_TED2020/v1 # 11,638 sentences + - opus_NeuLab-TedTalks/v1 # 6,136 sentences + - opus_EUbookshop/v2 # 558 sentences + - opus_Tatoeba/v2023-04-12 # 515 sentences + - opus_tldr-pages/v2023-08-29 # 479 sentences + - opus_ELRC-3047-wikipedia_health/v1 # 205 sentences + - opus_ELRC-wikipedia_health/v1 # 205 sentences + - opus_ELRC_2922/v1 # 204 sentences + - mtdata_Neulab-tedtalks_test-1-eng-bos # ~3,117,009 sentences (352.2 MB) + + # The monolingual data contains: + # ~8,982,298 sentences + mono-src: + - news-crawl_news.2018 # ~8,849 sentences (1.0M) + - news-crawl_news.2019 # ~920,353 sentences (104M) + - news-crawl_news.2020 # ~1,734,513 sentences (196M) + - news-crawl_news.2021 # ~2,079,646 sentences (235M) + - news-crawl_news.2022 # ~2,132,743 sentences (241M) + - news-crawl_news.2023 # ~2,106,194 sentences (238M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/cs-en-spring-2024.yml b/configs/cs-en-spring-2024.yml new file mode 100644 index 000000000..960dfe671 --- /dev/null +++ b/configs/cs-en-spring-2024.yml @@ -0,0 +1,234 @@ +# The initial configuration was generated using: +# task config-generator -- cs en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: cs + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Lindat-khresmoi_summary_dev-2-ces-eng + - mtdata_Neulab-tedtalks_dev-1-eng-ces + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt19 + - sacrebleu_aug-mix_wmt18/test-ts + - sacrebleu_aug-mix_wmt16 + - sacrebleu_aug-mix_wmt14 + - sacrebleu_aug-mix_wmt13 + - sacrebleu_aug-mix_wmt11 + - sacrebleu_aug-mix_wmt09 + - sacrebleu_aug-mix_wmt08/nc + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt20 + - sacrebleu_wmt18 + - sacrebleu_wmt17 + - sacrebleu_wmt15 + - sacrebleu_wmt14/full + - sacrebleu_wmt12 + - sacrebleu_wmt10 + - sacrebleu_wmt08 + - sacrebleu_multi30k/2016 + + # The training data contains: + # 213,550,488 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (56,307,029 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - opus_WikiTitles/v3 - ignored datasets (0 sentences) + # - mtdata_ELRC-euipo_2017-1-ces-eng - duplicate with opus + # - mtdata_ELRC-emea-1-ces-eng - duplicate with opus + # - mtdata_ELRC-vaccination-1-ces-eng - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-ces-eng - duplicate with opus + # - mtdata_ELRC-antibiotic-1-ces-eng - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-ces-eng - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-ces-eng - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-ces-eng - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-ces-eng - duplicate with opus + # - mtdata_EU-ecdc-1-eng-ces - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-ces-eng - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-ces-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-3-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-ces - duplicate with opus + # - mtdata_Statmt-europarl-9-ces-eng - duplicate with opus + # - mtdata_Statmt-europarl-7-ces-eng - duplicate with opus + # - mtdata_Statmt-news_commentary-14-ces-eng - duplicate with opus + # - mtdata_Statmt-news_commentary-15-ces-eng - duplicate with opus + # - mtdata_Statmt-news_commentary-16-ces-eng - duplicate with opus + # - mtdata_Statmt-europarl-10-ces-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-ces_CZ-eng - duplicate with opus + # - mtdata_Tilde-ecb-2017-ces-eng - duplicate with opus + train: + - opus_NLLB/v1 # 56,307,029 sentences + - opus_ParaCrawl/v9 # 50,633,505 sentences + - opus_OpenSubtitles/v2018 # 42,346,436 sentences + - opus_StanfordNLP-NMT/v1.0 # 15,793,121 sentences + - opus_ELRC-EMEA/v1 # 12,891,707 sentences + - opus_CCAligned/v1 # 12,730,121 sentences + - opus_DGT/v2019 # 5,207,753 sentences + - opus_LinguaTools-WikiTitles/v2014 # 4,813,030 sentences + - opus_XLEnt/v1.2 # 3,894,132 sentences + - opus_JRC-Acquis/v3.0 # 1,273,411 sentences + - opus_ELRC-5067-SciPar/v1 # 1,064,385 sentences + - opus_EMEA/v3 # 1,053,385 sentences + - opus_ELRC-2713-EMEA/v1 # 779,083 sentences + - opus_ELRC_2682/v1 # 779,082 sentences + - opus_Europarl/v8 # 647,095 sentences + - opus_WikiMatrix/v1 # 519,195 sentences + - opus_EUbookshop/v2 # 455,472 sentences + - opus_QED/v2.0a # 441,508 sentences + - opus_ELITR-ECA/v1 # 295,788 sentences + - opus_Tanzil/v1 # 233,399 sentences + - opus_News-Commentary/v16 # 218,509 sentences + - opus_TED2020/v1 # 170,611 sentences + - opus_wikimedia/v20230407 # 146,717 sentences + - opus_KDE4/v2 # 134,071 sentences + - opus_ELRC-presscorner_covid/v1 # 129,652 sentences + - opus_NeuLab-TedTalks/v1 # 111,107 sentences + - opus_ECB/v1 # 63,716 sentences + - opus_bible-uedin/v1 # 62,151 sentences + - opus_WMT-News/v2019 # 44,859 sentences + - opus_Tatoeba/v2023-04-12 # 34,628 sentences + - opus_PHP/v1 # 32,983 sentences + - opus_Wikipedia/v1.0 # 27,723 sentences + - opus_ELRC-3564-EUR_LEX_covid/v1 # 22,637 sentences + - opus_ELRC-EUR_LEX/v1 # 22,637 sentences + - opus_GlobalVoices/v2018q4 # 18,876 sentences + - opus_ELRC-427-Electronic_Exchange_/v1 # 17,357 sentences + - opus_ELRC-2012-EUIPO_2017/v1 # 15,945 sentences + - opus_ELRC-EUIPO_2017/v1 # 15,945 sentences + - opus_ELRC-antibiotic/v1 # 15,678 sentences + - opus_ELRC-2874-EU_publications_medi/v1 # 13,161 sentences + - opus_ELRC-EU_publications/v1 # 13,161 sentences + - opus_ELRC-EUROPARL_covid/v1 # 11,142 sentences + - opus_EUconst/v1 # 9,953 sentences + - opus_ELRC-3605-presscorner_covid/v1 # 6,229 sentences + - opus_ELRC-2406-Czech_Supreme_Audit/v1 # 4,771 sentences + - opus_ELRC_3382/v1 # 3,722 sentences + - opus_TildeMODEL/v2018 # 3,100 sentences + - opus_ELRC-2405-Czech_Supreme_Audit/v1 # 2,868 sentences + - opus_ECDC/v2016-03-16 # 2,559 sentences + - opus_ELRC-3463-EC_EUROPA_covid/v1 # 2,386 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,386 sentences + - opus_ELRC-40-Information_Portal_C/v1 # 1,828 sentences + - opus_ELRC-Information_Portal/v1 # 1,828 sentences + - opus_ELRC-3062-wikipedia_health/v1 # 1,146 sentences + - opus_ELRC-wikipedia_health/v1 # 1,146 sentences + - opus_ELRC_2922/v1 # 1,145 sentences + - opus_ELRC-3201-antibiotic/v1 # 965 sentences + - opus_ELRC-3292-EUROPARL_covid/v1 # 557 sentences + - opus_ELRC-2749-vaccination/v1 # 520 sentences + - opus_ELRC-vaccination/v1 # 520 sentences + - opus_ELRC-2404-Czech_Supreme_Audit/v1 # 403 sentences + - opus_ELRC_2923/v1 # 319 sentences + - opus_ELRC-2407-Czech_Supreme_Audit/v1 # 234 sentences + - mtdata_ELRC-information_portal_czech_president_czech_castle-1-ces-eng + - mtdata_ELRC-electronic_exchange_social_security_information-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2018_reports-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2008_2017_reports-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2003_2017_press_releases-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2018_press_releases-1-ces-eng + - mtdata_ELRC-eu_publications_medical_v2-1-ces-eng + - mtdata_EU-eac_forms-1-ces-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-ces-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-ces-eng # ~533,693 sentences (60.3 MB) + - mtdata_Lindat-khresmoi_summary_test-2-ces-eng # ~11,808 sentences (1.3 MB) + - mtdata_Neulab-tedtalks_test-1-eng-ces # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-commoncrawl_wmt13-1-ces-eng # ~8,126,649 sentences (918.3 MB) + - mtdata_Statmt-europarl_wmt13-7-ces-eng # ~5,819,755 sentences (657.6 MB) + - mtdata_Statmt-news_commentary_wmt18-13-ces-eng # ~1,001,393 sentences (113.2 MB) + - mtdata_Statmt-wiki_titles-1-ces-eng # ~45,242 sentences (5.1 MB) + - mtdata_Statmt-wiki_titles-2-ces-eng # ~47,995 sentences (5.4 MB) + - mtdata_Tilde-eesc-2017-ces-eng # ~1,157,475 sentences (130.8 MB) + - mtdata_Tilde-ema-2016-ces-eng # ~244,524 sentences (27.6 MB) + - mtdata_Tilde-rapid-2019-ces-eng # ~255,063 sentences (28.8 MB) + + # The monolingual data contains: + # ~55,777,868 sentences + mono-src: + - news-crawl_news.2007 # ~34,513 sentences (3.9M) + - news-crawl_news.2008 # ~1,840,707 sentences (208M) + - news-crawl_news.2009 # ~2,079,646 sentences (235M) + - news-crawl_news.2010 # ~1,247,787 sentences (141M) + - news-crawl_news.2011 # ~3,185,840 sentences (360M) + - news-crawl_news.2012 # ~2,964,601 sentences (335M) + - news-crawl_news.2013 # ~3,389,380 sentences (383M) + - news-crawl_news.2014 # ~2,973,451 sentences (336M) + - news-crawl_news.2015 # ~3,026,548 sentences (342M) + - news-crawl_news.2016 # ~2,159,292 sentences (244M) + - news-crawl_news.2017 # ~2,849,557 sentences (322M) + - news-crawl_news.2018 # ~2,637,168 sentences (298M) + - news-crawl_news.2019 # ~5,513,274 sentences (623M) + - news-crawl_news.2020 # ~7,451,327 sentences (842M) + - news-crawl_news.2021 # ~5,265,486 sentences (595M) + - news-crawl_news.2022 # ~3,884,955 sentences (439M) + - news-crawl_news.2023 # ~5,274,336 sentences (596M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/da-en-spring-2024.yml b/configs/da-en-spring-2024.yml new file mode 100644 index 000000000..336f0c593 --- /dev/null +++ b/configs/da-en-spring-2024.yml @@ -0,0 +1,235 @@ +# The initial configuration was generated using: +# task config-generator -- da en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: da + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-dan + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 161,668,955 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (52,273,664 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-www.norden.org-1-dan-eng - duplicate with opus + # - mtdata_ELRC-mst.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-ufm.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.dst.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.dma.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.geus.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-naturstyrelsen.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.trm.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-um.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.aarhus2017.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.odense.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.visitvejle.com-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.visitdenmark.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-slks.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-natmus.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-gallery_denmark-1-dan-eng - duplicate with opus + # - mtdata_ELRC-royal_danish_library-1-dan-eng - duplicate with opus + # - mtdata_ELRC-danish_fsa-1-dan-eng - duplicate with opus + # - mtdata_ELRC-uk.fm.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-spillemyndigheden.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-rigsrevisionen.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-dan-eng - duplicate with opus + # - mtdata_ELRC-emea-1-dan-eng - duplicate with opus + # - mtdata_ELRC-vaccination-1-dan-eng - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-dan-eng - duplicate with opus + # - mtdata_ELRC-antibiotic-1-dan-eng - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-dan-eng - duplicate with opus + # - mtdata_EU-ecdc-1-eng-dan - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-dan-eng - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-dan-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-dan - duplicate with opus + # - mtdata_Statmt-europarl-7-dan-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-dan_DK-eng - duplicate with opus + # - mtdata_Tilde-ecb-2017-dan-eng - duplicate with opus + train: + - opus_NLLB/v1 # 52,273,664 sentences + - opus_ParaCrawl/v9 # 34,207,840 sentences + - opus_OpenSubtitles/v2018 # 14,474,569 sentences + - opus_ELRC-4248-NTEU_TierA/v1 # 13,756,130 sentences + - opus_ELRC-EMEA/v1 # 12,556,334 sentences + - opus_CCAligned/v1 # 10,738,610 sentences + - opus_DGT/v2019 # 5,152,323 sentences + - opus_EUbookshop/v2 # 4,980,755 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,084,707 sentences + - opus_XLEnt/v1.2 # 3,042,401 sentences + - opus_Europarl/v8 # 1,991,647 sentences + - opus_EMEA/v3 # 1,093,780 sentences + - opus_JRC-Acquis/v3.0 # 808,916 sentences + - opus_ELRC-2716-EMEA/v1 # 775,676 sentences + - opus_ELRC_2682/v1 # 775,675 sentences + - opus_WikiMatrix/v1 # 436,052 sentences + - opus_KDE4/v2 # 194,410 sentences + - opus_QED/v2.0a # 175,384 sentences + - opus_ELRC-presscorner_covid/v1 # 145,352 sentences + - opus_ECB/v1 # 138,154 sentences + - opus_ELITR-ECA/v1 # 135,384 sentences + - opus_TED2020/v1 # 72,113 sentences + - opus_wikimedia/v20230407 # 69,969 sentences + - opus_bible-uedin/v1 # 62,113 sentences + - opus_NeuLab-TedTalks/v1 # 48,462 sentences + - opus_ELRC-847-mst.dk/v1 # 36,750 sentences + - opus_ELRC-730-www.norden.org/v1 # 36,626 sentences + - opus_ELRC-www.norden.org/v1 # 36,626 sentences + - opus_Tatoeba/v2023-04-12 # 32,790 sentences + - opus_ELRC-850-www.dst.dk/v1 # 22,817 sentences + - opus_ELRC-848-laegemiddelstyrelsen/v1 # 22,700 sentences + - opus_ELRC-3567-EUR_LEX_covid/v1 # 21,239 sentences + - opus_ELRC-EUR_LEX/v1 # 21,239 sentences + - opus_ELRC-2013-EUIPO_2017/v1 # 17,269 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,269 sentences + - opus_ELRA-W0214/v1 # 16,243 sentences + - opus_ELRC-antibiotic/v1 # 13,310 sentences + - opus_ELRC-2877-EU_publications_medi/v1 # 13,243 sentences + - opus_ELRC-EU_publications/v1 # 13,243 sentences + - opus_ELRC-851-www.vikingeskibsmuse/v1 # 12,404 sentences + - opus_ELRC-EUROPARL_covid/v1 # 11,723 sentences + - opus_ELRC-849-ufm.dk/v1 # 10,054 sentences + - opus_EUconst/v1 # 10,032 sentences + - opus_ELRC-1062-rigsrevisionen.dk/v1 # 8,234 sentences + - opus_GlobalVoices/v2018q4 # 7,311 sentences + - opus_ELRC-904-uk.fm.dk/v1 # 6,949 sentences + - opus_ELRC-3608-presscorner_covid/v1 # 6,262 sentences + - opus_ELRC-892-slks.dk/v1 # 4,956 sentences + - opus_ELRC-885-www.aarhus2017.dk/v1 # 4,709 sentences + - opus_TildeMODEL/v2018 # 4,420 sentences + - opus_ELRC-397-Danish_Higher_Educat/v1 # 4,395 sentences + - opus_ELRA-W0157/v1 # 4,394 sentences + - opus_ELRC-439-Danish_Higher_Educat/v1 # 4,149 sentences + - opus_ELRC-893-natmus.dk/v1 # 3,950 sentences + - opus_ELRC-394-Danish_Higher_Educat/v1 # 3,719 sentences + - opus_ELRC_3382/v1 # 3,406 sentences + - opus_ELRC-905-spillemyndigheden.dk/v1 # 3,355 sentences + - opus_ELRC-856-naturstyrelsen.dk/v1 # 3,118 sentences + - opus_ELRC-859-um.dk/v1 # 3,055 sentences + - opus_ELRC-857-www.trm.dk/v1 # 3,015 sentences + - opus_ELRC-852-www.dma.dk/v1 # 3,010 sentences + - opus_ELRC-3466-EC_EUROPA_covid/v1 # 2,804 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,804 sentences + - opus_ECDC/v2016-03-16 # 2,578 sentences + - opus_ELRC-897-Denmark_Space_Instit/v1 # 1,940 sentences + - opus_ELRC-899-Danish_FSA/v1 # 1,931 sentences + - opus_ELRC-426-Danish_Higher_Educat/v1 # 1,886 sentences + - opus_ELRC-854-www.geus.dk/v1 # 1,655 sentences + - opus_ELRC-891-www.visitdenmark.dk/v1 # 1,603 sentences + - opus_ELRC-895-Royal_Danish_Library/v1 # 1,547 sentences + - opus_ELRC-889-www.visitvejle.com/v1 # 1,472 sentences + - opus_ELRC-886-www.odense.dk/v1 # 1,427 sentences + - opus_ELRC-901-Denmark_Prosecution_/v1 # 1,163 sentences + - opus_ELRC-900-Danish_Working_Envir/v1 # 1,138 sentences + - opus_ELRC-890-Holstebro_Kunstmuseu/v1 # 1,023 sentences + - opus_ELRC-3204-antibiotic/v1 # 801 sentences + - opus_ELRC-894-Gallery_Denmark/v1 # 769 sentences + - opus_ELRC-3295-EUROPARL_covid/v1 # 634 sentences + - opus_ELRC-3066-wikipedia_health/v1 # 523 sentences + - opus_ELRC-wikipedia_health/v1 # 523 sentences + - opus_ELRC_2922/v1 # 522 sentences + - opus_tldr-pages/v2023-08-29 # 495 sentences + - opus_ELRC-2754-vaccination/v1 # 462 sentences + - opus_ELRC-vaccination/v1 # 462 sentences + - opus_ELRC_2923/v1 # 389 sentences + - mtdata_ELRC-danish_higher_education_science_3-1-dan-eng + - mtdata_ELRC-danish_higher_education_science_2-1-dan-eng + - mtdata_ELRC-danish_higher_education_science-1-dan-eng + - mtdata_ELRC-danish_higher_education_science_4-1-dan-eng + - mtdata_ELRC-laegemiddelstyrelsen.dk-1-dan-eng + - mtdata_ELRC-www.vikingeskibsmuseet.dk-1-dan-eng + - mtdata_ELRC-holstebro_kunstmuseum-1-dan-eng + - mtdata_ELRC-denmark_space_institute-1-dan-eng + - mtdata_ELRC-danish_working_environment_authority-1-dan-eng + - mtdata_ELRC-denmark_prosecution_service-1-dan-eng + - mtdata_ELRC-eu_publications_medical_v2-1-dan-eng + - mtdata_ELRC-nteu_tierb-1-dan-eng + - mtdata_EU-eac_forms-1-dan-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-dan-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-dan-eng # ~1,040,518 sentences (117.6 MB) + - mtdata_Neulab-tedtalks_test-1-eng-dan # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-dan-eng # ~1,936,973 sentences (218.9 MB) + - mtdata_Tilde-ema-2016-dan-eng # ~215,232 sentences (24.3 MB) + - mtdata_Tilde-rapid-2016-dan-eng # ~451,067 sentences (51.0 MB) + + # The monolingual data contains: + # ~0 sentences + mono-src: [] + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/el-en-spring-2024.yml b/configs/el-en-spring-2024.yml new file mode 100644 index 000000000..7a416e140 --- /dev/null +++ b/configs/el-en-spring-2024.yml @@ -0,0 +1,260 @@ +# The initial configuration was generated using: +# task config-generator -- el en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: el + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ell + - flores_aug-mix_dev + - sacrebleu_aug-mix_mtedx/test + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_mtedx/valid + + # The training data contains: + # 159,976,981 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (49,262,631 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (3,583,975 sentences) + # - opus_ELRC-3857-State_Budget_Executi/v1 - not enough data (180 sentences) + # - opus_ELRC-745-Convention_against_T/v1 - not enough data (165 sentences) + # - opus_ELRA-W0309/v1 - not enough data (164 sentences) + # - opus_ELRC-656-Macroeconomic_Develo/v1 - not enough data (151 sentences) + # - opus_ELRC-496-Convention_transfer_/v1 - not enough data (121 sentences) + # - opus_ELRA-W0196/v1 - not enough data (120 sentences) + # - opus_ELRA-W0207/v1 - not enough data (101 sentences) + # - opus_ELRA-W0308/v1 - not enough data (87 sentences) + # - opus_ELRC-662-Expression_interest/v1 - not enough data (85 sentences) + # - opus_ELRA-W0209/v1 - not enough data (84 sentences) + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (65 sentences) + # - opus_ELRC-658-Methodological_Recon/v1 - not enough data (45 sentences) + # - opus_ELRA-W0208/v1 - not enough data (44 sentences) + # - opus_ELRC-1022-COMPULSORY_EXPROPRIA/v1 - not enough data (38 sentences) + # - opus_ELRC-3856-PRESS/v1 - not enough data (35 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (30 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (29 sentences) + # - opus_ELRC-1021-Commitment_Property_/v1 - not enough data (23 sentences) + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (22 sentences) + # - opus_ELRA-W0301/v1 - not enough data (16 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-greek_administration-1-ell-eng - duplicate with opus + # - mtdata_ELRC-greek_law-1-ell-eng - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-ell-eng - duplicate with opus + # - mtdata_ELRC-press_releases_pio-1-ell-eng - duplicate with opus + # - mtdata_ELRC-constitution_greece-1-ell-eng - duplicate with opus + # - mtdata_ELRC-emea-1-ell-eng - duplicate with opus + # - mtdata_ELRC-vaccination-1-ell-eng - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-ell-eng - duplicate with opus + # - mtdata_ELRC-antibiotic-1-ell-eng - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-ell-eng - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-ell-eng - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-ell-eng - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-ell-eng - duplicate with opus + # - mtdata_EU-ecdc-1-eng-ell - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-ell-eng - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-ell-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-ell - duplicate with opus + # - mtdata_Statmt-europarl-7-ell-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-ell_GR-eng - duplicate with opus + # - mtdata_Tilde-ecb-2017-ell-eng - duplicate with opus + train: + - opus_NLLB/v1 # 49,262,631 sentences + - opus_OpenSubtitles/v2018 # 40,492,942 sentences + - opus_ParaCrawl/v9 # 21,402,471 sentences + - opus_ELRC-EMEA/v1 # 13,691,653 sentences + - opus_CCAligned/v1 # 8,878,509 sentences + - opus_DGT/v2019 # 5,099,790 sentences + - opus_EUbookshop/v2 # 4,022,952 sentences + - opus_MaCoCu/v2 # 3,583,978 sentences + - opus_XLEnt/v1.2 # 2,949,219 sentences + - opus_LinguaTools-WikiTitles/v2014 # 1,850,804 sentences + - opus_Europarl/v8 # 1,292,180 sentences + - opus_EMEA/v3 # 1,073,225 sentences + - opus_ELRC-2711-EMEA/v1 # 781,988 sentences + - opus_ELRC_2682/v1 # 781,987 sentences + - opus_ELRC-5067-SciPar/v1 # 742,987 sentences + - opus_WikiMatrix/v1 # 620,802 sentences + - opus_wikimedia/v20230407 # 589,733 sentences + - opus_QED/v2.0a # 550,438 sentences + - opus_ELITR-ECA/v1 # 381,561 sentences + - opus_TED2020/v1 # 269,407 sentences + - opus_SETIMES/v2 # 227,168 sentences + - opus_NeuLab-TedTalks/v1 # 153,493 sentences + - opus_ELRC-presscorner_covid/v1 # 152,003 sentences + - opus_KDE4/v2 # 144,894 sentences + - opus_GlobalVoices/v2018q4 # 120,421 sentences + - opus_ELRC-Press_Releases/v1 # 117,171 sentences + - opus_Wikipedia/v1.0 # 104,076 sentences + - opus_ECB/v1 # 102,986 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_ELRA-W0202/v1 # 61,967 sentences + - opus_Tatoeba/v2023-04-12 # 25,995 sentences + - opus_ELRC-3562-EUR_LEX_covid/v1 # 23,024 sentences + - opus_ELRC-EUR_LEX/v1 # 23,024 sentences + - opus_ELRC-842-Quarterly_Reports_Pa/v1 # 21,248 sentences + - opus_ELRA-W0243/v1 # 21,247 sentences + - opus_ELRC-1175-EUIPO_2017/v1 # 20,027 sentences + - opus_ELRC-EUIPO_2017/v1 # 20,027 sentences + - opus_ELRC-EUROPARL_covid/v1 # 19,587 sentences + - opus_JRC-Acquis/v3.0 # 17,717 sentences + - opus_GNOME/v1 # 17,389 sentences + - opus_ELRC-843-collection_reports_G/v1 # 16,286 sentences + - opus_ELRA-W0244/v1 # 16,285 sentences + - opus_ELRC-antibiotic/v1 # 16,083 sentences + - opus_ELRC-2872-EU_publications_medi/v1 # 13,092 sentences + - opus_ELRC-EU_publications/v1 # 13,092 sentences + - opus_ELRC-649-Greek_administration/v1 # 12,510 sentences + - opus_ELRA-W0203/v1 # 12,509 sentences + - opus_EUconst/v1 # 9,990 sentences + - opus_SPC/v1 # 8,181 sentences + - opus_ELRC-3603-presscorner_covid/v1 # 6,635 sentences + - opus_ELRC-936-Prime_Minister_Helle/v1 # 5,323 sentences + - opus_ELRA-W0272/v1 # 5,322 sentences + - opus_TildeMODEL/v2018 # 5,238 sentences + - opus_ELRC-1787-Press_Releases_PIO/v1 # 5,163 sentences + - opus_ELRC-PIO_Publication/v1 # 3,949 sentences + - opus_ELRC-1984-Hellenic_Gaming_Comm/v1 # 3,875 sentences + - opus_ELRC_3382/v1 # 3,818 sentences + - opus_ELRC-932-Hellenic_Foreign_Aff/v1 # 3,471 sentences + - opus_ELRA-W0271/v1 # 3,470 sentences + - opus_ELRC-1067-PIO_Publication_Wind/v1 # 2,629 sentences + - opus_ECDC/v2016-03-16 # 2,531 sentences + - opus_ELRC-3461-EC_EUROPA_covid/v1 # 2,234 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,234 sentences + - opus_ELRC-1981-EQF_Referencing_Repo/v1 # 2,100 sentences + - opus_ELRC-652-Greek_law/v1 # 1,980 sentences + - opus_ELRA-W0205/v1 # 1,979 sentences + - opus_ELRC-3058-wikipedia_health/v1 # 1,871 sentences + - opus_ELRC-wikipedia_health/v1 # 1,871 sentences + - opus_ELRC-1020-collection_about_Cyp/v1 # 1,870 sentences + - opus_ELRC_2922/v1 # 1,870 sentences + - opus_ELRC-1070-Press_Information_Cy/v1 # 1,863 sentences + - opus_ELRC-1970-governmental_about_M/v1 # 1,435 sentences + - opus_ELRC-1065-PIO_Publication_Cypr/v1 # 1,321 sentences + - opus_Books/v1 # 1,285 sentences + - opus_ELRC-419-Greek_legislation_An/v1 # 1,070 sentences + - opus_ELRA-W0164/v1 # 1,069 sentences + - opus_ELRC-1986-Constitution_Greece/v1 # 1,022 sentences + - opus_ELRC-3199-antibiotic/v1 # 990 sentences + - opus_ELRC-5160-Press_Releases_PIO/v1 # 955 sentences + - opus_ELRC-3290-EUROPARL_covid/v1 # 692 sentences + - opus_ELRC-2735-vaccination/v1 # 519 sentences + - opus_ELRC-vaccination/v1 # 519 sentences + - opus_ELRC-663-Memorandum_a_ESM/v1 # 434 sentences + - opus_ELRA-W0210/v1 # 433 sentences + - opus_ELRC_2923/v1 # 420 sentences + - opus_ELRC-646-International_Judici/v1 # 289 sentences + - opus_ELRA-W0307/v1 # 288 sentences + - mtdata_ELRC-rights_arrested-1-ell-eng + - mtdata_ELRC-swedish_social_security-1-ell-eng + - mtdata_ELRC-greek_legislation_anticorruption_plan-1-ell-eng + - mtdata_ELRC-convention_transfer_sentenced_persons-1-ell-eng + - mtdata_ELRC-international_judicial_cooperation_civil_matters-1-ell-eng + - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-ell-eng + - mtdata_ELRC-macroeconomic_developments-1-ell-eng + - mtdata_ELRC-methodological_reconciliation-1-ell-eng + - mtdata_ELRC-expression_interest-1-ell-eng + - mtdata_ELRC-memorandum_a_esm_programme-1-ell-eng + - mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng + - mtdata_ELRC-quarterly_reports_parliamentary_budget-1-ell-eng + - mtdata_ELRC-collection_reports_greek_power_corporation-1-ell-eng + - mtdata_ELRC-hellenic_foreign_affairs_announcements-1-ell-eng + - mtdata_ELRC-prime_minister_hellenic-1-ell-eng + - mtdata_ELRC-collection_about_cyprus_problem-1-ell-eng + - mtdata_ELRC-commitment_property_open-1-ell-eng + - mtdata_ELRC-compulsory_expropriation_process_greece-1-ell-eng + - mtdata_ELRC-pio_publication_cyprus_has_always_been_europe_2017-1-ell-eng + - mtdata_ELRC-pio_publication_window_cyprus-1-ell-eng + - mtdata_ELRC-press_information_cyprus-1-ell-eng + - mtdata_ELRC-governmental_about_migration_policy-1-ell-eng + - mtdata_ELRC-eqf_referencing_report-1-ell-eng + - mtdata_ELRC-hellenic_gaming_commission-1-ell-eng + - mtdata_ELRC-eu_publications_medical_v2-1-ell-eng + - mtdata_EU-eac_forms-1-ell-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-ell-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-ell-eng # ~1,178,828 sentences (133.2 MB) + - mtdata_Neulab-tedtalks_test-1-eng-ell # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-ell-eng # ~2,393,754 sentences (270.5 MB) + - mtdata_Tilde-ema-2016-ell-eng # ~244,548 sentences (27.6 MB) + - mtdata_Tilde-rapid-2016-ell-eng # ~586,564 sentences (66.3 MB) + + # The monolingual data contains: + # ~27,097,343 sentences + mono-src: + - news-crawl_news.2015 # ~1,115,044 sentences (126M) + - news-crawl_news.2019 # ~2,398,230 sentences (271M) + - news-crawl_news.2020 # ~5,327,433 sentences (602M) + - news-crawl_news.2021 # ~5,238,938 sentences (592M) + - news-crawl_news.2022 # ~6,725,663 sentences (760M) + - news-crawl_news.2023 # ~6,292,035 sentences (711M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-bs-spring-2024.yml b/configs/en-bs-spring-2024.yml new file mode 100644 index 000000000..f9f99bde7 --- /dev/null +++ b/configs/en-bs-spring-2024.yml @@ -0,0 +1,118 @@ +# The initial configuration was generated using: +# task config-generator -- en bs --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: bs + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-bos + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 94,895,603 sentences + # + # Skipped datasets: + # - opus_MultiHPLT/v1.1 - ignored datasets (240,013 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-bos-eng - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-bos-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-bos - duplicate with opus + # - mtdata_Statmt-ccaligned-1-bos_BA-eng - duplicate with opus + train: + - opus_NLLB/v1 # 79,334,034 sentences + - opus_OpenSubtitles/v2018 # 14,041,160 sentences + - opus_XLEnt/v1.2 # 266,696 sentences + - opus_Tanzil/v1 # 246,913 sentences + - opus_HPLT/v1.1 # 240,015 sentences + - opus_WikiMatrix/v1 # 210,691 sentences + - opus_CCAligned/v1 # 192,099 sentences + - opus_GNOME/v1 # 164,960 sentences + - opus_SETIMES/v2 # 138,387 sentences + - opus_wikimedia/v20230407 # 28,167 sentences + - opus_QED/v2.0a # 12,541 sentences + - opus_TED2020/v1 # 11,638 sentences + - opus_NeuLab-TedTalks/v1 # 6,136 sentences + - opus_EUbookshop/v2 # 558 sentences + - opus_Tatoeba/v2023-04-12 # 515 sentences + - opus_tldr-pages/v2023-08-29 # 479 sentences + - opus_ELRC-3047-wikipedia_health/v1 # 205 sentences + - opus_ELRC-wikipedia_health/v1 # 205 sentences + - opus_ELRC_2922/v1 # 204 sentences + - mtdata_Neulab-tedtalks_test-1-eng-bos # ~3,117,009 sentences (352.2 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~8,982,298 sentences + mono-trg: + - news-crawl_news.2018 # ~8,849 sentences (1.0M) + - news-crawl_news.2019 # ~920,353 sentences (104M) + - news-crawl_news.2020 # ~1,734,513 sentences (196M) + - news-crawl_news.2021 # ~2,079,646 sentences (235M) + - news-crawl_news.2022 # ~2,132,743 sentences (241M) + - news-crawl_news.2023 # ~2,106,194 sentences (238M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-cs-spring-2024.yml b/configs/en-cs-spring-2024.yml new file mode 100644 index 000000000..c71aaf91c --- /dev/null +++ b/configs/en-cs-spring-2024.yml @@ -0,0 +1,234 @@ +# The initial configuration was generated using: +# task config-generator -- en cs --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: cs + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Lindat-khresmoi_summary_dev-2-ces-eng + - mtdata_Neulab-tedtalks_dev-1-eng-ces + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt19 + - sacrebleu_aug-mix_wmt18/test-ts + - sacrebleu_aug-mix_wmt16 + - sacrebleu_aug-mix_wmt14 + - sacrebleu_aug-mix_wmt13 + - sacrebleu_aug-mix_wmt11 + - sacrebleu_aug-mix_wmt09 + - sacrebleu_aug-mix_wmt08/nc + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt20 + - sacrebleu_wmt18 + - sacrebleu_wmt17 + - sacrebleu_wmt15 + - sacrebleu_wmt14/full + - sacrebleu_wmt12 + - sacrebleu_wmt10 + - sacrebleu_wmt08 + - sacrebleu_multi30k/2016 + + # The training data contains: + # 213,550,488 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (56,307,029 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - opus_WikiTitles/v3 - ignored datasets (0 sentences) + # - mtdata_ELRC-euipo_2017-1-ces-eng - duplicate with opus + # - mtdata_ELRC-emea-1-ces-eng - duplicate with opus + # - mtdata_ELRC-vaccination-1-ces-eng - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-ces-eng - duplicate with opus + # - mtdata_ELRC-antibiotic-1-ces-eng - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-ces-eng - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-ces-eng - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-ces-eng - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-ces-eng - duplicate with opus + # - mtdata_EU-ecdc-1-eng-ces - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-ces-eng - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-ces-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-3-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-ces - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-ces - duplicate with opus + # - mtdata_Statmt-europarl-9-ces-eng - duplicate with opus + # - mtdata_Statmt-europarl-7-ces-eng - duplicate with opus + # - mtdata_Statmt-news_commentary-14-ces-eng - duplicate with opus + # - mtdata_Statmt-news_commentary-15-ces-eng - duplicate with opus + # - mtdata_Statmt-news_commentary-16-ces-eng - duplicate with opus + # - mtdata_Statmt-europarl-10-ces-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-ces_CZ-eng - duplicate with opus + # - mtdata_Tilde-ecb-2017-ces-eng - duplicate with opus + train: + - opus_NLLB/v1 # 56,307,029 sentences + - opus_ParaCrawl/v9 # 50,633,505 sentences + - opus_OpenSubtitles/v2018 # 42,346,436 sentences + - opus_StanfordNLP-NMT/v1.0 # 15,793,121 sentences + - opus_ELRC-EMEA/v1 # 12,891,707 sentences + - opus_CCAligned/v1 # 12,730,121 sentences + - opus_DGT/v2019 # 5,207,753 sentences + - opus_LinguaTools-WikiTitles/v2014 # 4,813,030 sentences + - opus_XLEnt/v1.2 # 3,894,132 sentences + - opus_JRC-Acquis/v3.0 # 1,273,411 sentences + - opus_ELRC-5067-SciPar/v1 # 1,064,385 sentences + - opus_EMEA/v3 # 1,053,385 sentences + - opus_ELRC-2713-EMEA/v1 # 779,083 sentences + - opus_ELRC_2682/v1 # 779,082 sentences + - opus_Europarl/v8 # 647,095 sentences + - opus_WikiMatrix/v1 # 519,195 sentences + - opus_EUbookshop/v2 # 455,472 sentences + - opus_QED/v2.0a # 441,508 sentences + - opus_ELITR-ECA/v1 # 295,788 sentences + - opus_Tanzil/v1 # 233,399 sentences + - opus_News-Commentary/v16 # 218,509 sentences + - opus_TED2020/v1 # 170,611 sentences + - opus_wikimedia/v20230407 # 146,717 sentences + - opus_KDE4/v2 # 134,071 sentences + - opus_ELRC-presscorner_covid/v1 # 129,652 sentences + - opus_NeuLab-TedTalks/v1 # 111,107 sentences + - opus_ECB/v1 # 63,716 sentences + - opus_bible-uedin/v1 # 62,151 sentences + - opus_WMT-News/v2019 # 44,859 sentences + - opus_Tatoeba/v2023-04-12 # 34,628 sentences + - opus_PHP/v1 # 32,983 sentences + - opus_Wikipedia/v1.0 # 27,723 sentences + - opus_ELRC-3564-EUR_LEX_covid/v1 # 22,637 sentences + - opus_ELRC-EUR_LEX/v1 # 22,637 sentences + - opus_GlobalVoices/v2018q4 # 18,876 sentences + - opus_ELRC-427-Electronic_Exchange_/v1 # 17,357 sentences + - opus_ELRC-2012-EUIPO_2017/v1 # 15,945 sentences + - opus_ELRC-EUIPO_2017/v1 # 15,945 sentences + - opus_ELRC-antibiotic/v1 # 15,678 sentences + - opus_ELRC-2874-EU_publications_medi/v1 # 13,161 sentences + - opus_ELRC-EU_publications/v1 # 13,161 sentences + - opus_ELRC-EUROPARL_covid/v1 # 11,142 sentences + - opus_EUconst/v1 # 9,953 sentences + - opus_ELRC-3605-presscorner_covid/v1 # 6,229 sentences + - opus_ELRC-2406-Czech_Supreme_Audit/v1 # 4,771 sentences + - opus_ELRC_3382/v1 # 3,722 sentences + - opus_TildeMODEL/v2018 # 3,100 sentences + - opus_ELRC-2405-Czech_Supreme_Audit/v1 # 2,868 sentences + - opus_ECDC/v2016-03-16 # 2,559 sentences + - opus_ELRC-3463-EC_EUROPA_covid/v1 # 2,386 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,386 sentences + - opus_ELRC-40-Information_Portal_C/v1 # 1,828 sentences + - opus_ELRC-Information_Portal/v1 # 1,828 sentences + - opus_ELRC-3062-wikipedia_health/v1 # 1,146 sentences + - opus_ELRC-wikipedia_health/v1 # 1,146 sentences + - opus_ELRC_2922/v1 # 1,145 sentences + - opus_ELRC-3201-antibiotic/v1 # 965 sentences + - opus_ELRC-3292-EUROPARL_covid/v1 # 557 sentences + - opus_ELRC-2749-vaccination/v1 # 520 sentences + - opus_ELRC-vaccination/v1 # 520 sentences + - opus_ELRC-2404-Czech_Supreme_Audit/v1 # 403 sentences + - opus_ELRC_2923/v1 # 319 sentences + - opus_ELRC-2407-Czech_Supreme_Audit/v1 # 234 sentences + - mtdata_ELRC-information_portal_czech_president_czech_castle-1-ces-eng + - mtdata_ELRC-electronic_exchange_social_security_information-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2018_reports-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2008_2017_reports-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2003_2017_press_releases-1-ces-eng + - mtdata_ELRC-czech_supreme_audit_office_2018_press_releases-1-ces-eng + - mtdata_ELRC-eu_publications_medical_v2-1-ces-eng + - mtdata_EU-eac_forms-1-ces-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-ces-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-ces-eng # ~533,693 sentences (60.3 MB) + - mtdata_Lindat-khresmoi_summary_test-2-ces-eng # ~11,808 sentences (1.3 MB) + - mtdata_Neulab-tedtalks_test-1-eng-ces # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-commoncrawl_wmt13-1-ces-eng # ~8,126,649 sentences (918.3 MB) + - mtdata_Statmt-europarl_wmt13-7-ces-eng # ~5,819,755 sentences (657.6 MB) + - mtdata_Statmt-news_commentary_wmt18-13-ces-eng # ~1,001,393 sentences (113.2 MB) + - mtdata_Statmt-wiki_titles-1-ces-eng # ~45,242 sentences (5.1 MB) + - mtdata_Statmt-wiki_titles-2-ces-eng # ~47,995 sentences (5.4 MB) + - mtdata_Tilde-eesc-2017-ces-eng # ~1,157,475 sentences (130.8 MB) + - mtdata_Tilde-ema-2016-ces-eng # ~244,524 sentences (27.6 MB) + - mtdata_Tilde-rapid-2019-ces-eng # ~255,063 sentences (28.8 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~55,777,868 sentences + mono-trg: + - news-crawl_news.2007 # ~34,513 sentences (3.9M) + - news-crawl_news.2008 # ~1,840,707 sentences (208M) + - news-crawl_news.2009 # ~2,079,646 sentences (235M) + - news-crawl_news.2010 # ~1,247,787 sentences (141M) + - news-crawl_news.2011 # ~3,185,840 sentences (360M) + - news-crawl_news.2012 # ~2,964,601 sentences (335M) + - news-crawl_news.2013 # ~3,389,380 sentences (383M) + - news-crawl_news.2014 # ~2,973,451 sentences (336M) + - news-crawl_news.2015 # ~3,026,548 sentences (342M) + - news-crawl_news.2016 # ~2,159,292 sentences (244M) + - news-crawl_news.2017 # ~2,849,557 sentences (322M) + - news-crawl_news.2018 # ~2,637,168 sentences (298M) + - news-crawl_news.2019 # ~5,513,274 sentences (623M) + - news-crawl_news.2020 # ~7,451,327 sentences (842M) + - news-crawl_news.2021 # ~5,265,486 sentences (595M) + - news-crawl_news.2022 # ~3,884,955 sentences (439M) + - news-crawl_news.2023 # ~5,274,336 sentences (596M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-da-spring-2024.yml b/configs/en-da-spring-2024.yml new file mode 100644 index 000000000..2861b50ec --- /dev/null +++ b/configs/en-da-spring-2024.yml @@ -0,0 +1,235 @@ +# The initial configuration was generated using: +# task config-generator -- en da --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: da + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-dan + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 161,668,955 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (52,273,664 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-www.norden.org-1-dan-eng - duplicate with opus + # - mtdata_ELRC-mst.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-ufm.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.dst.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.dma.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.geus.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-naturstyrelsen.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.trm.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-um.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.aarhus2017.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.odense.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.visitvejle.com-1-dan-eng - duplicate with opus + # - mtdata_ELRC-www.visitdenmark.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-slks.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-natmus.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-gallery_denmark-1-dan-eng - duplicate with opus + # - mtdata_ELRC-royal_danish_library-1-dan-eng - duplicate with opus + # - mtdata_ELRC-danish_fsa-1-dan-eng - duplicate with opus + # - mtdata_ELRC-uk.fm.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-spillemyndigheden.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-rigsrevisionen.dk-1-dan-eng - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-dan-eng - duplicate with opus + # - mtdata_ELRC-emea-1-dan-eng - duplicate with opus + # - mtdata_ELRC-vaccination-1-dan-eng - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-dan-eng - duplicate with opus + # - mtdata_ELRC-antibiotic-1-dan-eng - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-dan-eng - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-dan-eng - duplicate with opus + # - mtdata_EU-ecdc-1-eng-dan - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-dan-eng - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-dan-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-dan - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-dan - duplicate with opus + # - mtdata_Statmt-europarl-7-dan-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-dan_DK-eng - duplicate with opus + # - mtdata_Tilde-ecb-2017-dan-eng - duplicate with opus + train: + - opus_NLLB/v1 # 52,273,664 sentences + - opus_ParaCrawl/v9 # 34,207,840 sentences + - opus_OpenSubtitles/v2018 # 14,474,569 sentences + - opus_ELRC-4248-NTEU_TierA/v1 # 13,756,130 sentences + - opus_ELRC-EMEA/v1 # 12,556,334 sentences + - opus_CCAligned/v1 # 10,738,610 sentences + - opus_DGT/v2019 # 5,152,323 sentences + - opus_EUbookshop/v2 # 4,980,755 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,084,707 sentences + - opus_XLEnt/v1.2 # 3,042,401 sentences + - opus_Europarl/v8 # 1,991,647 sentences + - opus_EMEA/v3 # 1,093,780 sentences + - opus_JRC-Acquis/v3.0 # 808,916 sentences + - opus_ELRC-2716-EMEA/v1 # 775,676 sentences + - opus_ELRC_2682/v1 # 775,675 sentences + - opus_WikiMatrix/v1 # 436,052 sentences + - opus_KDE4/v2 # 194,410 sentences + - opus_QED/v2.0a # 175,384 sentences + - opus_ELRC-presscorner_covid/v1 # 145,352 sentences + - opus_ECB/v1 # 138,154 sentences + - opus_ELITR-ECA/v1 # 135,384 sentences + - opus_TED2020/v1 # 72,113 sentences + - opus_wikimedia/v20230407 # 69,969 sentences + - opus_bible-uedin/v1 # 62,113 sentences + - opus_NeuLab-TedTalks/v1 # 48,462 sentences + - opus_ELRC-847-mst.dk/v1 # 36,750 sentences + - opus_ELRC-730-www.norden.org/v1 # 36,626 sentences + - opus_ELRC-www.norden.org/v1 # 36,626 sentences + - opus_Tatoeba/v2023-04-12 # 32,790 sentences + - opus_ELRC-850-www.dst.dk/v1 # 22,817 sentences + - opus_ELRC-848-laegemiddelstyrelsen/v1 # 22,700 sentences + - opus_ELRC-3567-EUR_LEX_covid/v1 # 21,239 sentences + - opus_ELRC-EUR_LEX/v1 # 21,239 sentences + - opus_ELRC-2013-EUIPO_2017/v1 # 17,269 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,269 sentences + - opus_ELRA-W0214/v1 # 16,243 sentences + - opus_ELRC-antibiotic/v1 # 13,310 sentences + - opus_ELRC-2877-EU_publications_medi/v1 # 13,243 sentences + - opus_ELRC-EU_publications/v1 # 13,243 sentences + - opus_ELRC-851-www.vikingeskibsmuse/v1 # 12,404 sentences + - opus_ELRC-EUROPARL_covid/v1 # 11,723 sentences + - opus_ELRC-849-ufm.dk/v1 # 10,054 sentences + - opus_EUconst/v1 # 10,032 sentences + - opus_ELRC-1062-rigsrevisionen.dk/v1 # 8,234 sentences + - opus_GlobalVoices/v2018q4 # 7,311 sentences + - opus_ELRC-904-uk.fm.dk/v1 # 6,949 sentences + - opus_ELRC-3608-presscorner_covid/v1 # 6,262 sentences + - opus_ELRC-892-slks.dk/v1 # 4,956 sentences + - opus_ELRC-885-www.aarhus2017.dk/v1 # 4,709 sentences + - opus_TildeMODEL/v2018 # 4,420 sentences + - opus_ELRC-397-Danish_Higher_Educat/v1 # 4,395 sentences + - opus_ELRA-W0157/v1 # 4,394 sentences + - opus_ELRC-439-Danish_Higher_Educat/v1 # 4,149 sentences + - opus_ELRC-893-natmus.dk/v1 # 3,950 sentences + - opus_ELRC-394-Danish_Higher_Educat/v1 # 3,719 sentences + - opus_ELRC_3382/v1 # 3,406 sentences + - opus_ELRC-905-spillemyndigheden.dk/v1 # 3,355 sentences + - opus_ELRC-856-naturstyrelsen.dk/v1 # 3,118 sentences + - opus_ELRC-859-um.dk/v1 # 3,055 sentences + - opus_ELRC-857-www.trm.dk/v1 # 3,015 sentences + - opus_ELRC-852-www.dma.dk/v1 # 3,010 sentences + - opus_ELRC-3466-EC_EUROPA_covid/v1 # 2,804 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,804 sentences + - opus_ECDC/v2016-03-16 # 2,578 sentences + - opus_ELRC-897-Denmark_Space_Instit/v1 # 1,940 sentences + - opus_ELRC-899-Danish_FSA/v1 # 1,931 sentences + - opus_ELRC-426-Danish_Higher_Educat/v1 # 1,886 sentences + - opus_ELRC-854-www.geus.dk/v1 # 1,655 sentences + - opus_ELRC-891-www.visitdenmark.dk/v1 # 1,603 sentences + - opus_ELRC-895-Royal_Danish_Library/v1 # 1,547 sentences + - opus_ELRC-889-www.visitvejle.com/v1 # 1,472 sentences + - opus_ELRC-886-www.odense.dk/v1 # 1,427 sentences + - opus_ELRC-901-Denmark_Prosecution_/v1 # 1,163 sentences + - opus_ELRC-900-Danish_Working_Envir/v1 # 1,138 sentences + - opus_ELRC-890-Holstebro_Kunstmuseu/v1 # 1,023 sentences + - opus_ELRC-3204-antibiotic/v1 # 801 sentences + - opus_ELRC-894-Gallery_Denmark/v1 # 769 sentences + - opus_ELRC-3295-EUROPARL_covid/v1 # 634 sentences + - opus_ELRC-3066-wikipedia_health/v1 # 523 sentences + - opus_ELRC-wikipedia_health/v1 # 523 sentences + - opus_ELRC_2922/v1 # 522 sentences + - opus_tldr-pages/v2023-08-29 # 495 sentences + - opus_ELRC-2754-vaccination/v1 # 462 sentences + - opus_ELRC-vaccination/v1 # 462 sentences + - opus_ELRC_2923/v1 # 389 sentences + - mtdata_ELRC-danish_higher_education_science_3-1-dan-eng + - mtdata_ELRC-danish_higher_education_science_2-1-dan-eng + - mtdata_ELRC-danish_higher_education_science-1-dan-eng + - mtdata_ELRC-danish_higher_education_science_4-1-dan-eng + - mtdata_ELRC-laegemiddelstyrelsen.dk-1-dan-eng + - mtdata_ELRC-www.vikingeskibsmuseet.dk-1-dan-eng + - mtdata_ELRC-holstebro_kunstmuseum-1-dan-eng + - mtdata_ELRC-denmark_space_institute-1-dan-eng + - mtdata_ELRC-danish_working_environment_authority-1-dan-eng + - mtdata_ELRC-denmark_prosecution_service-1-dan-eng + - mtdata_ELRC-eu_publications_medical_v2-1-dan-eng + - mtdata_ELRC-nteu_tierb-1-dan-eng + - mtdata_EU-eac_forms-1-dan-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-dan-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-dan-eng # ~1,040,518 sentences (117.6 MB) + - mtdata_Neulab-tedtalks_test-1-eng-dan # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-dan-eng # ~1,936,973 sentences (218.9 MB) + - mtdata_Tilde-ema-2016-dan-eng # ~215,232 sentences (24.3 MB) + - mtdata_Tilde-rapid-2016-dan-eng # ~451,067 sentences (51.0 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~0 sentences + mono-trg: [] +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-el-spring-2024.yml b/configs/en-el-spring-2024.yml new file mode 100644 index 000000000..c5ba9165e --- /dev/null +++ b/configs/en-el-spring-2024.yml @@ -0,0 +1,260 @@ +# The initial configuration was generated using: +# task config-generator -- en el --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: el + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ell + - flores_aug-mix_dev + - sacrebleu_aug-mix_mtedx/test + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_mtedx/valid + + # The training data contains: + # 159,976,981 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (49,262,631 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (3,583,975 sentences) + # - opus_ELRC-3857-State_Budget_Executi/v1 - not enough data (180 sentences) + # - opus_ELRC-745-Convention_against_T/v1 - not enough data (165 sentences) + # - opus_ELRA-W0309/v1 - not enough data (164 sentences) + # - opus_ELRC-656-Macroeconomic_Develo/v1 - not enough data (151 sentences) + # - opus_ELRC-496-Convention_transfer_/v1 - not enough data (121 sentences) + # - opus_ELRA-W0196/v1 - not enough data (120 sentences) + # - opus_ELRA-W0207/v1 - not enough data (101 sentences) + # - opus_ELRA-W0308/v1 - not enough data (87 sentences) + # - opus_ELRC-662-Expression_interest/v1 - not enough data (85 sentences) + # - opus_ELRA-W0209/v1 - not enough data (84 sentences) + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (65 sentences) + # - opus_ELRC-658-Methodological_Recon/v1 - not enough data (45 sentences) + # - opus_ELRA-W0208/v1 - not enough data (44 sentences) + # - opus_ELRC-1022-COMPULSORY_EXPROPRIA/v1 - not enough data (38 sentences) + # - opus_ELRC-3856-PRESS/v1 - not enough data (35 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (30 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (29 sentences) + # - opus_ELRC-1021-Commitment_Property_/v1 - not enough data (23 sentences) + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (22 sentences) + # - opus_ELRA-W0301/v1 - not enough data (16 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-greek_administration-1-ell-eng - duplicate with opus + # - mtdata_ELRC-greek_law-1-ell-eng - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-ell-eng - duplicate with opus + # - mtdata_ELRC-press_releases_pio-1-ell-eng - duplicate with opus + # - mtdata_ELRC-constitution_greece-1-ell-eng - duplicate with opus + # - mtdata_ELRC-emea-1-ell-eng - duplicate with opus + # - mtdata_ELRC-vaccination-1-ell-eng - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-ell-eng - duplicate with opus + # - mtdata_ELRC-antibiotic-1-ell-eng - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-ell-eng - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-ell-eng - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-ell-eng - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-ell-eng - duplicate with opus + # - mtdata_EU-ecdc-1-eng-ell - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-ell-eng - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-ell-eng - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-ell - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-ell - duplicate with opus + # - mtdata_Statmt-europarl-7-ell-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-ell_GR-eng - duplicate with opus + # - mtdata_Tilde-ecb-2017-ell-eng - duplicate with opus + train: + - opus_NLLB/v1 # 49,262,631 sentences + - opus_OpenSubtitles/v2018 # 40,492,942 sentences + - opus_ParaCrawl/v9 # 21,402,471 sentences + - opus_ELRC-EMEA/v1 # 13,691,653 sentences + - opus_CCAligned/v1 # 8,878,509 sentences + - opus_DGT/v2019 # 5,099,790 sentences + - opus_EUbookshop/v2 # 4,022,952 sentences + - opus_MaCoCu/v2 # 3,583,978 sentences + - opus_XLEnt/v1.2 # 2,949,219 sentences + - opus_LinguaTools-WikiTitles/v2014 # 1,850,804 sentences + - opus_Europarl/v8 # 1,292,180 sentences + - opus_EMEA/v3 # 1,073,225 sentences + - opus_ELRC-2711-EMEA/v1 # 781,988 sentences + - opus_ELRC_2682/v1 # 781,987 sentences + - opus_ELRC-5067-SciPar/v1 # 742,987 sentences + - opus_WikiMatrix/v1 # 620,802 sentences + - opus_wikimedia/v20230407 # 589,733 sentences + - opus_QED/v2.0a # 550,438 sentences + - opus_ELITR-ECA/v1 # 381,561 sentences + - opus_TED2020/v1 # 269,407 sentences + - opus_SETIMES/v2 # 227,168 sentences + - opus_NeuLab-TedTalks/v1 # 153,493 sentences + - opus_ELRC-presscorner_covid/v1 # 152,003 sentences + - opus_KDE4/v2 # 144,894 sentences + - opus_GlobalVoices/v2018q4 # 120,421 sentences + - opus_ELRC-Press_Releases/v1 # 117,171 sentences + - opus_Wikipedia/v1.0 # 104,076 sentences + - opus_ECB/v1 # 102,986 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_ELRA-W0202/v1 # 61,967 sentences + - opus_Tatoeba/v2023-04-12 # 25,995 sentences + - opus_ELRC-3562-EUR_LEX_covid/v1 # 23,024 sentences + - opus_ELRC-EUR_LEX/v1 # 23,024 sentences + - opus_ELRC-842-Quarterly_Reports_Pa/v1 # 21,248 sentences + - opus_ELRA-W0243/v1 # 21,247 sentences + - opus_ELRC-1175-EUIPO_2017/v1 # 20,027 sentences + - opus_ELRC-EUIPO_2017/v1 # 20,027 sentences + - opus_ELRC-EUROPARL_covid/v1 # 19,587 sentences + - opus_JRC-Acquis/v3.0 # 17,717 sentences + - opus_GNOME/v1 # 17,389 sentences + - opus_ELRC-843-collection_reports_G/v1 # 16,286 sentences + - opus_ELRA-W0244/v1 # 16,285 sentences + - opus_ELRC-antibiotic/v1 # 16,083 sentences + - opus_ELRC-2872-EU_publications_medi/v1 # 13,092 sentences + - opus_ELRC-EU_publications/v1 # 13,092 sentences + - opus_ELRC-649-Greek_administration/v1 # 12,510 sentences + - opus_ELRA-W0203/v1 # 12,509 sentences + - opus_EUconst/v1 # 9,990 sentences + - opus_SPC/v1 # 8,181 sentences + - opus_ELRC-3603-presscorner_covid/v1 # 6,635 sentences + - opus_ELRC-936-Prime_Minister_Helle/v1 # 5,323 sentences + - opus_ELRA-W0272/v1 # 5,322 sentences + - opus_TildeMODEL/v2018 # 5,238 sentences + - opus_ELRC-1787-Press_Releases_PIO/v1 # 5,163 sentences + - opus_ELRC-PIO_Publication/v1 # 3,949 sentences + - opus_ELRC-1984-Hellenic_Gaming_Comm/v1 # 3,875 sentences + - opus_ELRC_3382/v1 # 3,818 sentences + - opus_ELRC-932-Hellenic_Foreign_Aff/v1 # 3,471 sentences + - opus_ELRA-W0271/v1 # 3,470 sentences + - opus_ELRC-1067-PIO_Publication_Wind/v1 # 2,629 sentences + - opus_ECDC/v2016-03-16 # 2,531 sentences + - opus_ELRC-3461-EC_EUROPA_covid/v1 # 2,234 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,234 sentences + - opus_ELRC-1981-EQF_Referencing_Repo/v1 # 2,100 sentences + - opus_ELRC-652-Greek_law/v1 # 1,980 sentences + - opus_ELRA-W0205/v1 # 1,979 sentences + - opus_ELRC-3058-wikipedia_health/v1 # 1,871 sentences + - opus_ELRC-wikipedia_health/v1 # 1,871 sentences + - opus_ELRC-1020-collection_about_Cyp/v1 # 1,870 sentences + - opus_ELRC_2922/v1 # 1,870 sentences + - opus_ELRC-1070-Press_Information_Cy/v1 # 1,863 sentences + - opus_ELRC-1970-governmental_about_M/v1 # 1,435 sentences + - opus_ELRC-1065-PIO_Publication_Cypr/v1 # 1,321 sentences + - opus_Books/v1 # 1,285 sentences + - opus_ELRC-419-Greek_legislation_An/v1 # 1,070 sentences + - opus_ELRA-W0164/v1 # 1,069 sentences + - opus_ELRC-1986-Constitution_Greece/v1 # 1,022 sentences + - opus_ELRC-3199-antibiotic/v1 # 990 sentences + - opus_ELRC-5160-Press_Releases_PIO/v1 # 955 sentences + - opus_ELRC-3290-EUROPARL_covid/v1 # 692 sentences + - opus_ELRC-2735-vaccination/v1 # 519 sentences + - opus_ELRC-vaccination/v1 # 519 sentences + - opus_ELRC-663-Memorandum_a_ESM/v1 # 434 sentences + - opus_ELRA-W0210/v1 # 433 sentences + - opus_ELRC_2923/v1 # 420 sentences + - opus_ELRC-646-International_Judici/v1 # 289 sentences + - opus_ELRA-W0307/v1 # 288 sentences + - mtdata_ELRC-rights_arrested-1-ell-eng + - mtdata_ELRC-swedish_social_security-1-ell-eng + - mtdata_ELRC-greek_legislation_anticorruption_plan-1-ell-eng + - mtdata_ELRC-convention_transfer_sentenced_persons-1-ell-eng + - mtdata_ELRC-international_judicial_cooperation_civil_matters-1-ell-eng + - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-ell-eng + - mtdata_ELRC-macroeconomic_developments-1-ell-eng + - mtdata_ELRC-methodological_reconciliation-1-ell-eng + - mtdata_ELRC-expression_interest-1-ell-eng + - mtdata_ELRC-memorandum_a_esm_programme-1-ell-eng + - mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng + - mtdata_ELRC-quarterly_reports_parliamentary_budget-1-ell-eng + - mtdata_ELRC-collection_reports_greek_power_corporation-1-ell-eng + - mtdata_ELRC-hellenic_foreign_affairs_announcements-1-ell-eng + - mtdata_ELRC-prime_minister_hellenic-1-ell-eng + - mtdata_ELRC-collection_about_cyprus_problem-1-ell-eng + - mtdata_ELRC-commitment_property_open-1-ell-eng + - mtdata_ELRC-compulsory_expropriation_process_greece-1-ell-eng + - mtdata_ELRC-pio_publication_cyprus_has_always_been_europe_2017-1-ell-eng + - mtdata_ELRC-pio_publication_window_cyprus-1-ell-eng + - mtdata_ELRC-press_information_cyprus-1-ell-eng + - mtdata_ELRC-governmental_about_migration_policy-1-ell-eng + - mtdata_ELRC-eqf_referencing_report-1-ell-eng + - mtdata_ELRC-hellenic_gaming_commission-1-ell-eng + - mtdata_ELRC-eu_publications_medical_v2-1-ell-eng + - mtdata_EU-eac_forms-1-ell-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-ell-eng # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-ell-eng # ~1,178,828 sentences (133.2 MB) + - mtdata_Neulab-tedtalks_test-1-eng-ell # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-ell-eng # ~2,393,754 sentences (270.5 MB) + - mtdata_Tilde-ema-2016-ell-eng # ~244,548 sentences (27.6 MB) + - mtdata_Tilde-rapid-2016-ell-eng # ~586,564 sentences (66.3 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~27,097,343 sentences + mono-trg: + - news-crawl_news.2015 # ~1,115,044 sentences (126M) + - news-crawl_news.2019 # ~2,398,230 sentences (271M) + - news-crawl_news.2020 # ~5,327,433 sentences (602M) + - news-crawl_news.2021 # ~5,238,938 sentences (592M) + - news-crawl_news.2022 # ~6,725,663 sentences (760M) + - news-crawl_news.2023 # ~6,292,035 sentences (711M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-fi-spring-2024.yml b/configs/en-fi-spring-2024.yml new file mode 100644 index 000000000..6d0d3c8a7 --- /dev/null +++ b/configs/en-fi-spring-2024.yml @@ -0,0 +1,242 @@ +# The initial configuration was generated using: +# task config-generator -- en fi --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: fi + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-fin + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt18 + - sacrebleu_aug-mix_wmt17 + - sacrebleu_aug-mix_wmt17/tworefs + - sacrebleu_aug-mix_wmt16/B + - sacrebleu_aug-mix_wmt15 + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt19 + - sacrebleu_wmt18/test-ts + - sacrebleu_wmt17/B + - sacrebleu_wmt16 + - sacrebleu_wmt16/tworefs + + # The training data contains: + # 180,578,066 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (35,982,562 sentences) + # - opus_ELRC-401-Swedish_Labour_Part2/v1 - not enough data (171 sentences) + # - opus_ELRC-406-Swedish_Labour_Part1/v1 - not enough data (41 sentences) + # - opus_ELRC-436-Swedish_Food/v1 - not enough data (16 sentences) + # - opus_ELRA-W0305/v1 - not enough data (15 sentences) + # - opus_MultiHPLT/v1.1 - ignored datasets (0 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-hallituskausi_2007_2011-1-eng-fin - duplicate with opus + # - mtdata_ELRC-hallituskausi_2011_2015-1-eng-fin - duplicate with opus + # - mtdata_ELRC-www.norden.org-1-eng-fin - duplicate with opus + # - mtdata_ELRC-www.vtv.fi-1-eng-fin - duplicate with opus + # - mtdata_ELRC-www.visitestonia.com-1-eng-fin - duplicate with opus + # - mtdata_ELRC-valtioneuvosto.fi-1-eng-fin - duplicate with opus + # - mtdata_ELRC-vnk.fi-1-eng-fin - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-fin - duplicate with opus + # - mtdata_ELRC-www.turku.fi-1-eng-fin - duplicate with opus + # - mtdata_ELRC-www.vero.fi-1-eng-fin - duplicate with opus + # - mtdata_ELRC-emea-1-eng-fin - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-fin - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-fin - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-fin - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-fin - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-fin - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-fin - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-fin - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-fin - duplicate with opus + # - mtdata_EU-ecdc-1-eng-fin - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-fin - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-fin - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-fin - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-3-eng-fin - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-fin - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-fin - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-fin - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-fin - duplicate with opus + # - mtdata_Statmt-europarl-9-fin-eng - duplicate with opus + # - mtdata_Statmt-europarl-7-fin-eng - duplicate with opus + # - mtdata_Statmt-europarl-10-fin-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-fin_FI - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-fin - duplicate with opus + train: + - opus_NLLB/v1 # 35,982,562 sentences + - opus_ParaCrawl/v9 # 31,315,914 sentences + - opus_OpenSubtitles/v2018 # 27,281,566 sentences + - opus_HPLT/v1.1 # 25,176,714 sentences + - opus_ELRC-EMEA/v1 # 13,287,447 sentences + - opus_ELRC-4239-NTEU_TierA/v1 # 12,855,266 sentences + - opus_CCAligned/v1 # 9,699,433 sentences + - opus_DGT/v2019 # 5,079,631 sentences + - opus_LinguaTools-WikiTitles/v2014 # 5,059,694 sentences + - opus_TildeMODEL/v2018 # 3,059,563 sentences + - opus_XLEnt/v1.2 # 2,630,648 sentences + - opus_EUbookshop/v2 # 2,039,833 sentences + - opus_Europarl/v8 # 1,969,624 sentences + - opus_EMEA/v3 # 1,083,857 sentences + - opus_ELRC-2708-EMEA/v1 # 753,744 sentences + - opus_ELRC-5067-SciPar/v1 # 457,342 sentences + - opus_WikiMatrix/v1 # 375,724 sentences + - opus_ELITR-ECA/v1 # 375,254 sentences + - opus_ECB/v1 # 157,603 sentences + - opus_ELRC-www.turku.fi/v1 # 141,917 sentences + - opus_ELRC-Finnish_Information/v1 # 127,638 sentences + - opus_ELRC-www.visitestonia.com/v1 # 124,120 sentences + - opus_KDE4/v2 # 108,073 sentences + - opus_ELRC-presscorner_covid/v1 # 107,841 sentences + - opus_QED/v2.0a # 102,273 sentences + - opus_infopankki/v1 # 84,645 sentences + - opus_Tatoeba/v2023-04-12 # 81,684 sentences + - opus_GNOME/v1 # 62,184 sentences + - opus_bible-uedin/v1 # 62,026 sentences + - opus_ELRC-724-Hallituskausi_2007_2/v1 # 53,844 sentences + - opus_ELRA-W0220/v1 # 53,843 sentences + - opus_ELRC-1769-valtioneuvosto.fi/v1 # 49,317 sentences + - opus_ELRC-valtioneuvosto.fi/v1 # 49,317 sentences + - opus_TED2020/v1 # 44,447 sentences + - opus_wikimedia/v20230407 # 43,055 sentences + - opus_ELRC-735-www.norden.org/v1 # 42,742 sentences + - opus_ELRC-www.norden.org/v1 # 42,742 sentences + - opus_ELRC-1127-www.vtv.fi/v1 # 42,724 sentences + - opus_ELRC-www.vtv.fi/v1 # 42,724 sentences + - opus_WMT-News/v2019 # 36,741 sentences + - opus_ELRC-1771-vnk.fi/v1 # 31,527 sentences + - opus_ELRC-vnk.fi/v1 # 31,527 sentences + - opus_ELRC-725-Hallituskausi_2011_2/v1 # 31,476 sentences + - opus_PHP/v1 # 27,879 sentences + - opus_NeuLab-TedTalks/v1 # 26,761 sentences + - opus_ELRC-3559-EUR_LEX_covid/v1 # 21,742 sentences + - opus_ELRC-EUR_LEX/v1 # 21,742 sentences + - opus_ELRC-2036-www.vero.fi/v1 # 21,285 sentences + - opus_ELRC-www.vero.fi/v1 # 21,285 sentences + - opus_JRC-Acquis/v3.0 # 19,665 sentences + - opus_ELRC-2032-www.turku.fi/v1 # 17,674 sentences + - opus_ELRC-2017-EUIPO_2017/v1 # 16,802 sentences + - opus_ELRC-EUIPO_2017/v1 # 16,802 sentences + - opus_ELRC-EUROPARL_covid/v1 # 14,964 sentences + - opus_ELRC-1128-www.visitestonia.com/v1 # 14,616 sentences + - opus_ELRC-2869-EU_publications_medi/v1 # 12,943 sentences + - opus_ELRC-EU_publications/v1 # 12,943 sentences + - opus_ELRC-antibiotic/v1 # 11,241 sentences + - opus_EUconst/v1 # 10,026 sentences + - opus_ELRC-716-Finnish_Information_/v1 # 9,942 sentences + - opus_ELRA-W0217/v1 # 9,941 sentences + - opus_ELRC-3600-presscorner_covid/v1 # 6,760 sentences + - opus_Books/v1 # 3,645 sentences + - opus_ELRC_3382/v1 # 3,358 sentences + - opus_ECDC/v2016-03-16 # 2,618 sentences + - opus_ELRC-3458-EC_EUROPA_covid/v1 # 2,600 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,600 sentences + - opus_ELRC-Swedish_Labour/v1 # 1,780 sentences + - opus_ELRC-4995-Finnish_Financial_MT/v1 # 1,001 sentences + - opus_ELRC-3196-antibiotic/v1 # 885 sentences + - opus_ELRC-416-Swedish_Social_Secur/v1 # 843 sentences + - opus_ELRC-416-Swedish_Social_Secur/v1 # 842 sentences + - opus_ELRC-3287-EUROPARL_covid/v1 # 696 sentences + - opus_ELRC-2739-vaccination/v1 # 471 sentences + - opus_ELRC-vaccination/v1 # 471 sentences + - opus_ELRC_2923/v1 # 396 sentences + - opus_ELRC-3045-wikipedia_health/v1 # 334 sentences + - opus_ELRC-wikipedia_health/v1 # 334 sentences + - opus_ELRC_2922/v1 # 333 sentences + - mtdata_ELRC-swedish_labour_part2-1-eng-fin + - mtdata_ELRC-swedish_labour_part1-1-eng-fin + - mtdata_ELRC-swedish_social_security-1-eng-fin + - mtdata_ELRC-swedish_food-1-eng-fin + - mtdata_ELRC-finnish_information_bank-1-eng-fin + - mtdata_ELRC-eu_publications_medical_v2-1-eng-fin + - mtdata_ELRC-nteu_tierb-1-eng-fin + - mtdata_EU-eac_forms-1-eng-fin # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-fin # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-fin # ~1,039,474 sentences (117.5 MB) + - mtdata_Neulab-tedtalks_test-1-eng-fin # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-wiki_titles-1-fin-eng # ~45,145 sentences (5.1 MB) + - mtdata_Statmt-newsdev_fien-2015-fin-eng # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_enfi-2015-eng-fin # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-fin # ~1,759,784 sentences (198.9 MB) + - mtdata_Tilde-ema-2016-eng-fin # ~222,060 sentences (25.1 MB) + - mtdata_Tilde-airbaltic-1-eng-fin # ~754 sentences (85.2 kB) + - mtdata_Tilde-rapid-2016-eng-fin # ~365,302 sentences (41.3 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~11,592,916 sentences + mono-trg: + - news-crawl_news.2015 # ~1,601,769 sentences (181M) + - news-crawl_news.2016 # ~1,336,283 sentences (151M) + - news-crawl_news.2017 # ~1,265,486 sentences (143M) + - news-crawl_news.2018 # ~1,035,398 sentences (117M) + - news-crawl_news.2019 # ~1,672,566 sentences (189M) + - news-crawl_news.2020 # ~1,407,079 sentences (159M) + - news-crawl_news.2021 # ~1,106,194 sentences (125M) + - news-crawl_news.2022 # ~1,070,796 sentences (121M) + - news-crawl_news.2023 # ~1,097,345 sentences (124M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-hr-spring-2024.yml b/configs/en-hr-spring-2024.yml new file mode 100644 index 000000000..a6f29917d --- /dev/null +++ b/configs/en-hr-spring-2024.yml @@ -0,0 +1,225 @@ +# The initial configuration was generated using: +# task config-generator -- en hr --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: hr + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-hrv + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 99,724,833 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (18,797,643 sentences) + # - opus_MultiHPLT/v1.1 - ignored datasets (9,310,276 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (2,266,005 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-croatian_bank-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-croatian_mine_action-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-agriculture-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-emea-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-hrv - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-hrv - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-hrv - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-hrv_HR - duplicate with opus + train: + - opus_OpenSubtitles/v2018 # 35,131,729 sentences + - opus_NLLB/v1 # 18,797,643 sentences + - opus_ELRC-EMEA/v1 # 10,890,456 sentences + - opus_CCAligned/v1 # 9,376,190 sentences + - opus_HPLT/v1.1 # 9,310,369 sentences + - opus_ParaCrawl/v9 # 3,240,485 sentences + - opus_XLEnt/v1.2 # 2,844,710 sentences + - opus_ELRC-4142-NTEU_TierA/v1 # 2,290,893 sentences + - opus_MaCoCu/v2 # 2,266,007 sentences + - opus_ELRC-5067-SciPar/v1 # 806,581 sentences + - opus_TildeMODEL/v2018 # 745,616 sentences + - opus_DGT/v2019 # 722,182 sentences + - opus_ELRC-2706-EMEA/v1 # 650,030 sentences + - opus_WikiMatrix/v1 # 259,499 sentences + - opus_QED/v2.0a # 208,129 sentences + - opus_SETIMES/v2 # 205,910 sentences + - opus_TED2020/v1 # 197,411 sentences + - opus_ELITR-ECA/v1 # 181,038 sentences + - opus_EuroPat/v3 # 154,775 sentences + - opus_ELRC-presscorner_covid/v1 # 140,795 sentences + - opus_ELRC-Regional_Development/v1 # 136,809 sentences + - opus_NeuLab-TedTalks/v1 # 128,233 sentences + - opus_ELRC-Rural_Development/v1 # 105,562 sentences + - opus_hrenWaC/v1 # 99,001 sentences + - opus_KDE4/v2 # 87,333 sentences + - opus_TedTalks/v1 # 86,348 sentences + - opus_ELRC-2542-Agriculture/v1 # 68,376 sentences + - opus_bible-uedin/v1 # 62,179 sentences + - opus_ELRC-4329-PRINCIPLE_MVEP_legal/v1 # 44,460 sentences + - opus_wikimedia/v20230407 # 42,034 sentences + - opus_GNOME/v1 # 35,429 sentences + - opus_ELRC-3556-EUR_LEX_covid/v1 # 22,010 sentences + - opus_ELRC-EUR_LEX/v1 # 22,010 sentences + - opus_ELRC-651-government_websites_/v1 # 21,341 sentences + - opus_ELRC-government_websites/v1 # 21,341 sentences + - opus_ELRA-W0204/v1 # 21,340 sentences + - opus_ELRC-943-Journal_Croatian_Ass/v1 # 18,478 sentences + - opus_ELRA-W0273/v1 # 18,477 sentences + - opus_ELRC-1015-Croatian_Mine_Action/v1 # 17,602 sentences + - opus_ELRA-W0131/v1 # 17,601 sentences + - opus_ELRC-1174-EUIPO_2017/v1 # 17,205 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,205 sentences + - opus_ELRC-2866-EU_publications_medi/v1 # 12,837 sentences + - opus_ELRC-EU_publications/v1 # 12,837 sentences + - opus_ELRC-921-studies_challenges_C/v1 # 11,781 sentences + - opus_ELRA-W0266/v1 # 11,780 sentences + - opus_ELRC-915-statistical_reports_/v1 # 11,738 sentences + - opus_ELRC-statistical_reports/v1 # 11,738 sentences + - opus_ELRA-W0264/v1 # 11,737 sentences + - opus_ELRC-788-Croatian_Bank/v1 # 11,708 sentences + - opus_ELRA-W0226/v1 # 11,707 sentences + - opus_ELRC-EUROPARL_covid/v1 # 10,175 sentences + - opus_ELRC-2541-Regional_Development/v1 # 7,911 sentences + - opus_ELRC-3597-presscorner_covid/v1 # 6,645 sentences + - opus_EUbookshop/v2 # 6,104 sentences + - opus_ELRC-992-Rural_Development_Pr/v1 # 5,202 sentences + - opus_ELRC_3382/v1 # 3,671 sentences + - opus_ELRC-989-Foreign_Affairs_Croa/v1 # 3,103 sentences + - opus_ELRC-Foreign_Affairs/v1 # 3,103 sentences + - opus_ELRA-W0293/v1 # 3,102 sentences + - opus_ELRC-3478-EC_EUROPA_covid/v1 # 2,595 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,595 sentences + - opus_Tatoeba/v2023-04-12 # 2,454 sentences + - opus_ELRC-991-Croatian_Journal_Fis/v1 # 2,408 sentences + - opus_ELRA-W0294/v1 # 2,407 sentences + - opus_ELRC-1080-Acts_Biological_Land/v1 # 2,329 sentences + - opus_ELRA-W0142/v1 # 2,328 sentences + - opus_ELRC-1058-University_Library_Z/v1 # 2,310 sentences + - opus_ELRA-W0135/v1 # 2,309 sentences + - opus_ELRC-986-Embassy_Finland_Zagr/v1 # 1,967 sentences + - opus_ELRA-W0292/v1 # 1,966 sentences + - opus_ELRC-1159-Swedish_Migration_Bo/v1 # 1,112 sentences + - opus_ELRC-Swedish_Migration/v1 # 1,112 sentences + - opus_ELRC-3193-antibiotic/v1 # 1,070 sentences + - opus_ELRC-antibiotic/v1 # 1,070 sentences + - opus_ELRC-984-Government_Cooperati/v1 # 1,026 sentences + - opus_ELRA-W0291/v1 # 1,025 sentences + - opus_ELRC-996-nature_protection_st/v1 # 970 sentences + - opus_ELRC-825-Croatian_Swedish_Cri/v1 # 907 sentences + - opus_ELRA-W0238/v1 # 906 sentences + - opus_ELRC-2753-vaccination/v1 # 509 sentences + - opus_ELRC-vaccination/v1 # 509 sentences + - opus_ELRC_2922/v1 # 485 sentences + - opus_ELRC-3284-EUROPARL_covid/v1 # 475 sentences + - opus_ELRC_2923/v1 # 288 sentences + - mtdata_ELRC-government_websites_croatian-1-eng-hrv + - mtdata_ELRC-croatian_swedish_crime_victim_compensation_support_authority-1-eng-hrv + - mtdata_ELRC-statistical_reports_studies_croatian_bureau_statistics-1-eng-hrv + - mtdata_ELRC-studies_challenges_croatian_accession_union_croatian_institute_finance-1-eng-hrv + - mtdata_ELRC-journal_croatian_association_civil_engineers-1-eng-hrv + - mtdata_ELRC-government_cooperation_ngos-1-eng-hrv + - mtdata_ELRC-embassy_finland_zagreb-1-eng-hrv + - mtdata_ELRC-foreign_affairs_croatia-1-eng-hrv + - mtdata_ELRC-croatian_journal_fisheries-1-eng-hrv + - mtdata_ELRC-rural_development_programme_period_2014_2020_croatian_rural_development_programme-1-eng-hrv + - mtdata_ELRC-nature_protection_strategy_croatia-1-eng-hrv + - mtdata_ELRC-university_library_zagreb-1-eng-hrv + - mtdata_ELRC-acts_biological_landscape_diversity_environmental_protection-1-eng-hrv + - mtdata_ELRC-swedish_migration_board_migrationsverket-1-eng-hrv + - mtdata_ELRC-regional_development_funds-1-eng-hrv + - mtdata_ELRC-eu_publications_medical_v2-1-eng-hrv + - mtdata_ELRC-wikipedia_health-1-eng-hrv + - mtdata_ELRC-nteu_tierb-1-eng-hrv + - mtdata_EU-eac_reference-1-eng-hrv # ~31,162 sentences (3.5 MB) + - mtdata_Neulab-tedtalks_test-1-eng-hrv # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-hrv # ~216,663 sentences (24.5 MB) + - mtdata_Tilde-ema-2016-eng-hrv # ~209,283 sentences (23.6 MB) + - mtdata_Tilde-ecb-2017-eng-hrv # ~876 sentences (99.0 kB) + - mtdata_Tilde-rapid-2016-eng-hrv # ~45,055 sentences (5.1 MB) + - mtdata_Tilde-worldbank-1-eng-hrv # ~1,566 sentences (177.0 kB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~11,498,228 sentences + mono-trg: + - news-crawl_news.2014 # ~46,902 sentences (5.3M) + - news-crawl_news.2019 # ~1,398,230 sentences (158M) + - news-crawl_news.2020 # ~2,610,619 sentences (295M) + - news-crawl_news.2021 # ~2,398,230 sentences (271M) + - news-crawl_news.2022 # ~2,592,920 sentences (293M) + - news-crawl_news.2023 # ~2,451,327 sentences (277M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-hu-spring-2024.yml b/configs/en-hu-spring-2024.yml new file mode 100644 index 000000000..4392942d7 --- /dev/null +++ b/configs/en-hu-spring-2024.yml @@ -0,0 +1,190 @@ +# The initial configuration was generated using: +# task config-generator -- en hu --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: hu + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Lindat-khresmoi_summary_dev-2-eng-hun + - mtdata_Neulab-tedtalks_dev-1-eng-hun + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt08 + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt09 + + # The training data contains: + # 147,468,240 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (36,435,409 sentences) + # - opus_ELRC-EMEA/v1 - not enough data (0 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-euipo_2017-1-eng-hun - duplicate with opus + # - mtdata_ELRC-emea-1-eng-hun - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-hun - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-hun - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-hun - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-hun - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-hun - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-hun - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-hun - duplicate with opus + # - mtdata_EU-ecdc-1-eng-hun - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-hun - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-hun - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-hun - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-hun - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-hun - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-hun - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-hun - duplicate with opus + # - mtdata_Statmt-europarl-7-hun-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-hun_HU - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-hun - duplicate with opus + train: + - opus_OpenSubtitles/v2018 # 42,655,519 sentences + - opus_NLLB/v1 # 36,435,409 sentences + - opus_ParaCrawl/v9 # 36,433,273 sentences + - opus_CCAligned/v1 # 11,586,886 sentences + - opus_DGT/v2019 # 5,074,777 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,716,457 sentences + - opus_XLEnt/v1.2 # 3,609,156 sentences + - opus_TildeMODEL/v2018 # 1,958,734 sentences + - opus_EMEA/v3 # 1,050,606 sentences + - opus_ELRC-2715-EMEA/v1 # 772,359 sentences + - opus_Europarl/v8 # 625,178 sentences + - opus_WikiMatrix/v1 # 488,319 sentences + - opus_JRC-Acquis/v3.0 # 487,829 sentences + - opus_EUbookshop/v2 # 438,264 sentences + - opus_QED/v2.0a # 335,038 sentences + - opus_TED2020/v1 # 308,341 sentences + - opus_ELITR-ECA/v1 # 299,216 sentences + - opus_NeuLab-TedTalks/v1 # 159,437 sentences + - opus_ELRC-presscorner_covid/v1 # 139,284 sentences + - opus_Books/v1 # 137,151 sentences + - opus_KDE4/v2 # 120,657 sentences + - opus_Tatoeba/v2023-04-12 # 116,774 sentences + - opus_wikimedia/v20230407 # 91,028 sentences + - opus_ECB/v1 # 72,034 sentences + - opus_bible-uedin/v1 # 62,121 sentences + - opus_Wikipedia/v1.0 # 61,472 sentences + - opus_PHP/v1 # 35,423 sentences + - opus_ELRC-5067-SciPar/v1 # 27,422 sentences + - opus_ELRC-3566-EUR_LEX_covid/v1 # 22,271 sentences + - opus_ELRC-EUR_LEX/v1 # 22,271 sentences + - opus_ELRC-2019-EUIPO_2017/v1 # 17,038 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,038 sentences + - opus_GlobalVoices/v2018q4 # 15,362 sentences + - opus_ELRC-2876-EU_publications_medi/v1 # 13,026 sentences + - opus_ELRC-EU_publications/v1 # 13,026 sentences + - opus_ELRC-antibiotic/v1 # 10,272 sentences + - opus_EUconst/v1 # 8,748 sentences + - opus_ELRC-3607-presscorner_covid/v1 # 6,599 sentences + - opus_GNOME/v1 # 6,312 sentences + - opus_ELRC_3382/v1 # 3,564 sentences + - opus_WMT-News/v2019 # 3,027 sentences + - opus_ECDC/v2016-03-16 # 2,572 sentences + - opus_ELRC-3465-EC_EUROPA_covid/v1 # 2,497 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,497 sentences + - opus_ELRC-3203-antibiotic/v1 # 687 sentences + - opus_ELRC-2744-vaccination/v1 # 518 sentences + - opus_ELRC-vaccination/v1 # 518 sentences + - opus_ELRC-3294-EUROPARL_covid/v1 # 410 sentences + - opus_ELRC-EUROPARL_covid/v1 # 410 sentences + - opus_ELRC-3064-wikipedia_health/v1 # 401 sentences + - opus_ELRC-wikipedia_health/v1 # 401 sentences + - opus_ELRC_2922/v1 # 400 sentences + - opus_ELRC_2923/v1 # 211 sentences + - mtdata_ELRC-eu_publications_medical_v2-1-eng-hun + - mtdata_EU-eac_forms-1-eng-hun # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-hun # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-hun # ~546,644 sentences (61.8 MB) + - mtdata_Lindat-khresmoi_summary_test-2-eng-hun # ~11,808 sentences (1.3 MB) + - mtdata_Neulab-tedtalks_test-1-eng-hun # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-hun # ~1,098,560 sentences (124.1 MB) + - mtdata_Tilde-ema-2016-eng-hun # ~237,326 sentences (26.8 MB) + - mtdata_Tilde-rapid-2016-eng-hun # ~219,863 sentences (24.8 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~30,008,843 sentences + mono-trg: + - news-crawl_news.2007 # ~26,548 sentences (3.0M) + - news-crawl_news.2008 # ~1,150,442 sentences (130M) + - news-crawl_news.2009 # ~1,070,796 sentences (121M) + - news-crawl_news.2011 # ~2,743,362 sentences (310M) + - news-crawl_news.2012 # ~2,495,575 sentences (282M) + - news-crawl_news.2013 # ~2,318,584 sentences (262M) + - news-crawl_news.2014 # ~1,876,106 sentences (212M) + - news-crawl_news.2015 # ~1,805,309 sentences (204M) + - news-crawl_news.2016 # ~1,752,212 sentences (198M) + - news-crawl_news.2017 # ~2,061,946 sentences (233M) + - news-crawl_news.2018 # ~1,814,159 sentences (205M) + - news-crawl_news.2019 # ~2,176,991 sentences (246M) + - news-crawl_news.2020 # ~2,238,938 sentences (253M) + - news-crawl_news.2021 # ~1,831,858 sentences (207M) + - news-crawl_news.2022 # ~2,274,336 sentences (257M) + - news-crawl_news.2023 # ~2,371,681 sentences (268M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-id-spring-2024.yml b/configs/en-id-spring-2024.yml new file mode 100644 index 000000000..3b743d12c --- /dev/null +++ b/configs/en-id-spring-2024.yml @@ -0,0 +1,117 @@ +# The initial configuration was generated using: +# task config-generator -- en id --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: id + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ind + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 102,103,778 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (70,545,705 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-ind - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-ind - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ind - duplicate with opus + # - mtdata_Statmt-news_commentary-14-eng-ind - duplicate with opus + # - mtdata_Statmt-news_commentary-15-eng-ind - duplicate with opus + # - mtdata_Statmt-news_commentary-16-eng-ind - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-ind_ID - duplicate with opus + train: + - opus_NLLB/v1 # 70,545,705 sentences + - opus_CCAligned/v1 # 15,700,345 sentences + - opus_OpenSubtitles/v2018 # 9,268,181 sentences + - opus_XLEnt/v1.2 # 4,179,174 sentences + - opus_WikiMatrix/v1 # 1,019,171 sentences + - opus_Tanzil/v1 # 393,552 sentences + - opus_wikimedia/v20230407 # 284,126 sentences + - opus_QED/v2.0a # 274,581 sentences + - opus_TED2020/v1 # 165,059 sentences + - opus_NeuLab-TedTalks/v1 # 95,295 sentences + - opus_bible-uedin/v1 # 59,363 sentences + - opus_GNOME/v1 # 47,234 sentences + - opus_News-Commentary/v16 # 18,054 sentences + - opus_GlobalVoices/v2018q4 # 16,043 sentences + - opus_KDE4/v2 # 14,782 sentences + - opus_Tatoeba/v2023-04-12 # 10,550 sentences + - opus_tico-19/v2020-10-28 # 3,071 sentences + - opus_ELRC-3049-wikipedia_health/v1 # 2,680 sentences + - opus_ELRC-wikipedia_health/v1 # 2,680 sentences + - opus_ELRC_2922/v1 # 2,679 sentences + - opus_tldr-pages/v2023-08-29 # 1,453 sentences + - mtdata_Neulab-tedtalks_test-1-eng-ind # ~3,117,009 sentences (352.2 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~0 sentences + mono-trg: [] +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-lt-spring-2024.yml b/configs/en-lt-spring-2024.yml new file mode 100644 index 000000000..2a88aaa00 --- /dev/null +++ b/configs/en-lt-spring-2024.yml @@ -0,0 +1,192 @@ +# The initial configuration was generated using: +# task config-generator -- en lt --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: lt + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-lit + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt19/dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt19 + + # The training data contains: + # 76,643,900 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (23,298,470 sentences) + # - opus_ELRC-3069-wikipedia_health/v1 - not enough data (136 sentences) + # - opus_ELRC-wikipedia_health/v1 - not enough data (136 sentences) + # - opus_ELRC_2922/v1 - not enough data (135 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-president_lithuania-1-eng-lit - duplicate with opus + # - mtdata_ELRC-www.lrs.lt-1-eng-lit - duplicate with opus + # - mtdata_ELRC-www.lb.lt-1-eng-lit - duplicate with opus + # - mtdata_ELRC-kam.lt-1-eng-lit - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-lit - duplicate with opus + # - mtdata_ELRC-emea-1-eng-lit - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-lit - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-lit - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-lit - duplicate with opus + # - mtdata_EU-ecdc-1-eng-lit - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-lit - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-3-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-lit - duplicate with opus + # - mtdata_Statmt-europarl-9-lit-eng - duplicate with opus + # - mtdata_Statmt-europarl-7-lit-eng - duplicate with opus + # - mtdata_Statmt-europarl-10-lit-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-lit_LT - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-lit - duplicate with opus + train: + - opus_NLLB/v1 # 23,298,470 sentences + - opus_ParaCrawl/v9 # 13,192,237 sentences + - opus_ELRC-EMEA/v1 # 11,487,359 sentences + - opus_ELRC-4270-NTEU_TierA/v1 # 8,061,918 sentences + - opus_CCAligned/v1 # 5,215,271 sentences + - opus_DGT/v2019 # 5,061,918 sentences + - opus_TildeMODEL/v2018 # 2,084,002 sentences + - opus_XLEnt/v1.2 # 1,642,943 sentences + - opus_OpenSubtitles/v2018 # 1,415,961 sentences + - opus_EMEA/v3 # 1,042,425 sentences + - opus_JRC-Acquis/v3.0 # 790,475 sentences + - opus_ELRC-2717-EMEA/v1 # 764,031 sentences + - opus_Europarl/v8 # 634,284 sentences + - opus_EUbookshop/v2 # 445,813 sentences + - opus_ELRC-5067-SciPar/v1 # 177,437 sentences + - opus_WikiMatrix/v1 # 157,526 sentences + - opus_ELITR-ECA/v1 # 147,678 sentences + - opus_ELRC-425-Lithuanian_legislati/v1 # 130,549 sentences + - opus_ELRC-presscorner_covid/v1 # 117,054 sentences + - opus_KDE4/v2 # 104,044 sentences + - opus_QED/v2.0a # 85,435 sentences + - opus_TED2020/v1 # 75,484 sentences + - opus_ECB/v1 # 69,805 sentences + - opus_bible-uedin/v1 # 62,187 sentences + - opus_GNOME/v1 # 59,776 sentences + - opus_NeuLab-TedTalks/v1 # 45,963 sentences + - opus_ELRC-591-www.lb.lt/v1 # 33,261 sentences + - opus_ELRC-3568-EUR_LEX_covid/v1 # 21,390 sentences + - opus_ELRC-EUR_LEX/v1 # 21,390 sentences + - opus_ELRC-405-President_Lithuania/v1 # 21,225 sentences + - opus_ELRA-W0160/v1 # 21,224 sentences + - opus_ELRC-2021-EUIPO_2017/v1 # 17,133 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,133 sentences + - opus_wikimedia/v20230407 # 14,454 sentences + - opus_ELRC-EUROPARL_covid/v1 # 13,851 sentences + - opus_ELRC-antibiotic/v1 # 12,602 sentences + - opus_ELRC-2878-EU_publications_medi/v1 # 12,581 sentences + - opus_ELRC-EU_publications/v1 # 12,581 sentences + - opus_EUconst/v1 # 10,171 sentences + - opus_ELRC-592-kam.lt/v1 # 8,531 sentences + - opus_Tatoeba/v2023-04-12 # 8,236 sentences + - opus_ELRC-3609-presscorner_covid/v1 # 6,462 sentences + - opus_WMT-News/v2019 # 5,998 sentences + - opus_ELRC_3382/v1 # 3,587 sentences + - opus_ECDC/v2016-03-16 # 2,546 sentences + - opus_ELRC-3467-EC_EUROPA_covid/v1 # 2,438 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,438 sentences + - opus_ELRC-590-www.lrs.lt/v1 # 1,771 sentences + - opus_ELRC-3205-antibiotic/v1 # 823 sentences + - opus_ELRC-3296-EUROPARL_covid/v1 # 553 sentences + - opus_ELRC-2740-vaccination/v1 # 546 sentences + - opus_ELRC-vaccination/v1 # 546 sentences + - opus_ELRC_2923/v1 # 384 sentences + - mtdata_ELRC-lithuanian_legislation_seimas_lithuania-1-eng-lit + - mtdata_ELRC-eu_publications_medical_v2-1-eng-lit + - mtdata_ELRC-wikipedia_health-1-eng-lit + - mtdata_ELRC-nteu_tierb-1-eng-lit + - mtdata_EU-eac_forms-1-eng-lit # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-lit # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-lit # ~510,025 sentences (57.6 MB) + - mtdata_Neulab-tedtalks_test-1-eng-lit # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-wiki_titles-1-lit-eng # ~15,267 sentences (1.7 MB) + - mtdata_Statmt-newsdev_enlt-2019-eng-lit # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_lten-2019-lit-eng # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-lit # ~1,149,015 sentences (129.8 MB) + - mtdata_Tilde-ema-2016-eng-lit # ~228,287 sentences (25.8 MB) + - mtdata_Tilde-airbaltic-1-eng-lit # ~962 sentences (108.7 kB) + - mtdata_Tilde-rapid-2016-eng-lit # ~180,798 sentences (20.4 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~5,442,476 sentences + mono-trg: + - news-crawl_news.2019 # ~1,079,646 sentences (122M) + - news-crawl_news.2020 # ~1,088,495 sentences (123M) + - news-crawl_news.2021 # ~1,008,849 sentences (114M) + - news-crawl_news.2022 # ~1,079,646 sentences (122M) + - news-crawl_news.2023 # ~1,185,840 sentences (134M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-lv-spring-2024.yml b/configs/en-lv-spring-2024.yml new file mode 100644 index 000000000..6585c4ba8 --- /dev/null +++ b/configs/en-lv-spring-2024.yml @@ -0,0 +1,194 @@ +# The initial configuration was generated using: +# task config-generator -- en lv --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: lv + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt17/dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt17 + + # The training data contains: + # 68,374,368 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (16,685,969 sentences) + # - opus_ELRC-3089-wikipedia_health/v1 - not enough data (143 sentences) + # - opus_ELRC-wikipedia_health/v1 - not enough data (143 sentences) + # - opus_ELRC_2922/v1 - not enough data (142 sentences) + # - opus_ELRA-W0308/v1 - not enough data (108 sentences) + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (84 sentences) + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (23 sentences) + # - opus_ELRA-W0301/v1 - not enough data (20 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-mfa_latvia-1-eng-lav - duplicate with opus + # - mtdata_ELRC-state_latvian-1-eng-lav - duplicate with opus + # - mtdata_ELRC-www.visitestonia.com-1-eng-lav - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-lav - duplicate with opus + # - mtdata_ELRC-emea-1-eng-lav - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-lav - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-lav - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-covid19.gov.lv-1-eng-lav - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-lav - duplicate with opus + # - mtdata_EU-ecdc-1-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-lav - duplicate with opus + # - mtdata_Statmt-europarl-7-lav-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-lav_LV - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-lav - duplicate with opus + train: + - opus_NLLB/v1 # 16,685,969 sentences + - opus_ParaCrawl/v9 # 13,064,066 sentences + - opus_ELRC-EMEA/v1 # 11,795,507 sentences + - opus_ELRC-4269-NTEU_TierA/v1 # 8,072,484 sentences + - opus_DGT/v2019 # 5,072,124 sentences + - opus_CCAligned/v1 # 4,850,972 sentences + - opus_TildeMODEL/v2018 # 2,111,785 sentences + - opus_XLEnt/v1.2 # 1,295,887 sentences + - opus_EMEA/v3 # 1,030,272 sentences + - opus_JRC-Acquis/v3.0 # 793,589 sentences + - opus_ELRC-2729-EMEA/v1 # 783,490 sentences + - opus_Europarl/v8 # 639,318 sentences + - opus_OpenSubtitles/v2018 # 519,553 sentences + - opus_EUbookshop/v2 # 445,891 sentences + - opus_ELRC-5067-SciPar/v1 # 347,473 sentences + - opus_ELRC-presscorner_covid/v1 # 128,895 sentences + - opus_KDE4/v2 # 91,386 sentences + - opus_QED/v2.0a # 72,447 sentences + - opus_ECB/v1 # 65,374 sentences + - opus_ELITR-ECA/v1 # 64,115 sentences + - opus_TED2020/v1 # 55,488 sentences + - opus_ELRC-399-International_Agreem/v1 # 40,897 sentences + - opus_ELRA-W0158/v1 # 40,896 sentences + - opus_ELRC-3578-EUR_LEX_covid/v1 # 22,476 sentences + - opus_ELRC-EUR_LEX/v1 # 22,476 sentences + - opus_wikimedia/v20230407 # 21,295 sentences + - opus_ELRC-EUROPARL_covid/v1 # 17,831 sentences + - opus_ELRC-2022-EUIPO_2017/v1 # 17,255 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,255 sentences + - opus_bible-uedin/v1 # 15,885 sentences + - opus_ELRC-1130-www.visitestonia.com/v1 # 13,841 sentences + - opus_ELRC-www.visitestonia.com/v1 # 13,841 sentences + - opus_ELRC-2888-EU_publications_medi/v1 # 13,045 sentences + - opus_ELRC-EU_publications/v1 # 13,045 sentences + - opus_ELRC-antibiotic/v1 # 12,048 sentences + - opus_ELRC-715-Finance_Economics_Ba/v1 # 11,600 sentences + - opus_ELRA-W0216/v1 # 11,599 sentences + - opus_GNOME/v1 # 11,265 sentences + - opus_EUconst/v1 # 10,036 sentences + - opus_WMT-News/v2019 # 8,008 sentences + - opus_ELRC-402-MFA_Latvia/v1 # 7,195 sentences + - opus_ELRA-W0159/v1 # 7,194 sentences + - opus_ELRC-433-State_Latvian/v1 # 6,862 sentences + - opus_ELRA-W0169/v1 # 6,861 sentences + - opus_ELRC-3619-presscorner_covid/v1 # 6,686 sentences + - opus_ELRC_3382/v1 # 3,737 sentences + - opus_ECDC/v2016-03-16 # 2,543 sentences + - opus_ELRC-3477-EC_EUROPA_covid/v1 # 2,407 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,407 sentences + - opus_ELRC-4994-Latvian_Financial_MT/v1 # 2,002 sentences + - opus_Tatoeba/v2023-04-12 # 1,814 sentences + - opus_ELRC-3453-covid19.gov.lv/v1 # 826 sentences + - opus_ELRC-3217-antibiotic/v1 # 809 sentences + - opus_ELRC-3306-EUROPARL_covid/v1 # 724 sentences + - opus_ELRC_2923/v1 # 580 sentences + - opus_ELRC-2741-vaccination/v1 # 521 sentences + - opus_ELRC-vaccination/v1 # 521 sentences + - mtdata_ELRC-international_agreements-1-eng-lav + - mtdata_ELRC-rights_arrested-1-eng-lav + - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-eng-lav + - mtdata_ELRC-finance_economics_bank_latvia-1-eng-lav + - mtdata_ELRC-eu_publications_medical_v2-1-eng-lav + - mtdata_ELRC-wikipedia_health-1-eng-lav + - mtdata_ELRC-nteu_tierb-1-eng-lav + - mtdata_EU-eac_forms-1-eng-lav # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-lav # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-lav # ~524,054 sentences (59.2 MB) + - mtdata_Statmt-newsdev_lven-2017-lav-eng # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_enlv-2017-eng-lav # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-lav # ~1,122,956 sentences (126.9 MB) + - mtdata_Tilde-ema-2016-eng-lav # ~231,439 sentences (26.2 MB) + - mtdata_Tilde-airbaltic-1-eng-lav # ~1,050 sentences (118.7 kB) + - mtdata_Tilde-fold-1-eng-lav # ~10,070 sentences (1.1 MB) + - mtdata_Tilde-rapid-2016-eng-lav # ~198,906 sentences (22.5 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~3,283,185 sentences + mono-trg: + - news-crawl_news.2015 # ~1,274,336 sentences (144M) + - news-crawl_news.2016 # ~1,017,699 sentences (115M) + - news-crawl_news.2017 # ~991,150 sentences (112M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-ro-spring-2024.yml b/configs/en-ro-spring-2024.yml new file mode 100644 index 000000000..553250c0b --- /dev/null +++ b/configs/en-ro-spring-2024.yml @@ -0,0 +1,219 @@ +# The initial configuration was generated using: +# task config-generator -- en ro --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: ro + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ron + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt16/dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt16 + + # The training data contains: + # 174,698,415 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (55,607,023 sentences) + # - opus_ELRA-W0308/v1 - not enough data (92 sentences) + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (77 sentences) + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (24 sentences) + # - opus_ELRA-W0301/v1 - not enough data (21 sentences) + # - opus_tldr-pages/v2023-08-29 - not enough data (9 sentences) + # - opus_ELRC-417-Swedish_Work_Environ/v1 - not enough data (8 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-romanian_literature-1-eng-ron - duplicate with opus + # - mtdata_ELRC-romanian_wikipedia-1-eng-ron - duplicate with opus + # - mtdata_ELRC-romanian_news-1-eng-ron - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eir_spos-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eir_newsletter-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eir-1-eng-ron - duplicate with opus + # - mtdata_ELRC-emea-1-eng-ron - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-ron - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-ron - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-ron - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-ron - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-ron - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-ron - duplicate with opus + # - mtdata_EU-ecdc-1-eng-ron - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-ron - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-ron - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-ron - duplicate with opus + # - mtdata_Statmt-europarl-7-ron-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-ron_RO - duplicate with opus + train: + - opus_NLLB/v1 # 55,607,023 sentences + - opus_OpenSubtitles/v2018 # 50,693,226 sentences + - opus_ParaCrawl/v9 # 25,048,962 sentences + - opus_ELRC-EMEA/v1 # 13,648,577 sentences + - opus_CCAligned/v1 # 10,525,602 sentences + - opus_DGT/v2019 # 3,541,661 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,421,073 sentences + - opus_XLEnt/v1.2 # 3,337,016 sentences + - opus_TildeMODEL/v2018 # 1,925,419 sentences + - opus_EMEA/v3 # 994,499 sentences + - opus_ELRC-2728-EMEA/v1 # 783,742 sentences + - opus_WikiMatrix/v1 # 631,486 sentences + - opus_JRC-Acquis/v3.0 # 455,171 sentences + - opus_QED/v2.0a # 438,832 sentences + - opus_Europarl/v8 # 400,356 sentences + - opus_Wikipedia/v1.0 # 360,499 sentences + - opus_TED2020/v1 # 328,491 sentences + - opus_EUbookshop/v2 # 324,553 sentences + - opus_wikimedia/v20230407 # 323,049 sentences + - opus_SETIMES/v2 # 213,047 sentences + - opus_NeuLab-TedTalks/v1 # 196,122 sentences + - opus_TED2013/v1.1 # 158,483 sentences + - opus_ELRC-presscorner_covid/v1 # 153,650 sentences + - opus_Tanzil/v1 # 136,175 sentences + - opus_ELRC-492-Romanian_Wikipedia/v1 # 132,230 sentences + - opus_ELRA-W0193/v1 # 132,229 sentences + - opus_KDE4/v2 # 114,741 sentences + - opus_ELRC-493-Romanian_news/v1 # 98,099 sentences + - opus_ELRA-W0194/v1 # 98,098 sentences + - opus_ELITR-ECA/v1 # 92,826 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_PHP/v1 # 30,391 sentences + - opus_GNOME/v1 # 25,419 sentences + - opus_ELRC-3577-EUR_LEX_covid/v1 # 23,183 sentences + - opus_ELRC-EUR_LEX/v1 # 23,183 sentences + - opus_ELRC-1177-EUIPO_2017/v1 # 20,298 sentences + - opus_ELRC-EUIPO_2017/v1 # 20,298 sentences + - opus_Tatoeba/v2023-04-12 # 16,308 sentences + - opus_ELRC-wikipedia_health/v1 # 13,252 sentences + - opus_ELRC-2887-EU_publications_medi/v1 # 13,164 sentences + - opus_ELRC-EU_publications/v1 # 13,164 sentences + - opus_ELRC-930-studies_reports_stat/v1 # 12,043 sentences + - opus_ELRA-W0270/v1 # 12,042 sentences + - opus_ELRC-EUROPARL_covid/v1 # 10,906 sentences + - opus_WMT-News/v2019 # 7,996 sentences + - opus_ELRC-3618-presscorner_covid/v1 # 6,715 sentences + - opus_ELRC-435-Romanian_New_Crimina/v1 # 6,496 sentences + - opus_ELRA-W0170/v1 # 6,495 sentences + - opus_ELRC-491-Romanian_literature/v1 # 5,281 sentences + - opus_ELRA-W0192/v1 # 5,280 sentences + - opus_ELRC-1819-EIR/v1 # 4,994 sentences + - opus_GlobalVoices/v2018q4 # 4,454 sentences + - opus_ELRC-1992-Rural_Development_Pr/v1 # 4,186 sentences + - opus_ELRC-Rural_Development/v1 # 4,186 sentences + - opus_ELRC-654-Romanian_Ombudsman_a/v1 # 4,148 sentences + - opus_ELRA-W0206/v1 # 4,147 sentences + - opus_ELRC-1815-EIR_Newsletter/v1 # 3,788 sentences + - opus_ELRC_3382/v1 # 3,674 sentences + - opus_ELRC-1814-EIR_SPOS/v1 # 3,248 sentences + - opus_ECDC/v2016-03-16 # 2,556 sentences + - opus_ELRC-3476-EC_EUROPA_covid/v1 # 2,338 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,338 sentences + - opus_ELRC-3216-antibiotic/v1 # 1,035 sentences + - opus_ELRC-antibiotic/v1 # 1,035 sentences + - opus_ELRC-3087-wikipedia_health/v1 # 693 sentences + - opus_ELRC_2922/v1 # 692 sentences + - opus_ELRC-3305-EUROPARL_covid/v1 # 546 sentences + - opus_ELRC-2750-vaccination/v1 # 496 sentences + - opus_ELRC-vaccination/v1 # 496 sentences + - opus_ELRC_2923/v1 # 319 sentences + - mtdata_ELRC-rights_arrested-1-eng-ron + - mtdata_ELRC-swedish_work_environment-1-eng-ron + - mtdata_ELRC-romanian_new_criminal_procedure_code-1-eng-ron + - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-eng-ron + - mtdata_ELRC-romanian_ombudsman_archive-1-eng-ron + - mtdata_ELRC-studies_reports_statistical_culture_institute_cultural_research_training-1-eng-ron + - mtdata_ELRC-rural_development_programme_romania-1-eng-ron + - mtdata_ELRC-eu_publications_medical_v2-1-eng-ron + - mtdata_EU-eac_forms-1-eng-ron # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-ron # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-ron # ~389,297 sentences (44.0 MB) + - mtdata_Neulab-tedtalks_test-1-eng-ron # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-newsdev_enro-2016-eng-ron # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_roen-2016-ron-eng # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-ron # ~1,026,056 sentences (115.9 MB) + - mtdata_Tilde-ema-2016-eng-ron # ~229,130 sentences (25.9 MB) + - mtdata_Tilde-ecb-2017-eng-ron # ~1,778 sentences (200.9 kB) + - mtdata_Tilde-rapid-2016-eng-ron # ~196,150 sentences (22.2 MB) + - mtdata_Tilde-worldbank-1-eng-ron # ~6,413 sentences (724.7 kB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~24,920,348 sentences + mono-trg: + - news-crawl_news.2015 # ~1,088,495 sentences (123M) + - news-crawl_news.2016 # ~2,061,946 sentences (233M) + - news-crawl_news.2017 # ~2,247,787 sentences (254M) + - news-crawl_news.2018 # ~1,345,132 sentences (152M) + - news-crawl_news.2019 # ~3,283,185 sentences (371M) + - news-crawl_news.2020 # ~3,982,300 sentences (450M) + - news-crawl_news.2021 # ~3,353,982 sentences (379M) + - news-crawl_news.2022 # ~3,831,858 sentences (433M) + - news-crawl_news.2023 # ~3,725,663 sentences (421M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-ru-spring-2024.yml b/configs/en-ru-spring-2024.yml new file mode 100644 index 000000000..1f4a55075 --- /dev/null +++ b/configs/en-ru-spring-2024.yml @@ -0,0 +1,174 @@ +# The initial configuration was generated using: +# task config-generator -- en ru --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: ru + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-rus + - mtdata_UN-un_dev-1-eng-rus + - flores_aug-mix_dev + - sacrebleu_aug-mix_mtedx/test + - sacrebleu_aug-mix_wmt20 + - sacrebleu_aug-mix_wmt18 + - sacrebleu_aug-mix_wmt17 + - sacrebleu_aug-mix_wmt15 + - sacrebleu_aug-mix_wmt14/full + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_mtedx/valid + - sacrebleu_wmt20/tworefs + - sacrebleu_wmt19 + - sacrebleu_wmt18/test-ts + - sacrebleu_wmt16 + - sacrebleu_wmt14 + - sacrebleu_wmt13 + + # The training data contains: + # 250,111,081 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (139,937,785 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_ELRC-3855-SWPS_University_Soci/v1 - not enough data (109 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - opus_WikiTitles/v3 - ignored datasets (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-rus - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-rus - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-rus - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-rus - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-1_bonus-eng-rus - duplicate with opus + # - mtdata_Statmt-news_commentary-14-eng-rus - duplicate with opus + # - mtdata_Statmt-news_commentary-15-eng-rus - duplicate with opus + # - mtdata_Statmt-news_commentary-16-eng-rus - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-rus_RU - duplicate with opus + train: + - opus_NLLB/v1 # 139,937,785 sentences + - opus_OpenSubtitles/v2018 # 25,910,105 sentences + - opus_UNPC/v1.0 # 25,173,398 sentences + - opus_CCAligned/v1 # 13,850,305 sentences + - opus_LinguaTools-WikiTitles/v2014 # 13,565,182 sentences + - opus_MultiUN/v1 # 11,654,416 sentences + - opus_XLEnt/v1.2 # 7,890,088 sentences + - opus_ParaCrawl/v9 # 5,378,016 sentences + - opus_WikiMatrix/v1 # 1,661,909 sentences + - opus_Tanzil/v1 # 1,067,840 sentences + - opus_Wikipedia/v1.0 # 572,717 sentences + - opus_QED/v2.0a # 563,700 sentences + - opus_wikimedia/v20230407 # 541,583 sentences + - opus_Tatoeba/v2023-04-12 # 540,675 sentences + - opus_TED2020/v1 # 390,015 sentences + - opus_News-Commentary/v16 # 265,809 sentences + - opus_NeuLab-TedTalks/v1 # 221,999 sentences + - opus_KDE4/v2 # 180,793 sentences + - opus_GlobalVoices/v2018q4 # 170,351 sentences + - opus_TED2013/v1.1 # 133,660 sentences + - opus_ELRC-5183-SciPar_Ukraine/v1 # 126,585 sentences + - opus_infopankki/v1 # 75,305 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_EUbookshop/v2 # 49,830 sentences + - opus_WMT-News/v2019 # 36,637 sentences + - opus_PHP/v1 # 30,064 sentences + - opus_Books/v1 # 17,496 sentences + - opus_TildeMODEL/v2018 # 10,977 sentences + - opus_MDN_Web_Docs/v2023-09-25 # 8,134 sentences + - opus_ada83/v1 # 4,122 sentences + - opus_ELRC-3075-wikipedia_health/v1 # 4,073 sentences + - opus_ELRC-wikipedia_health/v1 # 4,073 sentences + - opus_ELRC_2922/v1 # 4,072 sentences + - opus_tico-19/v2020-10-28 # 3,071 sentences + - opus_ELRC-5067-SciPar/v1 # 3,064 sentences + - opus_tldr-pages/v2023-08-29 # 1,037 sentences + - mtdata_Neulab-tedtalks_test-1-eng-rus # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-commoncrawl_wmt13-1-rus-eng # ~8,126,649 sentences (918.3 MB) + - mtdata_Statmt-news_commentary_wmt18-13-rus-eng # ~1,001,393 sentences (113.2 MB) + - mtdata_Statmt-wiki_titles-1-rus-eng # ~179,637 sentences (20.3 MB) + - mtdata_Statmt-wiki_titles-2-rus-eng # ~193,345 sentences (21.8 MB) + - mtdata_Tilde-airbaltic-1-eng-rus # ~1,288 sentences (145.6 kB) + - mtdata_Tilde-czechtourism-1-eng-rus # ~7,561 sentences (854.5 kB) + - mtdata_Tilde-worldbank-1-eng-rus # ~33,049 sentences (3.7 MB) + - mtdata_UN-un_test-1-eng-rus + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~90,385,836 sentences + mono-trg: + - news-crawl_news.2008 # ~19,469 sentences (2.2M) + - news-crawl_news.2009 # ~47,787 sentences (5.4M) + - news-crawl_news.2011 # ~4,876,106 sentences (551M) + - news-crawl_news.2012 # ~5,079,646 sentences (574M) + - news-crawl_news.2013 # ~7,327,433 sentences (828M) + - news-crawl_news.2014 # ~6,194,690 sentences (700M) + - news-crawl_news.2015 # ~5,433,628 sentences (614M) + - news-crawl_news.2016 # ~3,716,814 sentences (420M) + - news-crawl_news.2017 # ~4,451,327 sentences (503M) + - news-crawl_news.2018 # ~4,539,823 sentences (513M) + - news-crawl_news.2019 # ~6,955,752 sentences (786M) + - news-crawl_news.2020 # ~8,849,557 sentences (1.0G) + - news-crawl_news.2021 # ~8,115,044 sentences (917M) + - news-crawl_news.2022 # ~8,849,557 sentences (1.0G) + - news-crawl_news.2023 # ~15,929,203 sentences (1.8G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-sk-spring-2024.yml b/configs/en-sk-spring-2024.yml new file mode 100644 index 000000000..2ee3a729d --- /dev/null +++ b/configs/en-sk-spring-2024.yml @@ -0,0 +1,180 @@ +# The initial configuration was generated using: +# task config-generator -- en sk --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: sk + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-slk + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 111,168,672 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (38,096,241 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_ELRC-3076-wikipedia_health/v1 - not enough data (134 sentences) + # - opus_ELRC-wikipedia_health/v1 - not enough data (134 sentences) + # - opus_ELRC_2922/v1 - not enough data (133 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-culture_slovak-1-eng-slk - duplicate with opus + # - mtdata_ELRC-justice_slovak-1-eng-slk - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-slk - duplicate with opus + # - mtdata_ELRC-emea-1-eng-slk - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-slk - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-slk - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-slk - duplicate with opus + # - mtdata_EU-ecdc-1-eng-slk - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-slk - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-slk - duplicate with opus + # - mtdata_Statmt-europarl-7-slk-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-slk_SK - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-slk - duplicate with opus + train: + - opus_NLLB/v1 # 38,096,241 sentences + - opus_ParaCrawl/v9 # 22,902,149 sentences + - opus_ELRC-EMEA/v1 # 12,032,452 sentences + - opus_OpenSubtitles/v2018 # 8,850,871 sentences + - opus_ELRC-4154-NTEU_TierA/v1 # 7,922,512 sentences + - opus_CCAligned/v1 # 6,938,181 sentences + - opus_DGT/v2019 # 5,118,830 sentences + - opus_XLEnt/v1.2 # 2,594,162 sentences + - opus_TildeMODEL/v2018 # 2,190,889 sentences + - opus_EMEA/v3 # 1,054,178 sentences + - opus_ELRC-2721-EMEA/v1 # 780,098 sentences + - opus_Europarl/v8 # 639,958 sentences + - opus_EUbookshop/v2 # 452,097 sentences + - opus_ELITR-ECA/v1 # 294,356 sentences + - opus_WikiMatrix/v1 # 178,985 sentences + - opus_QED/v2.0a # 173,727 sentences + - opus_ELRC-presscorner_covid/v1 # 142,656 sentences + - opus_ECB/v1 # 122,131 sentences + - opus_TED2020/v1 # 106,067 sentences + - opus_KDE4/v2 # 105,425 sentences + - opus_NeuLab-TedTalks/v1 # 67,607 sentences + - opus_bible-uedin/v1 # 62,159 sentences + - opus_ELRC-5067-SciPar/v1 # 60,468 sentences + - opus_JRC-Acquis/v3.0 # 35,744 sentences + - opus_PHP/v1 # 31,173 sentences + - opus_ELRC-3570-EUR_LEX_covid/v1 # 22,479 sentences + - opus_ELRC-EUR_LEX/v1 # 22,479 sentences + - opus_wikimedia/v20230407 # 18,819 sentences + - opus_ELRC-EUROPARL_covid/v1 # 16,369 sentences + - opus_ELRC-1179-EUIPO_2017/v1 # 16,313 sentences + - opus_ELRC-EUIPO_2017/v1 # 16,313 sentences + - opus_ELRC-2880-EU_publications_medi/v1 # 12,927 sentences + - opus_ELRC-EU_publications/v1 # 12,927 sentences + - opus_EUconst/v1 # 10,119 sentences + - opus_ELRC-1072-annual_reports_immig/v1 # 8,041 sentences + - opus_ELRA-W0136/v1 # 8,040 sentences + - opus_ELRC-3611-presscorner_covid/v1 # 6,982 sentences + - opus_ELRC-1073-annual_reports_Slova/v1 # 6,008 sentences + - opus_ELRA-W0137/v1 # 6,007 sentences + - opus_ELRC-1074-annual_reports_Stati/v1 # 5,614 sentences + - opus_ELRC_3382/v1 # 3,624 sentences + - opus_ELRC-488-Justice_Slovak/v1 # 2,896 sentences + - opus_ELRA-W0189/v1 # 2,895 sentences + - opus_ELRC-487-Culture_Slovak/v1 # 2,610 sentences + - opus_ELRA-W0188/v1 # 2,609 sentences + - opus_ECDC/v2016-03-16 # 2,526 sentences + - opus_ELRC-3469-EC_EUROPA_covid/v1 # 2,409 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,409 sentences + - opus_ELRC-3209-antibiotic/v1 # 1,010 sentences + - opus_ELRC-antibiotic/v1 # 1,010 sentences + - opus_ELRC-3298-EUROPARL_covid/v1 # 653 sentences + - opus_ELRC-2745-vaccination/v1 # 510 sentences + - opus_ELRC-vaccination/v1 # 510 sentences + - opus_ELRC_2923/v1 # 448 sentences + - mtdata_ELRC-annual_reports_immigration_asylum_policies_emn_contact_point_slovak-1-eng-slk + - mtdata_ELRC-annual_reports_slovak_centre_human_rights-1-eng-slk + - mtdata_ELRC-annual_reports_statistical_slovak-1-eng-slk + - mtdata_ELRC-eu_publications_medical_v2-1-eng-slk + - mtdata_ELRC-wikipedia_health-1-eng-slk + - mtdata_ELRC-nteu_tierb-1-eng-slk + - mtdata_EU-eac_forms-1-eng-slk # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-slk # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-slk # ~548,757 sentences (62.0 MB) + - mtdata_Neulab-tedtalks_test-1-eng-slk # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-slk # ~1,269,685 sentences (143.5 MB) + - mtdata_Tilde-ema-2016-eng-slk # ~238,237 sentences (26.9 MB) + - mtdata_Tilde-rapid-2016-eng-slk # ~214,164 sentences (24.2 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~0 sentences + mono-trg: [] +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-sl-spring-2024.yml b/configs/en-sl-spring-2024.yml new file mode 100644 index 000000000..26db44623 --- /dev/null +++ b/configs/en-sl-spring-2024.yml @@ -0,0 +1,182 @@ +# The initial configuration was generated using: +# task config-generator -- en sl --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: sl + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-slv + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 89,057,699 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (27,406,782 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (1,875,517 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-euipo_2017-1-eng-slv - duplicate with opus + # - mtdata_ELRC-emea-1-eng-slv - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-slv - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-slv - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-slv - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-slv - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-slv - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-slv - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-slv - duplicate with opus + # - mtdata_EU-ecdc-1-eng-slv - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-slv - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-slv - duplicate with opus + # - mtdata_Statmt-europarl-7-slv-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-slv_SI - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-slv - duplicate with opus + train: + - opus_NLLB/v1 # 27,406,782 sentences + - opus_OpenSubtitles/v2018 # 19,641,457 sentences + - opus_ELRC-EMEA/v1 # 13,288,992 sentences + - opus_ParaCrawl/v9 # 9,516,259 sentences + - opus_DGT/v2019 # 5,125,455 sentences + - opus_CCAligned/v1 # 4,366,555 sentences + - opus_TildeMODEL/v2018 # 2,048,216 sentences + - opus_MaCoCu/v2 # 1,875,518 sentences + - opus_EMEA/v3 # 1,045,041 sentences + - opus_XLEnt/v1.2 # 861,509 sentences + - opus_ELRC-2727-EMEA/v1 # 766,139 sentences + - opus_Europarl/v8 # 624,803 sentences + - opus_EUbookshop/v2 # 405,653 sentences + - opus_WikiMatrix/v1 # 318,028 sentences + - opus_ELRC-5067-SciPar/v1 # 300,017 sentences + - opus_ELRC-presscorner_covid/v1 # 150,090 sentences + - opus_Wikipedia/v1.0 # 140,124 sentences + - opus_KDE4/v2 # 119,645 sentences + - opus_ELITR-ECA/v1 # 102,934 sentences + - opus_ECB/v1 # 89,634 sentences + - opus_QED/v2.0a # 79,196 sentences + - opus_wikimedia/v20230407 # 73,612 sentences + - opus_ELRC-490-Secretariat_General_/v1 # 63,070 sentences + - opus_ELRA-W0191/v1 # 63,069 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_ELRC-489-Secretariat_General_/v1 # 55,185 sentences + - opus_ELRA-W0190/v1 # 55,184 sentences + - opus_JRC-Acquis/v3.0 # 53,390 sentences + - opus_TED2020/v1 # 44,340 sentences + - opus_PHP/v1 # 30,550 sentences + - opus_NeuLab-TedTalks/v1 # 22,856 sentences + - opus_ELRC-3576-EUR_LEX_covid/v1 # 22,381 sentences + - opus_ELRC-EUR_LEX/v1 # 22,381 sentences + - opus_ELRC-1180-EUIPO_2017/v1 # 19,767 sentences + - opus_ELRC-EUIPO_2017/v1 # 19,767 sentences + - opus_ELRC-wikipedia_health/v1 # 17,519 sentences + - opus_ELRC-antibiotic/v1 # 16,166 sentences + - opus_ELRC-EUROPARL_covid/v1 # 15,269 sentences + - opus_TED2013/v1.1 # 14,960 sentences + - opus_ELRC-2886-EU_publications_medi/v1 # 13,209 sentences + - opus_ELRC-EU_publications/v1 # 13,209 sentences + - opus_ELRC-924-statistical_reports_/v1 # 11,860 sentences + - opus_ELRC-statistical_reports/v1 # 11,860 sentences + - opus_ELRA-W0267/v1 # 11,859 sentences + - opus_EUconst/v1 # 8,807 sentences + - opus_GNOME/v1 # 8,070 sentences + - opus_ELRC-3617-presscorner_covid/v1 # 6,995 sentences + - opus_ELRC-923-chapters_Youth_2010/v1 # 5,866 sentences + - opus_Tatoeba/v2023-04-12 # 4,302 sentences + - opus_ELRC_3382/v1 # 3,633 sentences + - opus_ECDC/v2016-03-16 # 2,546 sentences + - opus_ELRC-3475-EC_EUROPA_covid/v1 # 2,534 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,534 sentences + - opus_ELRC-2328-Agriculture_Forestry/v1 # 1,551 sentences + - opus_ELRC-3215-antibiotic/v1 # 986 sentences + - opus_ELRC-3085-wikipedia_health/v1 # 960 sentences + - opus_ELRC_2922/v1 # 959 sentences + - opus_ELRC-3304-EUROPARL_covid/v1 # 816 sentences + - opus_ELRC-2737-vaccination/v1 # 492 sentences + - opus_ELRC-vaccination/v1 # 492 sentences + - opus_ELRC_2923/v1 # 451 sentences + - mtdata_ELRC-secretariat_general_part1-1-eng-slv + - mtdata_ELRC-secretariat_general_part2-1-eng-slv + - mtdata_ELRC-chapters_youth_2010_social_profile_young_people_slovenia_publication-1-eng-slv + - mtdata_ELRC-statistical_reports_statistical_slovenia-1-eng-slv + - mtdata_ELRC-agriculture_forestry_food_slovenia-1-eng-slv + - mtdata_ELRC-eu_publications_medical_v2-1-eng-slv + - mtdata_EU-eac_forms-1-eng-slv # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-slv # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-slv # ~539,490 sentences (61.0 MB) + - mtdata_Neulab-tedtalks_test-1-eng-slv # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-slv # ~1,116,707 sentences (126.2 MB) + - mtdata_Tilde-ema-2016-eng-slv # ~223,681 sentences (25.3 MB) + - mtdata_Tilde-rapid-2016-eng-slv # ~203,695 sentences (23.0 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~0 sentences + mono-trg: [] +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-sr-spring-2024.yml b/configs/en-sr-spring-2024.yml new file mode 100644 index 000000000..6de158fae --- /dev/null +++ b/configs/en-sr-spring-2024.yml @@ -0,0 +1,131 @@ +# The initial configuration was generated using: +# task config-generator -- en sr --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: sr + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-srp + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 78,565,711 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (26,510,872 sentences) + # - opus_MultiHPLT/v1.1 - ignored datasets (3,904,384 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (114 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (113 sentences) + # - opus_tldr-pages/v2023-08-29 - not enough data (26 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-srp - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-srp - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-srp - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-srp_RS - duplicate with opus + train: + - opus_OpenSubtitles/v2018 # 42,635,098 sentences + - opus_NLLB/v1 # 26,510,872 sentences + - opus_HPLT/v1.1 # 3,904,423 sentences + - opus_CCAligned/v1 # 1,992,692 sentences + - opus_XLEnt/v1.2 # 1,474,447 sentences + - opus_WikiMatrix/v1 # 395,569 sentences + - opus_GoURMET/v1 # 329,004 sentences + - opus_QED/v2.0a # 284,942 sentences + - opus_TED2020/v1 # 260,966 sentences + - opus_SETIMES/v2 # 225,169 sentences + - opus_wikimedia/v20230407 # 217,199 sentences + - opus_NeuLab-TedTalks/v1 # 152,477 sentences + - opus_bible-uedin/v1 # 62,131 sentences + - opus_KDE4/v2 # 60,827 sentences + - opus_Tatoeba/v2023-04-12 # 21,760 sentences + - opus_GlobalVoices/v2018q4 # 20,309 sentences + - opus_ELRC-wikipedia_health/v1 # 12,707 sentences + - opus_TildeMODEL/v2018 # 2,024 sentences + - opus_EUbookshop/v2 # 1,608 sentences + - opus_ELRC-3041-wikipedia_health/v1 # 744 sentences + - opus_ELRC_2922/v1 # 743 sentences + - mtdata_ELRC-swedish_social_security-1-eng-srp + - mtdata_Neulab-tedtalks_test-1-eng-srp # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-worldbank-1-eng-srp # ~2,533 sentences (286.3 kB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~35,920,209 sentences + mono-trg: + - news-crawl_news.2008 # ~3,522 sentences (398K) + - news-crawl_news.2009 # ~18,584 sentences (2.1M) + - news-crawl_news.2010 # ~9,734 sentences (1.1M) + - news-crawl_news.2011 # ~2,530 sentences (286K) + - news-crawl_news.2018 # ~18,584 sentences (2.1M) + - news-crawl_news.2019 # ~1,929,203 sentences (218M) + - news-crawl_news.2020 # ~5,619,469 sentences (635M) + - news-crawl_news.2021 # ~8,849,557 sentences (1.0G) + - news-crawl_news.2022 # ~9,734,513 sentences (1.1G) + - news-crawl_news.2023 # ~9,734,513 sentences (1.1G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-sv-spring-2024.yml b/configs/en-sv-spring-2024.yml new file mode 100644 index 000000000..59db07584 --- /dev/null +++ b/configs/en-sv-spring-2024.yml @@ -0,0 +1,239 @@ +# The initial configuration was generated using: +# task config-generator -- en sv --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: sv + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Lindat-khresmoi_summary_dev-2-eng-swe + - mtdata_Neulab-tedtalks_dev-1-eng-swe + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 211,400,324 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (77,008,059 sentences) + # - opus_RF/v1 - not enough data (180 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-swedish_labour_part2-1-eng-swe - duplicate with opus + # - mtdata_ELRC-swedish_labour_part1-1-eng-swe - duplicate with opus + # - mtdata_ELRC-swedish_food-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.norden.org-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.sida.se-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.visitestonia.com-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.vtv.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-valtioneuvosto.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-vnk.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.turku.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.vero.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-emea-1-eng-swe - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-swe - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-swe - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-swe - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-swe - duplicate with opus + # - mtdata_EU-ecdc-1-eng-swe - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-swe - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-swe - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-swe - duplicate with opus + # - mtdata_Statmt-europarl-7-swe-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-swe_SE - duplicate with opus + train: + - opus_NLLB/v1 # 77,008,059 sentences + - opus_ParaCrawl/v9 # 49,110,322 sentences + - opus_OpenSubtitles/v2018 # 17,660,152 sentences + - opus_ELRC-4268-NTEU_TierA/v1 # 12,737,597 sentences + - opus_CCAligned/v1 # 12,544,114 sentences + - opus_ELRC-EMEA/v1 # 12,083,941 sentences + - opus_LinguaTools-WikiTitles/v2014 # 8,058,690 sentences + - opus_DGT/v2019 # 5,139,521 sentences + - opus_XLEnt/v1.2 # 3,674,011 sentences + - opus_TildeMODEL/v2018 # 3,234,207 sentences + - opus_EUbookshop/v2 # 1,915,479 sentences + - opus_Europarl/v8 # 1,892,723 sentences + - opus_EMEA/v3 # 1,086,217 sentences + - opus_JRC-Acquis/v3.0 # 792,924 sentences + - opus_ELRC-2725-EMEA/v1 # 759,846 sentences + - opus_ELRC-5067-SciPar/v1 # 670,816 sentences + - opus_WikiMatrix/v1 # 546,289 sentences + - opus_ELITR-ECA/v1 # 389,808 sentences + - opus_KDE4/v2 # 232,485 sentences + - opus_QED/v2.0a # 171,126 sentences + - opus_ELRC-presscorner_covid/v1 # 147,973 sentences + - opus_Tanzil/v1 # 127,493 sentences + - opus_ELRC-Swedish_Migration/v1 # 124,398 sentences + - opus_TED2020/v1 # 120,718 sentences + - opus_ELRC-www.turku.fi/v1 # 107,773 sentences + - opus_NeuLab-TedTalks/v1 # 69,332 sentences + - opus_wikimedia/v20230407 # 63,135 sentences + - opus_bible-uedin/v1 # 62,137 sentences + - opus_infopankki/v1 # 51,749 sentences + - opus_ELRC-1770-valtioneuvosto.fi/v1 # 49,084 sentences + - opus_ELRC-valtioneuvosto.fi/v1 # 49,084 sentences + - opus_ELRC-1133-www.vtv.fi/v1 # 46,501 sentences + - opus_ELRC-www.vtv.fi/v1 # 46,501 sentences + - opus_ELRC-734-www.norden.org/v1 # 37,763 sentences + - opus_ELRC-www.norden.org/v1 # 37,763 sentences + - opus_ELRC-1772-vnk.fi/v1 # 33,627 sentences + - opus_ELRC-vnk.fi/v1 # 33,627 sentences + - opus_WikiSource/v1 # 33,283 sentences + - opus_ELRC-817-Swedish_Audit_Riksre/v1 # 30,352 sentences + - opus_PHP/v1 # 30,198 sentences + - opus_Tatoeba/v2023-04-12 # 27,050 sentences + - opus_ELRC-3574-EUR_LEX_covid/v1 # 22,445 sentences + - opus_ELRC-EUR_LEX/v1 # 22,445 sentences + - opus_ELRC-2037-www.vero.fi/v1 # 22,317 sentences + - opus_ELRC-www.vero.fi/v1 # 22,317 sentences + - opus_ELRC-2026-EUIPO_2017/v1 # 16,947 sentences + - opus_ELRC-EUIPO_2017/v1 # 16,947 sentences + - opus_ELRC-EUROPARL_covid/v1 # 15,681 sentences + - opus_ELRC-antibiotic/v1 # 15,452 sentences + - opus_ELRC-1131-www.visitestonia.com/v1 # 14,335 sentences + - opus_ELRC-www.visitestonia.com/v1 # 14,335 sentences + - opus_ELRC-2884-EU_publications_medi/v1 # 13,100 sentences + - opus_ELRC-EU_publications/v1 # 13,100 sentences + - opus_ELRC-802-Swedish_Competition_/v1 # 11,374 sentences + - opus_ELRC-wikipedia_health/v1 # 10,395 sentences + - opus_ELRC-928-Annual_Reports_Swedi/v1 # 10,227 sentences + - opus_EUconst/v1 # 9,954 sentences + - opus_ELRC-2033-www.turku.fi/v1 # 9,706 sentences + - opus_GlobalVoices/v2018q4 # 8,793 sentences + - opus_ELRC-829-Swedish_Migration_Bo/v1 # 8,366 sentences + - opus_ELRA-W0239/v1 # 8,365 sentences + - opus_ELRC-417-Swedish_Work_Environ/v1 # 7,475 sentences + - opus_ELRC-3615-presscorner_covid/v1 # 6,856 sentences + - opus_ELRC-744-Finnish_Information_/v1 # 6,819 sentences + - opus_ELRC-Finnish_Information/v1 # 6,819 sentences + - opus_ELRA-W0222/v1 # 6,818 sentences + - opus_ELRC_3382/v1 # 3,760 sentences + - opus_Books/v1 # 3,095 sentences + - opus_ELRC-Swedish_Labour/v1 # 2,778 sentences + - opus_ECDC/v2016-03-16 # 2,528 sentences + - opus_ELRC-1013-Sweden_a_Pocket/v1 # 2,200 sentences + - opus_ELRA-W0130/v1 # 2,199 sentences + - opus_ELRC-712-Social_Insurance_Frs/v1 # 1,953 sentences + - opus_ELRA-W0213/v1 # 1,952 sentences + - opus_ELRC-3473-EC_EUROPA_covid/v1 # 1,858 sentences + - opus_ELRC-EC_EUROPA/v1 # 1,858 sentences + - opus_ELRC-401-Swedish_Labour_Part2/v1 # 1,768 sentences + - opus_ELRC-929-www.sida.se/v1 # 1,545 sentences + - opus_ELRC-823-Swedish_Swedish_Crim/v1 # 1,503 sentences + - opus_ELRC-416-Swedish_Social_Secur/v1 # 1,447 sentences + - opus_ELRC-416-Swedish_Social_Secur/v1 # 1,446 sentences + - opus_ELRC-436-Swedish_Food/v1 # 1,147 sentences + - opus_ELRA-W0305/v1 # 1,146 sentences + - opus_ELRC-406-Swedish_Labour_Part1/v1 # 1,011 sentences + - opus_ELRC-3213-antibiotic/v1 # 953 sentences + - opus_ELRC-830-Swedish_Economic_Reg/v1 # 949 sentences + - opus_ELRC-3302-EUROPARL_covid/v1 # 844 sentences + - opus_tldr-pages/v2023-08-29 # 566 sentences + - opus_ELRC-3082-wikipedia_health/v1 # 535 sentences + - opus_ELRC_2922/v1 # 534 sentences + - opus_ELRC_2923/v1 # 499 sentences + - opus_ELRC-2752-vaccination/v1 # 497 sentences + - opus_ELRC-vaccination/v1 # 497 sentences + - mtdata_ELRC-swedish_social_security-1-eng-swe + - mtdata_ELRC-swedish_work_environment-1-eng-swe + - mtdata_ELRC-social_insurance_försäkringskassan-1-eng-swe + - mtdata_ELRC-finnish_information_bank-1-eng-swe + - mtdata_ELRC-swedish_competition_authority_konkurrensverket-1-eng-swe + - mtdata_ELRC-swedish_audit_riksrevisionen-1-eng-swe + - mtdata_ELRC-swedish_swedish_crime_victim_compensation_support_authority-1-eng-swe + - mtdata_ELRC-swedish_migration_board_migrationsverket-1-eng-swe + - mtdata_ELRC-swedish_economic_regional_growth_tillväxtverket-1-eng-swe + - mtdata_ELRC-annual_reports_swedish_pension_system-1-eng-swe + - mtdata_ELRC-sweden_a_pocket_guide_book-1-eng-swe + - mtdata_ELRC-eu_publications_medical_v2-1-eng-swe + - mtdata_ELRC-nteu_tierb-1-eng-swe + - mtdata_EU-eac_forms-1-eng-swe # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-swe # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-swe # ~980,674 sentences (110.8 MB) + - mtdata_Lindat-khresmoi_summary_test-2-eng-swe # ~11,808 sentences (1.3 MB) + - mtdata_Neulab-tedtalks_test-1-eng-swe # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-dcep_wmt17-1-swe-eng # ~1,137,607 sentences (128.5 MB) + - mtdata_Statmt-books_wmt17-1-swe-eng # ~2,797 sentences (316.2 kB) + - mtdata_Tilde-eesc-2017-eng-swe # ~1,798,328 sentences (203.2 MB) + - mtdata_Tilde-ema-2016-eng-swe # ~215,912 sentences (24.4 MB) + - mtdata_Tilde-ecb-2017-eng-swe # ~3,314 sentences (374.5 kB) + - mtdata_Tilde-rapid-2016-eng-swe # ~400,648 sentences (45.3 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~0 sentences + mono-trg: [] +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-tr-spring-2024.yml b/configs/en-tr-spring-2024.yml new file mode 100644 index 000000000..816433fde --- /dev/null +++ b/configs/en-tr-spring-2024.yml @@ -0,0 +1,144 @@ +# The initial configuration was generated using: +# task config-generator -- en tr --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: tr + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-tur + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt18/test-ts + - sacrebleu_aug-mix_wmt16 + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt18 + - sacrebleu_wmt17 + - sacrebleu_wmt16/dev + + # The training data contains: + # 121,323,758 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (47,045,956 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (1,646,740 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-tur - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-tur - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-tur - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-tur - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-tur_TR - duplicate with opus + train: + - opus_NLLB/v1 # 47,045,956 sentences + - opus_OpenSubtitles/v2018 # 44,986,121 sentences + - opus_CCAligned/v1 # 13,650,311 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,915,248 sentences + - opus_XLEnt/v1.2 # 3,809,464 sentences + - opus_MaCoCu/v2 # 1,646,741 sentences + - opus_GoURMET/v1 # 1,308,307 sentences + - opus_Tanzil/v1 # 1,189,967 sentences + - opus_Tatoeba/v2023-04-12 # 676,920 sentences + - opus_wikimedia/v20230407 # 668,099 sentences + - opus_QED/v2.0a # 482,964 sentences + - opus_WikiMatrix/v1 # 477,736 sentences + - opus_TED2020/v1 # 378,033 sentences + - opus_SETIMES/v2 # 207,678 sentences + - opus_NeuLab-TedTalks/v1 # 195,641 sentences + - opus_Wikipedia/v1.0 # 159,979 sentences + - opus_KDE4/v2 # 153,438 sentences + - opus_TED2013/v1.1 # 137,028 sentences + - opus_bible-uedin/v1 # 60,411 sentences + - opus_infopankki/v1 # 44,030 sentences + - opus_Bianet/v1 # 34,770 sentences + - opus_PHP/v1 # 32,713 sentences + - opus_EUbookshop/v2 # 23,706 sentences + - opus_WMT-News/v2019 # 20,016 sentences + - opus_GlobalVoices/v2018q4 # 7,838 sentences + - opus_ELRC-3057-wikipedia_health/v1 # 2,368 sentences + - opus_ELRC-wikipedia_health/v1 # 2,368 sentences + - opus_ELRC_2922/v1 # 2,367 sentences + - opus_tldr-pages/v2023-08-29 # 1,956 sentences + - opus_TildeMODEL/v2018 # 1,584 sentences + - mtdata_EU-eac_forms-1-eng-tur # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-tur # ~31,162 sentences (3.5 MB) + - mtdata_Neulab-tedtalks_test-1-eng-tur # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-newsdev_tren-2016-tur-eng # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_entr-2016-eng-tur # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-worldbank-1-eng-tur # ~1,827 sentences (206.5 kB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~20,230,124 sentences + mono-trg: + - news-crawl_news.2010 # ~38 sentences (4.4K) + - news-crawl_news.2017 # ~1,194,690 sentences (135M) + - news-crawl_news.2018 # ~1,964,601 sentences (222M) + - news-crawl_news.2019 # ~3,168,141 sentences (358M) + - news-crawl_news.2020 # ~3,716,814 sentences (420M) + - news-crawl_news.2021 # ~3,814,159 sentences (431M) + - news-crawl_news.2022 # ~3,575,221 sentences (404M) + - news-crawl_news.2023 # ~2,796,460 sentences (316M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-uk-spring-2024.yml b/configs/en-uk-spring-2024.yml new file mode 100644 index 000000000..32b4eb83a --- /dev/null +++ b/configs/en-uk-spring-2024.yml @@ -0,0 +1,134 @@ +# The initial configuration was generated using: +# task config-generator -- en uk --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: uk + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ukr + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 58,968,083 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (20,240,171 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (6,406,288 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-ukr - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-ukr - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ukr - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-ukr_UA - duplicate with opus + train: + - opus_NLLB/v1 # 20,240,171 sentences + - opus_ParaCrawl/v9 # 14,079,832 sentences + - opus_CCAligned/v1 # 8,547,377 sentences + - opus_MaCoCu/v2 # 6,406,294 sentences + - opus_XLEnt/v1.2 # 3,671,061 sentences + - opus_SUMMA/v1 # 1,574,611 sentences + - opus_OpenSubtitles/v2018 # 877,780 sentences + - opus_wikimedia/v20230407 # 757,910 sentences + - opus_WikiMatrix/v1 # 681,115 sentences + - opus_ELRC-5214-A_Lexicon_Named/v1 # 495,403 sentences + - opus_ELRC-5183-SciPar_Ukraine/v1 # 306,813 sentences + - opus_KDE4/v2 # 233,611 sentences + - opus_QED/v2.0a # 215,630 sentences + - opus_TED2020/v1 # 208,141 sentences + - opus_Tatoeba/v2023-04-12 # 175,502 sentences + - opus_ELRC-5179-acts_Ukrainian/v1 # 129,942 sentences + - opus_ELRC-5180-Official_Parliament_/v1 # 116,260 sentences + - opus_NeuLab-TedTalks/v1 # 115,474 sentences + - opus_ELRC-5181-Official_Parliament_/v1 # 61,012 sentences + - opus_ELRC-5174-French_Polish_Ukrain/v1 # 36,228 sentences + - opus_bible-uedin/v1 # 15,901 sentences + - opus_ELRC-5182-Official_Parliament_/v1 # 8,800 sentences + - opus_ELRC-3043-wikipedia_health/v1 # 2,735 sentences + - opus_ELRC-wikipedia_health/v1 # 2,735 sentences + - opus_ELRC_2922/v1 # 2,734 sentences + - opus_EUbookshop/v2 # 1,793 sentences + - opus_TildeMODEL/v2018 # 1,628 sentences + - opus_ELRC-5217-Ukrainian_Legal_MT/v1 # 997 sentences + - opus_tldr-pages/v2023-08-29 # 593 sentences + - mtdata_Neulab-tedtalks_test-1-eng-ukr # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-worldbank-1-eng-ukr # ~2,011 sentences (227.3 kB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~463,898 sentences + mono-trg: + - news-crawl_news.2008 # ~6,070 sentences (686K) + - news-crawl_news.2009 # ~30,088 sentences (3.4M) + - news-crawl_news.2010 # ~6,504 sentences (735K) + - news-crawl_news.2011 # ~58,407 sentences (6.6M) + - news-crawl_news.2012 # ~68,141 sentences (7.7M) + - news-crawl_news.2013 # ~82,300 sentences (9.3M) + - news-crawl_news.2014 # ~87,610 sentences (9.9M) + - news-crawl_news.2016 # ~39,823 sentences (4.5M) + - news-crawl_news.2018 # ~84,955 sentences (9.6M) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/en-vi-spring-2024.yml b/configs/en-vi-spring-2024.yml new file mode 100644 index 000000000..1d9d9de21 --- /dev/null +++ b/configs/en-vi-spring-2024.yml @@ -0,0 +1,111 @@ +# The initial configuration was generated using: +# task config-generator -- en vi --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: en + trg: vi + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-vie + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 69,085,316 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (50,092,444 sentences) + # - opus_GNOME/v1 - not enough data (149 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - opus_XLEnt/v1.2 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-vie - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-vie - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-vie - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-vie_VN - duplicate with opus + train: + - opus_NLLB/v1 # 50,092,444 sentences + - opus_CCAligned/v1 # 12,394,417 sentences + - opus_OpenSubtitles/v2018 # 3,505,276 sentences + - opus_WikiMatrix/v1 # 1,073,752 sentences + - opus_wikimedia/v20230407 # 669,743 sentences + - opus_QED/v2.0a # 338,024 sentences + - opus_TED2020/v1 # 326,417 sentences + - opus_NeuLab-TedTalks/v1 # 184,973 sentences + - opus_StanfordNLP-NMT/v1.0 # 133,167 sentences + - opus_ELRC-wikipedia_health/v1 # 126,413 sentences + - opus_bible-uedin/v1 # 124,390 sentences + - opus_Wikipedia/v1.0 # 58,116 sentences + - opus_KDE4/v2 # 42,782 sentences + - opus_Tatoeba/v2023-04-12 # 6,855 sentences + - opus_ELRC-3086-wikipedia_health/v1 # 4,274 sentences + - opus_ELRC_2922/v1 # 4,273 sentences + - mtdata_Neulab-tedtalks_test-1-eng-vie # ~3,117,009 sentences (352.2 MB) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-src: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) + + # The monolingual data contains: + # ~0 sentences + mono-trg: [] +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/hr-en-spring-2024.yml b/configs/hr-en-spring-2024.yml new file mode 100644 index 000000000..f97abd59a --- /dev/null +++ b/configs/hr-en-spring-2024.yml @@ -0,0 +1,225 @@ +# The initial configuration was generated using: +# task config-generator -- hr en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: hr + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-hrv + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 99,724,833 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (18,797,643 sentences) + # - opus_MultiHPLT/v1.1 - ignored datasets (9,310,276 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (2,266,005 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-croatian_bank-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-croatian_mine_action-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-agriculture-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-emea-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-hrv - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-hrv - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-hrv - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-hrv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-hrv - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-hrv_HR - duplicate with opus + train: + - opus_OpenSubtitles/v2018 # 35,131,729 sentences + - opus_NLLB/v1 # 18,797,643 sentences + - opus_ELRC-EMEA/v1 # 10,890,456 sentences + - opus_CCAligned/v1 # 9,376,190 sentences + - opus_HPLT/v1.1 # 9,310,369 sentences + - opus_ParaCrawl/v9 # 3,240,485 sentences + - opus_XLEnt/v1.2 # 2,844,710 sentences + - opus_ELRC-4142-NTEU_TierA/v1 # 2,290,893 sentences + - opus_MaCoCu/v2 # 2,266,007 sentences + - opus_ELRC-5067-SciPar/v1 # 806,581 sentences + - opus_TildeMODEL/v2018 # 745,616 sentences + - opus_DGT/v2019 # 722,182 sentences + - opus_ELRC-2706-EMEA/v1 # 650,030 sentences + - opus_WikiMatrix/v1 # 259,499 sentences + - opus_QED/v2.0a # 208,129 sentences + - opus_SETIMES/v2 # 205,910 sentences + - opus_TED2020/v1 # 197,411 sentences + - opus_ELITR-ECA/v1 # 181,038 sentences + - opus_EuroPat/v3 # 154,775 sentences + - opus_ELRC-presscorner_covid/v1 # 140,795 sentences + - opus_ELRC-Regional_Development/v1 # 136,809 sentences + - opus_NeuLab-TedTalks/v1 # 128,233 sentences + - opus_ELRC-Rural_Development/v1 # 105,562 sentences + - opus_hrenWaC/v1 # 99,001 sentences + - opus_KDE4/v2 # 87,333 sentences + - opus_TedTalks/v1 # 86,348 sentences + - opus_ELRC-2542-Agriculture/v1 # 68,376 sentences + - opus_bible-uedin/v1 # 62,179 sentences + - opus_ELRC-4329-PRINCIPLE_MVEP_legal/v1 # 44,460 sentences + - opus_wikimedia/v20230407 # 42,034 sentences + - opus_GNOME/v1 # 35,429 sentences + - opus_ELRC-3556-EUR_LEX_covid/v1 # 22,010 sentences + - opus_ELRC-EUR_LEX/v1 # 22,010 sentences + - opus_ELRC-651-government_websites_/v1 # 21,341 sentences + - opus_ELRC-government_websites/v1 # 21,341 sentences + - opus_ELRA-W0204/v1 # 21,340 sentences + - opus_ELRC-943-Journal_Croatian_Ass/v1 # 18,478 sentences + - opus_ELRA-W0273/v1 # 18,477 sentences + - opus_ELRC-1015-Croatian_Mine_Action/v1 # 17,602 sentences + - opus_ELRA-W0131/v1 # 17,601 sentences + - opus_ELRC-1174-EUIPO_2017/v1 # 17,205 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,205 sentences + - opus_ELRC-2866-EU_publications_medi/v1 # 12,837 sentences + - opus_ELRC-EU_publications/v1 # 12,837 sentences + - opus_ELRC-921-studies_challenges_C/v1 # 11,781 sentences + - opus_ELRA-W0266/v1 # 11,780 sentences + - opus_ELRC-915-statistical_reports_/v1 # 11,738 sentences + - opus_ELRC-statistical_reports/v1 # 11,738 sentences + - opus_ELRA-W0264/v1 # 11,737 sentences + - opus_ELRC-788-Croatian_Bank/v1 # 11,708 sentences + - opus_ELRA-W0226/v1 # 11,707 sentences + - opus_ELRC-EUROPARL_covid/v1 # 10,175 sentences + - opus_ELRC-2541-Regional_Development/v1 # 7,911 sentences + - opus_ELRC-3597-presscorner_covid/v1 # 6,645 sentences + - opus_EUbookshop/v2 # 6,104 sentences + - opus_ELRC-992-Rural_Development_Pr/v1 # 5,202 sentences + - opus_ELRC_3382/v1 # 3,671 sentences + - opus_ELRC-989-Foreign_Affairs_Croa/v1 # 3,103 sentences + - opus_ELRC-Foreign_Affairs/v1 # 3,103 sentences + - opus_ELRA-W0293/v1 # 3,102 sentences + - opus_ELRC-3478-EC_EUROPA_covid/v1 # 2,595 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,595 sentences + - opus_Tatoeba/v2023-04-12 # 2,454 sentences + - opus_ELRC-991-Croatian_Journal_Fis/v1 # 2,408 sentences + - opus_ELRA-W0294/v1 # 2,407 sentences + - opus_ELRC-1080-Acts_Biological_Land/v1 # 2,329 sentences + - opus_ELRA-W0142/v1 # 2,328 sentences + - opus_ELRC-1058-University_Library_Z/v1 # 2,310 sentences + - opus_ELRA-W0135/v1 # 2,309 sentences + - opus_ELRC-986-Embassy_Finland_Zagr/v1 # 1,967 sentences + - opus_ELRA-W0292/v1 # 1,966 sentences + - opus_ELRC-1159-Swedish_Migration_Bo/v1 # 1,112 sentences + - opus_ELRC-Swedish_Migration/v1 # 1,112 sentences + - opus_ELRC-3193-antibiotic/v1 # 1,070 sentences + - opus_ELRC-antibiotic/v1 # 1,070 sentences + - opus_ELRC-984-Government_Cooperati/v1 # 1,026 sentences + - opus_ELRA-W0291/v1 # 1,025 sentences + - opus_ELRC-996-nature_protection_st/v1 # 970 sentences + - opus_ELRC-825-Croatian_Swedish_Cri/v1 # 907 sentences + - opus_ELRA-W0238/v1 # 906 sentences + - opus_ELRC-2753-vaccination/v1 # 509 sentences + - opus_ELRC-vaccination/v1 # 509 sentences + - opus_ELRC_2922/v1 # 485 sentences + - opus_ELRC-3284-EUROPARL_covid/v1 # 475 sentences + - opus_ELRC_2923/v1 # 288 sentences + - mtdata_ELRC-government_websites_croatian-1-eng-hrv + - mtdata_ELRC-croatian_swedish_crime_victim_compensation_support_authority-1-eng-hrv + - mtdata_ELRC-statistical_reports_studies_croatian_bureau_statistics-1-eng-hrv + - mtdata_ELRC-studies_challenges_croatian_accession_union_croatian_institute_finance-1-eng-hrv + - mtdata_ELRC-journal_croatian_association_civil_engineers-1-eng-hrv + - mtdata_ELRC-government_cooperation_ngos-1-eng-hrv + - mtdata_ELRC-embassy_finland_zagreb-1-eng-hrv + - mtdata_ELRC-foreign_affairs_croatia-1-eng-hrv + - mtdata_ELRC-croatian_journal_fisheries-1-eng-hrv + - mtdata_ELRC-rural_development_programme_period_2014_2020_croatian_rural_development_programme-1-eng-hrv + - mtdata_ELRC-nature_protection_strategy_croatia-1-eng-hrv + - mtdata_ELRC-university_library_zagreb-1-eng-hrv + - mtdata_ELRC-acts_biological_landscape_diversity_environmental_protection-1-eng-hrv + - mtdata_ELRC-swedish_migration_board_migrationsverket-1-eng-hrv + - mtdata_ELRC-regional_development_funds-1-eng-hrv + - mtdata_ELRC-eu_publications_medical_v2-1-eng-hrv + - mtdata_ELRC-wikipedia_health-1-eng-hrv + - mtdata_ELRC-nteu_tierb-1-eng-hrv + - mtdata_EU-eac_reference-1-eng-hrv # ~31,162 sentences (3.5 MB) + - mtdata_Neulab-tedtalks_test-1-eng-hrv # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-hrv # ~216,663 sentences (24.5 MB) + - mtdata_Tilde-ema-2016-eng-hrv # ~209,283 sentences (23.6 MB) + - mtdata_Tilde-ecb-2017-eng-hrv # ~876 sentences (99.0 kB) + - mtdata_Tilde-rapid-2016-eng-hrv # ~45,055 sentences (5.1 MB) + - mtdata_Tilde-worldbank-1-eng-hrv # ~1,566 sentences (177.0 kB) + + # The monolingual data contains: + # ~11,498,228 sentences + mono-src: + - news-crawl_news.2014 # ~46,902 sentences (5.3M) + - news-crawl_news.2019 # ~1,398,230 sentences (158M) + - news-crawl_news.2020 # ~2,610,619 sentences (295M) + - news-crawl_news.2021 # ~2,398,230 sentences (271M) + - news-crawl_news.2022 # ~2,592,920 sentences (293M) + - news-crawl_news.2023 # ~2,451,327 sentences (277M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/id-en-spring-2024.yml b/configs/id-en-spring-2024.yml new file mode 100644 index 000000000..315d3d663 --- /dev/null +++ b/configs/id-en-spring-2024.yml @@ -0,0 +1,117 @@ +# The initial configuration was generated using: +# task config-generator -- id en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: id + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ind + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 102,103,778 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (70,545,705 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-ind - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-ind - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ind - duplicate with opus + # - mtdata_Statmt-news_commentary-14-eng-ind - duplicate with opus + # - mtdata_Statmt-news_commentary-15-eng-ind - duplicate with opus + # - mtdata_Statmt-news_commentary-16-eng-ind - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-ind_ID - duplicate with opus + train: + - opus_NLLB/v1 # 70,545,705 sentences + - opus_CCAligned/v1 # 15,700,345 sentences + - opus_OpenSubtitles/v2018 # 9,268,181 sentences + - opus_XLEnt/v1.2 # 4,179,174 sentences + - opus_WikiMatrix/v1 # 1,019,171 sentences + - opus_Tanzil/v1 # 393,552 sentences + - opus_wikimedia/v20230407 # 284,126 sentences + - opus_QED/v2.0a # 274,581 sentences + - opus_TED2020/v1 # 165,059 sentences + - opus_NeuLab-TedTalks/v1 # 95,295 sentences + - opus_bible-uedin/v1 # 59,363 sentences + - opus_GNOME/v1 # 47,234 sentences + - opus_News-Commentary/v16 # 18,054 sentences + - opus_GlobalVoices/v2018q4 # 16,043 sentences + - opus_KDE4/v2 # 14,782 sentences + - opus_Tatoeba/v2023-04-12 # 10,550 sentences + - opus_tico-19/v2020-10-28 # 3,071 sentences + - opus_ELRC-3049-wikipedia_health/v1 # 2,680 sentences + - opus_ELRC-wikipedia_health/v1 # 2,680 sentences + - opus_ELRC_2922/v1 # 2,679 sentences + - opus_tldr-pages/v2023-08-29 # 1,453 sentences + - mtdata_Neulab-tedtalks_test-1-eng-ind # ~3,117,009 sentences (352.2 MB) + + # The monolingual data contains: + # ~0 sentences + mono-src: [] + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/lt-en-spring-2024.yml b/configs/lt-en-spring-2024.yml new file mode 100644 index 000000000..be55f5214 --- /dev/null +++ b/configs/lt-en-spring-2024.yml @@ -0,0 +1,192 @@ +# The initial configuration was generated using: +# task config-generator -- lt en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: lt + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-lit + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt19/dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt19 + + # The training data contains: + # 76,643,900 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (23,298,470 sentences) + # - opus_ELRC-3069-wikipedia_health/v1 - not enough data (136 sentences) + # - opus_ELRC-wikipedia_health/v1 - not enough data (136 sentences) + # - opus_ELRC_2922/v1 - not enough data (135 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-president_lithuania-1-eng-lit - duplicate with opus + # - mtdata_ELRC-www.lrs.lt-1-eng-lit - duplicate with opus + # - mtdata_ELRC-www.lb.lt-1-eng-lit - duplicate with opus + # - mtdata_ELRC-kam.lt-1-eng-lit - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-lit - duplicate with opus + # - mtdata_ELRC-emea-1-eng-lit - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-lit - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-lit - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-lit - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-lit - duplicate with opus + # - mtdata_EU-ecdc-1-eng-lit - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-lit - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-3-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-lit - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-lit - duplicate with opus + # - mtdata_Statmt-europarl-9-lit-eng - duplicate with opus + # - mtdata_Statmt-europarl-7-lit-eng - duplicate with opus + # - mtdata_Statmt-europarl-10-lit-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-lit_LT - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-lit - duplicate with opus + train: + - opus_NLLB/v1 # 23,298,470 sentences + - opus_ParaCrawl/v9 # 13,192,237 sentences + - opus_ELRC-EMEA/v1 # 11,487,359 sentences + - opus_ELRC-4270-NTEU_TierA/v1 # 8,061,918 sentences + - opus_CCAligned/v1 # 5,215,271 sentences + - opus_DGT/v2019 # 5,061,918 sentences + - opus_TildeMODEL/v2018 # 2,084,002 sentences + - opus_XLEnt/v1.2 # 1,642,943 sentences + - opus_OpenSubtitles/v2018 # 1,415,961 sentences + - opus_EMEA/v3 # 1,042,425 sentences + - opus_JRC-Acquis/v3.0 # 790,475 sentences + - opus_ELRC-2717-EMEA/v1 # 764,031 sentences + - opus_Europarl/v8 # 634,284 sentences + - opus_EUbookshop/v2 # 445,813 sentences + - opus_ELRC-5067-SciPar/v1 # 177,437 sentences + - opus_WikiMatrix/v1 # 157,526 sentences + - opus_ELITR-ECA/v1 # 147,678 sentences + - opus_ELRC-425-Lithuanian_legislati/v1 # 130,549 sentences + - opus_ELRC-presscorner_covid/v1 # 117,054 sentences + - opus_KDE4/v2 # 104,044 sentences + - opus_QED/v2.0a # 85,435 sentences + - opus_TED2020/v1 # 75,484 sentences + - opus_ECB/v1 # 69,805 sentences + - opus_bible-uedin/v1 # 62,187 sentences + - opus_GNOME/v1 # 59,776 sentences + - opus_NeuLab-TedTalks/v1 # 45,963 sentences + - opus_ELRC-591-www.lb.lt/v1 # 33,261 sentences + - opus_ELRC-3568-EUR_LEX_covid/v1 # 21,390 sentences + - opus_ELRC-EUR_LEX/v1 # 21,390 sentences + - opus_ELRC-405-President_Lithuania/v1 # 21,225 sentences + - opus_ELRA-W0160/v1 # 21,224 sentences + - opus_ELRC-2021-EUIPO_2017/v1 # 17,133 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,133 sentences + - opus_wikimedia/v20230407 # 14,454 sentences + - opus_ELRC-EUROPARL_covid/v1 # 13,851 sentences + - opus_ELRC-antibiotic/v1 # 12,602 sentences + - opus_ELRC-2878-EU_publications_medi/v1 # 12,581 sentences + - opus_ELRC-EU_publications/v1 # 12,581 sentences + - opus_EUconst/v1 # 10,171 sentences + - opus_ELRC-592-kam.lt/v1 # 8,531 sentences + - opus_Tatoeba/v2023-04-12 # 8,236 sentences + - opus_ELRC-3609-presscorner_covid/v1 # 6,462 sentences + - opus_WMT-News/v2019 # 5,998 sentences + - opus_ELRC_3382/v1 # 3,587 sentences + - opus_ECDC/v2016-03-16 # 2,546 sentences + - opus_ELRC-3467-EC_EUROPA_covid/v1 # 2,438 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,438 sentences + - opus_ELRC-590-www.lrs.lt/v1 # 1,771 sentences + - opus_ELRC-3205-antibiotic/v1 # 823 sentences + - opus_ELRC-3296-EUROPARL_covid/v1 # 553 sentences + - opus_ELRC-2740-vaccination/v1 # 546 sentences + - opus_ELRC-vaccination/v1 # 546 sentences + - opus_ELRC_2923/v1 # 384 sentences + - mtdata_ELRC-lithuanian_legislation_seimas_lithuania-1-eng-lit + - mtdata_ELRC-eu_publications_medical_v2-1-eng-lit + - mtdata_ELRC-wikipedia_health-1-eng-lit + - mtdata_ELRC-nteu_tierb-1-eng-lit + - mtdata_EU-eac_forms-1-eng-lit # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-lit # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-lit # ~510,025 sentences (57.6 MB) + - mtdata_Neulab-tedtalks_test-1-eng-lit # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-wiki_titles-1-lit-eng # ~15,267 sentences (1.7 MB) + - mtdata_Statmt-newsdev_enlt-2019-eng-lit # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_lten-2019-lit-eng # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-lit # ~1,149,015 sentences (129.8 MB) + - mtdata_Tilde-ema-2016-eng-lit # ~228,287 sentences (25.8 MB) + - mtdata_Tilde-airbaltic-1-eng-lit # ~962 sentences (108.7 kB) + - mtdata_Tilde-rapid-2016-eng-lit # ~180,798 sentences (20.4 MB) + + # The monolingual data contains: + # ~5,442,476 sentences + mono-src: + - news-crawl_news.2019 # ~1,079,646 sentences (122M) + - news-crawl_news.2020 # ~1,088,495 sentences (123M) + - news-crawl_news.2021 # ~1,008,849 sentences (114M) + - news-crawl_news.2022 # ~1,079,646 sentences (122M) + - news-crawl_news.2023 # ~1,185,840 sentences (134M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/lv-en-spring-2024.yml b/configs/lv-en-spring-2024.yml new file mode 100644 index 000000000..d37e982df --- /dev/null +++ b/configs/lv-en-spring-2024.yml @@ -0,0 +1,194 @@ +# The initial configuration was generated using: +# task config-generator -- lv en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: lv + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt17/dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt17 + + # The training data contains: + # 68,374,368 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (16,685,969 sentences) + # - opus_ELRC-3089-wikipedia_health/v1 - not enough data (143 sentences) + # - opus_ELRC-wikipedia_health/v1 - not enough data (143 sentences) + # - opus_ELRC_2922/v1 - not enough data (142 sentences) + # - opus_ELRA-W0308/v1 - not enough data (108 sentences) + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (84 sentences) + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (23 sentences) + # - opus_ELRA-W0301/v1 - not enough data (20 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-mfa_latvia-1-eng-lav - duplicate with opus + # - mtdata_ELRC-state_latvian-1-eng-lav - duplicate with opus + # - mtdata_ELRC-www.visitestonia.com-1-eng-lav - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-lav - duplicate with opus + # - mtdata_ELRC-emea-1-eng-lav - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-lav - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-lav - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-covid19.gov.lv-1-eng-lav - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-lav - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-lav - duplicate with opus + # - mtdata_EU-ecdc-1-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-lav - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-lav - duplicate with opus + # - mtdata_Statmt-europarl-7-lav-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-lav_LV - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-lav - duplicate with opus + train: + - opus_NLLB/v1 # 16,685,969 sentences + - opus_ParaCrawl/v9 # 13,064,066 sentences + - opus_ELRC-EMEA/v1 # 11,795,507 sentences + - opus_ELRC-4269-NTEU_TierA/v1 # 8,072,484 sentences + - opus_DGT/v2019 # 5,072,124 sentences + - opus_CCAligned/v1 # 4,850,972 sentences + - opus_TildeMODEL/v2018 # 2,111,785 sentences + - opus_XLEnt/v1.2 # 1,295,887 sentences + - opus_EMEA/v3 # 1,030,272 sentences + - opus_JRC-Acquis/v3.0 # 793,589 sentences + - opus_ELRC-2729-EMEA/v1 # 783,490 sentences + - opus_Europarl/v8 # 639,318 sentences + - opus_OpenSubtitles/v2018 # 519,553 sentences + - opus_EUbookshop/v2 # 445,891 sentences + - opus_ELRC-5067-SciPar/v1 # 347,473 sentences + - opus_ELRC-presscorner_covid/v1 # 128,895 sentences + - opus_KDE4/v2 # 91,386 sentences + - opus_QED/v2.0a # 72,447 sentences + - opus_ECB/v1 # 65,374 sentences + - opus_ELITR-ECA/v1 # 64,115 sentences + - opus_TED2020/v1 # 55,488 sentences + - opus_ELRC-399-International_Agreem/v1 # 40,897 sentences + - opus_ELRA-W0158/v1 # 40,896 sentences + - opus_ELRC-3578-EUR_LEX_covid/v1 # 22,476 sentences + - opus_ELRC-EUR_LEX/v1 # 22,476 sentences + - opus_wikimedia/v20230407 # 21,295 sentences + - opus_ELRC-EUROPARL_covid/v1 # 17,831 sentences + - opus_ELRC-2022-EUIPO_2017/v1 # 17,255 sentences + - opus_ELRC-EUIPO_2017/v1 # 17,255 sentences + - opus_bible-uedin/v1 # 15,885 sentences + - opus_ELRC-1130-www.visitestonia.com/v1 # 13,841 sentences + - opus_ELRC-www.visitestonia.com/v1 # 13,841 sentences + - opus_ELRC-2888-EU_publications_medi/v1 # 13,045 sentences + - opus_ELRC-EU_publications/v1 # 13,045 sentences + - opus_ELRC-antibiotic/v1 # 12,048 sentences + - opus_ELRC-715-Finance_Economics_Ba/v1 # 11,600 sentences + - opus_ELRA-W0216/v1 # 11,599 sentences + - opus_GNOME/v1 # 11,265 sentences + - opus_EUconst/v1 # 10,036 sentences + - opus_WMT-News/v2019 # 8,008 sentences + - opus_ELRC-402-MFA_Latvia/v1 # 7,195 sentences + - opus_ELRA-W0159/v1 # 7,194 sentences + - opus_ELRC-433-State_Latvian/v1 # 6,862 sentences + - opus_ELRA-W0169/v1 # 6,861 sentences + - opus_ELRC-3619-presscorner_covid/v1 # 6,686 sentences + - opus_ELRC_3382/v1 # 3,737 sentences + - opus_ECDC/v2016-03-16 # 2,543 sentences + - opus_ELRC-3477-EC_EUROPA_covid/v1 # 2,407 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,407 sentences + - opus_ELRC-4994-Latvian_Financial_MT/v1 # 2,002 sentences + - opus_Tatoeba/v2023-04-12 # 1,814 sentences + - opus_ELRC-3453-covid19.gov.lv/v1 # 826 sentences + - opus_ELRC-3217-antibiotic/v1 # 809 sentences + - opus_ELRC-3306-EUROPARL_covid/v1 # 724 sentences + - opus_ELRC_2923/v1 # 580 sentences + - opus_ELRC-2741-vaccination/v1 # 521 sentences + - opus_ELRC-vaccination/v1 # 521 sentences + - mtdata_ELRC-international_agreements-1-eng-lav + - mtdata_ELRC-rights_arrested-1-eng-lav + - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-eng-lav + - mtdata_ELRC-finance_economics_bank_latvia-1-eng-lav + - mtdata_ELRC-eu_publications_medical_v2-1-eng-lav + - mtdata_ELRC-wikipedia_health-1-eng-lav + - mtdata_ELRC-nteu_tierb-1-eng-lav + - mtdata_EU-eac_forms-1-eng-lav # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-lav # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-lav # ~524,054 sentences (59.2 MB) + - mtdata_Statmt-newsdev_lven-2017-lav-eng # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_enlv-2017-eng-lav # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-lav # ~1,122,956 sentences (126.9 MB) + - mtdata_Tilde-ema-2016-eng-lav # ~231,439 sentences (26.2 MB) + - mtdata_Tilde-airbaltic-1-eng-lav # ~1,050 sentences (118.7 kB) + - mtdata_Tilde-fold-1-eng-lav # ~10,070 sentences (1.1 MB) + - mtdata_Tilde-rapid-2016-eng-lav # ~198,906 sentences (22.5 MB) + + # The monolingual data contains: + # ~3,283,185 sentences + mono-src: + - news-crawl_news.2015 # ~1,274,336 sentences (144M) + - news-crawl_news.2016 # ~1,017,699 sentences (115M) + - news-crawl_news.2017 # ~991,150 sentences (112M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/ro-en-spring-2024.yml b/configs/ro-en-spring-2024.yml new file mode 100644 index 000000000..669a8501c --- /dev/null +++ b/configs/ro-en-spring-2024.yml @@ -0,0 +1,219 @@ +# The initial configuration was generated using: +# task config-generator -- ro en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: ro + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-ron + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt16/dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt16 + + # The training data contains: + # 174,698,415 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (55,607,023 sentences) + # - opus_ELRA-W0308/v1 - not enough data (92 sentences) + # - opus_ELRC-648-Letter_rights_person/v1 - not enough data (77 sentences) + # - opus_ELRC-403-Rights_Arrested/v1 - not enough data (24 sentences) + # - opus_ELRA-W0301/v1 - not enough data (21 sentences) + # - opus_tldr-pages/v2023-08-29 - not enough data (9 sentences) + # - opus_ELRC-417-Swedish_Work_Environ/v1 - not enough data (8 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-romanian_literature-1-eng-ron - duplicate with opus + # - mtdata_ELRC-romanian_wikipedia-1-eng-ron - duplicate with opus + # - mtdata_ELRC-romanian_news-1-eng-ron - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eir_spos-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eir_newsletter-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eir-1-eng-ron - duplicate with opus + # - mtdata_ELRC-emea-1-eng-ron - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-ron - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-ron - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-ron - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-ron - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-ron - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-ron - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-ron - duplicate with opus + # - mtdata_EU-ecdc-1-eng-ron - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-ron - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-ron - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-ron - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-ron - duplicate with opus + # - mtdata_Statmt-europarl-7-ron-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-ron_RO - duplicate with opus + train: + - opus_NLLB/v1 # 55,607,023 sentences + - opus_OpenSubtitles/v2018 # 50,693,226 sentences + - opus_ParaCrawl/v9 # 25,048,962 sentences + - opus_ELRC-EMEA/v1 # 13,648,577 sentences + - opus_CCAligned/v1 # 10,525,602 sentences + - opus_DGT/v2019 # 3,541,661 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,421,073 sentences + - opus_XLEnt/v1.2 # 3,337,016 sentences + - opus_TildeMODEL/v2018 # 1,925,419 sentences + - opus_EMEA/v3 # 994,499 sentences + - opus_ELRC-2728-EMEA/v1 # 783,742 sentences + - opus_WikiMatrix/v1 # 631,486 sentences + - opus_JRC-Acquis/v3.0 # 455,171 sentences + - opus_QED/v2.0a # 438,832 sentences + - opus_Europarl/v8 # 400,356 sentences + - opus_Wikipedia/v1.0 # 360,499 sentences + - opus_TED2020/v1 # 328,491 sentences + - opus_EUbookshop/v2 # 324,553 sentences + - opus_wikimedia/v20230407 # 323,049 sentences + - opus_SETIMES/v2 # 213,047 sentences + - opus_NeuLab-TedTalks/v1 # 196,122 sentences + - opus_TED2013/v1.1 # 158,483 sentences + - opus_ELRC-presscorner_covid/v1 # 153,650 sentences + - opus_Tanzil/v1 # 136,175 sentences + - opus_ELRC-492-Romanian_Wikipedia/v1 # 132,230 sentences + - opus_ELRA-W0193/v1 # 132,229 sentences + - opus_KDE4/v2 # 114,741 sentences + - opus_ELRC-493-Romanian_news/v1 # 98,099 sentences + - opus_ELRA-W0194/v1 # 98,098 sentences + - opus_ELITR-ECA/v1 # 92,826 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_PHP/v1 # 30,391 sentences + - opus_GNOME/v1 # 25,419 sentences + - opus_ELRC-3577-EUR_LEX_covid/v1 # 23,183 sentences + - opus_ELRC-EUR_LEX/v1 # 23,183 sentences + - opus_ELRC-1177-EUIPO_2017/v1 # 20,298 sentences + - opus_ELRC-EUIPO_2017/v1 # 20,298 sentences + - opus_Tatoeba/v2023-04-12 # 16,308 sentences + - opus_ELRC-wikipedia_health/v1 # 13,252 sentences + - opus_ELRC-2887-EU_publications_medi/v1 # 13,164 sentences + - opus_ELRC-EU_publications/v1 # 13,164 sentences + - opus_ELRC-930-studies_reports_stat/v1 # 12,043 sentences + - opus_ELRA-W0270/v1 # 12,042 sentences + - opus_ELRC-EUROPARL_covid/v1 # 10,906 sentences + - opus_WMT-News/v2019 # 7,996 sentences + - opus_ELRC-3618-presscorner_covid/v1 # 6,715 sentences + - opus_ELRC-435-Romanian_New_Crimina/v1 # 6,496 sentences + - opus_ELRA-W0170/v1 # 6,495 sentences + - opus_ELRC-491-Romanian_literature/v1 # 5,281 sentences + - opus_ELRA-W0192/v1 # 5,280 sentences + - opus_ELRC-1819-EIR/v1 # 4,994 sentences + - opus_GlobalVoices/v2018q4 # 4,454 sentences + - opus_ELRC-1992-Rural_Development_Pr/v1 # 4,186 sentences + - opus_ELRC-Rural_Development/v1 # 4,186 sentences + - opus_ELRC-654-Romanian_Ombudsman_a/v1 # 4,148 sentences + - opus_ELRA-W0206/v1 # 4,147 sentences + - opus_ELRC-1815-EIR_Newsletter/v1 # 3,788 sentences + - opus_ELRC_3382/v1 # 3,674 sentences + - opus_ELRC-1814-EIR_SPOS/v1 # 3,248 sentences + - opus_ECDC/v2016-03-16 # 2,556 sentences + - opus_ELRC-3476-EC_EUROPA_covid/v1 # 2,338 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,338 sentences + - opus_ELRC-3216-antibiotic/v1 # 1,035 sentences + - opus_ELRC-antibiotic/v1 # 1,035 sentences + - opus_ELRC-3087-wikipedia_health/v1 # 693 sentences + - opus_ELRC_2922/v1 # 692 sentences + - opus_ELRC-3305-EUROPARL_covid/v1 # 546 sentences + - opus_ELRC-2750-vaccination/v1 # 496 sentences + - opus_ELRC-vaccination/v1 # 496 sentences + - opus_ELRC_2923/v1 # 319 sentences + - mtdata_ELRC-rights_arrested-1-eng-ron + - mtdata_ELRC-swedish_work_environment-1-eng-ron + - mtdata_ELRC-romanian_new_criminal_procedure_code-1-eng-ron + - mtdata_ELRC-letter_rights_persons_arrested_or_detained-1-eng-ron + - mtdata_ELRC-romanian_ombudsman_archive-1-eng-ron + - mtdata_ELRC-studies_reports_statistical_culture_institute_cultural_research_training-1-eng-ron + - mtdata_ELRC-rural_development_programme_romania-1-eng-ron + - mtdata_ELRC-eu_publications_medical_v2-1-eng-ron + - mtdata_EU-eac_forms-1-eng-ron # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-ron # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-ron # ~389,297 sentences (44.0 MB) + - mtdata_Neulab-tedtalks_test-1-eng-ron # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-newsdev_enro-2016-eng-ron # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_roen-2016-ron-eng # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-eesc-2017-eng-ron # ~1,026,056 sentences (115.9 MB) + - mtdata_Tilde-ema-2016-eng-ron # ~229,130 sentences (25.9 MB) + - mtdata_Tilde-ecb-2017-eng-ron # ~1,778 sentences (200.9 kB) + - mtdata_Tilde-rapid-2016-eng-ron # ~196,150 sentences (22.2 MB) + - mtdata_Tilde-worldbank-1-eng-ron # ~6,413 sentences (724.7 kB) + + # The monolingual data contains: + # ~24,920,348 sentences + mono-src: + - news-crawl_news.2015 # ~1,088,495 sentences (123M) + - news-crawl_news.2016 # ~2,061,946 sentences (233M) + - news-crawl_news.2017 # ~2,247,787 sentences (254M) + - news-crawl_news.2018 # ~1,345,132 sentences (152M) + - news-crawl_news.2019 # ~3,283,185 sentences (371M) + - news-crawl_news.2020 # ~3,982,300 sentences (450M) + - news-crawl_news.2021 # ~3,353,982 sentences (379M) + - news-crawl_news.2022 # ~3,831,858 sentences (433M) + - news-crawl_news.2023 # ~3,725,663 sentences (421M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/ru-en-spring-2024.yml b/configs/ru-en-spring-2024.yml new file mode 100644 index 000000000..4c891ec35 --- /dev/null +++ b/configs/ru-en-spring-2024.yml @@ -0,0 +1,181 @@ +# The initial configuration was generated using: +# task config-generator -- ru en --name spring-2024 --fast +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/e880dbefe77f3428aed2d8ccc4f840fe347b025b/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: ru + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: one-stage + pretrained-models: + train-backwards: + urls: + - https://storage.googleapis.com/releng-translations-dev/models/ru-en/better-teacher/student + mode: use + type: default +datasets: + + # Skipped test/devtest datasets: + # - mtedx/valid - variant dataset + # - mtedx/test - variant dataset + # - wmt20/tworefs - variant dataset + # - wmt18/test-ts - variant dataset + # - wmt14/full - variant dataset + devtest: + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt19 + - sacrebleu_aug-mix_wmt17 + - sacrebleu_aug-mix_wmt15 + - sacrebleu_aug-mix_wmt13 + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt20 + - sacrebleu_wmt18 + - sacrebleu_wmt16 + - sacrebleu_wmt14 + + # The training data contains: + # 250,111,081 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (139,937,785 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_ELRC-3855-SWPS_University_Soci/v1 - not enough data (109 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - opus_WikiTitles/v3 - ignored datasets (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-rus - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-rus - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-rus - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-rus - duplicate with opus + # - mtdata_Neulab-tedtalks_test-1-eng-rus - duplicate with opus + # - mtdata_Neulab-tedtalks_dev-1-eng-rus - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-1_bonus-eng-rus - duplicate with opus + # - mtdata_Statmt-news_commentary-14-eng-rus - duplicate with opus + # - mtdata_Statmt-news_commentary-15-eng-rus - duplicate with opus + # - mtdata_Statmt-news_commentary-16-eng-rus - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-rus_RU - duplicate with opus + train: + - opus_NLLB/v1 # 139,937,785 sentences + - opus_OpenSubtitles/v2018 # 25,910,105 sentences + - opus_UNPC/v1.0 # 25,173,398 sentences + - opus_CCAligned/v1 # 13,850,305 sentences + - opus_LinguaTools-WikiTitles/v2014 # 13,565,182 sentences + - opus_MultiUN/v1 # 11,654,416 sentences + - opus_XLEnt/v1.2 # 7,890,088 sentences + - opus_ParaCrawl/v9 # 5,378,016 sentences + - opus_WikiMatrix/v1 # 1,661,909 sentences + - opus_Tanzil/v1 # 1,067,840 sentences + - opus_Wikipedia/v1.0 # 572,717 sentences + - opus_QED/v2.0a # 563,700 sentences + - opus_wikimedia/v20230407 # 541,583 sentences + - opus_Tatoeba/v2023-04-12 # 540,675 sentences + - opus_TED2020/v1 # 390,015 sentences + - opus_News-Commentary/v16 # 265,809 sentences + - opus_NeuLab-TedTalks/v1 # 221,999 sentences + - opus_KDE4/v2 # 180,793 sentences + - opus_GlobalVoices/v2018q4 # 170,351 sentences + - opus_TED2013/v1.1 # 133,660 sentences + - opus_ELRC-5183-SciPar_Ukraine/v1 # 126,585 sentences + - opus_infopankki/v1 # 75,305 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_EUbookshop/v2 # 49,830 sentences + - opus_WMT-News/v2019 # 36,637 sentences + - opus_PHP/v1 # 30,064 sentences + - opus_Books/v1 # 17,496 sentences + - opus_TildeMODEL/v2018 # 10,977 sentences + - opus_MDN_Web_Docs/v2023-09-25 # 8,134 sentences + - opus_ada83/v1 # 4,122 sentences + - opus_ELRC-3075-wikipedia_health/v1 # 4,073 sentences + - opus_ELRC-wikipedia_health/v1 # 4,073 sentences + - opus_ELRC_2922/v1 # 4,072 sentences + - opus_tico-19/v2020-10-28 # 3,071 sentences + - opus_ELRC-5067-SciPar/v1 # 3,064 sentences + - opus_tldr-pages/v2023-08-29 # 1,037 sentences + - mtdata_Statmt-commoncrawl_wmt13-1-rus-eng + - mtdata_Statmt-news_commentary_wmt18-13-rus-eng + - mtdata_Statmt-wiki_titles-1-rus-eng + - mtdata_Statmt-wiki_titles-2-rus-eng + - mtdata_Tilde-airbaltic-1-eng-rus + - mtdata_Tilde-czechtourism-1-eng-rus + - mtdata_Tilde-worldbank-1-eng-rus + - mtdata_UN-un_dev-1-eng-rus + - mtdata_UN-un_test-1-eng-rus + + # The monolingual data contains: + # ~90,385,836 sentences + mono-src: + - news-crawl_news.2008 # ~19,469 sentences (2.2M) + - news-crawl_news.2009 # ~47,787 sentences (5.4M) + - news-crawl_news.2011 # ~4,876,106 sentences (551M) + - news-crawl_news.2012 # ~5,079,646 sentences (574M) + - news-crawl_news.2013 # ~7,327,433 sentences (828M) + - news-crawl_news.2014 # ~6,194,690 sentences (700M) + - news-crawl_news.2015 # ~5,433,628 sentences (614M) + - news-crawl_news.2016 # ~3,716,814 sentences (420M) + - news-crawl_news.2017 # ~4,451,327 sentences (503M) + - news-crawl_news.2018 # ~4,539,823 sentences (513M) + - news-crawl_news.2019 # ~6,955,752 sentences (786M) + - news-crawl_news.2020 # ~8,849,557 sentences (1.0G) + - news-crawl_news.2021 # ~8,115,044 sentences (917M) + - news-crawl_news.2022 # ~8,849,557 sentences (1.0G) + - news-crawl_news.2023 # ~15,929,203 sentences (1.8G) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/sk-en-spring-2024.yml b/configs/sk-en-spring-2024.yml new file mode 100644 index 000000000..30d52d8ae --- /dev/null +++ b/configs/sk-en-spring-2024.yml @@ -0,0 +1,180 @@ +# The initial configuration was generated using: +# task config-generator -- sk en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: sk + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-slk + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 111,168,672 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (38,096,241 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_ELRC-3076-wikipedia_health/v1 - not enough data (134 sentences) + # - opus_ELRC-wikipedia_health/v1 - not enough data (134 sentences) + # - opus_ELRC_2922/v1 - not enough data (133 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-culture_slovak-1-eng-slk - duplicate with opus + # - mtdata_ELRC-justice_slovak-1-eng-slk - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-slk - duplicate with opus + # - mtdata_ELRC-emea-1-eng-slk - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-slk - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-slk - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-slk - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-slk - duplicate with opus + # - mtdata_EU-ecdc-1-eng-slk - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-slk - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-slk - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-slk - duplicate with opus + # - mtdata_Statmt-europarl-7-slk-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-slk_SK - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-slk - duplicate with opus + train: + - opus_NLLB/v1 # 38,096,241 sentences + - opus_ParaCrawl/v9 # 22,902,149 sentences + - opus_ELRC-EMEA/v1 # 12,032,452 sentences + - opus_OpenSubtitles/v2018 # 8,850,871 sentences + - opus_ELRC-4154-NTEU_TierA/v1 # 7,922,512 sentences + - opus_CCAligned/v1 # 6,938,181 sentences + - opus_DGT/v2019 # 5,118,830 sentences + - opus_XLEnt/v1.2 # 2,594,162 sentences + - opus_TildeMODEL/v2018 # 2,190,889 sentences + - opus_EMEA/v3 # 1,054,178 sentences + - opus_ELRC-2721-EMEA/v1 # 780,098 sentences + - opus_Europarl/v8 # 639,958 sentences + - opus_EUbookshop/v2 # 452,097 sentences + - opus_ELITR-ECA/v1 # 294,356 sentences + - opus_WikiMatrix/v1 # 178,985 sentences + - opus_QED/v2.0a # 173,727 sentences + - opus_ELRC-presscorner_covid/v1 # 142,656 sentences + - opus_ECB/v1 # 122,131 sentences + - opus_TED2020/v1 # 106,067 sentences + - opus_KDE4/v2 # 105,425 sentences + - opus_NeuLab-TedTalks/v1 # 67,607 sentences + - opus_bible-uedin/v1 # 62,159 sentences + - opus_ELRC-5067-SciPar/v1 # 60,468 sentences + - opus_JRC-Acquis/v3.0 # 35,744 sentences + - opus_PHP/v1 # 31,173 sentences + - opus_ELRC-3570-EUR_LEX_covid/v1 # 22,479 sentences + - opus_ELRC-EUR_LEX/v1 # 22,479 sentences + - opus_wikimedia/v20230407 # 18,819 sentences + - opus_ELRC-EUROPARL_covid/v1 # 16,369 sentences + - opus_ELRC-1179-EUIPO_2017/v1 # 16,313 sentences + - opus_ELRC-EUIPO_2017/v1 # 16,313 sentences + - opus_ELRC-2880-EU_publications_medi/v1 # 12,927 sentences + - opus_ELRC-EU_publications/v1 # 12,927 sentences + - opus_EUconst/v1 # 10,119 sentences + - opus_ELRC-1072-annual_reports_immig/v1 # 8,041 sentences + - opus_ELRA-W0136/v1 # 8,040 sentences + - opus_ELRC-3611-presscorner_covid/v1 # 6,982 sentences + - opus_ELRC-1073-annual_reports_Slova/v1 # 6,008 sentences + - opus_ELRA-W0137/v1 # 6,007 sentences + - opus_ELRC-1074-annual_reports_Stati/v1 # 5,614 sentences + - opus_ELRC_3382/v1 # 3,624 sentences + - opus_ELRC-488-Justice_Slovak/v1 # 2,896 sentences + - opus_ELRA-W0189/v1 # 2,895 sentences + - opus_ELRC-487-Culture_Slovak/v1 # 2,610 sentences + - opus_ELRA-W0188/v1 # 2,609 sentences + - opus_ECDC/v2016-03-16 # 2,526 sentences + - opus_ELRC-3469-EC_EUROPA_covid/v1 # 2,409 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,409 sentences + - opus_ELRC-3209-antibiotic/v1 # 1,010 sentences + - opus_ELRC-antibiotic/v1 # 1,010 sentences + - opus_ELRC-3298-EUROPARL_covid/v1 # 653 sentences + - opus_ELRC-2745-vaccination/v1 # 510 sentences + - opus_ELRC-vaccination/v1 # 510 sentences + - opus_ELRC_2923/v1 # 448 sentences + - mtdata_ELRC-annual_reports_immigration_asylum_policies_emn_contact_point_slovak-1-eng-slk + - mtdata_ELRC-annual_reports_slovak_centre_human_rights-1-eng-slk + - mtdata_ELRC-annual_reports_statistical_slovak-1-eng-slk + - mtdata_ELRC-eu_publications_medical_v2-1-eng-slk + - mtdata_ELRC-wikipedia_health-1-eng-slk + - mtdata_ELRC-nteu_tierb-1-eng-slk + - mtdata_EU-eac_forms-1-eng-slk # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-slk # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-slk # ~548,757 sentences (62.0 MB) + - mtdata_Neulab-tedtalks_test-1-eng-slk # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-slk # ~1,269,685 sentences (143.5 MB) + - mtdata_Tilde-ema-2016-eng-slk # ~238,237 sentences (26.9 MB) + - mtdata_Tilde-rapid-2016-eng-slk # ~214,164 sentences (24.2 MB) + + # The monolingual data contains: + # ~0 sentences + mono-src: [] + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/sl-en-spring-2024.yml b/configs/sl-en-spring-2024.yml new file mode 100644 index 000000000..1b3a5613d --- /dev/null +++ b/configs/sl-en-spring-2024.yml @@ -0,0 +1,182 @@ +# The initial configuration was generated using: +# task config-generator -- sl en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: sl + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-slv + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 89,057,699 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (27,406,782 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (1,875,517 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-euipo_2017-1-eng-slv - duplicate with opus + # - mtdata_ELRC-emea-1-eng-slv - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-slv - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-slv - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-slv - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-slv - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-slv - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-slv - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-slv - duplicate with opus + # - mtdata_EU-ecdc-1-eng-slv - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-slv - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-slv - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-slv - duplicate with opus + # - mtdata_Statmt-europarl-7-slv-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-slv_SI - duplicate with opus + # - mtdata_Tilde-ecb-2017-eng-slv - duplicate with opus + train: + - opus_NLLB/v1 # 27,406,782 sentences + - opus_OpenSubtitles/v2018 # 19,641,457 sentences + - opus_ELRC-EMEA/v1 # 13,288,992 sentences + - opus_ParaCrawl/v9 # 9,516,259 sentences + - opus_DGT/v2019 # 5,125,455 sentences + - opus_CCAligned/v1 # 4,366,555 sentences + - opus_TildeMODEL/v2018 # 2,048,216 sentences + - opus_MaCoCu/v2 # 1,875,518 sentences + - opus_EMEA/v3 # 1,045,041 sentences + - opus_XLEnt/v1.2 # 861,509 sentences + - opus_ELRC-2727-EMEA/v1 # 766,139 sentences + - opus_Europarl/v8 # 624,803 sentences + - opus_EUbookshop/v2 # 405,653 sentences + - opus_WikiMatrix/v1 # 318,028 sentences + - opus_ELRC-5067-SciPar/v1 # 300,017 sentences + - opus_ELRC-presscorner_covid/v1 # 150,090 sentences + - opus_Wikipedia/v1.0 # 140,124 sentences + - opus_KDE4/v2 # 119,645 sentences + - opus_ELITR-ECA/v1 # 102,934 sentences + - opus_ECB/v1 # 89,634 sentences + - opus_QED/v2.0a # 79,196 sentences + - opus_wikimedia/v20230407 # 73,612 sentences + - opus_ELRC-490-Secretariat_General_/v1 # 63,070 sentences + - opus_ELRA-W0191/v1 # 63,069 sentences + - opus_bible-uedin/v1 # 62,195 sentences + - opus_ELRC-489-Secretariat_General_/v1 # 55,185 sentences + - opus_ELRA-W0190/v1 # 55,184 sentences + - opus_JRC-Acquis/v3.0 # 53,390 sentences + - opus_TED2020/v1 # 44,340 sentences + - opus_PHP/v1 # 30,550 sentences + - opus_NeuLab-TedTalks/v1 # 22,856 sentences + - opus_ELRC-3576-EUR_LEX_covid/v1 # 22,381 sentences + - opus_ELRC-EUR_LEX/v1 # 22,381 sentences + - opus_ELRC-1180-EUIPO_2017/v1 # 19,767 sentences + - opus_ELRC-EUIPO_2017/v1 # 19,767 sentences + - opus_ELRC-wikipedia_health/v1 # 17,519 sentences + - opus_ELRC-antibiotic/v1 # 16,166 sentences + - opus_ELRC-EUROPARL_covid/v1 # 15,269 sentences + - opus_TED2013/v1.1 # 14,960 sentences + - opus_ELRC-2886-EU_publications_medi/v1 # 13,209 sentences + - opus_ELRC-EU_publications/v1 # 13,209 sentences + - opus_ELRC-924-statistical_reports_/v1 # 11,860 sentences + - opus_ELRC-statistical_reports/v1 # 11,860 sentences + - opus_ELRA-W0267/v1 # 11,859 sentences + - opus_EUconst/v1 # 8,807 sentences + - opus_GNOME/v1 # 8,070 sentences + - opus_ELRC-3617-presscorner_covid/v1 # 6,995 sentences + - opus_ELRC-923-chapters_Youth_2010/v1 # 5,866 sentences + - opus_Tatoeba/v2023-04-12 # 4,302 sentences + - opus_ELRC_3382/v1 # 3,633 sentences + - opus_ECDC/v2016-03-16 # 2,546 sentences + - opus_ELRC-3475-EC_EUROPA_covid/v1 # 2,534 sentences + - opus_ELRC-EC_EUROPA/v1 # 2,534 sentences + - opus_ELRC-2328-Agriculture_Forestry/v1 # 1,551 sentences + - opus_ELRC-3215-antibiotic/v1 # 986 sentences + - opus_ELRC-3085-wikipedia_health/v1 # 960 sentences + - opus_ELRC_2922/v1 # 959 sentences + - opus_ELRC-3304-EUROPARL_covid/v1 # 816 sentences + - opus_ELRC-2737-vaccination/v1 # 492 sentences + - opus_ELRC-vaccination/v1 # 492 sentences + - opus_ELRC_2923/v1 # 451 sentences + - mtdata_ELRC-secretariat_general_part1-1-eng-slv + - mtdata_ELRC-secretariat_general_part2-1-eng-slv + - mtdata_ELRC-chapters_youth_2010_social_profile_young_people_slovenia_publication-1-eng-slv + - mtdata_ELRC-statistical_reports_statistical_slovenia-1-eng-slv + - mtdata_ELRC-agriculture_forestry_food_slovenia-1-eng-slv + - mtdata_ELRC-eu_publications_medical_v2-1-eng-slv + - mtdata_EU-eac_forms-1-eng-slv # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-slv # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-slv # ~539,490 sentences (61.0 MB) + - mtdata_Neulab-tedtalks_test-1-eng-slv # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-eesc-2017-eng-slv # ~1,116,707 sentences (126.2 MB) + - mtdata_Tilde-ema-2016-eng-slv # ~223,681 sentences (25.3 MB) + - mtdata_Tilde-rapid-2016-eng-slv # ~203,695 sentences (23.0 MB) + + # The monolingual data contains: + # ~0 sentences + mono-src: [] + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/sr-en-spring-2024.yml b/configs/sr-en-spring-2024.yml new file mode 100644 index 000000000..4d07e66e7 --- /dev/null +++ b/configs/sr-en-spring-2024.yml @@ -0,0 +1,131 @@ +# The initial configuration was generated using: +# task config-generator -- sr en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: sr + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-srp + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 78,565,711 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (26,510,872 sentences) + # - opus_MultiHPLT/v1.1 - ignored datasets (3,904,384 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (114 sentences) + # - opus_ELRC-416-Swedish_Social_Secur/v1 - not enough data (113 sentences) + # - opus_tldr-pages/v2023-08-29 - not enough data (26 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-srp - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-srp - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-srp - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-srp_RS - duplicate with opus + train: + - opus_OpenSubtitles/v2018 # 42,635,098 sentences + - opus_NLLB/v1 # 26,510,872 sentences + - opus_HPLT/v1.1 # 3,904,423 sentences + - opus_CCAligned/v1 # 1,992,692 sentences + - opus_XLEnt/v1.2 # 1,474,447 sentences + - opus_WikiMatrix/v1 # 395,569 sentences + - opus_GoURMET/v1 # 329,004 sentences + - opus_QED/v2.0a # 284,942 sentences + - opus_TED2020/v1 # 260,966 sentences + - opus_SETIMES/v2 # 225,169 sentences + - opus_wikimedia/v20230407 # 217,199 sentences + - opus_NeuLab-TedTalks/v1 # 152,477 sentences + - opus_bible-uedin/v1 # 62,131 sentences + - opus_KDE4/v2 # 60,827 sentences + - opus_Tatoeba/v2023-04-12 # 21,760 sentences + - opus_GlobalVoices/v2018q4 # 20,309 sentences + - opus_ELRC-wikipedia_health/v1 # 12,707 sentences + - opus_TildeMODEL/v2018 # 2,024 sentences + - opus_EUbookshop/v2 # 1,608 sentences + - opus_ELRC-3041-wikipedia_health/v1 # 744 sentences + - opus_ELRC_2922/v1 # 743 sentences + - mtdata_ELRC-swedish_social_security-1-eng-srp + - mtdata_Neulab-tedtalks_test-1-eng-srp # ~3,117,009 sentences (352.2 MB) + - mtdata_Tilde-worldbank-1-eng-srp # ~2,533 sentences (286.3 kB) + + # The monolingual data contains: + # ~35,920,209 sentences + mono-src: + - news-crawl_news.2008 # ~3,522 sentences (398K) + - news-crawl_news.2009 # ~18,584 sentences (2.1M) + - news-crawl_news.2010 # ~9,734 sentences (1.1M) + - news-crawl_news.2011 # ~2,530 sentences (286K) + - news-crawl_news.2018 # ~18,584 sentences (2.1M) + - news-crawl_news.2019 # ~1,929,203 sentences (218M) + - news-crawl_news.2020 # ~5,619,469 sentences (635M) + - news-crawl_news.2021 # ~8,849,557 sentences (1.0G) + - news-crawl_news.2022 # ~9,734,513 sentences (1.1G) + - news-crawl_news.2023 # ~9,734,513 sentences (1.1G) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/sv-en-spring-2024.yml b/configs/sv-en-spring-2024.yml new file mode 100644 index 000000000..bd44178e6 --- /dev/null +++ b/configs/sv-en-spring-2024.yml @@ -0,0 +1,239 @@ +# The initial configuration was generated using: +# task config-generator -- sv en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: sv + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Lindat-khresmoi_summary_dev-2-eng-swe + - mtdata_Neulab-tedtalks_dev-1-eng-swe + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 211,400,324 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (77,008,059 sentences) + # - opus_RF/v1 - not enough data (180 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-swedish_labour_part2-1-eng-swe - duplicate with opus + # - mtdata_ELRC-swedish_labour_part1-1-eng-swe - duplicate with opus + # - mtdata_ELRC-swedish_food-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.norden.org-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.sida.se-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.visitestonia.com-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.vtv.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-valtioneuvosto.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-vnk.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-euipo_2017-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.turku.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-www.vero.fi-1-eng-swe - duplicate with opus + # - mtdata_ELRC-emea-1-eng-swe - duplicate with opus + # - mtdata_ELRC-vaccination-1-eng-swe - duplicate with opus + # - mtdata_ELRC-wikipedia_health-1-eng-swe - duplicate with opus + # - mtdata_ELRC-antibiotic-1-eng-swe - duplicate with opus + # - mtdata_ELRC-europarl_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-ec_europa_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-eur_lex_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-presscorner_covid-1-eng-swe - duplicate with opus + # - mtdata_ELRC-nteu_tiera-1-eng-swe - duplicate with opus + # - mtdata_EU-ecdc-1-eng-swe - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-swe - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-swe - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-6-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-7.1-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-8-eng-swe - duplicate with opus + # - mtdata_ParaCrawl-paracrawl-9-eng-swe - duplicate with opus + # - mtdata_Statmt-europarl-7-swe-eng - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-swe_SE - duplicate with opus + train: + - opus_NLLB/v1 # 77,008,059 sentences + - opus_ParaCrawl/v9 # 49,110,322 sentences + - opus_OpenSubtitles/v2018 # 17,660,152 sentences + - opus_ELRC-4268-NTEU_TierA/v1 # 12,737,597 sentences + - opus_CCAligned/v1 # 12,544,114 sentences + - opus_ELRC-EMEA/v1 # 12,083,941 sentences + - opus_LinguaTools-WikiTitles/v2014 # 8,058,690 sentences + - opus_DGT/v2019 # 5,139,521 sentences + - opus_XLEnt/v1.2 # 3,674,011 sentences + - opus_TildeMODEL/v2018 # 3,234,207 sentences + - opus_EUbookshop/v2 # 1,915,479 sentences + - opus_Europarl/v8 # 1,892,723 sentences + - opus_EMEA/v3 # 1,086,217 sentences + - opus_JRC-Acquis/v3.0 # 792,924 sentences + - opus_ELRC-2725-EMEA/v1 # 759,846 sentences + - opus_ELRC-5067-SciPar/v1 # 670,816 sentences + - opus_WikiMatrix/v1 # 546,289 sentences + - opus_ELITR-ECA/v1 # 389,808 sentences + - opus_KDE4/v2 # 232,485 sentences + - opus_QED/v2.0a # 171,126 sentences + - opus_ELRC-presscorner_covid/v1 # 147,973 sentences + - opus_Tanzil/v1 # 127,493 sentences + - opus_ELRC-Swedish_Migration/v1 # 124,398 sentences + - opus_TED2020/v1 # 120,718 sentences + - opus_ELRC-www.turku.fi/v1 # 107,773 sentences + - opus_NeuLab-TedTalks/v1 # 69,332 sentences + - opus_wikimedia/v20230407 # 63,135 sentences + - opus_bible-uedin/v1 # 62,137 sentences + - opus_infopankki/v1 # 51,749 sentences + - opus_ELRC-1770-valtioneuvosto.fi/v1 # 49,084 sentences + - opus_ELRC-valtioneuvosto.fi/v1 # 49,084 sentences + - opus_ELRC-1133-www.vtv.fi/v1 # 46,501 sentences + - opus_ELRC-www.vtv.fi/v1 # 46,501 sentences + - opus_ELRC-734-www.norden.org/v1 # 37,763 sentences + - opus_ELRC-www.norden.org/v1 # 37,763 sentences + - opus_ELRC-1772-vnk.fi/v1 # 33,627 sentences + - opus_ELRC-vnk.fi/v1 # 33,627 sentences + - opus_WikiSource/v1 # 33,283 sentences + - opus_ELRC-817-Swedish_Audit_Riksre/v1 # 30,352 sentences + - opus_PHP/v1 # 30,198 sentences + - opus_Tatoeba/v2023-04-12 # 27,050 sentences + - opus_ELRC-3574-EUR_LEX_covid/v1 # 22,445 sentences + - opus_ELRC-EUR_LEX/v1 # 22,445 sentences + - opus_ELRC-2037-www.vero.fi/v1 # 22,317 sentences + - opus_ELRC-www.vero.fi/v1 # 22,317 sentences + - opus_ELRC-2026-EUIPO_2017/v1 # 16,947 sentences + - opus_ELRC-EUIPO_2017/v1 # 16,947 sentences + - opus_ELRC-EUROPARL_covid/v1 # 15,681 sentences + - opus_ELRC-antibiotic/v1 # 15,452 sentences + - opus_ELRC-1131-www.visitestonia.com/v1 # 14,335 sentences + - opus_ELRC-www.visitestonia.com/v1 # 14,335 sentences + - opus_ELRC-2884-EU_publications_medi/v1 # 13,100 sentences + - opus_ELRC-EU_publications/v1 # 13,100 sentences + - opus_ELRC-802-Swedish_Competition_/v1 # 11,374 sentences + - opus_ELRC-wikipedia_health/v1 # 10,395 sentences + - opus_ELRC-928-Annual_Reports_Swedi/v1 # 10,227 sentences + - opus_EUconst/v1 # 9,954 sentences + - opus_ELRC-2033-www.turku.fi/v1 # 9,706 sentences + - opus_GlobalVoices/v2018q4 # 8,793 sentences + - opus_ELRC-829-Swedish_Migration_Bo/v1 # 8,366 sentences + - opus_ELRA-W0239/v1 # 8,365 sentences + - opus_ELRC-417-Swedish_Work_Environ/v1 # 7,475 sentences + - opus_ELRC-3615-presscorner_covid/v1 # 6,856 sentences + - opus_ELRC-744-Finnish_Information_/v1 # 6,819 sentences + - opus_ELRC-Finnish_Information/v1 # 6,819 sentences + - opus_ELRA-W0222/v1 # 6,818 sentences + - opus_ELRC_3382/v1 # 3,760 sentences + - opus_Books/v1 # 3,095 sentences + - opus_ELRC-Swedish_Labour/v1 # 2,778 sentences + - opus_ECDC/v2016-03-16 # 2,528 sentences + - opus_ELRC-1013-Sweden_a_Pocket/v1 # 2,200 sentences + - opus_ELRA-W0130/v1 # 2,199 sentences + - opus_ELRC-712-Social_Insurance_Frs/v1 # 1,953 sentences + - opus_ELRA-W0213/v1 # 1,952 sentences + - opus_ELRC-3473-EC_EUROPA_covid/v1 # 1,858 sentences + - opus_ELRC-EC_EUROPA/v1 # 1,858 sentences + - opus_ELRC-401-Swedish_Labour_Part2/v1 # 1,768 sentences + - opus_ELRC-929-www.sida.se/v1 # 1,545 sentences + - opus_ELRC-823-Swedish_Swedish_Crim/v1 # 1,503 sentences + - opus_ELRC-416-Swedish_Social_Secur/v1 # 1,447 sentences + - opus_ELRC-416-Swedish_Social_Secur/v1 # 1,446 sentences + - opus_ELRC-436-Swedish_Food/v1 # 1,147 sentences + - opus_ELRA-W0305/v1 # 1,146 sentences + - opus_ELRC-406-Swedish_Labour_Part1/v1 # 1,011 sentences + - opus_ELRC-3213-antibiotic/v1 # 953 sentences + - opus_ELRC-830-Swedish_Economic_Reg/v1 # 949 sentences + - opus_ELRC-3302-EUROPARL_covid/v1 # 844 sentences + - opus_tldr-pages/v2023-08-29 # 566 sentences + - opus_ELRC-3082-wikipedia_health/v1 # 535 sentences + - opus_ELRC_2922/v1 # 534 sentences + - opus_ELRC_2923/v1 # 499 sentences + - opus_ELRC-2752-vaccination/v1 # 497 sentences + - opus_ELRC-vaccination/v1 # 497 sentences + - mtdata_ELRC-swedish_social_security-1-eng-swe + - mtdata_ELRC-swedish_work_environment-1-eng-swe + - mtdata_ELRC-social_insurance_försäkringskassan-1-eng-swe + - mtdata_ELRC-finnish_information_bank-1-eng-swe + - mtdata_ELRC-swedish_competition_authority_konkurrensverket-1-eng-swe + - mtdata_ELRC-swedish_audit_riksrevisionen-1-eng-swe + - mtdata_ELRC-swedish_swedish_crime_victim_compensation_support_authority-1-eng-swe + - mtdata_ELRC-swedish_migration_board_migrationsverket-1-eng-swe + - mtdata_ELRC-swedish_economic_regional_growth_tillväxtverket-1-eng-swe + - mtdata_ELRC-annual_reports_swedish_pension_system-1-eng-swe + - mtdata_ELRC-sweden_a_pocket_guide_book-1-eng-swe + - mtdata_ELRC-eu_publications_medical_v2-1-eng-swe + - mtdata_ELRC-nteu_tierb-1-eng-swe + - mtdata_EU-eac_forms-1-eng-swe # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-swe # ~31,162 sentences (3.5 MB) + - mtdata_EU-dcep-1-eng-swe # ~980,674 sentences (110.8 MB) + - mtdata_Lindat-khresmoi_summary_test-2-eng-swe # ~11,808 sentences (1.3 MB) + - mtdata_Neulab-tedtalks_test-1-eng-swe # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-dcep_wmt17-1-swe-eng # ~1,137,607 sentences (128.5 MB) + - mtdata_Statmt-books_wmt17-1-swe-eng # ~2,797 sentences (316.2 kB) + - mtdata_Tilde-eesc-2017-eng-swe # ~1,798,328 sentences (203.2 MB) + - mtdata_Tilde-ema-2016-eng-swe # ~215,912 sentences (24.4 MB) + - mtdata_Tilde-ecb-2017-eng-swe # ~3,314 sentences (374.5 kB) + - mtdata_Tilde-rapid-2016-eng-swe # ~400,648 sentences (45.3 MB) + + # The monolingual data contains: + # ~0 sentences + mono-src: [] + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/tr-en-spring-2024.yml b/configs/tr-en-spring-2024.yml new file mode 100644 index 000000000..e56b22c38 --- /dev/null +++ b/configs/tr-en-spring-2024.yml @@ -0,0 +1,144 @@ +# The initial configuration was generated using: +# task config-generator -- tr en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: tr + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-tur + - flores_aug-mix_dev + - sacrebleu_aug-mix_wmt18/test-ts + - sacrebleu_aug-mix_wmt16 + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + - sacrebleu_wmt18 + - sacrebleu_wmt17 + - sacrebleu_wmt16/dev + + # The training data contains: + # 121,323,758 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (47,045,956 sentences) + # - opus_MultiMaCoCu/v2 - ignored datasets (1,646,740 sentences) + # - opus_GNOME/v1 - not enough data (150 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-tur - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-tur - duplicate with opus + # - mtdata_LinguaTools-wikititles-2014-eng-tur - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-tur - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-tur_TR - duplicate with opus + train: + - opus_NLLB/v1 # 47,045,956 sentences + - opus_OpenSubtitles/v2018 # 44,986,121 sentences + - opus_CCAligned/v1 # 13,650,311 sentences + - opus_LinguaTools-WikiTitles/v2014 # 3,915,248 sentences + - opus_XLEnt/v1.2 # 3,809,464 sentences + - opus_MaCoCu/v2 # 1,646,741 sentences + - opus_GoURMET/v1 # 1,308,307 sentences + - opus_Tanzil/v1 # 1,189,967 sentences + - opus_Tatoeba/v2023-04-12 # 676,920 sentences + - opus_wikimedia/v20230407 # 668,099 sentences + - opus_QED/v2.0a # 482,964 sentences + - opus_WikiMatrix/v1 # 477,736 sentences + - opus_TED2020/v1 # 378,033 sentences + - opus_SETIMES/v2 # 207,678 sentences + - opus_NeuLab-TedTalks/v1 # 195,641 sentences + - opus_Wikipedia/v1.0 # 159,979 sentences + - opus_KDE4/v2 # 153,438 sentences + - opus_TED2013/v1.1 # 137,028 sentences + - opus_bible-uedin/v1 # 60,411 sentences + - opus_infopankki/v1 # 44,030 sentences + - opus_Bianet/v1 # 34,770 sentences + - opus_PHP/v1 # 32,713 sentences + - opus_EUbookshop/v2 # 23,706 sentences + - opus_WMT-News/v2019 # 20,016 sentences + - opus_GlobalVoices/v2018q4 # 7,838 sentences + - opus_ELRC-3057-wikipedia_health/v1 # 2,368 sentences + - opus_ELRC-wikipedia_health/v1 # 2,368 sentences + - opus_ELRC_2922/v1 # 2,367 sentences + - opus_tldr-pages/v2023-08-29 # 1,956 sentences + - opus_TildeMODEL/v2018 # 1,584 sentences + - mtdata_EU-eac_forms-1-eng-tur # ~31,162 sentences (3.5 MB) + - mtdata_EU-eac_reference-1-eng-tur # ~31,162 sentences (3.5 MB) + - mtdata_Neulab-tedtalks_test-1-eng-tur # ~3,117,009 sentences (352.2 MB) + - mtdata_Statmt-newsdev_tren-2016-tur-eng # ~402,756 sentences (45.5 MB) + - mtdata_Statmt-newsdev_entr-2016-eng-tur # ~402,756 sentences (45.5 MB) + - mtdata_Tilde-worldbank-1-eng-tur # ~1,827 sentences (206.5 kB) + + # The monolingual data contains: + # ~20,230,124 sentences + mono-src: + - news-crawl_news.2010 # ~38 sentences (4.4K) + - news-crawl_news.2017 # ~1,194,690 sentences (135M) + - news-crawl_news.2018 # ~1,964,601 sentences (222M) + - news-crawl_news.2019 # ~3,168,141 sentences (358M) + - news-crawl_news.2020 # ~3,716,814 sentences (420M) + - news-crawl_news.2021 # ~3,814,159 sentences (431M) + - news-crawl_news.2022 # ~3,575,221 sentences (404M) + - news-crawl_news.2023 # ~2,796,460 sentences (316M) + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/configs/vi-en-spring-2024.yml b/configs/vi-en-spring-2024.yml new file mode 100644 index 000000000..bd4202652 --- /dev/null +++ b/configs/vi-en-spring-2024.yml @@ -0,0 +1,111 @@ +# The initial configuration was generated using: +# task config-generator -- vi en --name spring-2024 +# +# The documentation for this config can be found here: +# https://github.com/mozilla/firefox-translations-training/blob/c2a6e7f8c899ba363c5058e200692bfd8e321299/taskcluster/configs/config.prod.yml +experiment: + name: spring-2024 + src: vi + trg: en + best-model: chrf + use-opuscleaner: 'true' + opuscleaner-mode: defaults + bicleaner: + default-threshold: 0.5 + dataset-thresholds: {} + mono-max-sentences-src: 500_000_000 + mono-max-sentences-trg: 200_000_000 + spm-sample-size: 10_000_000 + spm-vocab-size: 32000 + teacher-ensemble: 2 + teacher-mode: two-stage + pretrained-models: {} +datasets: + devtest: + - mtdata_Neulab-tedtalks_dev-1-eng-vie + - flores_aug-mix_dev + test: + - flores_devtest + - flores_aug-mix_devtest + - flores_aug-title_devtest + - flores_aug-upper_devtest + - flores_aug-typos_devtest + - flores_aug-noise_devtest + - flores_aug-inline-noise_devtest + + # The training data contains: + # 69,085,316 sentences + # + # Skipped datasets: + # - opus_CCMatrix/v1 - ignored datasets (50,092,444 sentences) + # - opus_GNOME/v1 - not enough data (149 sentences) + # - opus_Ubuntu/v14.10 - not enough data (0 sentences) + # - opus_XLEnt/v1.2 - not enough data (0 sentences) + # - mtdata_ELRC-wikipedia_health-1-eng-vie - duplicate with opus + # - mtdata_Facebook-wikimatrix-1-eng-vie - duplicate with opus + # - mtdata_Neulab-tedtalks_train-1-eng-vie - duplicate with opus + # - mtdata_Statmt-ccaligned-1-eng-vie_VN - duplicate with opus + train: + - opus_NLLB/v1 # 50,092,444 sentences + - opus_CCAligned/v1 # 12,394,417 sentences + - opus_OpenSubtitles/v2018 # 3,505,276 sentences + - opus_WikiMatrix/v1 # 1,073,752 sentences + - opus_wikimedia/v20230407 # 669,743 sentences + - opus_QED/v2.0a # 338,024 sentences + - opus_TED2020/v1 # 326,417 sentences + - opus_NeuLab-TedTalks/v1 # 184,973 sentences + - opus_StanfordNLP-NMT/v1.0 # 133,167 sentences + - opus_ELRC-wikipedia_health/v1 # 126,413 sentences + - opus_bible-uedin/v1 # 124,390 sentences + - opus_Wikipedia/v1.0 # 58,116 sentences + - opus_KDE4/v2 # 42,782 sentences + - opus_Tatoeba/v2023-04-12 # 6,855 sentences + - opus_ELRC-3086-wikipedia_health/v1 # 4,274 sentences + - opus_ELRC_2922/v1 # 4,273 sentences + - mtdata_Neulab-tedtalks_test-1-eng-vie # ~3,117,009 sentences (352.2 MB) + + # The monolingual data contains: + # ~0 sentences + mono-src: [] + + # The monolingual data contains: + # ~195,823,002 sentences + mono-trg: + - news-crawl_news.2007 # ~1,557,522 sentences (176M) + - news-crawl_news.2008 # ~5,389,380 sentences (609M) + - news-crawl_news.2009 # ~6,557,522 sentences (741M) + - news-crawl_news.2010 # ~3,247,787 sentences (367M) + - news-crawl_news.2011 # ~6,318,584 sentences (714M) + - news-crawl_news.2012 # ~6,407,079 sentences (724M) + - news-crawl_news.2013 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2014 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2015 # ~10,619,469 sentences (1.2G) + - news-crawl_news.2016 # ~7,982,300 sentences (902M) + - news-crawl_news.2017 # ~11,504,424 sentences (1.3G) + - news-crawl_news.2018 # ~7,920,353 sentences (895M) + - news-crawl_news.2019 # ~17,699,115 sentences (2.0G) + - news-crawl_news.2020 # ~22,123,893 sentences (2.5G) + - news-crawl_news.2021 # ~21,238,938 sentences (2.4G) + - news-crawl_news.2022 # ~23,008,849 sentences (2.6G) + - news-crawl_news.2023 # ~23,008,849 sentences (2.6G) +marian-args: + decoding-backward: + beam-size: '12' + mini-batch-words: '2000' + decoding-teacher: + mini-batch-words: '4000' + precision: float16 + training-backward: + early-stopping: '5' + training-teacher: + early-stopping: '20' + training-student: + early-stopping: '20' + training-student-finetuned: + early-stopping: '20' +target-stage: all +wandb-publication: true +taskcluster: + split-chunks: 20 + worker-classes: + default: gcp-spot diff --git a/poetry.lock b/poetry.lock index 9659d55d4..a05f55a29 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3811,4 +3811,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "9ef01d74291f3092911ace9f003f728c4f9d691a4e0fdcfe15ab05dd1fa5ad00" +content-hash = "f4c1131b0c136675710a2997b3ef56fd010e3808b64bdc7ded3727db31ed5f6a" diff --git a/pyproject.toml b/pyproject.toml index 9961b2e21..4ecdf00b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ websocket_client ="*" # This install group is for running tests. Note that any dependencies in the # pipeline are installed separately through the run_task test abstraction. This # list is only for things imported directly in the tests. +ruamel-yaml = "^0.18.6" [tool.poetry.group.tests.dependencies] mtdata="0.3.2" requests="2.26.0" diff --git a/taskcluster/configs/config.prod.yml b/taskcluster/configs/config.prod.yml index 7cbc5bd70..4ca63159c 100644 --- a/taskcluster/configs/config.prod.yml +++ b/taskcluster/configs/config.prod.yml @@ -9,7 +9,7 @@ # An "experiment" is an individual training run. experiment: # Provide an identifiable name for your experiment. - name: baseline_en_ru + name: baseline # The source and target languages. This is the language tag part of the # BCP 47 locale identifier. diff --git a/tests/test_find_corpus.py b/tests/test_find_corpus.py index f402c7e7a..e98a817d3 100644 --- a/tests/test_find_corpus.py +++ b/tests/test_find_corpus.py @@ -75,10 +75,6 @@ def test_opus(mock_opus_data, capsys): capsys, "The opus dataset outputs nicely.", """ - Fetching datasets from: - https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest - - ┌──────────────────────────────┐ │ OPUS - https://opus.nlpl.eu/ │ └──────────────────────────────┘ diff --git a/utils/config_generator.py b/utils/config_generator.py new file mode 100644 index 000000000..17f177e8f --- /dev/null +++ b/utils/config_generator.py @@ -0,0 +1,433 @@ +import argparse +import re +import subprocess +import sys +from io import StringIO +from pathlib import Path + +import ruamel.yaml + +from utils.find_corpus import ( + fetch_mtdata, + fetch_news_crawl, + fetch_opus, + fetch_sacrebleu, + get_remote_file_size, +) + +""" +Generate a training config for a language pair based on the latest production +training config, taskcluster/configs/config.prod.yml. +""" + +root_dir = Path(__file__).parent.parent +prod_config_path = root_dir / "taskcluster/configs/config.prod.yml" + +pretrained_student_models = { + ("ru", "en"): "https://storage.googleapis.com/releng-translations-dev/models/ru-en/better-teacher/student" +} # fmt: skip + +skip_datasets = [ + # The NLLB dataset is based off of the CCMatrix dataset, and is mostly duplicated. + "CCMatrix", + # Skip Multi* datasets as they are generally multilingual versions of the original datasets. + "MultiMaCoCu", + "MultiHPLT", + # In Russian, the WikiTitles data had its direction reversed. The `LinguaTools-WikiTitles` + # version is fine. + "WikiTitles", +] + +# Do not include small datasets. This works around #508, and minimizes dataset tasks that +# won't bring a lot more data. +minimum_dataset_sentences = 200 + +flores_101_languages = { + "af", "amh", "ar", "as", "ast", "az", "be", "bn", "bs", "bg", "ca", "ceb", "cs", "ckb", "cy", + "da", "de", "el", "en", "et", "fa", "fi", "fr", "ful", "ga", "gl", "gu", "ha", "he", "hi", + "hr", "hu", "hy", "ig", "id", "is", "it", "jv", "ja", "kam", "kn", "ka", "kk", "kea", "km", + "ky", "ko", "lo", "lv", "ln", "lt", "lb", "lg", "luo", "ml", "mr", "mk", "mt", "mn", "mi", + "ms", "my", "nl", "nb", "npi", "nso", "ny", "oc", "om", "or", "pa", "pl", "pt", "pus", "ro", + "ru", "sk", "sl", "sna", "snd", "so", "es", "sr", "sv", "sw", "ta", "te", "tg", "tl", "th", + "tr", "uk", "umb", "ur", "uz", "vi", "wo", "xh", "yo", "zh", "zh", "zu" +} # fmt: skip + + +def get_git_revision_hash(remote_branch: str) -> str: + """ + The git hash should be something that will always be around. Check the main branch for the + most common ancestor to the local changes. The prod config locally could be different than + remote, but it's better + """ + return ( + subprocess.check_output(["git", "merge-base", remote_branch, "HEAD"]) + .decode("ascii") + .strip() + ) + + +def update_config( + prod_config: any, name: str, source: str, target: str, fast: bool +) -> dict[str, str]: + experiment = prod_config["experiment"] + + # Update the prod config for this language pair. + experiment["name"] = name + experiment["src"] = source + experiment["trg"] = target + experiment["bicleaner"]["dataset-thresholds"] = {} + + pretrained_model = pretrained_student_models.get((source, target)) + if pretrained_model: + # Switch to the one stage teacher mode, as the higher quality backtranslations lead + # to issues with early stopping when switching between stages. + experiment["teacher-mode"] = "one-stage" + experiment["pretrained-models"]["train-backwards"]["urls"] = [pretrained_model] + else: + experiment["pretrained-models"] = {} + + datasets = prod_config["datasets"] + + # Clear out the base config. + datasets["train"].clear() + datasets["devtest"].clear() + datasets["test"].clear() + datasets["mono-src"].clear() + datasets["mono-trg"].clear() + + # ruamel.yaml only supports inline comments. This dict will do string matching to apply + # comments too the top of a section. + comment_section = {} + + add_train_data(source, target, datasets, comment_section, fast) + add_test_data( + source, + target, + datasets["test"], + datasets["devtest"], + comment_section, + ) + add_mono_data(source, datasets["mono-src"], " mono-src:", comment_section) + add_mono_data(target, datasets["mono-trg"], " mono-trg:", comment_section) + + return comment_section + + +def add_train_data( + source: str, target: str, datasets: list[str], comment_section: dict[str, str], fast: bool +): + print("Fetching opus") + opus_datasets = fetch_opus(source, target) + total_sentences = 0 + skipped_datasets = [] + visited_corpora = set() + + for dataset in opus_datasets: + sentences = dataset.alignment_pairs or 0 + # Some datasets are ignored or too small to be included. + if dataset.corpus in skip_datasets: + skipped_datasets.append( + f"{dataset.corpus_key()} - ignored datasets ({sentences:,} sentences)" + ) + continue + if (dataset.alignment_pairs or 0) < minimum_dataset_sentences: + skipped_datasets.append( + f"{dataset.corpus_key()} - not enough data ({sentences:,} sentences)" + ) + continue + + visited_corpora.add(normalize_corpus_name(dataset.corpus)) + total_sentences += sentences + corpus_key = dataset.corpus_key() + datasets["train"].append(corpus_key) + datasets["train"].yaml_add_eol_comment( + f"{sentences:,} sentences".rjust(70 - len(corpus_key), " "), + len(datasets["train"]) - 1, + ) + + print("Fetching mtdata") + entries = fetch_mtdata(source, target) + + for corpus_key, entry in entries.items(): + # mtdata can have test and devtest data as well. + if entry.did.name.endswith("test"): + dataset = datasets["test"] + if entry.did.name.endswith("dev"): + dataset = datasets["devtest"] + else: + dataset = datasets["train"] + corpus_name = normalize_corpus_name(entry.did.name) + group_corpus_name = normalize_corpus_name(entry.did.group + entry.did.name) + if corpus_name in visited_corpora or group_corpus_name in visited_corpora: + skipped_datasets.append(f"{corpus_key} - duplicate with opus") + continue + + if entry.did.name in skip_datasets: + skipped_datasets.append(f"{entry.did.name} - ignored datasets") + continue + + dataset.append(corpus_key) + if not fast: + byte_size, display_size = get_remote_file_size(entry.url) + if byte_size: + # Don't add the sentences to the total, as these will be commented out by default. + sentences = estimate_sentence_size(byte_size) + dataset.yaml_add_eol_comment( + f"~{sentences:,} sentences ".rjust(70 - len(corpus_key), " ") + + f"({display_size})", + len(datasets["train"]) - 1, + ) + + comments = [ + "The training data contains:", + f" {total_sentences:,} sentences", + ] + if skipped_datasets: + comments.append("") + comments.append("Skipped datasets:") + for d in skipped_datasets: + comments.append(f" - {d}") + + train_comment = "\n".join(comments) + + comment_section[" train:"] = train_comment + + +def normalize_corpus_name(corpus_name: str): + """Normalize the corpus name so that it's easy to deduplicate between opus and mtdata.""" + + # Remove the language tags at the end. + # mtdata_ELRC-vnk.fi-1-eng-fin + # ^^^^^^^^ + corpus_name = re.sub(r"-\w{3}-\w{3}$", "", corpus_name) + + corpus_name = corpus_name.lower() + + # Remove numbers anything that is not a letter. This is a little aggressive, but should help + # deduplicate more datasets. For example: + # opus: 725-Hallituskausi_2011_2 + # mtdata: hallituskausi_2011_2015-1-eng-fin + corpus_name = re.sub(r"[^a-z]", "", corpus_name.lower()) + + # Datasets could be split by train/test/dev. Remove the "train" word so that it will match + # between Opus and mtdata. + # opus: NeuLab-TedTalks/v1 + # mtdata: Neulab-tedtalks_train-1-eng-fin + # mtdata: Neulab-tedtalks_test-1-eng-fin + # mtdata: Neulab-tedtalks_dev-1-eng-fin + corpus_name = re.sub(r"train$", "", corpus_name) + + return corpus_name + + +def add_test_data( + source: str, + target: str, + test_datasets: list[str], + devtest_datasets: list[str], + comment_section: dict[str, str], +): + skipped_datasets = [] + print("Fetching flores") + if source in flores_101_languages and target in flores_101_languages: + test_datasets.append("flores_devtest") + + # Add augmented datasets to check performance for the specific cases + test_datasets.append("flores_aug-mix_devtest") + test_datasets.append("flores_aug-title_devtest") + test_datasets.append("flores_aug-upper_devtest") + test_datasets.append("flores_aug-typos_devtest") + test_datasets.append("flores_aug-noise_devtest") + test_datasets.append("flores_aug-inline-noise_devtest") + + devtest_datasets.append("flores_aug-mix_dev") + + is_test = True # Flip between devtest and test. + print("Fetching sacrebleu") + for d in fetch_sacrebleu(source, target): + # Work around: PLW2901 `for` loop variable `dataset_name` overwritten by assignment target + dataset_name = d + if dataset_name in skip_datasets: + # This could be a dataset with a variant design. + skipped_datasets.append(f"{dataset_name} - variant dataset") + else: + dataset_name = dataset_name.replace("sacrebleu_", "") + if is_test: + test_datasets.append(f"sacrebleu_{dataset_name}") + else: + devtest_datasets.append(f"sacrebleu_aug-mix_{dataset_name}") + is_test = not is_test + + if skipped_datasets: + test_comment = "\n".join( + [ + "Skipped test/devtest datasets:", + *[f" - {d}" for d in skipped_datasets], + ] + ) + + comment_section[" devtest:"] = test_comment + + +def estimate_sentence_size(bytes: int) -> int: + """Estimate the sentences based on the compressed byte size""" + # One dataset measured 113 bytes per sentence, use that as a rough estimate. + bytes_per_sentence = 113 + return bytes // bytes_per_sentence + + +def add_mono_data( + lang: str, + datasets: list[str], + comment_key: str, + comment_section: dict[str, str], +): + print("Fetching newscrawl for", lang) + sentence_count = 0 + for dataset in fetch_news_crawl(lang): + datasets.append(dataset.name) + if dataset.size: + sentences = estimate_sentence_size(dataset.size) + sentence_count += sentences + datasets.yaml_add_eol_comment( + f"~{sentences:,} sentences ".rjust(50 - len(dataset.name), " ") + + f"({dataset.display_size})", + len(datasets) - 1, + ) + + comment = "\n".join( + [ + "The monolingual data contains:", + f" ~{sentence_count:,} sentences", + ] + ) + + comment_section[comment_key] = comment + + +def strip_comments(yaml_text: str) -> list[str]: + """ + ruamel.yaml preserves key ordering and comments. This function strips out the comments + + """ + result = "" + for l in yaml_text.splitlines(): + # Work around: PLW2901 `for` loop variable `line` overwritten by assignment target + line = l + if line.strip().startswith("#"): + continue + + # Remove any comments at the end. + line = re.sub(r"#[\s\w\-.]*$", "", line) + + # Don't add any empty lines. + if line.strip(): + result += line.rstrip() + "\n" + + return result + + +def apply_comments_to_yaml_string(yaml, prod_config, comment_section, remote_branch: str) -> str: + """ + ruamel.yaml only supports inline comments, so do direct string manipulation to apply + all the comments needed. + """ + # Dump out the yaml to a string so that it can be manipulated. + output_stream = StringIO() + yaml.dump(prod_config, output_stream) + yaml_string: str = output_stream.getvalue() + yaml_string = apply_comment_section(comment_section, yaml_string) + + script_args = " ".join(sys.argv[1:]) + return "\n".join( + [ + "# The initial configuration was generated using:", + f"# task config-generator -- {script_args}", + "#", + "# The documentation for this config can be found here:", + f"# https://github.com/mozilla/firefox-translations-training/blob/{get_git_revision_hash(remote_branch)}/taskcluster/configs/config.prod.yml", + yaml_string, + ] + ) + + +def apply_comment_section(comment_section: dict[str, str], yaml_string: str) -> str: + for key, raw_comment in comment_section.items(): + # Find the indent amount for the key. + match = re.search(r"^(?P\s*)", key) + if not match: + raise Exception("Could not find regex match") + indent = match.group("indent") + + # Indent the lines, and add the # comment. + comment = "\n".join([f"{indent}# {line}" for line in raw_comment.splitlines()]) + + yaml_string = yaml_string.replace(f"\n{key}", f"\n\n{comment}\n{key}") + return yaml_string + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, + # Preserves whitespace in the help text. + formatter_class=argparse.RawTextHelpFormatter, + ) + + parser.add_argument("source", metavar="SOURCE", type=str, help="The source language tag") + parser.add_argument("target", metavar="TARGET", type=str, help="The target language tag") + parser.add_argument( + "--name", + metavar="name", + type=str, + required=True, + help="The name of the config, which gets constructed like so: configs/{source}-{target}-{name}.yml", + ) + parser.add_argument( + "--remote_branch", + metavar="REF", + type=str, + default="origin/main", + help="The remote branch that contains the config.prod.yml. Typically origin/main, or origin/release", + ) + parser.add_argument( + "--fast", + action="store_true", + help="Skip slow network requests like looking up dataset size", + ) + + args = parser.parse_args() + + # Validate the inputs. + langtag_re = r"[a-z]{2,3}" + if not re.fullmatch(langtag_re, args.source): + print("The source language should be a 2 or 3 letter lang tag.") + if not re.fullmatch(langtag_re, args.target): + print("The target language should be a 2 or 3 letter lang tag.") + if not re.fullmatch(r"[\w\d-]+", args.name): + print( + "The name of the training config should only contain alphanumeric, underscores, and dashes.", + file=sys.stderr, + ) + sys.exit(1) + + # ruamel.yaml preserves comments and ordering unlink PyYAML + yaml = ruamel.yaml.YAML() + + # Load the prod yaml. + with prod_config_path.open() as f: + yaml_string = f.read() + yaml_string = strip_comments(yaml_string) + prod_config = yaml.load(StringIO(yaml_string)) + + comment_section = update_config(prod_config, args.name, args.source, args.target, args.fast) + final_config = apply_comments_to_yaml_string( + yaml, prod_config, comment_section, args.remote_branch + ) + final_config_path = root_dir / "configs" / f"{args.source}-{args.target}-{args.name}.yml" + + print("Writing config to:", str(final_config_path)) + final_config_path.write_text(final_config) + + +if __name__ == "__main__": + main() diff --git a/utils/find_corpus.py b/utils/find_corpus.py index 47f4484e9..2629e3a05 100755 --- a/utils/find_corpus.py +++ b/utils/find_corpus.py @@ -9,6 +9,7 @@ import argparse import logging +import re import sys from typing import NamedTuple, Optional, TypeVar, Union @@ -42,7 +43,7 @@ class OpusDataset(NamedTuple): latest: Union["True", "False"] - def name(self) -> str: + def corpus_key(self) -> str: return f"opus_{self.corpus}/{self.version}" def website_url(self) -> str: @@ -52,23 +53,25 @@ def humanize_size(self) -> str: return humanize.naturalsize(self.size * 1024) -def get_opus(source: str, target: str, download_url: bool): +def fetch_opus(source: str, target: str) -> list[OpusDataset]: # This API is documented: https://opus.nlpl.eu/opusapi/ url = f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest" - print(f"Fetching datasets from:\n{url}\n") - datasets = requests.get(url).json() # Convert the response into a typed object that is sorted. datasets_typed = [OpusDataset(**corpus_data) for corpus_data in datasets.get("corpora", [])] - datasets_typed = sorted(datasets_typed, key=lambda x: x.alignment_pairs or 0, reverse=True) + return sorted(datasets_typed, key=lambda x: x.alignment_pairs or 0, reverse=True) + +def get_opus(source: str, target: str, download_url: bool): print("") print("┌──────────────────────────────┐") print("│ OPUS - https://opus.nlpl.eu/ │") print("└──────────────────────────────┘") + datasets = fetch_opus(source, target) + print_table( [ [ @@ -81,31 +84,33 @@ def get_opus(source: str, target: str, download_url: bool): *[ [ dataset.corpus, - dataset.name(), + dataset.corpus_key(), dataset.alignment_pairs, dataset.humanize_size(), dataset.url if download_url else dataset.website_url(), ] - for dataset in datasets_typed + for dataset in datasets if dataset.alignment_pairs ], ] ) - names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]] + names = [dataset.corpus_key() for dataset in datasets] print_yaml(names, exclude=["OPUS100v", "WMT-News"]) -def get_sacrebleu(source: str, target: str): +def fetch_sacrebleu(source: str, target: str) -> dict[str, dict[str, any]]: import sacrebleu - entries = [ - (name, entry) + return { + name: entry for name, entry in sacrebleu.DATASETS.items() if f"{source}-{target}" in entry or f"{target}-{source}" in entry - ] + } - names = [f"sacrebleu_{name}" for name, entry in entries] + +def get_sacrebleu(source: str, target: str): + datasets_dict = fetch_sacrebleu(source, target) print("") print("┌─────────────────────────────────────────────────┐") @@ -118,14 +123,14 @@ def get_sacrebleu(source: str, target: str): [ # name, - entry["description"], - ", ".join(entry["data"]), + dataset["description"], + ", ".join(dataset["data"]), ] - for name, entry in entries + for name, dataset in datasets_dict.items() ], ] ) - print_yaml(names) + print_yaml(list(f"sacrebleu_{name}" for name in datasets_dict.keys())) def get_size(tags: list[str]) -> str: @@ -305,41 +310,33 @@ def get_huggingface_any(language: str): ) -def get_remote_file_size(url: str, display_not_200: bool = True) -> Optional[int]: +def get_remote_file_size( + url: str, display_not_200: bool = True +) -> tuple[Optional[int], Optional[str]]: try: - response = requests.head(url, timeout=1) + response = requests.head(url, timeout=1, allow_redirects=True) if response.status_code == 200: - return humanize.naturalsize(int(response.headers.get("Content-Length", 0))) + int_size = int(response.headers.get("Content-Length", 0)) + return int_size, humanize.naturalsize(int_size) else: if display_not_200: print(f"Failed to retrieve file information. Status code: {response.status_code}") - return None + return None, None except requests.exceptions.RequestException as e: print(f"An error occurred: {e}") - return None + return None, None T = TypeVar("T") - -def exclude_by_name(excludes: list[str], names: list[str], entries: list[T]) -> list[T]: - """Exclude entries by an excludes list, and a name list.""" - filtered_entries = [] - for name, entry in zip(names, entries): - filter = False - for exclude in excludes: - if exclude.lower() in name.lower(): - filter = True - break - - if not filter: - filtered_entries.append(entry) - - return filtered_entries +from mtdata.entry import Entry -def get_mtdata(source: str, target: str): +def fetch_mtdata(source: str, target: str) -> dict[str, Entry]: + """ + Returns a dict that maps the corpus key to the mtdata entry. + """ # mtdata outputs debug logs logging.disable(logging.CRITICAL) @@ -353,14 +350,30 @@ def get_mtdata(source: str, target: str): get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True), key=lambda entry: entry.did.group, ) - excludes = ["opus", "newstest", "UNv1"] - def get_name(entry): + def get_corpus_key(entry): return ( f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}" ) - names = [get_name(entry) for entry in entries] + entries = {get_corpus_key(entry): entry for entry in entries} + + excludes = ["opus", "newstest", "unv1"] # lowercase excludes. + + def is_excluded(corpus_key: str) -> bool: + for exclude in excludes: + if exclude in corpus_key.lower(): + return True + return False + + # Filter out the excluded entries. + return { + corpus_key: entry for corpus_key, entry in entries.items() if not is_excluded(corpus_key) + } + + +def get_mtdata(source: str, target: str): + entries = fetch_mtdata(source, target) print("") print("┌────────────────────────────────────────────────┐") @@ -376,32 +389,85 @@ def get_name(entry): *[ [ # - get_name(entry), + corpus_key, entry.url, # get_remote_file_size(entry.url), ] - for entry in + for corpus_key, entry in entries.items() # Filter out the excludes - exclude_by_name(excludes, names, entries) ], ] ) - print_yaml(names, exclude=excludes) + print_yaml(entries.keys()) + + +class NewsCrawlDataset(NamedTuple): + name: str + url: str + size: Optional[int] + display_size: Optional[int] + + +def fetch_news_crawl(lang: str) -> list[NewsCrawlDataset]: + base_url = f"https://data.statmt.org/news-crawl/{lang}/" + response = requests.get(base_url, allow_redirects=True) + + datasets = [] + if response.ok: + # Example row: (indentation and newlines added) + # + # [   ] + # news.2013.en.shuffled.deduped.gz + # 2019-01-14 10:23 + # 1.2G + #   + # + + regex = re.compile( + r""" + # Match the file name year. + # >news.2008.en.shuffled.deduped.gz< + # ^^^^ + >news.(\d+)\.\w+\.shuffled\.deduped\.gz< + [^\n]* + + # Match the file size and unit. + # 176M + # ^^^^ + + ([\d\.]+)(\w+) + + """, + re.VERBOSE, + ) + + matches = re.findall(regex, response.text) + + if matches: + for year, size_number, size_unit in matches: + if size_unit == "K": + multiplier = 1_000 + elif size_unit == "M": + multiplier = 1_000_000 + elif size_unit == "G": + multiplier = 1_000_000_000 + + name = f"news-crawl_news.{year}" + url = f"https://data.statmt.org/news-crawl/{lang}/news.{year}.{lang}.shuffled.deduped.gz" + size = int(float(size_number) * multiplier) + + datasets.append(NewsCrawlDataset(name, url, size, f"{size_number}{size_unit}")) + else: + print("The regex could not find newscrawl datasets for", lang) + else: + print("No newscrawl data was available for", lang) + return datasets def get_news_crawl(source: str, target: str): for lang in (source, target): - datasets = [] - for i in range(20): - year = 2007 + i - name = f"news-crawl_news.{year}" - url = ( - f"https://data.statmt.org/news-crawl/{lang}/news.{year}.{lang}.shuffled.deduped.gz" - ) - size = get_remote_file_size(url, display_not_200=False) - if size is not None: - datasets.append((name, url, size)) + datasets = fetch_news_crawl(lang) print("") print("┌─────────────────────────────────────────────────────────────────────┐") @@ -414,11 +480,11 @@ def get_news_crawl(source: str, target: str): "URL", "Size", ], - *[[name, url, size] for name, url, size in datasets], + *[[name, url, display_size] for name, url, _, display_size in datasets], ] ) - print_yaml([name for name, _, _ in datasets]) + print_yaml([name for name, _, _, _ in datasets]) def print_yaml(names: list[str], exclude: list[str] = []):