@techreport{956768f8c6864281835256a4c0c468aa,
title = "Meta Learning Text-to-Speech Synthesis in over 7000 Languages",
abstract = "In this work, we take on the challenging task of building a single text-to-speech synthesis system that is capable of generating speech in over 7000 languages, many of which lack sufficient data for traditional TTS development. By leveraging a novel integration of massively multilingual pretraining and meta learning to approximate language representations, our approach enables zero-shot speech synthesis in languages without any available data. We validate our system's performance through objective measures and human evaluation across a diverse linguistic landscape. By releasing our code and models publicly, we aim to empower communities with limited linguistic resources and foster further innovation in the field of speech technology.",
keywords = "cs.CL, cs.LG, cs.SD, eess.AS",
author = "Florian Lux and Sarina Meyer and Lyonel Behringer and Frank Zalkow and Phat Do and Matt Coler and Habets, {Emanu{\"e}l A. P.} and Vu, {Ngoc Thang}",
note = "accepted at Interspeech 2024",
year = "2024",
month = jun,
day = "10",
doi = "10.48550/arXiv.2406.06403",
language = "English",
publisher = "arXiv",
type = "WorkingPaper",
institution = "arXiv",
}