Multilingual sequence-to-sequence models perform poorly with increased
language coverage and fail to consistently generate text in the correct target
language in few-shot settings. To address these challenges, we propose mmT5, a
modular multilingual sequence-to-sequence model. mmT5 utilizes
language-specific modules during pre-training, which disentangle
language-specific information from language-agnostic information. We identify
representation drift during fine-tuning as a key limitation of modular
generative models and develop strategies that enable effective zero-shot
transfer. Our model outperforms mT5 at the same parameter sizes by a large
margin on representative natural language understanding and generation tasks in
40+ languages. Compared to mT5, mmT5 raises the rate of generating text in the
correct language under zero-shot settings from 7% to 99%, thereby greatly
alleviating the source language hallucination problem.
Description
mmT5: Modular Multilingual Pre-Training Solves Source Language Hallucinations
%0 Generic
%1 pfeiffer2023modular
%A Pfeiffer, Jonas
%A Piccinno, Francesco
%A Nicosia, Massimo
%A Wang, Xinyi
%A Reid, Machel
%A Ruder, Sebastian
%D 2023
%K low-resource nlp
%T mmT5: Modular Multilingual Pre-Training Solves Source Language
Hallucinations
%U http://arxiv.org/abs/2305.14224
%X Multilingual sequence-to-sequence models perform poorly with increased
language coverage and fail to consistently generate text in the correct target
language in few-shot settings. To address these challenges, we propose mmT5, a
modular multilingual sequence-to-sequence model. mmT5 utilizes
language-specific modules during pre-training, which disentangle
language-specific information from language-agnostic information. We identify
representation drift during fine-tuning as a key limitation of modular
generative models and develop strategies that enable effective zero-shot
transfer. Our model outperforms mT5 at the same parameter sizes by a large
margin on representative natural language understanding and generation tasks in
40+ languages. Compared to mT5, mmT5 raises the rate of generating text in the
correct language under zero-shot settings from 7% to 99%, thereby greatly
alleviating the source language hallucination problem.
@misc{pfeiffer2023modular,
abstract = {Multilingual sequence-to-sequence models perform poorly with increased
language coverage and fail to consistently generate text in the correct target
language in few-shot settings. To address these challenges, we propose mmT5, a
modular multilingual sequence-to-sequence model. mmT5 utilizes
language-specific modules during pre-training, which disentangle
language-specific information from language-agnostic information. We identify
representation drift during fine-tuning as a key limitation of modular
generative models and develop strategies that enable effective zero-shot
transfer. Our model outperforms mT5 at the same parameter sizes by a large
margin on representative natural language understanding and generation tasks in
40+ languages. Compared to mT5, mmT5 raises the rate of generating text in the
correct language under zero-shot settings from 7% to 99%, thereby greatly
alleviating the source language hallucination problem.},
added-at = {2023-06-26T12:48:20.000+0200},
author = {Pfeiffer, Jonas and Piccinno, Francesco and Nicosia, Massimo and Wang, Xinyi and Reid, Machel and Ruder, Sebastian},
biburl = {https://www.bibsonomy.org/bibtex/26c8a260eda1522977978ff904e9f997a/albinzehe},
description = {mmT5: Modular Multilingual Pre-Training Solves Source Language Hallucinations},
interhash = {1e164125ca53ebb56f9bdacd2d33d00c},
intrahash = {6c8a260eda1522977978ff904e9f997a},
keywords = {low-resource nlp},
note = {cite arxiv:2305.14224},
timestamp = {2023-06-26T12:48:20.000+0200},
title = {mmT5: Modular Multilingual Pre-Training Solves Source Language
Hallucinations},
url = {http://arxiv.org/abs/2305.14224},
year = 2023
}