@inproceedings{le-luu-2023-parallel,
title = "A Parallel Corpus for {V}ietnamese Central-Northern Dialect Text Transfer",
author = "Le, Thang and
Luu, Anh",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.925",
doi = "10.18653/v1/2023.findings-emnlp.925",
pages = "13839--13855",
abstract = "The Vietnamese language embodies dialectal variants closely attached to the nation{'}s three macro-regions: the Northern, Central and Southern regions. As the northern dialect forms the basis of the standard language, it{'}s considered the prestige dialect. While the northern dialect differs from the remaining two in certain aspects, it almost shares an identical lexicon with the southern dialect, making the textual attributes nearly interchangeable. In contrast, the central dialect possesses a number of unique vocabularies and is less mutually intelligible to the standard dialect. Through preliminary experiments, we observe that current NLP models do not possess understandings of the Vietnamese central dialect text, which most likely originates from the lack of resources. To facilitate research on this domain, we introduce a new parallel corpus for Vietnamese central-northern dialect text transfer. Via exhaustive benchmarking, we discover monolingual language models{'} superiority over their multilingual counterparts on the dialect transfer task. We further demonstrate that fine-tuned transfer models can seamlessly improve the performance of existing NLP systems on the central dialect domain with dedicated results in translation and text-image retrieval tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="le-luu-2023-parallel">
<titleInfo>
<title>A Parallel Corpus for Vietnamese Central-Northern Dialect Text Transfer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thang</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anh</namePart>
<namePart type="family">Luu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Vietnamese language embodies dialectal variants closely attached to the nation’s three macro-regions: the Northern, Central and Southern regions. As the northern dialect forms the basis of the standard language, it’s considered the prestige dialect. While the northern dialect differs from the remaining two in certain aspects, it almost shares an identical lexicon with the southern dialect, making the textual attributes nearly interchangeable. In contrast, the central dialect possesses a number of unique vocabularies and is less mutually intelligible to the standard dialect. Through preliminary experiments, we observe that current NLP models do not possess understandings of the Vietnamese central dialect text, which most likely originates from the lack of resources. To facilitate research on this domain, we introduce a new parallel corpus for Vietnamese central-northern dialect text transfer. Via exhaustive benchmarking, we discover monolingual language models’ superiority over their multilingual counterparts on the dialect transfer task. We further demonstrate that fine-tuned transfer models can seamlessly improve the performance of existing NLP systems on the central dialect domain with dedicated results in translation and text-image retrieval tasks.</abstract>
<identifier type="citekey">le-luu-2023-parallel</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.925</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.925</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>13839</start>
<end>13855</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Parallel Corpus for Vietnamese Central-Northern Dialect Text Transfer
%A Le, Thang
%A Luu, Anh
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F le-luu-2023-parallel
%X The Vietnamese language embodies dialectal variants closely attached to the nation’s three macro-regions: the Northern, Central and Southern regions. As the northern dialect forms the basis of the standard language, it’s considered the prestige dialect. While the northern dialect differs from the remaining two in certain aspects, it almost shares an identical lexicon with the southern dialect, making the textual attributes nearly interchangeable. In contrast, the central dialect possesses a number of unique vocabularies and is less mutually intelligible to the standard dialect. Through preliminary experiments, we observe that current NLP models do not possess understandings of the Vietnamese central dialect text, which most likely originates from the lack of resources. To facilitate research on this domain, we introduce a new parallel corpus for Vietnamese central-northern dialect text transfer. Via exhaustive benchmarking, we discover monolingual language models’ superiority over their multilingual counterparts on the dialect transfer task. We further demonstrate that fine-tuned transfer models can seamlessly improve the performance of existing NLP systems on the central dialect domain with dedicated results in translation and text-image retrieval tasks.
%R 10.18653/v1/2023.findings-emnlp.925
%U https://aclanthology.org/2023.findings-emnlp.925
%U https://doi.org/10.18653/v1/2023.findings-emnlp.925
%P 13839-13855
Markdown (Informal)
[A Parallel Corpus for Vietnamese Central-Northern Dialect Text Transfer](https://aclanthology.org/2023.findings-emnlp.925) (Le & Luu, Findings 2023)
ACL