@inproceedings{nguyen-etal-2025-improving,
title = "Improving {V}ietnamese-{E}nglish Cross-Lingual Retrieval for Legal and General Domains",
author = "Nguyen, Toan Ngoc and
Hai, Nam Le and
Hieu, Nguyen Doan and
Nguyen, Dai An and
Van, Linh Ngo and
Nguyen, Thien Huu and
Dinh, Sang",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-short.12/",
doi = "10.18653/v1/2025.naacl-short.12",
pages = "142--153",
ISBN = "979-8-89176-190-2",
abstract = "Document retrieval plays a crucial role in numerous question-answering systems, yet research has concentrated on the general knowledge domain and resource-rich languages like English. In contrast, it remains largely underexplored in low-resource languages and cross-lingual scenarios within specialized domain knowledge such as legal. We present a novel dataset designed for cross-lingual retrieval between Vietnamese and English, which not only covers the general domain but also extends to the legal field. Additionally, we propose auxiliary loss function and symmetrical training strategy that significantly enhance the performance of state-of-the-art models on these retrieval tasks. Our contributions offer a significant resource and methodology aimed at improving cross-lingual retrieval in both legal and general QA settings, facilitating further advancements in document retrieval research across multiple languages and a broader spectrum of specialized domains. All the resources related to our work can be accessed at \url{huggingface.co/datasets/bkai-foundation-models/crosslingual}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-etal-2025-improving">
<titleInfo>
<title>Improving Vietnamese-English Cross-Lingual Retrieval for Legal and General Domains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Toan</namePart>
<namePart type="given">Ngoc</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nam</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Hai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Doan</namePart>
<namePart type="family">Hieu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dai</namePart>
<namePart type="given">An</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linh</namePart>
<namePart type="given">Ngo</namePart>
<namePart type="family">Van</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thien</namePart>
<namePart type="given">Huu</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sang</namePart>
<namePart type="family">Dinh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-190-2</identifier>
</relatedItem>
<abstract>Document retrieval plays a crucial role in numerous question-answering systems, yet research has concentrated on the general knowledge domain and resource-rich languages like English. In contrast, it remains largely underexplored in low-resource languages and cross-lingual scenarios within specialized domain knowledge such as legal. We present a novel dataset designed for cross-lingual retrieval between Vietnamese and English, which not only covers the general domain but also extends to the legal field. Additionally, we propose auxiliary loss function and symmetrical training strategy that significantly enhance the performance of state-of-the-art models on these retrieval tasks. Our contributions offer a significant resource and methodology aimed at improving cross-lingual retrieval in both legal and general QA settings, facilitating further advancements in document retrieval research across multiple languages and a broader spectrum of specialized domains. All the resources related to our work can be accessed at huggingface.co/datasets/bkai-foundation-models/crosslingual.</abstract>
<identifier type="citekey">nguyen-etal-2025-improving</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-short.12</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-short.12/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>142</start>
<end>153</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Vietnamese-English Cross-Lingual Retrieval for Legal and General Domains
%A Nguyen, Toan Ngoc
%A Hai, Nam Le
%A Hieu, Nguyen Doan
%A Nguyen, Dai An
%A Van, Linh Ngo
%A Nguyen, Thien Huu
%A Dinh, Sang
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-190-2
%F nguyen-etal-2025-improving
%X Document retrieval plays a crucial role in numerous question-answering systems, yet research has concentrated on the general knowledge domain and resource-rich languages like English. In contrast, it remains largely underexplored in low-resource languages and cross-lingual scenarios within specialized domain knowledge such as legal. We present a novel dataset designed for cross-lingual retrieval between Vietnamese and English, which not only covers the general domain but also extends to the legal field. Additionally, we propose auxiliary loss function and symmetrical training strategy that significantly enhance the performance of state-of-the-art models on these retrieval tasks. Our contributions offer a significant resource and methodology aimed at improving cross-lingual retrieval in both legal and general QA settings, facilitating further advancements in document retrieval research across multiple languages and a broader spectrum of specialized domains. All the resources related to our work can be accessed at huggingface.co/datasets/bkai-foundation-models/crosslingual.
%R 10.18653/v1/2025.naacl-short.12
%U https://aclanthology.org/2025.naacl-short.12/
%U https://doi.org/10.18653/v1/2025.naacl-short.12
%P 142-153
Markdown (Informal)
[Improving Vietnamese-English Cross-Lingual Retrieval for Legal and General Domains](https://aclanthology.org/2025.naacl-short.12/) (Nguyen et al., NAACL 2025)
ACL
- Toan Ngoc Nguyen, Nam Le Hai, Nguyen Doan Hieu, Dai An Nguyen, Linh Ngo Van, Thien Huu Nguyen, and Sang Dinh. 2025. Improving Vietnamese-English Cross-Lingual Retrieval for Legal and General Domains. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pages 142–153, Albuquerque, New Mexico. Association for Computational Linguistics.