@inproceedings{xie-etal-2024-typos-correction,
title = "Typos Correction Training against Misspellings from Text-to-Text Transformers",
author = "Xie, Guicai and
Zhang, Ke and
Duan, Lei and
Zhang, Wei and
Huang, Zeqian",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1470",
pages = "16907--16918",
abstract = "Dense retrieval (DR) has become a mainstream approach to information seeking, where a system is required to return relevant information to a user query. In real-life applications, typoed queries resulting from the users{'} mistyping words or phonetic typing errors exist widely in search behaviors. Current dense retrievers experience a significant drop in retrieval effectiveness when they encounter typoed queries. Therefore, the search system requires the extra introduction of spell-checkers to deal with typos and then applies the DR model to perform robust matching. Herein, we argue that directly conducting the typos correction training would be beneficial to make an end-to-end retriever against misspellings. To this end, we propose a novel approach that can facilitate the incorporation of the spelling correction objective into the DR model using the encoder-decoder architecture. During typos correction training, we also develop a prompt-based augmentation technique to enhance the DR space alignment of the typoed query and its original query. Extensive experiments demonstrate that the effectiveness of our proposed end-to-end retriever significantly outperforms existing typos-aware training approaches and sophisticated training advanced retrievers. Our code is available at https://github.com/striver314/ToCoTR.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xie-etal-2024-typos-correction">
<titleInfo>
<title>Typos Correction Training against Misspellings from Text-to-Text Transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guicai</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeqian</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Dense retrieval (DR) has become a mainstream approach to information seeking, where a system is required to return relevant information to a user query. In real-life applications, typoed queries resulting from the users’ mistyping words or phonetic typing errors exist widely in search behaviors. Current dense retrievers experience a significant drop in retrieval effectiveness when they encounter typoed queries. Therefore, the search system requires the extra introduction of spell-checkers to deal with typos and then applies the DR model to perform robust matching. Herein, we argue that directly conducting the typos correction training would be beneficial to make an end-to-end retriever against misspellings. To this end, we propose a novel approach that can facilitate the incorporation of the spelling correction objective into the DR model using the encoder-decoder architecture. During typos correction training, we also develop a prompt-based augmentation technique to enhance the DR space alignment of the typoed query and its original query. Extensive experiments demonstrate that the effectiveness of our proposed end-to-end retriever significantly outperforms existing typos-aware training approaches and sophisticated training advanced retrievers. Our code is available at https://github.com/striver314/ToCoTR.</abstract>
<identifier type="citekey">xie-etal-2024-typos-correction</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1470</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>16907</start>
<end>16918</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Typos Correction Training against Misspellings from Text-to-Text Transformers
%A Xie, Guicai
%A Zhang, Ke
%A Duan, Lei
%A Zhang, Wei
%A Huang, Zeqian
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F xie-etal-2024-typos-correction
%X Dense retrieval (DR) has become a mainstream approach to information seeking, where a system is required to return relevant information to a user query. In real-life applications, typoed queries resulting from the users’ mistyping words or phonetic typing errors exist widely in search behaviors. Current dense retrievers experience a significant drop in retrieval effectiveness when they encounter typoed queries. Therefore, the search system requires the extra introduction of spell-checkers to deal with typos and then applies the DR model to perform robust matching. Herein, we argue that directly conducting the typos correction training would be beneficial to make an end-to-end retriever against misspellings. To this end, we propose a novel approach that can facilitate the incorporation of the spelling correction objective into the DR model using the encoder-decoder architecture. During typos correction training, we also develop a prompt-based augmentation technique to enhance the DR space alignment of the typoed query and its original query. Extensive experiments demonstrate that the effectiveness of our proposed end-to-end retriever significantly outperforms existing typos-aware training approaches and sophisticated training advanced retrievers. Our code is available at https://github.com/striver314/ToCoTR.
%U https://aclanthology.org/2024.lrec-main.1470
%P 16907-16918
Markdown (Informal)
[Typos Correction Training against Misspellings from Text-to-Text Transformers](https://aclanthology.org/2024.lrec-main.1470) (Xie et al., LREC-COLING 2024)
ACL