@article{ren-etal-2022-effective,
title = "Effective Approaches to Neural Query Language Identification",
author = "Ren, Xingzhang and
Yang, Baosong and
Liu, Dayiheng and
Zhang, Haibo and
Lv, Xiaoyu and
Yao, Liang and
Xie, Jun",
journal = "Computational Linguistics",
volume = "48",
number = "4",
month = dec,
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.cl-4.14/",
doi = "10.1162/coli_a_00451",
pages = "887--906",
abstract = "Query language identification (Q-LID) plays a crucial role in a cross-lingual search engine. There exist two main challenges in Q-LID: (1) insufficient contextual information in queries for disambiguation; and (2) the lack of query-style training examples for low-resource languages. In this article, we propose a neural Q-LID model by alleviating the above problems from both model architecture and data augmentation perspectives. Concretely, we build our model upon the advanced Transformer model. In order to enhance the discrimination of queries, a variety of external features (e.g., character, word, as well as script) are fed into the model and fused by a multi-scale attention mechanism. Moreover, to remedy the low resource challenge in this task, a novel machine translation{--}based strategy is proposed to automatically generate synthetic query-style data for low-resource languages. We contribute the first Q-LID test set called QID-21, which consists of search queries in 21 languages. Experimental results reveal that our model yields better classification accuracy than strong baselines and existing LID systems on both query and traditional LID tasks.1"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ren-etal-2022-effective">
<titleInfo>
<title>Effective Approaches to Neural Query Language Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xingzhang</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baosong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayiheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haibo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoyu</namePart>
<namePart type="family">Lv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Query language identification (Q-LID) plays a crucial role in a cross-lingual search engine. There exist two main challenges in Q-LID: (1) insufficient contextual information in queries for disambiguation; and (2) the lack of query-style training examples for low-resource languages. In this article, we propose a neural Q-LID model by alleviating the above problems from both model architecture and data augmentation perspectives. Concretely, we build our model upon the advanced Transformer model. In order to enhance the discrimination of queries, a variety of external features (e.g., character, word, as well as script) are fed into the model and fused by a multi-scale attention mechanism. Moreover, to remedy the low resource challenge in this task, a novel machine translation–based strategy is proposed to automatically generate synthetic query-style data for low-resource languages. We contribute the first Q-LID test set called QID-21, which consists of search queries in 21 languages. Experimental results reveal that our model yields better classification accuracy than strong baselines and existing LID systems on both query and traditional LID tasks.1</abstract>
<identifier type="citekey">ren-etal-2022-effective</identifier>
<identifier type="doi">10.1162/coli_a_00451</identifier>
<location>
<url>https://aclanthology.org/2022.cl-4.14/</url>
</location>
<part>
<date>2022-12</date>
<detail type="volume"><number>48</number></detail>
<detail type="issue"><number>4</number></detail>
<extent unit="page">
<start>887</start>
<end>906</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Effective Approaches to Neural Query Language Identification
%A Ren, Xingzhang
%A Yang, Baosong
%A Liu, Dayiheng
%A Zhang, Haibo
%A Lv, Xiaoyu
%A Yao, Liang
%A Xie, Jun
%J Computational Linguistics
%D 2022
%8 December
%V 48
%N 4
%I MIT Press
%C Cambridge, MA
%F ren-etal-2022-effective
%X Query language identification (Q-LID) plays a crucial role in a cross-lingual search engine. There exist two main challenges in Q-LID: (1) insufficient contextual information in queries for disambiguation; and (2) the lack of query-style training examples for low-resource languages. In this article, we propose a neural Q-LID model by alleviating the above problems from both model architecture and data augmentation perspectives. Concretely, we build our model upon the advanced Transformer model. In order to enhance the discrimination of queries, a variety of external features (e.g., character, word, as well as script) are fed into the model and fused by a multi-scale attention mechanism. Moreover, to remedy the low resource challenge in this task, a novel machine translation–based strategy is proposed to automatically generate synthetic query-style data for low-resource languages. We contribute the first Q-LID test set called QID-21, which consists of search queries in 21 languages. Experimental results reveal that our model yields better classification accuracy than strong baselines and existing LID systems on both query and traditional LID tasks.1
%R 10.1162/coli_a_00451
%U https://aclanthology.org/2022.cl-4.14/
%U https://doi.org/10.1162/coli_a_00451
%P 887-906
Markdown (Informal)
[Effective Approaches to Neural Query Language Identification](https://aclanthology.org/2022.cl-4.14/) (Ren et al., CL 2022)
ACL