@inproceedings{corbeil-abdi-ghavidel-2021-assessing,
title = "Assessing the Eligibility of Backtranslated Samples Based on Semantic Similarity for the Paraphrase Identification Task",
author = "Corbeil, Jean-Philippe and
Abdi Ghavidel, Hadi",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)",
month = sep,
year = "2021",
address = "Held Online",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.ranlp-1.35",
pages = "301--308",
abstract = "In the domain of natural language augmentation, the eligibility of generated samples remains not well understood. To gather insights around this eligibility issue, we apply a transformer-based similarity calculation within the BET framework based on backtranslation, in the context of automated paraphrase detection. While providing a rigorous statistical foundation to BET, we push their results by analyzing statistically the impacts of the level of qualification, and several sample sizes. We conducted a vast amount of experiments on the MRPC corpus using six pre-trained models: BERT, XLNet, Albert, RoBERTa, Electra, and DeBerta. We show that our method improves significantly these {``}base{''} models while using only a fraction of the corpus. Our results suggest that using some of those smaller pre-trained models, namely RoBERTa base and Electra base, helps us reach F1 scores very close to their large counterparts, as reported on the GLUE benchmark. On top of acting as a regularizer, the proposed method is efficient in dealing with data scarcity with improvements of around 3{\%} in F1 score for most pre-trained models, and more than 7.5{\%} in the case of Electra.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="corbeil-abdi-ghavidel-2021-assessing">
<titleInfo>
<title>Assessing the Eligibility of Backtranslated Samples Based on Semantic Similarity for the Paraphrase Identification Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jean-Philippe</namePart>
<namePart type="family">Corbeil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadi</namePart>
<namePart type="family">Abdi Ghavidel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Held Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the domain of natural language augmentation, the eligibility of generated samples remains not well understood. To gather insights around this eligibility issue, we apply a transformer-based similarity calculation within the BET framework based on backtranslation, in the context of automated paraphrase detection. While providing a rigorous statistical foundation to BET, we push their results by analyzing statistically the impacts of the level of qualification, and several sample sizes. We conducted a vast amount of experiments on the MRPC corpus using six pre-trained models: BERT, XLNet, Albert, RoBERTa, Electra, and DeBerta. We show that our method improves significantly these “base” models while using only a fraction of the corpus. Our results suggest that using some of those smaller pre-trained models, namely RoBERTa base and Electra base, helps us reach F1 scores very close to their large counterparts, as reported on the GLUE benchmark. On top of acting as a regularizer, the proposed method is efficient in dealing with data scarcity with improvements of around 3% in F1 score for most pre-trained models, and more than 7.5% in the case of Electra.</abstract>
<identifier type="citekey">corbeil-abdi-ghavidel-2021-assessing</identifier>
<location>
<url>https://aclanthology.org/2021.ranlp-1.35</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>301</start>
<end>308</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing the Eligibility of Backtranslated Samples Based on Semantic Similarity for the Paraphrase Identification Task
%A Corbeil, Jean-Philippe
%A Abdi Ghavidel, Hadi
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference on Recent Advances in Natural Language Processing (RANLP 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Held Online
%F corbeil-abdi-ghavidel-2021-assessing
%X In the domain of natural language augmentation, the eligibility of generated samples remains not well understood. To gather insights around this eligibility issue, we apply a transformer-based similarity calculation within the BET framework based on backtranslation, in the context of automated paraphrase detection. While providing a rigorous statistical foundation to BET, we push their results by analyzing statistically the impacts of the level of qualification, and several sample sizes. We conducted a vast amount of experiments on the MRPC corpus using six pre-trained models: BERT, XLNet, Albert, RoBERTa, Electra, and DeBerta. We show that our method improves significantly these “base” models while using only a fraction of the corpus. Our results suggest that using some of those smaller pre-trained models, namely RoBERTa base and Electra base, helps us reach F1 scores very close to their large counterparts, as reported on the GLUE benchmark. On top of acting as a regularizer, the proposed method is efficient in dealing with data scarcity with improvements of around 3% in F1 score for most pre-trained models, and more than 7.5% in the case of Electra.
%U https://aclanthology.org/2021.ranlp-1.35
%P 301-308
Markdown (Informal)
[Assessing the Eligibility of Backtranslated Samples Based on Semantic Similarity for the Paraphrase Identification Task](https://aclanthology.org/2021.ranlp-1.35) (Corbeil & Abdi Ghavidel, RANLP 2021)
ACL