@inproceedings{sukhareva-2020-context,
title = "Context-Aware Text Normalisation for Historical Dialects",
author = "Sukhareva, Maria",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.89",
doi = "10.18653/v1/2020.coling-main.89",
pages = "1023--1036",
abstract = "Context-aware historical text normalisation is a severely under-researched area. To fill the gap we propose a context-aware normalisation approach that relies on the state-of-the-art methods in neural machine translation and transfer learning. We propose a multidialect normaliser with a context-aware reranking of the candidates. The reranker relies on a word-level n-gram language model that is applied to the five best normalisation candidates. The results are evaluated on the historical multidialect datasets of German, Spanish, Portuguese and Slovene. We show that incorporating dialectal information into the training leads to an accuracy improvement on all the datasets. The context-aware reranking gives further improvement over the baseline. For three out of six datasets, we reach a significantly higher accuracy than reported in the previous studies. The other three results are comparable with the current state-of-the-art. The code for the reranker is published as open-source.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sukhareva-2020-context">
<titleInfo>
<title>Context-Aware Text Normalisation for Historical Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Sukhareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Context-aware historical text normalisation is a severely under-researched area. To fill the gap we propose a context-aware normalisation approach that relies on the state-of-the-art methods in neural machine translation and transfer learning. We propose a multidialect normaliser with a context-aware reranking of the candidates. The reranker relies on a word-level n-gram language model that is applied to the five best normalisation candidates. The results are evaluated on the historical multidialect datasets of German, Spanish, Portuguese and Slovene. We show that incorporating dialectal information into the training leads to an accuracy improvement on all the datasets. The context-aware reranking gives further improvement over the baseline. For three out of six datasets, we reach a significantly higher accuracy than reported in the previous studies. The other three results are comparable with the current state-of-the-art. The code for the reranker is published as open-source.</abstract>
<identifier type="citekey">sukhareva-2020-context</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.89</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.89</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>1023</start>
<end>1036</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Context-Aware Text Normalisation for Historical Dialects
%A Sukhareva, Maria
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F sukhareva-2020-context
%X Context-aware historical text normalisation is a severely under-researched area. To fill the gap we propose a context-aware normalisation approach that relies on the state-of-the-art methods in neural machine translation and transfer learning. We propose a multidialect normaliser with a context-aware reranking of the candidates. The reranker relies on a word-level n-gram language model that is applied to the five best normalisation candidates. The results are evaluated on the historical multidialect datasets of German, Spanish, Portuguese and Slovene. We show that incorporating dialectal information into the training leads to an accuracy improvement on all the datasets. The context-aware reranking gives further improvement over the baseline. For three out of six datasets, we reach a significantly higher accuracy than reported in the previous studies. The other three results are comparable with the current state-of-the-art. The code for the reranker is published as open-source.
%R 10.18653/v1/2020.coling-main.89
%U https://aclanthology.org/2020.coling-main.89
%U https://doi.org/10.18653/v1/2020.coling-main.89
%P 1023-1036
Markdown (Informal)
[Context-Aware Text Normalisation for Historical Dialects](https://aclanthology.org/2020.coling-main.89) (Sukhareva, COLING 2020)
ACL