@inproceedings{albertsson-etal-2016-similarity,
title = "Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes",
author = {Albertsson, Sarah and
Rennes, Evelina and
J{\"o}nsson, Arne},
editor = "Brunato, Dominique and
Dell{'}Orletta, Felice and
Venturi, Giulia and
Fran{\c{c}}ois, Thomas and
Blache, Philippe",
booktitle = "Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity ({CL}4{LC})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/W16-4118",
pages = "154--163",
abstract = "Comparable or parallel corpora are beneficial for many NLP tasks. The automatic collection of corpora enables large-scale resources, even for less-resourced languages, which in turn can be useful for deducing rules and patterns for text rewriting algorithms, a subtask of automatic text simplification. We present two methods for the alignment of Swedish easy-to-read text segments to text segments from a reference corpus. The first method (M1) was originally developed for the task of text reuse detection, measuring sentence similarity by a modified version of a TF-IDF vector space model. A second method (M2), also accounting for part-of-speech tags, was developed, and the methods were compared. For evaluation, a crowdsourcing platform was built for human judgement data collection, and preliminary results showed that cosine similarity relates better to human ranks than the Dice coefficient. We also saw a tendency that including syntactic context to the TF-IDF vector space model is beneficial for this kind of paraphrase alignment task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="albertsson-etal-2016-similarity">
<titleInfo>
<title>Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Albertsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evelina</namePart>
<namePart type="family">Rennes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arne</namePart>
<namePart type="family">Jönsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dominique</namePart>
<namePart type="family">Brunato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giulia</namePart>
<namePart type="family">Venturi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">François</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Comparable or parallel corpora are beneficial for many NLP tasks. The automatic collection of corpora enables large-scale resources, even for less-resourced languages, which in turn can be useful for deducing rules and patterns for text rewriting algorithms, a subtask of automatic text simplification. We present two methods for the alignment of Swedish easy-to-read text segments to text segments from a reference corpus. The first method (M1) was originally developed for the task of text reuse detection, measuring sentence similarity by a modified version of a TF-IDF vector space model. A second method (M2), also accounting for part-of-speech tags, was developed, and the methods were compared. For evaluation, a crowdsourcing platform was built for human judgement data collection, and preliminary results showed that cosine similarity relates better to human ranks than the Dice coefficient. We also saw a tendency that including syntactic context to the TF-IDF vector space model is beneficial for this kind of paraphrase alignment task.</abstract>
<identifier type="citekey">albertsson-etal-2016-similarity</identifier>
<location>
<url>https://aclanthology.org/W16-4118</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>154</start>
<end>163</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes
%A Albertsson, Sarah
%A Rennes, Evelina
%A Jönsson, Arne
%Y Brunato, Dominique
%Y Dell’Orletta, Felice
%Y Venturi, Giulia
%Y François, Thomas
%Y Blache, Philippe
%S Proceedings of the Workshop on Computational Linguistics for Linguistic Complexity (CL4LC)
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F albertsson-etal-2016-similarity
%X Comparable or parallel corpora are beneficial for many NLP tasks. The automatic collection of corpora enables large-scale resources, even for less-resourced languages, which in turn can be useful for deducing rules and patterns for text rewriting algorithms, a subtask of automatic text simplification. We present two methods for the alignment of Swedish easy-to-read text segments to text segments from a reference corpus. The first method (M1) was originally developed for the task of text reuse detection, measuring sentence similarity by a modified version of a TF-IDF vector space model. A second method (M2), also accounting for part-of-speech tags, was developed, and the methods were compared. For evaluation, a crowdsourcing platform was built for human judgement data collection, and preliminary results showed that cosine similarity relates better to human ranks than the Dice coefficient. We also saw a tendency that including syntactic context to the TF-IDF vector space model is beneficial for this kind of paraphrase alignment task.
%U https://aclanthology.org/W16-4118
%P 154-163
Markdown (Informal)
[Similarity-Based Alignment of Monolingual Corpora for Text Simplification Purposes](https://aclanthology.org/W16-4118) (Albertsson et al., CL4LC 2016)
ACL