@inproceedings{frenzel-stede-2025-sentence,
title = "Sentence-Alignment in Semi-parallel Datasets",
author = "Frenzel, Steffen and
Stede, Manfred",
editor = "Kazantseva, Anna and
Szpakowicz, Stan and
Degaetano-Ortlieb, Stefania and
Bizzoni, Yuri and
Pagel, Janis",
booktitle = "Proceedings of the 9th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.latechclfl-1.9/",
doi = "10.18653/v1/2025.latechclfl-1.9",
pages = "87--96",
ISBN = "979-8-89176-241-1",
abstract = "In this paper, we are testing sentence alignment on complex, semi-parallel corpora, i.e., different versions of the same text that have been altered to some extent. We evaluate two hypotheses: To make alignment algorithms more efficient, we test the hypothesis that matching pairs can be found in the immediate vicinity of the source sentence and that it is sufficient to search for paraphrases in a `context window'. To improve the alignment quality on complex, semi-parallel texts, we test the implementation of a segmentation into Elementary Discourse Units (EDUs) in order to make more precise alignments at this level. Since EDUs are the smallest possible unit for communicating a full proposition, we assume that aligning at this level can improve the overall quality. Both hypotheses are tested and validated with several embedding models on varying degrees of parallel German datasets. The advantages and disadvantages of the different approaches are presented, and our next steps are outlined."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="frenzel-stede-2025-sentence">
<titleInfo>
<title>Sentence-Alignment in Semi-parallel Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Frenzel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manfred</namePart>
<namePart type="family">Stede</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janis</namePart>
<namePart type="family">Pagel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-241-1</identifier>
</relatedItem>
<abstract>In this paper, we are testing sentence alignment on complex, semi-parallel corpora, i.e., different versions of the same text that have been altered to some extent. We evaluate two hypotheses: To make alignment algorithms more efficient, we test the hypothesis that matching pairs can be found in the immediate vicinity of the source sentence and that it is sufficient to search for paraphrases in a ‘context window’. To improve the alignment quality on complex, semi-parallel texts, we test the implementation of a segmentation into Elementary Discourse Units (EDUs) in order to make more precise alignments at this level. Since EDUs are the smallest possible unit for communicating a full proposition, we assume that aligning at this level can improve the overall quality. Both hypotheses are tested and validated with several embedding models on varying degrees of parallel German datasets. The advantages and disadvantages of the different approaches are presented, and our next steps are outlined.</abstract>
<identifier type="citekey">frenzel-stede-2025-sentence</identifier>
<identifier type="doi">10.18653/v1/2025.latechclfl-1.9</identifier>
<location>
<url>https://aclanthology.org/2025.latechclfl-1.9/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>87</start>
<end>96</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sentence-Alignment in Semi-parallel Datasets
%A Frenzel, Steffen
%A Stede, Manfred
%Y Kazantseva, Anna
%Y Szpakowicz, Stan
%Y Degaetano-Ortlieb, Stefania
%Y Bizzoni, Yuri
%Y Pagel, Janis
%S Proceedings of the 9th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-241-1
%F frenzel-stede-2025-sentence
%X In this paper, we are testing sentence alignment on complex, semi-parallel corpora, i.e., different versions of the same text that have been altered to some extent. We evaluate two hypotheses: To make alignment algorithms more efficient, we test the hypothesis that matching pairs can be found in the immediate vicinity of the source sentence and that it is sufficient to search for paraphrases in a ‘context window’. To improve the alignment quality on complex, semi-parallel texts, we test the implementation of a segmentation into Elementary Discourse Units (EDUs) in order to make more precise alignments at this level. Since EDUs are the smallest possible unit for communicating a full proposition, we assume that aligning at this level can improve the overall quality. Both hypotheses are tested and validated with several embedding models on varying degrees of parallel German datasets. The advantages and disadvantages of the different approaches are presented, and our next steps are outlined.
%R 10.18653/v1/2025.latechclfl-1.9
%U https://aclanthology.org/2025.latechclfl-1.9/
%U https://doi.org/10.18653/v1/2025.latechclfl-1.9
%P 87-96
Markdown (Informal)
[Sentence-Alignment in Semi-parallel Datasets](https://aclanthology.org/2025.latechclfl-1.9/) (Frenzel & Stede, LaTeCHCLfL 2025)
ACL
- Steffen Frenzel and Manfred Stede. 2025. Sentence-Alignment in Semi-parallel Datasets. In Proceedings of the 9th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2025), pages 87–96, Albuquerque, New Mexico. Association for Computational Linguistics.