@inproceedings{chousa-etal-2020-spanalign,
title = "{S}pan{A}lign: Sentence Alignment Method based on Cross-Language Span Prediction and {ILP}",
author = "Chousa, Katsuki and
Nagata, Masaaki and
Nishino, Masaaki",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.418",
doi = "10.18653/v1/2020.coling-main.418",
pages = "4750--4761",
abstract = "We propose a novel method of automatic sentence alignment from noisy parallel documents. We first formalize the sentence alignment problem as the independent predictions of spans in the target document from sentences in the source document. We then introduce a total optimization method using integer linear programming to prevent span overlapping and obtain non-monotonic alignments. We implement cross-language span prediction by fine-tuning pre-trained multilingual language models based on BERT architecture and train them using pseudo-labeled data obtained from unsupervised sentence alignment method. While the baseline methods use sentence embeddings and assume monotonic alignment, our method can capture the token-to-token interaction between the tokens of source and target text and handle non-monotonic alignments. In sentence alignment experiments on English-Japanese, our method achieved 70.3 F1 scores, which are +8.0 points higher than the baseline method. In particular, our method improved by +53.9 F1 scores for extracting non-parallel sentences. Our method improved the downstream machine translation accuracy by 4.1 BLEU scores when the extracted bilingual sentences are used for fine-tuning a pre-trained Japanese-to-English translation model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chousa-etal-2020-spanalign">
<titleInfo>
<title>SpanAlign: Sentence Alignment Method based on Cross-Language Span Prediction and ILP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Katsuki</namePart>
<namePart type="family">Chousa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masaaki</namePart>
<namePart type="family">Nagata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masaaki</namePart>
<namePart type="family">Nishino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a novel method of automatic sentence alignment from noisy parallel documents. We first formalize the sentence alignment problem as the independent predictions of spans in the target document from sentences in the source document. We then introduce a total optimization method using integer linear programming to prevent span overlapping and obtain non-monotonic alignments. We implement cross-language span prediction by fine-tuning pre-trained multilingual language models based on BERT architecture and train them using pseudo-labeled data obtained from unsupervised sentence alignment method. While the baseline methods use sentence embeddings and assume monotonic alignment, our method can capture the token-to-token interaction between the tokens of source and target text and handle non-monotonic alignments. In sentence alignment experiments on English-Japanese, our method achieved 70.3 F1 scores, which are +8.0 points higher than the baseline method. In particular, our method improved by +53.9 F1 scores for extracting non-parallel sentences. Our method improved the downstream machine translation accuracy by 4.1 BLEU scores when the extracted bilingual sentences are used for fine-tuning a pre-trained Japanese-to-English translation model.</abstract>
<identifier type="citekey">chousa-etal-2020-spanalign</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.418</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.418</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>4750</start>
<end>4761</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SpanAlign: Sentence Alignment Method based on Cross-Language Span Prediction and ILP
%A Chousa, Katsuki
%A Nagata, Masaaki
%A Nishino, Masaaki
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F chousa-etal-2020-spanalign
%X We propose a novel method of automatic sentence alignment from noisy parallel documents. We first formalize the sentence alignment problem as the independent predictions of spans in the target document from sentences in the source document. We then introduce a total optimization method using integer linear programming to prevent span overlapping and obtain non-monotonic alignments. We implement cross-language span prediction by fine-tuning pre-trained multilingual language models based on BERT architecture and train them using pseudo-labeled data obtained from unsupervised sentence alignment method. While the baseline methods use sentence embeddings and assume monotonic alignment, our method can capture the token-to-token interaction between the tokens of source and target text and handle non-monotonic alignments. In sentence alignment experiments on English-Japanese, our method achieved 70.3 F1 scores, which are +8.0 points higher than the baseline method. In particular, our method improved by +53.9 F1 scores for extracting non-parallel sentences. Our method improved the downstream machine translation accuracy by 4.1 BLEU scores when the extracted bilingual sentences are used for fine-tuning a pre-trained Japanese-to-English translation model.
%R 10.18653/v1/2020.coling-main.418
%U https://aclanthology.org/2020.coling-main.418
%U https://doi.org/10.18653/v1/2020.coling-main.418
%P 4750-4761
Markdown (Informal)
[SpanAlign: Sentence Alignment Method based on Cross-Language Span Prediction and ILP](https://aclanthology.org/2020.coling-main.418) (Chousa et al., COLING 2020)
ACL