@inproceedings{itamar-itai-2008-using,
title = "Using Movie Subtitles for Creating a Large-Scale Bilingual Corpora",
author = "Itamar, Einav and
Itai, Alon",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Tapias, Daniel",
booktitle = "Proceedings of the Sixth International Conference on Language Resources and Evaluation ({LREC}'08)",
month = may,
year = "2008",
address = "Marrakech, Morocco",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2008/pdf/76_paper.pdf",
abstract = "This paper presents a method for compiling a large-scale bilingual corpus from a database of movie subtitles. To create the corpus, we propose an algorithm based on Gale and Churchs sentence alignment algorithm(1993). However, our algorithm not only relies on character length information, but also uses subtitle-timing information, which is encoded in the subtitle files. Timing is highly correlated between subtitles in different versions (for the same movie), since subtitles that match should be displayed at the same time. However, the absolute time values cant be used for alignment, since the timing is usually specified by frame numbers and not by real time, and converting it to real time values is not always possible, hence we use normalized subtitle duration instead. This results in a significant reduction in the alignment error rate.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="itamar-itai-2008-using">
<titleInfo>
<title>Using Movie Subtitles for Creating a Large-Scale Bilingual Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Einav</namePart>
<namePart type="family">Itamar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Itai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2008-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Marrakech, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents a method for compiling a large-scale bilingual corpus from a database of movie subtitles. To create the corpus, we propose an algorithm based on Gale and Churchs sentence alignment algorithm(1993). However, our algorithm not only relies on character length information, but also uses subtitle-timing information, which is encoded in the subtitle files. Timing is highly correlated between subtitles in different versions (for the same movie), since subtitles that match should be displayed at the same time. However, the absolute time values cant be used for alignment, since the timing is usually specified by frame numbers and not by real time, and converting it to real time values is not always possible, hence we use normalized subtitle duration instead. This results in a significant reduction in the alignment error rate.</abstract>
<identifier type="citekey">itamar-itai-2008-using</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2008/pdf/76_paper.pdf</url>
</location>
<part>
<date>2008-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using Movie Subtitles for Creating a Large-Scale Bilingual Corpora
%A Itamar, Einav
%A Itai, Alon
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Tapias, Daniel
%S Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC’08)
%D 2008
%8 May
%I European Language Resources Association (ELRA)
%C Marrakech, Morocco
%F itamar-itai-2008-using
%X This paper presents a method for compiling a large-scale bilingual corpus from a database of movie subtitles. To create the corpus, we propose an algorithm based on Gale and Churchs sentence alignment algorithm(1993). However, our algorithm not only relies on character length information, but also uses subtitle-timing information, which is encoded in the subtitle files. Timing is highly correlated between subtitles in different versions (for the same movie), since subtitles that match should be displayed at the same time. However, the absolute time values cant be used for alignment, since the timing is usually specified by frame numbers and not by real time, and converting it to real time values is not always possible, hence we use normalized subtitle duration instead. This results in a significant reduction in the alignment error rate.
%U http://www.lrec-conf.org/proceedings/lrec2008/pdf/76_paper.pdf
Markdown (Informal)
[Using Movie Subtitles for Creating a Large-Scale Bilingual Corpora](http://www.lrec-conf.org/proceedings/lrec2008/pdf/76_paper.pdf) (Itamar & Itai, LREC 2008)
ACL