@inproceedings{walker-etal-2010-large,
title = "Large Scale Multilingual Broadcast Data Collection to Support Machine Translation and Distillation Technology Development",
author = "Walker, Kevin and
Caruso, Christopher and
DiPersio, Denise",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/667_Paper.pdf",
abstract = "The development of technologies to address machine translation and distillation of multilingual broadcast data depends heavily on the collection of large volumes of material from modern data providers. To address the needs of GALE researchers, the Linguistic Data Consortium (LDC) developed a system for collecting broadcast news and conversation from a variety of Arabic, Chinese and English broadcasters. The system is highly automated, easily extensible and robust and is capable of collecting, processing and evaluating hundreds of hours of content from several dozen sources per day. In addition to this extensive system, LDC manages three remote collection sites to maximize the variety of available broadcast data and has designed a portable broadcast collection platform to facilitate remote collection. This paper will present a detailed a description of the design and implementation of LDCs collection system, the technical challenges and solutions to large scale broadcast data collection efforts and an overview of the systems operation. This paper will also discuss the challenges of managing remote collections, in particular, the strategies used to normalize data formats, naming conventions and delivery methods to achieve optimal integration of remotely-collected data into LDCs collection database and downstream tasking workflow.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="walker-etal-2010-large">
<titleInfo>
<title>Large Scale Multilingual Broadcast Data Collection to Support Machine Translation and Distillation Technology Development</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Walker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Caruso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denise</namePart>
<namePart type="family">DiPersio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The development of technologies to address machine translation and distillation of multilingual broadcast data depends heavily on the collection of large volumes of material from modern data providers. To address the needs of GALE researchers, the Linguistic Data Consortium (LDC) developed a system for collecting broadcast news and conversation from a variety of Arabic, Chinese and English broadcasters. The system is highly automated, easily extensible and robust and is capable of collecting, processing and evaluating hundreds of hours of content from several dozen sources per day. In addition to this extensive system, LDC manages three remote collection sites to maximize the variety of available broadcast data and has designed a portable broadcast collection platform to facilitate remote collection. This paper will present a detailed a description of the design and implementation of LDCs collection system, the technical challenges and solutions to large scale broadcast data collection efforts and an overview of the systems operation. This paper will also discuss the challenges of managing remote collections, in particular, the strategies used to normalize data formats, naming conventions and delivery methods to achieve optimal integration of remotely-collected data into LDCs collection database and downstream tasking workflow.</abstract>
<identifier type="citekey">walker-etal-2010-large</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/667_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Scale Multilingual Broadcast Data Collection to Support Machine Translation and Distillation Technology Development
%A Walker, Kevin
%A Caruso, Christopher
%A DiPersio, Denise
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F walker-etal-2010-large
%X The development of technologies to address machine translation and distillation of multilingual broadcast data depends heavily on the collection of large volumes of material from modern data providers. To address the needs of GALE researchers, the Linguistic Data Consortium (LDC) developed a system for collecting broadcast news and conversation from a variety of Arabic, Chinese and English broadcasters. The system is highly automated, easily extensible and robust and is capable of collecting, processing and evaluating hundreds of hours of content from several dozen sources per day. In addition to this extensive system, LDC manages three remote collection sites to maximize the variety of available broadcast data and has designed a portable broadcast collection platform to facilitate remote collection. This paper will present a detailed a description of the design and implementation of LDCs collection system, the technical challenges and solutions to large scale broadcast data collection efforts and an overview of the systems operation. This paper will also discuss the challenges of managing remote collections, in particular, the strategies used to normalize data formats, naming conventions and delivery methods to achieve optimal integration of remotely-collected data into LDCs collection database and downstream tasking workflow.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/667_Paper.pdf
Markdown (Informal)
[Large Scale Multilingual Broadcast Data Collection to Support Machine Translation and Distillation Technology Development](http://www.lrec-conf.org/proceedings/lrec2010/pdf/667_Paper.pdf) (Walker et al., LREC 2010)
ACL