@inproceedings{ramirez-sanchez-etal-2024-smartbic,
title = "{S}mart{B}i{C}: Smart Harvesting of Bilingual Corpora from the {I}nternet",
author = "Ram{\'\i}rez-S{\'a}nchez, Gema and
Ortiz Rojas, Sergio and
N{\'u}{\~n}ez Alcover, Alicia and
Mateiu, Tudor and
Forcada, Mikel and
Orzas, Pedro and
Carrillo, Almudena and
Nolasco, Giuseppe and
List{\'o}n, Noelia",
editor = "Scarton, Carolina and
Prescott, Charlotte and
Bayliss, Chris and
Oakley, Chris and
Wright, Joanna and
Wrigley, Stuart and
Song, Xingyi and
Gow-Smith, Edward and
Forcada, Mikel and
Moniz, Helena",
booktitle = "Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2)",
month = jun,
year = "2024",
address = "Sheffield, UK",
publisher = "European Association for Machine Translation (EAMT)",
url = "https://aclanthology.org/2024.eamt-2.5",
pages = "8--9",
abstract = "SmartBiC, an 18-month innovation project funded by the Spanish Government, aims at improving the full process of collecting, filtering and selecting in-domain parallel content to be used for machine translation and language model tuning purposes in industrial settings. Based on state-of-the-art technology in the free/open-source parallel web corpora harvester Bitextor, SmartBic develops a web-based application around it including novel components such as a language- and domain-focused crawler and a domain-specific corpora selector. SmartBic also addresses specific industrial use cases for individual components of the Bitextor pipeline, such as parallel data cleaning. Relevant improvements to the current Bitextor pipeline will be publicly released.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ramirez-sanchez-etal-2024-smartbic">
<titleInfo>
<title>SmartBiC: Smart Harvesting of Bilingual Corpora from the Internet</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Ortiz Rojas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Núñez Alcover</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tudor</namePart>
<namePart type="family">Mateiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Orzas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Almudena</namePart>
<namePart type="family">Carrillo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Nolasco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noelia</namePart>
<namePart type="family">Listón</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Scarton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Prescott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Bayliss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Oakley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joanna</namePart>
<namePart type="family">Wright</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stuart</namePart>
<namePart type="family">Wrigley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingyi</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edward</namePart>
<namePart type="family">Gow-Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation (EAMT)</publisher>
<place>
<placeTerm type="text">Sheffield, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>SmartBiC, an 18-month innovation project funded by the Spanish Government, aims at improving the full process of collecting, filtering and selecting in-domain parallel content to be used for machine translation and language model tuning purposes in industrial settings. Based on state-of-the-art technology in the free/open-source parallel web corpora harvester Bitextor, SmartBic develops a web-based application around it including novel components such as a language- and domain-focused crawler and a domain-specific corpora selector. SmartBic also addresses specific industrial use cases for individual components of the Bitextor pipeline, such as parallel data cleaning. Relevant improvements to the current Bitextor pipeline will be publicly released.</abstract>
<identifier type="citekey">ramirez-sanchez-etal-2024-smartbic</identifier>
<location>
<url>https://aclanthology.org/2024.eamt-2.5</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>8</start>
<end>9</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SmartBiC: Smart Harvesting of Bilingual Corpora from the Internet
%A Ramírez-Sánchez, Gema
%A Ortiz Rojas, Sergio
%A Núñez Alcover, Alicia
%A Mateiu, Tudor
%A Forcada, Mikel
%A Orzas, Pedro
%A Carrillo, Almudena
%A Nolasco, Giuseppe
%A Listón, Noelia
%Y Scarton, Carolina
%Y Prescott, Charlotte
%Y Bayliss, Chris
%Y Oakley, Chris
%Y Wright, Joanna
%Y Wrigley, Stuart
%Y Song, Xingyi
%Y Gow-Smith, Edward
%Y Forcada, Mikel
%Y Moniz, Helena
%S Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2)
%D 2024
%8 June
%I European Association for Machine Translation (EAMT)
%C Sheffield, UK
%F ramirez-sanchez-etal-2024-smartbic
%X SmartBiC, an 18-month innovation project funded by the Spanish Government, aims at improving the full process of collecting, filtering and selecting in-domain parallel content to be used for machine translation and language model tuning purposes in industrial settings. Based on state-of-the-art technology in the free/open-source parallel web corpora harvester Bitextor, SmartBic develops a web-based application around it including novel components such as a language- and domain-focused crawler and a domain-specific corpora selector. SmartBic also addresses specific industrial use cases for individual components of the Bitextor pipeline, such as parallel data cleaning. Relevant improvements to the current Bitextor pipeline will be publicly released.
%U https://aclanthology.org/2024.eamt-2.5
%P 8-9
Markdown (Informal)
[SmartBiC: Smart Harvesting of Bilingual Corpora from the Internet](https://aclanthology.org/2024.eamt-2.5) (Ramírez-Sánchez et al., EAMT 2024)
ACL
- Gema Ramírez-Sánchez, Sergio Ortiz Rojas, Alicia Núñez Alcover, Tudor Mateiu, Mikel Forcada, Pedro Orzas, Almudena Carrillo, Giuseppe Nolasco, and Noelia Listón. 2024. SmartBiC: Smart Harvesting of Bilingual Corpora from the Internet. In Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2), pages 8–9, Sheffield, UK. European Association for Machine Translation (EAMT).