@inproceedings{behzad-zeldes-2020-cross,
title = "A Cross-Genre Ensemble Approach to Robust {R}eddit Part of Speech Tagging",
author = "Behzad, Shabnam and
Zeldes, Amir",
editor = {Barbaresi, Adrien and
Bildhauer, Felix and
Sch{\"a}fer, Roland and
Stemle, Egon},
booktitle = "Proceedings of the 12th Web as Corpus Workshop",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.wac-1.7",
pages = "50--56",
abstract = "Part of speech tagging is a fundamental NLP task often regarded as solved for high-resource languages such as English. Current state-of-the-art models have achieved high accuracy, especially on the news domain. However, when these models are applied to other corpora with different genres, and especially user-generated data from the Web, we see substantial drops in performance. In this work, we study how a state-of-the-art tagging model trained on different genres performs on Web content from unfiltered Reddit forum discussions. We report the results when training on different splits of the data, tested on Reddit. Our results show that even small amounts of in-domain data can outperform the contribution of data an order of magnitude larger coming from other Web domains. To make progress on out-of-domain tagging, we also evaluate an ensemble approach using multiple single-genre taggers as input features to a meta-classifier. We present state of the art performance on tagging Reddit data, as well as error analysis of the results of these models, and offer a typology of the most common error types among them, broken down by training corpus.",
language = "English",
ISBN = "979-10-95546-68-9",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="behzad-zeldes-2020-cross">
<titleInfo>
<title>A Cross-Genre Ensemble Approach to Robust Reddit Part of Speech Tagging</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shabnam</namePart>
<namePart type="family">Behzad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zeldes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Web as Corpus Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Bildhauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roland</namePart>
<namePart type="family">Schäfer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Egon</namePart>
<namePart type="family">Stemle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-68-9</identifier>
</relatedItem>
<abstract>Part of speech tagging is a fundamental NLP task often regarded as solved for high-resource languages such as English. Current state-of-the-art models have achieved high accuracy, especially on the news domain. However, when these models are applied to other corpora with different genres, and especially user-generated data from the Web, we see substantial drops in performance. In this work, we study how a state-of-the-art tagging model trained on different genres performs on Web content from unfiltered Reddit forum discussions. We report the results when training on different splits of the data, tested on Reddit. Our results show that even small amounts of in-domain data can outperform the contribution of data an order of magnitude larger coming from other Web domains. To make progress on out-of-domain tagging, we also evaluate an ensemble approach using multiple single-genre taggers as input features to a meta-classifier. We present state of the art performance on tagging Reddit data, as well as error analysis of the results of these models, and offer a typology of the most common error types among them, broken down by training corpus.</abstract>
<identifier type="citekey">behzad-zeldes-2020-cross</identifier>
<location>
<url>https://aclanthology.org/2020.wac-1.7</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>50</start>
<end>56</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Cross-Genre Ensemble Approach to Robust Reddit Part of Speech Tagging
%A Behzad, Shabnam
%A Zeldes, Amir
%Y Barbaresi, Adrien
%Y Bildhauer, Felix
%Y Schäfer, Roland
%Y Stemle, Egon
%S Proceedings of the 12th Web as Corpus Workshop
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-68-9
%G English
%F behzad-zeldes-2020-cross
%X Part of speech tagging is a fundamental NLP task often regarded as solved for high-resource languages such as English. Current state-of-the-art models have achieved high accuracy, especially on the news domain. However, when these models are applied to other corpora with different genres, and especially user-generated data from the Web, we see substantial drops in performance. In this work, we study how a state-of-the-art tagging model trained on different genres performs on Web content from unfiltered Reddit forum discussions. We report the results when training on different splits of the data, tested on Reddit. Our results show that even small amounts of in-domain data can outperform the contribution of data an order of magnitude larger coming from other Web domains. To make progress on out-of-domain tagging, we also evaluate an ensemble approach using multiple single-genre taggers as input features to a meta-classifier. We present state of the art performance on tagging Reddit data, as well as error analysis of the results of these models, and offer a typology of the most common error types among them, broken down by training corpus.
%U https://aclanthology.org/2020.wac-1.7
%P 50-56
Markdown (Informal)
[A Cross-Genre Ensemble Approach to Robust Reddit Part of Speech Tagging](https://aclanthology.org/2020.wac-1.7) (Behzad & Zeldes, WAC 2020)
ACL