@inproceedings{dorsch-wachsmuth-2020-semi,
title = "Semi-Supervised Cleansing of Web Argument Corpora",
author = "Dorsch, Jonas and
Wachsmuth, Henning",
editor = "Cabrio, Elena and
Villata, Serena",
booktitle = "Proceedings of the 7th Workshop on Argument Mining",
month = dec,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.argmining-1.3",
pages = "19--29",
abstract = "Debate portals and similar web platforms constitute one of the main text sources in computational argumentation research and its applications. While the corpora built upon these sources are rich of argumentatively relevant content and structure, they also include text that is irrelevant, or even detrimental, to their purpose. In this paper, we present a precision-oriented approach to detecting such irrelevant text in a semi-supervised way. Given a few seed examples, the approach automatically learns basic lexical patterns of relevance and irrelevance and then incrementally bootstraps new patterns from sentences matching the patterns. In the existing args.me corpus with 400k argumentative texts, our approach detects almost 87k irrelevant sentences, at a precision of 0.97 according to manual evaluation. With low effort, the approach can be adapted to other web argument corpora, providing a generic way to improve corpus quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dorsch-wachsmuth-2020-semi">
<titleInfo>
<title>Semi-Supervised Cleansing of Web Argument Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonas</namePart>
<namePart type="family">Dorsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henning</namePart>
<namePart type="family">Wachsmuth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Argument Mining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Cabrio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serena</namePart>
<namePart type="family">Villata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Debate portals and similar web platforms constitute one of the main text sources in computational argumentation research and its applications. While the corpora built upon these sources are rich of argumentatively relevant content and structure, they also include text that is irrelevant, or even detrimental, to their purpose. In this paper, we present a precision-oriented approach to detecting such irrelevant text in a semi-supervised way. Given a few seed examples, the approach automatically learns basic lexical patterns of relevance and irrelevance and then incrementally bootstraps new patterns from sentences matching the patterns. In the existing args.me corpus with 400k argumentative texts, our approach detects almost 87k irrelevant sentences, at a precision of 0.97 according to manual evaluation. With low effort, the approach can be adapted to other web argument corpora, providing a generic way to improve corpus quality.</abstract>
<identifier type="citekey">dorsch-wachsmuth-2020-semi</identifier>
<location>
<url>https://aclanthology.org/2020.argmining-1.3</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>19</start>
<end>29</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semi-Supervised Cleansing of Web Argument Corpora
%A Dorsch, Jonas
%A Wachsmuth, Henning
%Y Cabrio, Elena
%Y Villata, Serena
%S Proceedings of the 7th Workshop on Argument Mining
%D 2020
%8 December
%I Association for Computational Linguistics
%C Online
%F dorsch-wachsmuth-2020-semi
%X Debate portals and similar web platforms constitute one of the main text sources in computational argumentation research and its applications. While the corpora built upon these sources are rich of argumentatively relevant content and structure, they also include text that is irrelevant, or even detrimental, to their purpose. In this paper, we present a precision-oriented approach to detecting such irrelevant text in a semi-supervised way. Given a few seed examples, the approach automatically learns basic lexical patterns of relevance and irrelevance and then incrementally bootstraps new patterns from sentences matching the patterns. In the existing args.me corpus with 400k argumentative texts, our approach detects almost 87k irrelevant sentences, at a precision of 0.97 according to manual evaluation. With low effort, the approach can be adapted to other web argument corpora, providing a generic way to improve corpus quality.
%U https://aclanthology.org/2020.argmining-1.3
%P 19-29
Markdown (Informal)
[Semi-Supervised Cleansing of Web Argument Corpora](https://aclanthology.org/2020.argmining-1.3) (Dorsch & Wachsmuth, ArgMining 2020)
ACL