@inproceedings{bikaun-etal-2024-maintnorm,
title = "{M}aint{N}orm: A corpus and benchmark model for lexical normalisation and masking of industrial maintenance short text",
author = "Bikaun, Tyler and
Hodkiewicz, Melinda and
Liu, Wei",
editor = {van der Goot, Rob and
Bak, JinYeong and
M{\"u}ller-Eberstein, Max and
Xu, Wei and
Ritter, Alan and
Baldwin, Tim},
booktitle = "Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024)",
month = mar,
year = "2024",
address = "San {\.G}iljan, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wnut-1.7",
pages = "68--78",
abstract = "Maintenance short texts are invaluable unstructured data sources, serving as a diagnostic and prognostic window into the operational health and status of physical assets. These user-generated texts, created during routine or ad-hoc maintenance activities, offer insights into equipment performance, potential failure points, and maintenance needs. However, the use of information captured in these texts is hindered by inherent challenges: the prevalence of engineering jargon, domain-specific vernacular, random spelling errors without identifiable patterns, and the absence of standard grammatical structures. To transform these texts into accessible and analysable data, we introduce the MaintNorm dataset, the first resource specifically tailored for the lexical normalisation task of maintenance short texts. Comprising 12,000 examples, this dataset enables the efficient processing and interpretation of these texts. We demonstrate the utility of MaintNorm by training a lexical normalisation model as a sequence-to-sequence learning task with two learning objectives, namely, enhancing the quality of the texts and masking segments to obscure sensitive information to anonymise data. Our benchmark model demonstrates a universal error reduction rate of 95.8{\%}. The dataset and benchmark outcomes are available to the public.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bikaun-etal-2024-maintnorm">
<titleInfo>
<title>MaintNorm: A corpus and benchmark model for lexical normalisation and masking of industrial maintenance short text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tyler</namePart>
<namePart type="family">Bikaun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melinda</namePart>
<namePart type="family">Hodkiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">van der Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">JinYeong</namePart>
<namePart type="family">Bak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Müller-Eberstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Ġiljan, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Maintenance short texts are invaluable unstructured data sources, serving as a diagnostic and prognostic window into the operational health and status of physical assets. These user-generated texts, created during routine or ad-hoc maintenance activities, offer insights into equipment performance, potential failure points, and maintenance needs. However, the use of information captured in these texts is hindered by inherent challenges: the prevalence of engineering jargon, domain-specific vernacular, random spelling errors without identifiable patterns, and the absence of standard grammatical structures. To transform these texts into accessible and analysable data, we introduce the MaintNorm dataset, the first resource specifically tailored for the lexical normalisation task of maintenance short texts. Comprising 12,000 examples, this dataset enables the efficient processing and interpretation of these texts. We demonstrate the utility of MaintNorm by training a lexical normalisation model as a sequence-to-sequence learning task with two learning objectives, namely, enhancing the quality of the texts and masking segments to obscure sensitive information to anonymise data. Our benchmark model demonstrates a universal error reduction rate of 95.8%. The dataset and benchmark outcomes are available to the public.</abstract>
<identifier type="citekey">bikaun-etal-2024-maintnorm</identifier>
<location>
<url>https://aclanthology.org/2024.wnut-1.7</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>68</start>
<end>78</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MaintNorm: A corpus and benchmark model for lexical normalisation and masking of industrial maintenance short text
%A Bikaun, Tyler
%A Hodkiewicz, Melinda
%A Liu, Wei
%Y van der Goot, Rob
%Y Bak, JinYeong
%Y Müller-Eberstein, Max
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%S Proceedings of the Ninth Workshop on Noisy and User-generated Text (W-NUT 2024)
%D 2024
%8 March
%I Association for Computational Linguistics
%C San Ġiljan, Malta
%F bikaun-etal-2024-maintnorm
%X Maintenance short texts are invaluable unstructured data sources, serving as a diagnostic and prognostic window into the operational health and status of physical assets. These user-generated texts, created during routine or ad-hoc maintenance activities, offer insights into equipment performance, potential failure points, and maintenance needs. However, the use of information captured in these texts is hindered by inherent challenges: the prevalence of engineering jargon, domain-specific vernacular, random spelling errors without identifiable patterns, and the absence of standard grammatical structures. To transform these texts into accessible and analysable data, we introduce the MaintNorm dataset, the first resource specifically tailored for the lexical normalisation task of maintenance short texts. Comprising 12,000 examples, this dataset enables the efficient processing and interpretation of these texts. We demonstrate the utility of MaintNorm by training a lexical normalisation model as a sequence-to-sequence learning task with two learning objectives, namely, enhancing the quality of the texts and masking segments to obscure sensitive information to anonymise data. Our benchmark model demonstrates a universal error reduction rate of 95.8%. The dataset and benchmark outcomes are available to the public.
%U https://aclanthology.org/2024.wnut-1.7
%P 68-78
Markdown (Informal)
[MaintNorm: A corpus and benchmark model for lexical normalisation and masking of industrial maintenance short text](https://aclanthology.org/2024.wnut-1.7) (Bikaun et al., WNUT-WS 2024)
ACL