@inproceedings{park-etal-2025-nota,
title = "Nota {AI} at {G}en{AI} Detection Task 1: Unseen Language-Aware Detection System for Multilingual Machine-Generated Text",
author = "Park, Hancheol and
Kim, Jaeyeon and
Kim, Geonmin and
Kim, Tae-Ho",
editor = "Alam, Firoj and
Nakov, Preslav and
Habash, Nizar and
Gurevych, Iryna and
Chowdhury, Shammur and
Shelmanov, Artem and
Wang, Yuxia and
Artemova, Ekaterina and
Kutlu, Mucahid and
Mikros, George",
booktitle = "Proceedings of the 1stWorkshop on GenAI Content Detection (GenAIDetect)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2025.genaidetect-1.19/",
pages = "191--196",
abstract = "Recently, large language models (LLMs) have demonstrated unprecedented capabilities in language generation, yet they still often produce incorrect information. Therefore, determining whether a text was generated by an LLM has become one of the factors that must be considered when evaluating its reliability. In this paper, we discuss methods to determine whether texts written in various languages were authored by humans or generated by LLMs. We have discovered that the classification accuracy significantly decreases for texts written in languages not observed during the training process, and we aim to address this issue. We propose a method to improve performance for unseen languages by using token-level predictive distributions extracted from various LLMs and text embeddings from a multilingual pre-trained langauge model. With the proposed method, we achieved third place out of 25 teams in Subtask B (binary multilingual machine-generated text detection) of Shared Task 1, with an F1 macro score of 0.7532."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="park-etal-2025-nota">
<titleInfo>
<title>Nota AI at GenAI Detection Task 1: Unseen Language-Aware Detection System for Multilingual Machine-Generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hancheol</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaeyeon</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geonmin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tae-Ho</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1stWorkshop on GenAI Content Detection (GenAIDetect)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Shelmanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxia</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mucahid</namePart>
<namePart type="family">Kutlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Mikros</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, large language models (LLMs) have demonstrated unprecedented capabilities in language generation, yet they still often produce incorrect information. Therefore, determining whether a text was generated by an LLM has become one of the factors that must be considered when evaluating its reliability. In this paper, we discuss methods to determine whether texts written in various languages were authored by humans or generated by LLMs. We have discovered that the classification accuracy significantly decreases for texts written in languages not observed during the training process, and we aim to address this issue. We propose a method to improve performance for unseen languages by using token-level predictive distributions extracted from various LLMs and text embeddings from a multilingual pre-trained langauge model. With the proposed method, we achieved third place out of 25 teams in Subtask B (binary multilingual machine-generated text detection) of Shared Task 1, with an F1 macro score of 0.7532.</abstract>
<identifier type="citekey">park-etal-2025-nota</identifier>
<location>
<url>https://aclanthology.org/2025.genaidetect-1.19/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>191</start>
<end>196</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Nota AI at GenAI Detection Task 1: Unseen Language-Aware Detection System for Multilingual Machine-Generated Text
%A Park, Hancheol
%A Kim, Jaeyeon
%A Kim, Geonmin
%A Kim, Tae-Ho
%Y Alam, Firoj
%Y Nakov, Preslav
%Y Habash, Nizar
%Y Gurevych, Iryna
%Y Chowdhury, Shammur
%Y Shelmanov, Artem
%Y Wang, Yuxia
%Y Artemova, Ekaterina
%Y Kutlu, Mucahid
%Y Mikros, George
%S Proceedings of the 1stWorkshop on GenAI Content Detection (GenAIDetect)
%D 2025
%8 January
%I International Conference on Computational Linguistics
%C Abu Dhabi, UAE
%F park-etal-2025-nota
%X Recently, large language models (LLMs) have demonstrated unprecedented capabilities in language generation, yet they still often produce incorrect information. Therefore, determining whether a text was generated by an LLM has become one of the factors that must be considered when evaluating its reliability. In this paper, we discuss methods to determine whether texts written in various languages were authored by humans or generated by LLMs. We have discovered that the classification accuracy significantly decreases for texts written in languages not observed during the training process, and we aim to address this issue. We propose a method to improve performance for unseen languages by using token-level predictive distributions extracted from various LLMs and text embeddings from a multilingual pre-trained langauge model. With the proposed method, we achieved third place out of 25 teams in Subtask B (binary multilingual machine-generated text detection) of Shared Task 1, with an F1 macro score of 0.7532.
%U https://aclanthology.org/2025.genaidetect-1.19/
%P 191-196
Markdown (Informal)
[Nota AI at GenAI Detection Task 1: Unseen Language-Aware Detection System for Multilingual Machine-Generated Text](https://aclanthology.org/2025.genaidetect-1.19/) (Park et al., GenAIDetect 2025)
ACL