@inproceedings{merx-etal-2024-generating,
title = "Generating bilingual example sentences with large language models as lexicography assistants",
author = "Merx, Raphael and
Vylomova, Ekaterina and
Kurniawan, Kemal",
editor = "Baldwin, Tim and
Rodr{\'i}guez M{\'e}ndez, Sergio Jos{\'e} and
Kuo, Nicholas",
booktitle = "Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association",
month = dec,
year = "2024",
address = "Canberra, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alta-1.5/",
pages = "64--74",
abstract = "We present a study of LLMs' performance in generating and rating example sentences for bilingual dictionaries across languages with varying resource levels: French (high-resource), Indonesian (mid-resource), and Tetun (low-resource), with English as the target language. We evaluate the quality of LLMgenerated examples against the GDEX (Good Dictionary EXample) criteria: typicality, informativeness, and intelligibility (Kilgarriff et al., 2008). Our findings reveal that while LLMs can generate reasonably good dictionary examples, their performance degrades significantly for lower-resourced languages. We also observe high variability in human preferences for example quality, reflected in low interannotator agreement rates. To address this, we demonstrate that in-context learning can successfully align LLMs with individual annotator preferences. Additionally, we explore the use of pre-trained language models for automated rating of examples, finding that sentence perplexity serves as a good proxy for {\textquotedblleft}typicality{\textquotedblright} and {\textquotedblleft}intelligibility{\textquotedblright} in higher-resourced languages. Our study also contributes a novel dataset of 600 ratings for LLM-generated sentence pairs, and provides insights into the potential of LLMs in reducing the cost of lexicographic work, particularly for low-resource languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="merx-etal-2024-generating">
<titleInfo>
<title>Generating bilingual example sentences with large language models as lexicography assistants</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Merx</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Kurniawan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="given">José</namePart>
<namePart type="family">Rodríguez Méndez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Kuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Canberra, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a study of LLMs’ performance in generating and rating example sentences for bilingual dictionaries across languages with varying resource levels: French (high-resource), Indonesian (mid-resource), and Tetun (low-resource), with English as the target language. We evaluate the quality of LLMgenerated examples against the GDEX (Good Dictionary EXample) criteria: typicality, informativeness, and intelligibility (Kilgarriff et al., 2008). Our findings reveal that while LLMs can generate reasonably good dictionary examples, their performance degrades significantly for lower-resourced languages. We also observe high variability in human preferences for example quality, reflected in low interannotator agreement rates. To address this, we demonstrate that in-context learning can successfully align LLMs with individual annotator preferences. Additionally, we explore the use of pre-trained language models for automated rating of examples, finding that sentence perplexity serves as a good proxy for “typicality” and “intelligibility” in higher-resourced languages. Our study also contributes a novel dataset of 600 ratings for LLM-generated sentence pairs, and provides insights into the potential of LLMs in reducing the cost of lexicographic work, particularly for low-resource languages.</abstract>
<identifier type="citekey">merx-etal-2024-generating</identifier>
<location>
<url>https://aclanthology.org/2024.alta-1.5/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>64</start>
<end>74</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating bilingual example sentences with large language models as lexicography assistants
%A Merx, Raphael
%A Vylomova, Ekaterina
%A Kurniawan, Kemal
%Y Baldwin, Tim
%Y Rodríguez Méndez, Sergio José
%Y Kuo, Nicholas
%S Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association
%D 2024
%8 December
%I Association for Computational Linguistics
%C Canberra, Australia
%F merx-etal-2024-generating
%X We present a study of LLMs’ performance in generating and rating example sentences for bilingual dictionaries across languages with varying resource levels: French (high-resource), Indonesian (mid-resource), and Tetun (low-resource), with English as the target language. We evaluate the quality of LLMgenerated examples against the GDEX (Good Dictionary EXample) criteria: typicality, informativeness, and intelligibility (Kilgarriff et al., 2008). Our findings reveal that while LLMs can generate reasonably good dictionary examples, their performance degrades significantly for lower-resourced languages. We also observe high variability in human preferences for example quality, reflected in low interannotator agreement rates. To address this, we demonstrate that in-context learning can successfully align LLMs with individual annotator preferences. Additionally, we explore the use of pre-trained language models for automated rating of examples, finding that sentence perplexity serves as a good proxy for “typicality” and “intelligibility” in higher-resourced languages. Our study also contributes a novel dataset of 600 ratings for LLM-generated sentence pairs, and provides insights into the potential of LLMs in reducing the cost of lexicographic work, particularly for low-resource languages.
%U https://aclanthology.org/2024.alta-1.5/
%P 64-74
Markdown (Informal)
[Generating bilingual example sentences with large language models as lexicography assistants](https://aclanthology.org/2024.alta-1.5/) (Merx et al., ALTA 2024)
ACL