@inproceedings{arora-etal-2025-calmqa,
title = "{C}a{LMQA}: Exploring culturally specific long-form question answering across 23 languages",
author = "Arora, Shane and
Karpinska, Marzena and
Chen, Hung-Ting and
Bhattacharjee, Ipsita and
Iyyer, Mohit and
Choi, Eunsol",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.578/",
doi = "10.18653/v1/2025.acl-long.578",
pages = "11772--11817",
ISBN = "979-8-89176-251-0",
abstract = "Despite rising global usage of large language models (LLMs), their ability to generate *long-form* answers to *culturally specific* questions remains unexplored in many languages. To fill this gap, we perform the first study of textual multilingual long-form QA by creating CaLMQA, a dataset of **51.7K** culturally specific questions across **23** different languages. We define culturally specific questions as those that refer to concepts unique to one or a few cultures, or have different answers depending on the cultural or regional context. We obtain these questions by crawling naturally-occurring questions from community web forums in high-resource languages, and by hiring native speakers to write questions in under-resourced, rarely-studied languages such as Fijian and Kirundi. Our data collection methodologies are translation-free, enabling the collection of culturally unique questions like ``Kuber iki umwami wa mbere w{'}uburundi yitwa Ntare?'' (Kirundi; English translation: ``Why was the first king of Burundi called Ntare (Lion)?''). We evaluate factuality, relevance and surface-level quality of LLM-generated long-form answers, finding that (1) for many languages, even the best models make critical surface-level errors (e.g., answering in the wrong language, repetition), especially for low-resource languages; and (2) answers to culturally specific questions contain more factual errors than answers to culturally agnostic questions {--} questions that have consistent meaning and answer across many cultures. We release CaLMQA to facilitate future research in cultural and multilingual long-form QA."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arora-etal-2025-calmqa">
<titleInfo>
<title>CaLMQA: Exploring culturally specific long-form question answering across 23 languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shane</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzena</namePart>
<namePart type="family">Karpinska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-Ting</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ipsita</namePart>
<namePart type="family">Bhattacharjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Iyyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eunsol</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Despite rising global usage of large language models (LLMs), their ability to generate *long-form* answers to *culturally specific* questions remains unexplored in many languages. To fill this gap, we perform the first study of textual multilingual long-form QA by creating CaLMQA, a dataset of **51.7K** culturally specific questions across **23** different languages. We define culturally specific questions as those that refer to concepts unique to one or a few cultures, or have different answers depending on the cultural or regional context. We obtain these questions by crawling naturally-occurring questions from community web forums in high-resource languages, and by hiring native speakers to write questions in under-resourced, rarely-studied languages such as Fijian and Kirundi. Our data collection methodologies are translation-free, enabling the collection of culturally unique questions like “Kuber iki umwami wa mbere w’uburundi yitwa Ntare?” (Kirundi; English translation: “Why was the first king of Burundi called Ntare (Lion)?”). We evaluate factuality, relevance and surface-level quality of LLM-generated long-form answers, finding that (1) for many languages, even the best models make critical surface-level errors (e.g., answering in the wrong language, repetition), especially for low-resource languages; and (2) answers to culturally specific questions contain more factual errors than answers to culturally agnostic questions – questions that have consistent meaning and answer across many cultures. We release CaLMQA to facilitate future research in cultural and multilingual long-form QA.</abstract>
<identifier type="citekey">arora-etal-2025-calmqa</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.578</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.578/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>11772</start>
<end>11817</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CaLMQA: Exploring culturally specific long-form question answering across 23 languages
%A Arora, Shane
%A Karpinska, Marzena
%A Chen, Hung-Ting
%A Bhattacharjee, Ipsita
%A Iyyer, Mohit
%A Choi, Eunsol
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F arora-etal-2025-calmqa
%X Despite rising global usage of large language models (LLMs), their ability to generate *long-form* answers to *culturally specific* questions remains unexplored in many languages. To fill this gap, we perform the first study of textual multilingual long-form QA by creating CaLMQA, a dataset of **51.7K** culturally specific questions across **23** different languages. We define culturally specific questions as those that refer to concepts unique to one or a few cultures, or have different answers depending on the cultural or regional context. We obtain these questions by crawling naturally-occurring questions from community web forums in high-resource languages, and by hiring native speakers to write questions in under-resourced, rarely-studied languages such as Fijian and Kirundi. Our data collection methodologies are translation-free, enabling the collection of culturally unique questions like “Kuber iki umwami wa mbere w’uburundi yitwa Ntare?” (Kirundi; English translation: “Why was the first king of Burundi called Ntare (Lion)?”). We evaluate factuality, relevance and surface-level quality of LLM-generated long-form answers, finding that (1) for many languages, even the best models make critical surface-level errors (e.g., answering in the wrong language, repetition), especially for low-resource languages; and (2) answers to culturally specific questions contain more factual errors than answers to culturally agnostic questions – questions that have consistent meaning and answer across many cultures. We release CaLMQA to facilitate future research in cultural and multilingual long-form QA.
%R 10.18653/v1/2025.acl-long.578
%U https://aclanthology.org/2025.acl-long.578/
%U https://doi.org/10.18653/v1/2025.acl-long.578
%P 11772-11817
Markdown (Informal)
[CaLMQA: Exploring culturally specific long-form question answering across 23 languages](https://aclanthology.org/2025.acl-long.578/) (Arora et al., ACL 2025)
ACL