@inproceedings{rachmat-etal-2025-qa,
title = "{QA} Analysis in Medical and Legal Domains: A Survey of Data Augmentation in Low-Resource Settings",
author = "Rachmat, Benedictus Kent and
Gerald, Thomas and
Zhang, Zheng and
Grouin, Cyril",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-srw.89/",
doi = "10.18653/v1/2025.acl-srw.89",
pages = "1132--1144",
ISBN = "979-8-89176-254-1",
abstract = "Large Language Models (LLMs) have revolutionized Natural Language Processing (NLP), but their success remains largely confined to high-resource, general-purpose domains. In contrast, applying LLMs to low-resource domains poses significant challenges due to limited training data, domain drift, and strict terminology constraints. This survey provides an overview of the current landscape in domain-specific, low-resource QA with LLMs. We begin by analyzing the coverage and representativeness of specialized-domain QA datasets against large-scale reference datasets what we refer to as \textit{ParentQA}. Building on this analysis, we survey data-centric strategies to enhance input diversity, including data augmentation techniques. We further discuss evaluation metrics for specialized tasks and consider ethical concerns. By mapping current methodologies and outlining open research questions, this survey aims to guide future efforts in adapting LLMs for robust and responsible use in resource-constrained, domain-specific environments. To facilitate reproducibility, we make our code available at https://github.com/kentrachmat/survey-da."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rachmat-etal-2025-qa">
<titleInfo>
<title>QA Analysis in Medical and Legal Domains: A Survey of Data Augmentation in Low-Resource Settings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Benedictus</namePart>
<namePart type="given">Kent</namePart>
<namePart type="family">Rachmat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Gerald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cyril</namePart>
<namePart type="family">Grouin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-254-1</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have revolutionized Natural Language Processing (NLP), but their success remains largely confined to high-resource, general-purpose domains. In contrast, applying LLMs to low-resource domains poses significant challenges due to limited training data, domain drift, and strict terminology constraints. This survey provides an overview of the current landscape in domain-specific, low-resource QA with LLMs. We begin by analyzing the coverage and representativeness of specialized-domain QA datasets against large-scale reference datasets what we refer to as ParentQA. Building on this analysis, we survey data-centric strategies to enhance input diversity, including data augmentation techniques. We further discuss evaluation metrics for specialized tasks and consider ethical concerns. By mapping current methodologies and outlining open research questions, this survey aims to guide future efforts in adapting LLMs for robust and responsible use in resource-constrained, domain-specific environments. To facilitate reproducibility, we make our code available at https://github.com/kentrachmat/survey-da.</abstract>
<identifier type="citekey">rachmat-etal-2025-qa</identifier>
<identifier type="doi">10.18653/v1/2025.acl-srw.89</identifier>
<location>
<url>https://aclanthology.org/2025.acl-srw.89/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1132</start>
<end>1144</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T QA Analysis in Medical and Legal Domains: A Survey of Data Augmentation in Low-Resource Settings
%A Rachmat, Benedictus Kent
%A Gerald, Thomas
%A Zhang, Zheng
%A Grouin, Cyril
%Y Zhao, Jin
%Y Wang, Mingyang
%Y Liu, Zhu
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-254-1
%F rachmat-etal-2025-qa
%X Large Language Models (LLMs) have revolutionized Natural Language Processing (NLP), but their success remains largely confined to high-resource, general-purpose domains. In contrast, applying LLMs to low-resource domains poses significant challenges due to limited training data, domain drift, and strict terminology constraints. This survey provides an overview of the current landscape in domain-specific, low-resource QA with LLMs. We begin by analyzing the coverage and representativeness of specialized-domain QA datasets against large-scale reference datasets what we refer to as ParentQA. Building on this analysis, we survey data-centric strategies to enhance input diversity, including data augmentation techniques. We further discuss evaluation metrics for specialized tasks and consider ethical concerns. By mapping current methodologies and outlining open research questions, this survey aims to guide future efforts in adapting LLMs for robust and responsible use in resource-constrained, domain-specific environments. To facilitate reproducibility, we make our code available at https://github.com/kentrachmat/survey-da.
%R 10.18653/v1/2025.acl-srw.89
%U https://aclanthology.org/2025.acl-srw.89/
%U https://doi.org/10.18653/v1/2025.acl-srw.89
%P 1132-1144
Markdown (Informal)
[QA Analysis in Medical and Legal Domains: A Survey of Data Augmentation in Low-Resource Settings](https://aclanthology.org/2025.acl-srw.89/) (Rachmat et al., ACL 2025)
ACL