@inproceedings{basharat-etal-2025-variantbench,
title = "{V}ariant{B}ench: A Framework for Evaluating {LLM}s on Justifications for Genetic Variant Interpretation",
author = "Basharat, Humair and
Plotkin, Simon and
Le, Charlotte and
Zhu, Kevin and
Pink, Michael and
Alfaro, Isabella",
editor = "T.y.s.s, Santosh and
Shimizu, Shuichiro and
Gong, Yifan",
booktitle = "The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.ijcnlp-srw.26/",
pages = "314--321",
ISBN = "979-8-89176-304-3",
abstract = "Accurate classification in high-stakes domains requires not only correct predictions but transparent, traceable reasoning. We instantiate this need in clinical genomics and present VariantBench, a reproducible benchmark and scoring harness that evaluates both the final American College of Medical Genetics and Genomics/Association for Molecular Pathology (ACMG/AMP) labels and criterion-level reasoning fidelity for missense single-nucleotide variants (SNVs). Each case pairs a variant with deterministic, machine-readable evidence aligned to five commonly used criteria (PM2, PP3, PS1, BS1, BA1), enabling consistent evaluation of large language models (LLMs). Unlike prior work that reports only final labels, our framework scores the correctness and faithfulness of per-criterion justifications against numeric evidence. On a balanced 100-variant freeze, Gemini 2.5 Flash and GPT-4o outperform Claude 3 Opus on label accuracy and criterion detection, and both improve materially when the decisive PS1 cue is provided explicitly. Error analyses show models master population-frequency cues yet underuse high-impact rules unless evidence is unambiguous. VariantBench delivers a substrate to track such improvements and compare prompting, calibration, and aggregation strategies in genomics and other rule-governed, safety-critical settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="basharat-etal-2025-variantbench">
<titleInfo>
<title>VariantBench: A Framework for Evaluating LLMs on Justifications for Genetic Variant Interpretation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Humair</namePart>
<namePart type="family">Basharat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Plotkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Le</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Pink</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabella</namePart>
<namePart type="family">Alfaro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.y.s.s</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuichiro</namePart>
<namePart type="family">Shimizu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-304-3</identifier>
</relatedItem>
<abstract>Accurate classification in high-stakes domains requires not only correct predictions but transparent, traceable reasoning. We instantiate this need in clinical genomics and present VariantBench, a reproducible benchmark and scoring harness that evaluates both the final American College of Medical Genetics and Genomics/Association for Molecular Pathology (ACMG/AMP) labels and criterion-level reasoning fidelity for missense single-nucleotide variants (SNVs). Each case pairs a variant with deterministic, machine-readable evidence aligned to five commonly used criteria (PM2, PP3, PS1, BS1, BA1), enabling consistent evaluation of large language models (LLMs). Unlike prior work that reports only final labels, our framework scores the correctness and faithfulness of per-criterion justifications against numeric evidence. On a balanced 100-variant freeze, Gemini 2.5 Flash and GPT-4o outperform Claude 3 Opus on label accuracy and criterion detection, and both improve materially when the decisive PS1 cue is provided explicitly. Error analyses show models master population-frequency cues yet underuse high-impact rules unless evidence is unambiguous. VariantBench delivers a substrate to track such improvements and compare prompting, calibration, and aggregation strategies in genomics and other rule-governed, safety-critical settings.</abstract>
<identifier type="citekey">basharat-etal-2025-variantbench</identifier>
<location>
<url>https://aclanthology.org/2025.ijcnlp-srw.26/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>314</start>
<end>321</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VariantBench: A Framework for Evaluating LLMs on Justifications for Genetic Variant Interpretation
%A Basharat, Humair
%A Plotkin, Simon
%A Le, Charlotte
%A Zhu, Kevin
%A Pink, Michael
%A Alfaro, Isabella
%Y T.y.s.s, Santosh
%Y Shimizu, Shuichiro
%Y Gong, Yifan
%S The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-304-3
%F basharat-etal-2025-variantbench
%X Accurate classification in high-stakes domains requires not only correct predictions but transparent, traceable reasoning. We instantiate this need in clinical genomics and present VariantBench, a reproducible benchmark and scoring harness that evaluates both the final American College of Medical Genetics and Genomics/Association for Molecular Pathology (ACMG/AMP) labels and criterion-level reasoning fidelity for missense single-nucleotide variants (SNVs). Each case pairs a variant with deterministic, machine-readable evidence aligned to five commonly used criteria (PM2, PP3, PS1, BS1, BA1), enabling consistent evaluation of large language models (LLMs). Unlike prior work that reports only final labels, our framework scores the correctness and faithfulness of per-criterion justifications against numeric evidence. On a balanced 100-variant freeze, Gemini 2.5 Flash and GPT-4o outperform Claude 3 Opus on label accuracy and criterion detection, and both improve materially when the decisive PS1 cue is provided explicitly. Error analyses show models master population-frequency cues yet underuse high-impact rules unless evidence is unambiguous. VariantBench delivers a substrate to track such improvements and compare prompting, calibration, and aggregation strategies in genomics and other rule-governed, safety-critical settings.
%U https://aclanthology.org/2025.ijcnlp-srw.26/
%P 314-321
Markdown (Informal)
[VariantBench: A Framework for Evaluating LLMs on Justifications for Genetic Variant Interpretation](https://aclanthology.org/2025.ijcnlp-srw.26/) (Basharat et al., IJCNLP 2025)
ACL