@inproceedings{yin-etal-2025-rethinking,
title = "Rethinking Cross-Subject Data Splitting for Brain-to-Text Decoding",
author = "Yin, Congchi and
Yu, Qian and
Fang, Zhiwei and
Peng, Changping and
Li, Piji",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.289/",
pages = "5686--5700",
ISBN = "979-8-89176-332-6",
abstract = "Recent major milestones have successfully reconstructed natural language from non-invasive brain signals (e.g. functional Magnetic Resonance Imaging (fMRI) and Electroencephalogram (EEG)) across subjects. However, we find current dataset splitting strategies for cross-subject brain-to-text decoding are wrong. Specifically, we first demonstrate that all current splitting methods suffer from data leakage problem, which refers to the leakage of validation and test data into training set, resulting in significant overfitting and overestimation of decoding models. In this study, we develop a right cross-subject data splitting criterion without data leakage for decoding fMRI and EEG signal to text. Some SOTA brain-to-text decoding models are re-evaluated correctly with the proposed criterion for further research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yin-etal-2025-rethinking">
<titleInfo>
<title>Rethinking Cross-Subject Data Splitting for Brain-to-Text Decoding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Congchi</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiwei</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changping</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piji</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Recent major milestones have successfully reconstructed natural language from non-invasive brain signals (e.g. functional Magnetic Resonance Imaging (fMRI) and Electroencephalogram (EEG)) across subjects. However, we find current dataset splitting strategies for cross-subject brain-to-text decoding are wrong. Specifically, we first demonstrate that all current splitting methods suffer from data leakage problem, which refers to the leakage of validation and test data into training set, resulting in significant overfitting and overestimation of decoding models. In this study, we develop a right cross-subject data splitting criterion without data leakage for decoding fMRI and EEG signal to text. Some SOTA brain-to-text decoding models are re-evaluated correctly with the proposed criterion for further research.</abstract>
<identifier type="citekey">yin-etal-2025-rethinking</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.289/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>5686</start>
<end>5700</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking Cross-Subject Data Splitting for Brain-to-Text Decoding
%A Yin, Congchi
%A Yu, Qian
%A Fang, Zhiwei
%A Peng, Changping
%A Li, Piji
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F yin-etal-2025-rethinking
%X Recent major milestones have successfully reconstructed natural language from non-invasive brain signals (e.g. functional Magnetic Resonance Imaging (fMRI) and Electroencephalogram (EEG)) across subjects. However, we find current dataset splitting strategies for cross-subject brain-to-text decoding are wrong. Specifically, we first demonstrate that all current splitting methods suffer from data leakage problem, which refers to the leakage of validation and test data into training set, resulting in significant overfitting and overestimation of decoding models. In this study, we develop a right cross-subject data splitting criterion without data leakage for decoding fMRI and EEG signal to text. Some SOTA brain-to-text decoding models are re-evaluated correctly with the proposed criterion for further research.
%U https://aclanthology.org/2025.emnlp-main.289/
%P 5686-5700
Markdown (Informal)
[Rethinking Cross-Subject Data Splitting for Brain-to-Text Decoding](https://aclanthology.org/2025.emnlp-main.289/) (Yin et al., EMNLP 2025)
ACL