@inproceedings{goto-etal-2025-acquiring,
title = "Acquiring Bidirectionality via Large and Small Language Models",
author = "Goto, Takumi and
Nagao, Hiroyoshi and
Koreeda, Yuta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.116/",
pages = "1711--1717",
abstract = "Using token representation from bidirectional language models (LMs) such as BERT is still a widely used approach for token-classification tasks. Even though there exist much larger unidirectional LMs such as Llama-2, they are rarely used to replace the token representation of bidirectional LMs. In this work, we hypothesize that their lack of bidirectionality is what is keeping unidirectional LMs behind. To that end, we propose to newly train a small backward LM and concatenate its representations to those of an existing LM for downstream tasks. Through experiments in token-classification tasks, we demonstrate that introducing backward model can improve the benchmark performance by more than 10 points. Furthermore, we show that the proposed method is especially effective for rare domains and in few-shot learning settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goto-etal-2025-acquiring">
<titleInfo>
<title>Acquiring Bidirectionality via Large and Small Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takumi</namePart>
<namePart type="family">Goto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroyoshi</namePart>
<namePart type="family">Nagao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuta</namePart>
<namePart type="family">Koreeda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Using token representation from bidirectional language models (LMs) such as BERT is still a widely used approach for token-classification tasks. Even though there exist much larger unidirectional LMs such as Llama-2, they are rarely used to replace the token representation of bidirectional LMs. In this work, we hypothesize that their lack of bidirectionality is what is keeping unidirectional LMs behind. To that end, we propose to newly train a small backward LM and concatenate its representations to those of an existing LM for downstream tasks. Through experiments in token-classification tasks, we demonstrate that introducing backward model can improve the benchmark performance by more than 10 points. Furthermore, we show that the proposed method is especially effective for rare domains and in few-shot learning settings.</abstract>
<identifier type="citekey">goto-etal-2025-acquiring</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.116/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1711</start>
<end>1717</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Acquiring Bidirectionality via Large and Small Language Models
%A Goto, Takumi
%A Nagao, Hiroyoshi
%A Koreeda, Yuta
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F goto-etal-2025-acquiring
%X Using token representation from bidirectional language models (LMs) such as BERT is still a widely used approach for token-classification tasks. Even though there exist much larger unidirectional LMs such as Llama-2, they are rarely used to replace the token representation of bidirectional LMs. In this work, we hypothesize that their lack of bidirectionality is what is keeping unidirectional LMs behind. To that end, we propose to newly train a small backward LM and concatenate its representations to those of an existing LM for downstream tasks. Through experiments in token-classification tasks, we demonstrate that introducing backward model can improve the benchmark performance by more than 10 points. Furthermore, we show that the proposed method is especially effective for rare domains and in few-shot learning settings.
%U https://aclanthology.org/2025.coling-main.116/
%P 1711-1717
Markdown (Informal)
[Acquiring Bidirectionality via Large and Small Language Models](https://aclanthology.org/2025.coling-main.116/) (Goto et al., COLING 2025)
ACL