@inproceedings{isbarov-etal-2024-open,
title = "Open foundation models for {A}zerbaijani language",
author = "Isbarov, Jafar and
Huseynova, Kavsar and
Mammadov, Elvin and
Hajili, Mammad and
Ataman, Duygu",
editor = {Ataman, Duygu and
Derin, Mehmet Oguz and
Ivanova, Sardana and
K{\"o}ksal, Abdullatif and
S{\"a}lev{\"a}, Jonne and
Zeyrek, Deniz},
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Turkic Languages (SIGTURK 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand and Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigturk-1.2",
pages = "18--28",
abstract = "The emergence of multilingual large language models has enabled the development of language understanding and generation systems in Azerbaijani. However, most of the production-grade systems rely on cloud solutions, such as GPT-4. While there have been several attempts to develop open foundation models for Azerbaijani, these works have not found their way into common use due to a lack of systemic benchmarking. This paper encompasses several lines of work that promote open-source foundation models for Azerbaijani. We introduce (1) a large text corpus for Azerbaijani, (2) a family of encoder-only language models trained on this dataset, (3) labeled datasets for evaluating these models, and (4) extensive evaluation that covers all major open-source models with Azerbaijani support.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="isbarov-etal-2024-open">
<titleInfo>
<title>Open foundation models for Azerbaijani language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jafar</namePart>
<namePart type="family">Isbarov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kavsar</namePart>
<namePart type="family">Huseynova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elvin</namePart>
<namePart type="family">Mammadov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mammad</namePart>
<namePart type="family">Hajili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Duygu</namePart>
<namePart type="family">Ataman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Natural Language Processing for Turkic Languages (SIGTURK 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Duygu</namePart>
<namePart type="family">Ataman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Oguz</namePart>
<namePart type="family">Derin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sardana</namePart>
<namePart type="family">Ivanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonne</namePart>
<namePart type="family">Sälevä</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deniz</namePart>
<namePart type="family">Zeyrek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand and Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The emergence of multilingual large language models has enabled the development of language understanding and generation systems in Azerbaijani. However, most of the production-grade systems rely on cloud solutions, such as GPT-4. While there have been several attempts to develop open foundation models for Azerbaijani, these works have not found their way into common use due to a lack of systemic benchmarking. This paper encompasses several lines of work that promote open-source foundation models for Azerbaijani. We introduce (1) a large text corpus for Azerbaijani, (2) a family of encoder-only language models trained on this dataset, (3) labeled datasets for evaluating these models, and (4) extensive evaluation that covers all major open-source models with Azerbaijani support.</abstract>
<identifier type="citekey">isbarov-etal-2024-open</identifier>
<location>
<url>https://aclanthology.org/2024.sigturk-1.2</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>18</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Open foundation models for Azerbaijani language
%A Isbarov, Jafar
%A Huseynova, Kavsar
%A Mammadov, Elvin
%A Hajili, Mammad
%A Ataman, Duygu
%Y Ataman, Duygu
%Y Derin, Mehmet Oguz
%Y Ivanova, Sardana
%Y Köksal, Abdullatif
%Y Sälevä, Jonne
%Y Zeyrek, Deniz
%S Proceedings of the First Workshop on Natural Language Processing for Turkic Languages (SIGTURK 2024)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand and Online
%F isbarov-etal-2024-open
%X The emergence of multilingual large language models has enabled the development of language understanding and generation systems in Azerbaijani. However, most of the production-grade systems rely on cloud solutions, such as GPT-4. While there have been several attempts to develop open foundation models for Azerbaijani, these works have not found their way into common use due to a lack of systemic benchmarking. This paper encompasses several lines of work that promote open-source foundation models for Azerbaijani. We introduce (1) a large text corpus for Azerbaijani, (2) a family of encoder-only language models trained on this dataset, (3) labeled datasets for evaluating these models, and (4) extensive evaluation that covers all major open-source models with Azerbaijani support.
%U https://aclanthology.org/2024.sigturk-1.2
%P 18-28
Markdown (Informal)
[Open foundation models for Azerbaijani language](https://aclanthology.org/2024.sigturk-1.2) (Isbarov et al., SIGTURK-WS 2024)
ACL
- Jafar Isbarov, Kavsar Huseynova, Elvin Mammadov, Mammad Hajili, and Duygu Ataman. 2024. Open foundation models for Azerbaijani language. In Proceedings of the First Workshop on Natural Language Processing for Turkic Languages (SIGTURK 2024), pages 18–28, Bangkok, Thailand and Online. Association for Computational Linguistics.