@inproceedings{nakov-2025-towards,
title = "Towards Truly Open, Language-Specific, Safe, Factual, and Specialized Large Language Models",
author = "Nakov, Preslav",
editor = "Sharoff, Serge and
Terryn, Ayla Rigouts and
Zweigenbaum, Pierre and
Rapp, Reinhard",
booktitle = "Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bucc-1.3/",
pages = "18",
abstract = "First, we will argue for the need for fully transparent open-source large language models (LLMs), and we will describe the efforts of MBZUAI`s Institute on Foundation Models (IFM) towards that based on the LLM360 initiative. Second, we will argue for the need for language-specific LLMs, and we will share our experience from building Jais, the world`s leading open Arabic-centric foundation and instruction-tuned large language model, Nanda, our recently released open Hindi LLM, and some other models. Third, we will argue for the need for safe LLMs, and we will present Do-Not-Answer, a dataset for evaluating the guardrails of LLMs, which is at the core of the safety mechanisms of our LLMs. Forth, we will argue for the need for factual LLMs, we will discuss the factuality challenges that LLMs pose. We will then present some recent relevant tools for addressing these challenges developed at MBZUAI: (i) OpenFactCheck, a framework for fact-checking LLM output, for building customized fact-checking systems, and for benchmarking LLMs for factuality, (ii) LM-Polygraph, a tool for predicting an LLM`s uncertainty in its output using cheap and fast uncertainty quantification techniques, and (iii) LLM-DetectAIve, a tool for machine-generated text detection. Finally, we will argue for the need for specialized models, and we will present the zoo of LLMs currently being developed at MBZUAI`s IFM."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nakov-2025-towards">
<titleInfo>
<title>Towards Truly Open, Language-Specific, Safe, Factual, and Specialized Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Sharoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayla</namePart>
<namePart type="given">Rigouts</namePart>
<namePart type="family">Terryn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>First, we will argue for the need for fully transparent open-source large language models (LLMs), and we will describe the efforts of MBZUAI‘s Institute on Foundation Models (IFM) towards that based on the LLM360 initiative. Second, we will argue for the need for language-specific LLMs, and we will share our experience from building Jais, the world‘s leading open Arabic-centric foundation and instruction-tuned large language model, Nanda, our recently released open Hindi LLM, and some other models. Third, we will argue for the need for safe LLMs, and we will present Do-Not-Answer, a dataset for evaluating the guardrails of LLMs, which is at the core of the safety mechanisms of our LLMs. Forth, we will argue for the need for factual LLMs, we will discuss the factuality challenges that LLMs pose. We will then present some recent relevant tools for addressing these challenges developed at MBZUAI: (i) OpenFactCheck, a framework for fact-checking LLM output, for building customized fact-checking systems, and for benchmarking LLMs for factuality, (ii) LM-Polygraph, a tool for predicting an LLM‘s uncertainty in its output using cheap and fast uncertainty quantification techniques, and (iii) LLM-DetectAIve, a tool for machine-generated text detection. Finally, we will argue for the need for specialized models, and we will present the zoo of LLMs currently being developed at MBZUAI‘s IFM.</abstract>
<identifier type="citekey">nakov-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.bucc-1.3/</url>
</location>
<part>
<date>2025-01</date>
<detail type="page"><number>18</number></detail>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Truly Open, Language-Specific, Safe, Factual, and Specialized Large Language Models
%A Nakov, Preslav
%Y Sharoff, Serge
%Y Terryn, Ayla Rigouts
%Y Zweigenbaum, Pierre
%Y Rapp, Reinhard
%S Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F nakov-2025-towards
%X First, we will argue for the need for fully transparent open-source large language models (LLMs), and we will describe the efforts of MBZUAI‘s Institute on Foundation Models (IFM) towards that based on the LLM360 initiative. Second, we will argue for the need for language-specific LLMs, and we will share our experience from building Jais, the world‘s leading open Arabic-centric foundation and instruction-tuned large language model, Nanda, our recently released open Hindi LLM, and some other models. Third, we will argue for the need for safe LLMs, and we will present Do-Not-Answer, a dataset for evaluating the guardrails of LLMs, which is at the core of the safety mechanisms of our LLMs. Forth, we will argue for the need for factual LLMs, we will discuss the factuality challenges that LLMs pose. We will then present some recent relevant tools for addressing these challenges developed at MBZUAI: (i) OpenFactCheck, a framework for fact-checking LLM output, for building customized fact-checking systems, and for benchmarking LLMs for factuality, (ii) LM-Polygraph, a tool for predicting an LLM‘s uncertainty in its output using cheap and fast uncertainty quantification techniques, and (iii) LLM-DetectAIve, a tool for machine-generated text detection. Finally, we will argue for the need for specialized models, and we will present the zoo of LLMs currently being developed at MBZUAI‘s IFM.
%U https://aclanthology.org/2025.bucc-1.3/
%P 18
Markdown (Informal)
[Towards Truly Open, Language-Specific, Safe, Factual, and Specialized Large Language Models](https://aclanthology.org/2025.bucc-1.3/) (Nakov, BUCC 2025)
ACL