@inproceedings{ethiraj-etal-2025-vec,
title = "{T}-{VEC}: A Telecom-Specific Vectorization Model with Enhanced Semantic Understanding via Deep Triplet Loss Fine-Tuning",
author = "Ethiraj, Vignesh and
D, Ashwath and
Menon, Sidhanth and
Vijay, Divya and
Kannan, Vidhyakshaya",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.168/",
doi = "10.18653/v1/2025.emnlp-industry.168",
pages = "2449--2460",
ISBN = "979-8-89176-333-3",
abstract = "The specialized vocabulary and nuanced concepts of the telecommunications industry pose persistent challenges for standard Natural Language Processing (NLP) models. Generic embedding models often struggle to represent telecom-specific semantics, limiting their utility in retrieval and downstream tasks. We present T-VEC (Telecom Vectorization Model), a domain-adapted embedding model fine-tuned from the gte-Qwen2-1.5B-instruct backbone using a triplet loss objective. Fine-tuning was performed on T-Embed, a high-quality, large-scale dataset covering diverse telecom concepts, standards, and operational scenarios. Although T-Embed contains some proprietary material and cannot be fully released, we open source 75{\%} of the dataset to support continued research in domain-specific representation learning. On a custom benchmark comprising 1500 query-passage pairs from IETF RFCs and vendor manuals, T-VEC surpasses MPNet, BGE, Jina and E5, demonstrating superior domain grounding and semantic precision in telecom-specific retrieval. Embedding visualizations further showcase tight clustering of telecom-relevant concepts. We release T-VEC and its tokenizer to support semantically faithful NLP applications within the telecom domain."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ethiraj-etal-2025-vec">
<titleInfo>
<title>T-VEC: A Telecom-Specific Vectorization Model with Enhanced Semantic Understanding via Deep Triplet Loss Fine-Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vignesh</namePart>
<namePart type="family">Ethiraj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashwath</namePart>
<namePart type="family">D</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sidhanth</namePart>
<namePart type="family">Menon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Divya</namePart>
<namePart type="family">Vijay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vidhyakshaya</namePart>
<namePart type="family">Kannan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>The specialized vocabulary and nuanced concepts of the telecommunications industry pose persistent challenges for standard Natural Language Processing (NLP) models. Generic embedding models often struggle to represent telecom-specific semantics, limiting their utility in retrieval and downstream tasks. We present T-VEC (Telecom Vectorization Model), a domain-adapted embedding model fine-tuned from the gte-Qwen2-1.5B-instruct backbone using a triplet loss objective. Fine-tuning was performed on T-Embed, a high-quality, large-scale dataset covering diverse telecom concepts, standards, and operational scenarios. Although T-Embed contains some proprietary material and cannot be fully released, we open source 75% of the dataset to support continued research in domain-specific representation learning. On a custom benchmark comprising 1500 query-passage pairs from IETF RFCs and vendor manuals, T-VEC surpasses MPNet, BGE, Jina and E5, demonstrating superior domain grounding and semantic precision in telecom-specific retrieval. Embedding visualizations further showcase tight clustering of telecom-relevant concepts. We release T-VEC and its tokenizer to support semantically faithful NLP applications within the telecom domain.</abstract>
<identifier type="citekey">ethiraj-etal-2025-vec</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-industry.168</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.168/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>2449</start>
<end>2460</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T T-VEC: A Telecom-Specific Vectorization Model with Enhanced Semantic Understanding via Deep Triplet Loss Fine-Tuning
%A Ethiraj, Vignesh
%A D, Ashwath
%A Menon, Sidhanth
%A Vijay, Divya
%A Kannan, Vidhyakshaya
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F ethiraj-etal-2025-vec
%X The specialized vocabulary and nuanced concepts of the telecommunications industry pose persistent challenges for standard Natural Language Processing (NLP) models. Generic embedding models often struggle to represent telecom-specific semantics, limiting their utility in retrieval and downstream tasks. We present T-VEC (Telecom Vectorization Model), a domain-adapted embedding model fine-tuned from the gte-Qwen2-1.5B-instruct backbone using a triplet loss objective. Fine-tuning was performed on T-Embed, a high-quality, large-scale dataset covering diverse telecom concepts, standards, and operational scenarios. Although T-Embed contains some proprietary material and cannot be fully released, we open source 75% of the dataset to support continued research in domain-specific representation learning. On a custom benchmark comprising 1500 query-passage pairs from IETF RFCs and vendor manuals, T-VEC surpasses MPNet, BGE, Jina and E5, demonstrating superior domain grounding and semantic precision in telecom-specific retrieval. Embedding visualizations further showcase tight clustering of telecom-relevant concepts. We release T-VEC and its tokenizer to support semantically faithful NLP applications within the telecom domain.
%R 10.18653/v1/2025.emnlp-industry.168
%U https://aclanthology.org/2025.emnlp-industry.168/
%U https://doi.org/10.18653/v1/2025.emnlp-industry.168
%P 2449-2460
Markdown (Informal)
[T-VEC: A Telecom-Specific Vectorization Model with Enhanced Semantic Understanding via Deep Triplet Loss Fine-Tuning](https://aclanthology.org/2025.emnlp-industry.168/) (Ethiraj et al., EMNLP 2025)
ACL