@inproceedings{ting-etal-2025-astromlab,
title = "{A}stro{ML}ab 5: Structured Summaries and Concept Extraction for 400,000 Astrophysics Papers",
author = "Ting, Yuan-Sen and
Accomazzi, Alberto and
Ghosal, Tirthankar and
Nguyen, Tuan Dung and
Pan, Rui and
Sun, Zechang and
de Haan, Tijmen",
editor = "Accomazzi, Alberto and
Ghosal, Tirthankar and
Grezes, Felix and
Lockhart, Kelly",
booktitle = "Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications",
month = dec,
year = "2025",
address = "Mumbai, India and virtual",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wasp-main.19/",
pages = "170--185",
ISBN = "979-8-89176-310-4",
abstract = "We present a dataset of 408,590 astrophysics papers from arXiv (astro-ph), spanning 1992 through July 2025. Each paper has been processed through a multi-stage pipeline to produce: (1) structured summaries organized into six semantic sections (Background, Motivation, Methodology, Results, Interpretation, Implication), and (2) concept extraction yielding 9,999 unique concepts with detailed descriptions. The dataset contains 3.8 million paper-concept associations and includes semantic embeddings for all concepts. Comparison with traditional ADS keywords reveals that the concepts provide denser coverage and more uniform distribution, while analysis of embedding space structure demonstrates that concepts are semantically dispersed within papers{---}enabling discovery through multiple diverse entry points. Concept vocabulary and embeddings are publicly released at https://github.com/tingyuansen/astro-ph{\_}knowledge{\_}graph."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ting-etal-2025-astromlab">
<titleInfo>
<title>AstroMLab 5: Structured Summaries and Concept Extraction for 400,000 Astrophysics Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuan-Sen</namePart>
<namePart type="family">Ting</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Accomazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuan</namePart>
<namePart type="given">Dung</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zechang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tijmen</namePart>
<namePart type="family">de Haan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Accomazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Grezes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Lockhart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India and virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-310-4</identifier>
</relatedItem>
<abstract>We present a dataset of 408,590 astrophysics papers from arXiv (astro-ph), spanning 1992 through July 2025. Each paper has been processed through a multi-stage pipeline to produce: (1) structured summaries organized into six semantic sections (Background, Motivation, Methodology, Results, Interpretation, Implication), and (2) concept extraction yielding 9,999 unique concepts with detailed descriptions. The dataset contains 3.8 million paper-concept associations and includes semantic embeddings for all concepts. Comparison with traditional ADS keywords reveals that the concepts provide denser coverage and more uniform distribution, while analysis of embedding space structure demonstrates that concepts are semantically dispersed within papers—enabling discovery through multiple diverse entry points. Concept vocabulary and embeddings are publicly released at https://github.com/tingyuansen/astro-ph_knowledge_graph.</abstract>
<identifier type="citekey">ting-etal-2025-astromlab</identifier>
<location>
<url>https://aclanthology.org/2025.wasp-main.19/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>170</start>
<end>185</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AstroMLab 5: Structured Summaries and Concept Extraction for 400,000 Astrophysics Papers
%A Ting, Yuan-Sen
%A Accomazzi, Alberto
%A Ghosal, Tirthankar
%A Nguyen, Tuan Dung
%A Pan, Rui
%A Sun, Zechang
%A de Haan, Tijmen
%Y Accomazzi, Alberto
%Y Ghosal, Tirthankar
%Y Grezes, Felix
%Y Lockhart, Kelly
%S Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India and virtual
%@ 979-8-89176-310-4
%F ting-etal-2025-astromlab
%X We present a dataset of 408,590 astrophysics papers from arXiv (astro-ph), spanning 1992 through July 2025. Each paper has been processed through a multi-stage pipeline to produce: (1) structured summaries organized into six semantic sections (Background, Motivation, Methodology, Results, Interpretation, Implication), and (2) concept extraction yielding 9,999 unique concepts with detailed descriptions. The dataset contains 3.8 million paper-concept associations and includes semantic embeddings for all concepts. Comparison with traditional ADS keywords reveals that the concepts provide denser coverage and more uniform distribution, while analysis of embedding space structure demonstrates that concepts are semantically dispersed within papers—enabling discovery through multiple diverse entry points. Concept vocabulary and embeddings are publicly released at https://github.com/tingyuansen/astro-ph_knowledge_graph.
%U https://aclanthology.org/2025.wasp-main.19/
%P 170-185
Markdown (Informal)
[AstroMLab 5: Structured Summaries and Concept Extraction for 400,000 Astrophysics Papers](https://aclanthology.org/2025.wasp-main.19/) (Ting et al., WASP 2025)
ACL
- Yuan-Sen Ting, Alberto Accomazzi, Tirthankar Ghosal, Tuan Dung Nguyen, Rui Pan, Zechang Sun, and Tijmen de Haan. 2025. AstroMLab 5: Structured Summaries and Concept Extraction for 400,000 Astrophysics Papers. In Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications, pages 170–185, Mumbai, India and virtual. Association for Computational Linguistics.