@inproceedings{fu-etal-2025-text,
title = "Text-Based Approaches to Item Alignment to Content Standards in Large-Scale Reading {\&} Writing Tests",
author = "Fu, Yanbin and
Jiao, Hong and
Zhou, Tianyi and
Zhang, Nan and
Li, Ming and
Xu, Qingshu and
Peters, Sydney and
Lissitz, Robert W",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://aclanthology.org/2025.aimecon-sessions.3/",
pages = "19--36",
ISBN = "979-8-218-84230-7",
abstract = "Aligning test items to content standards is a critical step in test development to collect validity evidence 3 based on content. Item alignment has typically been conducted by human experts, but this judgmental process can be subjective and time-consuming. This study investigated the performance of fine-tuned small language models (SLMs) for automated item alignment using data from a large-scale standardized reading and writing test for college admissions. Different SLMs were trained for both domain and skill alignment. The model performance was evaluated using precision, recall, accuracy, weighted F1 score, and Cohen{'}s kappa on two test sets. The impact of input data types and training sample sizes was also explored. Results showed that including more textual inputs led to better performance gains than increasing sample size. For comparison, classic supervised machine learning classifiers were trained on multilingual-E5 embedding. Fine-tuned SLMs consistently outperformed these models, particularly for fine-grained skill alignment. To better understand model classifications, semantic similarity analyses including cosine similarity, Kullback-Leibler divergence of embedding distributions, and two-dimension projections of item embedding revealed that certain skills in the two test datasets were semantically too close, providing evidence for the observed misclassification patterns."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fu-etal-2025-text">
<titleInfo>
<title>Text-Based Approaches to Item Alignment to Content Standards in Large-Scale Reading & Writing Tests</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanbin</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Jiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyi</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingshu</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sydney</namePart>
<namePart type="family">Peters</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">W</namePart>
<namePart type="family">Lissitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joshua</namePart>
<namePart type="family">Wilson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Ormerod</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Magdalen</namePart>
<namePart type="family">Beiting Parrish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>National Council on Measurement in Education (NCME)</publisher>
<place>
<placeTerm type="text">Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-218-84230-7</identifier>
</relatedItem>
<abstract>Aligning test items to content standards is a critical step in test development to collect validity evidence 3 based on content. Item alignment has typically been conducted by human experts, but this judgmental process can be subjective and time-consuming. This study investigated the performance of fine-tuned small language models (SLMs) for automated item alignment using data from a large-scale standardized reading and writing test for college admissions. Different SLMs were trained for both domain and skill alignment. The model performance was evaluated using precision, recall, accuracy, weighted F1 score, and Cohen’s kappa on two test sets. The impact of input data types and training sample sizes was also explored. Results showed that including more textual inputs led to better performance gains than increasing sample size. For comparison, classic supervised machine learning classifiers were trained on multilingual-E5 embedding. Fine-tuned SLMs consistently outperformed these models, particularly for fine-grained skill alignment. To better understand model classifications, semantic similarity analyses including cosine similarity, Kullback-Leibler divergence of embedding distributions, and two-dimension projections of item embedding revealed that certain skills in the two test datasets were semantically too close, providing evidence for the observed misclassification patterns.</abstract>
<identifier type="citekey">fu-etal-2025-text</identifier>
<location>
<url>https://aclanthology.org/2025.aimecon-sessions.3/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>19</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Text-Based Approaches to Item Alignment to Content Standards in Large-Scale Reading & Writing Tests
%A Fu, Yanbin
%A Jiao, Hong
%A Zhou, Tianyi
%A Zhang, Nan
%A Li, Ming
%A Xu, Qingshu
%A Peters, Sydney
%A Lissitz, Robert W.
%Y Wilson, Joshua
%Y Ormerod, Christopher
%Y Beiting Parrish, Magdalen
%S Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers
%D 2025
%8 October
%I National Council on Measurement in Education (NCME)
%C Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States
%@ 979-8-218-84230-7
%F fu-etal-2025-text
%X Aligning test items to content standards is a critical step in test development to collect validity evidence 3 based on content. Item alignment has typically been conducted by human experts, but this judgmental process can be subjective and time-consuming. This study investigated the performance of fine-tuned small language models (SLMs) for automated item alignment using data from a large-scale standardized reading and writing test for college admissions. Different SLMs were trained for both domain and skill alignment. The model performance was evaluated using precision, recall, accuracy, weighted F1 score, and Cohen’s kappa on two test sets. The impact of input data types and training sample sizes was also explored. Results showed that including more textual inputs led to better performance gains than increasing sample size. For comparison, classic supervised machine learning classifiers were trained on multilingual-E5 embedding. Fine-tuned SLMs consistently outperformed these models, particularly for fine-grained skill alignment. To better understand model classifications, semantic similarity analyses including cosine similarity, Kullback-Leibler divergence of embedding distributions, and two-dimension projections of item embedding revealed that certain skills in the two test datasets were semantically too close, providing evidence for the observed misclassification patterns.
%U https://aclanthology.org/2025.aimecon-sessions.3/
%P 19-36
Markdown (Informal)
[Text-Based Approaches to Item Alignment to Content Standards in Large-Scale Reading & Writing Tests](https://aclanthology.org/2025.aimecon-sessions.3/) (Fu et al., AIME-Con 2025)
ACL
- Yanbin Fu, Hong Jiao, Tianyi Zhou, Nan Zhang, Ming Li, Qingshu Xu, Sydney Peters, and Robert W Lissitz. 2025. Text-Based Approaches to Item Alignment to Content Standards in Large-Scale Reading & Writing Tests. In Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers, pages 19–36, Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States. National Council on Measurement in Education (NCME).