@inproceedings{daudaravicius-2019-textual,
title = "Textual and Visual Characteristics of Mathematical Expressions in Scholar Documents",
author = "Daudaravicius, Vidas",
editor = "Nastase, Vivi and
Roth, Benjamin and
Dietz, Laura and
McCallum, Andrew",
booktitle = "Proceedings of the Workshop on Extracting Structured Knowledge from Scientific Publications",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-2610/",
doi = "10.18653/v1/W19-2610",
pages = "72--81",
abstract = "Mathematical expressions (ME) are widely used in scholar documents. In this paper we analyze characteristics of textual and visual MEs characteristics for the image-to-LaTeX translation task. While there are open data-sets of LaTeX files with MEs included it is very complicated to extract these MEs from a document and to compile the list of MEs. Therefore we release a corpus of open-access scholar documents with PDF and JATS-XML parallel files. The MEs in these documents are LaTeX encoded and are document independent. The data contains more than 1.2 million distinct annotated formulae and more than 80 million raw tokens of LaTeX MEs in more than 8 thousand documents. While the variety of textual lengths and visual sizes of MEs are not well defined we found that the task of analyzing MEs in scholar documents can be reduced to the subtask of a particular text length, image width and height bounds, and display MEs can be processed as arrays of partial MEs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="daudaravicius-2019-textual">
<titleInfo>
<title>Textual and Visual Characteristics of Mathematical Expressions in Scholar Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vidas</namePart>
<namePart type="family">Daudaravicius</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Extracting Structured Knowledge from Scientific Publications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vivi</namePart>
<namePart type="family">Nastase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Dietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">McCallum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Minneapolis, Minnesota</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Mathematical expressions (ME) are widely used in scholar documents. In this paper we analyze characteristics of textual and visual MEs characteristics for the image-to-LaTeX translation task. While there are open data-sets of LaTeX files with MEs included it is very complicated to extract these MEs from a document and to compile the list of MEs. Therefore we release a corpus of open-access scholar documents with PDF and JATS-XML parallel files. The MEs in these documents are LaTeX encoded and are document independent. The data contains more than 1.2 million distinct annotated formulae and more than 80 million raw tokens of LaTeX MEs in more than 8 thousand documents. While the variety of textual lengths and visual sizes of MEs are not well defined we found that the task of analyzing MEs in scholar documents can be reduced to the subtask of a particular text length, image width and height bounds, and display MEs can be processed as arrays of partial MEs.</abstract>
<identifier type="citekey">daudaravicius-2019-textual</identifier>
<identifier type="doi">10.18653/v1/W19-2610</identifier>
<location>
<url>https://aclanthology.org/W19-2610/</url>
</location>
<part>
<date>2019-06</date>
<extent unit="page">
<start>72</start>
<end>81</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Textual and Visual Characteristics of Mathematical Expressions in Scholar Documents
%A Daudaravicius, Vidas
%Y Nastase, Vivi
%Y Roth, Benjamin
%Y Dietz, Laura
%Y McCallum, Andrew
%S Proceedings of the Workshop on Extracting Structured Knowledge from Scientific Publications
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, Minnesota
%F daudaravicius-2019-textual
%X Mathematical expressions (ME) are widely used in scholar documents. In this paper we analyze characteristics of textual and visual MEs characteristics for the image-to-LaTeX translation task. While there are open data-sets of LaTeX files with MEs included it is very complicated to extract these MEs from a document and to compile the list of MEs. Therefore we release a corpus of open-access scholar documents with PDF and JATS-XML parallel files. The MEs in these documents are LaTeX encoded and are document independent. The data contains more than 1.2 million distinct annotated formulae and more than 80 million raw tokens of LaTeX MEs in more than 8 thousand documents. While the variety of textual lengths and visual sizes of MEs are not well defined we found that the task of analyzing MEs in scholar documents can be reduced to the subtask of a particular text length, image width and height bounds, and display MEs can be processed as arrays of partial MEs.
%R 10.18653/v1/W19-2610
%U https://aclanthology.org/W19-2610/
%U https://doi.org/10.18653/v1/W19-2610
%P 72-81
Markdown (Informal)
[Textual and Visual Characteristics of Mathematical Expressions in Scholar Documents](https://aclanthology.org/W19-2610/) (Daudaravicius, NAACL 2019)
ACL