@inproceedings{patil-etal-2020-optimized,
title = "Optimized Web-Crawling of Conversational Data from Social Media and Context-Based Filtering",
author = "Patil, Annapurna P and
Subramanian, Rajarajeswari and
Karkal, Gaurav and
Purushotham, Keerthana and
Wadhwa, Jugal and
Reddy, K Dhanush and
Sawood, Meer",
editor = "S, Praveen Kumar G and
Mukherjee, Siddhartha and
Samal, Ranjan",
booktitle = "Proceedings of the Workshop on Joint NLP Modelling for Conversational AI @ ICON 2020",
month = dec,
year = "2020",
address = "Patna, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2020.icon-workshop.5",
pages = "33--39",
abstract = "Building Chabot{'}s requires a large amount of conversational data. In this paper, a web crawler is designed to fetch multi-turn dialogues from websites such as Twitter, YouTube and Reddit in the form of a JavaScript Object Notation (JSON) file. Tools like Twitter Application Programming Interface (API), LXML Library, and JSON library are used to crawl Twitter, YouTube and Reddit to collect conversational chat data. The data obtained in a raw form cannot be used directly as it will have only text metadata such as author or name, time to provide more information on the chat data being scraped. The data collected has to be formatted for proper use case and the JSON library of python allows us to format the data easily. The scraped dialogues are further filtered based on the context of a search keyword without introducing bias and with flexible strictness of classification.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="patil-etal-2020-optimized">
<titleInfo>
<title>Optimized Web-Crawling of Conversational Data from Social Media and Context-Based Filtering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Annapurna</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Patil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajarajeswari</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaurav</namePart>
<namePart type="family">Karkal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keerthana</namePart>
<namePart type="family">Purushotham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jugal</namePart>
<namePart type="family">Wadhwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">K</namePart>
<namePart type="given">Dhanush</namePart>
<namePart type="family">Reddy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meer</namePart>
<namePart type="family">Sawood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Joint NLP Modelling for Conversational AI @ ICON 2020</title>
</titleInfo>
<name type="personal">
<namePart type="given">Praveen</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="given">G</namePart>
<namePart type="family">S</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siddhartha</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ranjan</namePart>
<namePart type="family">Samal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Patna, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Building Chabot’s requires a large amount of conversational data. In this paper, a web crawler is designed to fetch multi-turn dialogues from websites such as Twitter, YouTube and Reddit in the form of a JavaScript Object Notation (JSON) file. Tools like Twitter Application Programming Interface (API), LXML Library, and JSON library are used to crawl Twitter, YouTube and Reddit to collect conversational chat data. The data obtained in a raw form cannot be used directly as it will have only text metadata such as author or name, time to provide more information on the chat data being scraped. The data collected has to be formatted for proper use case and the JSON library of python allows us to format the data easily. The scraped dialogues are further filtered based on the context of a search keyword without introducing bias and with flexible strictness of classification.</abstract>
<identifier type="citekey">patil-etal-2020-optimized</identifier>
<location>
<url>https://aclanthology.org/2020.icon-workshop.5</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>33</start>
<end>39</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Optimized Web-Crawling of Conversational Data from Social Media and Context-Based Filtering
%A Patil, Annapurna P.
%A Subramanian, Rajarajeswari
%A Karkal, Gaurav
%A Purushotham, Keerthana
%A Wadhwa, Jugal
%A Reddy, K. Dhanush
%A Sawood, Meer
%Y S, Praveen Kumar G.
%Y Mukherjee, Siddhartha
%Y Samal, Ranjan
%S Proceedings of the Workshop on Joint NLP Modelling for Conversational AI @ ICON 2020
%D 2020
%8 December
%I NLP Association of India (NLPAI)
%C Patna, India
%F patil-etal-2020-optimized
%X Building Chabot’s requires a large amount of conversational data. In this paper, a web crawler is designed to fetch multi-turn dialogues from websites such as Twitter, YouTube and Reddit in the form of a JavaScript Object Notation (JSON) file. Tools like Twitter Application Programming Interface (API), LXML Library, and JSON library are used to crawl Twitter, YouTube and Reddit to collect conversational chat data. The data obtained in a raw form cannot be used directly as it will have only text metadata such as author or name, time to provide more information on the chat data being scraped. The data collected has to be formatted for proper use case and the JSON library of python allows us to format the data easily. The scraped dialogues are further filtered based on the context of a search keyword without introducing bias and with flexible strictness of classification.
%U https://aclanthology.org/2020.icon-workshop.5
%P 33-39
Markdown (Informal)
[Optimized Web-Crawling of Conversational Data from Social Media and Context-Based Filtering](https://aclanthology.org/2020.icon-workshop.5) (Patil et al., ICON 2020)
ACL