@inproceedings{9a27598809e94e34bee2948afaeb4de4,
title = "STIF: Semi-Supervised Taxonomy Induction using Term Embeddings and Clustering",
abstract = "In this paper, we developed a semi-supervised taxonomy induction framework using term embedding and clustering methods for a blog corpus comprising 145,000 posts from 650 Ukraine-related blog domains dated between 2010-2020. We extracted 32,429 noun phrases (NPs) and proceeded to split these NPs into a pair of categories: General/Ambiguous phrases, which might appear under any topic vs. Topical/Non-Ambiguous phrases, which pertain to a topic's specifics. We used term representation and clustering methods to partition the topical/non-ambiguous phrases into 90 groups using the Silhouette method. Next, a team of 10 communications scientists analyzed the NP clusters and inducted a two-level taxonomy alongside its codebook. Upon achieving intercoder reliability of 94\%, coders proceeded to map all topical/non-ambiguous phrases into a gold-standard taxonomy. We evaluated a range of term representation and clustering methods using extrinsic and intrinsic measures. We determined that GloVe embeddings with K-Means achieved the highest performance (i.e. 74\% purity) for this real-world dataset.",
keywords = "Taxonomy induction, Text categorization, Topic detection",
author = "Maryam Mousavi and Elena Steiner and Steven Corman and Scott Ruston and Dylan Weber and Hasan Davulcu",
note = "Publisher Copyright: {\textcopyright} 2021 ACM.; 5th International Conference on Natural Language Processing and Information Retrieval, NLPIR 2021 ; Conference date: 17-12-2021 Through 20-12-2021",
year = "2021",
month = dec,
day = "17",
doi = "10.1145/3508230.3508247",
language = "English (US)",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery",
pages = "115--123",
booktitle = "2021 5th International Conference on Natural Language Processing and Information Retrieval, NLPIR 2021",
}