@inproceedings{2e5fb282b4cd453fa9a9c11e4a020dd4,
title = "Human-in-the-loop Language-agnostic Extraction of Medication Data from Highly Unstructured Electronic Health Records",
abstract = "Electronic health records contain important information written in free-form text. They are often highly unstructured and ungrammatical and contain misspellings and abbreviations, making it difficult to apply traditional natural language processing techniques. Annotated data is hard to come by due to restricted access, and supervised models often don't generalize well to other datasets. We propose a language-agnostic human-in-the-loop approach for extracting medication names from a large set of highly unstructured electronic health records, where we reach almost 97% recall on our test set after the second iteration while maintaining 100% precision. Starting with a bootstrap lexicon we perform a context based dictionary expansion curated by a human reviewer. The method can handle ambiguous lexicon entries and efficiently find fuzzy matches without producing false positives. The human review step ensures a high precision, which is especially important in healthcare, and is not subject to disagreements with annotations from an external source. The code is available online 11https://github.com/FrankRuis/medical_concept_extraction.",
keywords = "Dictionaries, Annotations, Data models, Natural language processing, Data mining, Electronic medical records, Task analysis",
author = "Frank Ruis and Shreyasi Pathak and Jeroen Geerdink and Hegeman, {Johannes H.} and Christin Seifert and {van Keulen}, Maurice",
year = "2020",
month = nov,
day = "20",
doi = "10.1109/ICDMW51313.2020.00091",
language = "English",
isbn = "978-1-7281-9013-6",
series = "International Conference on Data Mining Workshops (ICDMW)",
publisher = "IEEE",
pages = "644--650",
booktitle = "2020 International Conference on Data Mining Workshops (ICDMW)",
address = "United States",
note = "2020 International Conference on Data Mining Workshops, ICDMW 2020, ICDMW ; Conference date: 17-11-2020 Through 20-11-2020",
}