@inproceedings{f8d44657781c4efe9052147e390c00f6,
title = "Sample-based XPath Ranking for Web Information Extraction",
abstract = "Web information extraction typically relies on a wrapper, i.e., program code or a configuration that specifies how to extract some information from web pages at a specific website. Manually creating and maintaining wrappers is a cumbersome and error-prone task. It may even be prohibitive as some applications require information extraction from previously unseen websites. This paper approaches the problem of automatic on-the-fly wrapper creation for websites that provide attribute data for objects in a {\textquoteleft}search – search result page – detail page{\textquoteright} setup. The approach is a wrapper induction approach which uses a small and easily obtainable set of sample data for ranking XPaths on their suitability for extracting the wanted attribute data. Experiments show that the automatically generated top-ranked XPaths indeed extract the wanted data. Moreover, it appears that 20 to 25 input samples suffice for finding a suitable XPath for an attribute.",
keywords = "EWI-23413, IR-86350, METIS-297686",
author = "Oliver Jundt and {van Keulen}, Maurice",
note = "10.2991/eusflat.2013.27 ; 8th Conference of the European Society for Fuzzy Logic and Technology, EUSFLAT 2013 ; Conference date: 11-09-2013 Through 13-09-2013",
year = "2013",
month = sep,
doi = "10.2991/eusflat.2013.27",
language = "Undefined",
isbn = "978-90786-77-78-9",
series = "Advances in Intelligent Systems Research",
publisher = "Atlantis Press",
pages = "39",
booktitle = "Proceedings of the 8th Conference of the European Society for Fuzzy Logic and Technology (EUSFLAT 2013)",
address = "Netherlands",
}