@book{71843fb52f1149509d20d946702326cc,
title = "Ranking XPaths for extracting search result records",
abstract = "Extracting search result records (SRRs) from webpages is useful for building an aggregated search engine which combines search results from a variety of search engines. Most automatic approaches to search result extraction are not portable: the complete process has to be rerun on a new search result page. In this paper we describe an algorithm to automatically determine XPath expressions to extract SRRs from webpages. Based on a single search result page, an XPath expression is determined which can be reused to extract SRRs from pages based on the same template. The algorithm is evaluated on a six datasets, including two new datasets containing a variety of web, image, video, shopping and news search results. The evaluation shows that for 85% of the tested search result pages, a useful XPath is determined. The algorithm is implemented as a browser plugin and as a standalone application which are available as open source software.",
keywords = "DB-DM: DATA MINING, EWI-21640, IR-79917, DB-IR: INFORMATION RETRIEVAL, Scraper, Wrapper, Web extraction, Search result extraction, METIS-285252",
author = "Trieschnigg, {Rudolf Berend} and Kien Tjin-Kam-Jet and Djoerd Hiemstra",
year = "2012",
month = mar,
day = "8",
language = "Undefined",
series = "CTIT Technical Report Series",
publisher = "Centre for Telematics and Information Technology (CTIT)",
number = "TR-CTIT-12-08",
address = "Netherlands",
}