BIS 2008

BIS record '2007/1/451'


  author        = {Dominik Flejter and
                   Roman Hryniewiecki},
  title         = {Bottom-up Discovery of Clusters of Maximal Ranges in HTML Trees for Search Engines Results Extraction},
  year          = {2007},
  session       = {Information Retrieval},
  pages         = {401--414},
  crossref      = {BIS:2007/1},
  bibsource     = {BIS,},
  abstract      = {Unsupervised HTML records detection is an important step in many Web content mining applications. In this paper we propose a method of bottom-up discovery of clusters of maximal, non-agglomerative similar HTML ranges in nested set HTML tree representation and demonstrate its applicability to search engine results records detection. For performance measurement several distance assessment strategies were evaluated and two test collections were prepared containing results pages from almost 60 global and country-specific search engines and almost 100 methodically generated complex HTML trees with pre-set properties respectively. Empirical study shows that our method performs well and can detect successfully most of search results substructures clusters.}

  editor        = {Witold Abramowicz},
  booktitle     = {Business Information Systems, 10th International Conference, BIS 2007, Poznań, Poland, April 2007},
  title         = {Business Information Systems, 10th International Conference, BIS 2007, Poznań, Poland, April 2007},
  publisher     = {Springer-Verlag},
  isbn          = {978-3-540-72034-8},
  year          = {2007},
  bibsource     = {BIS,}

BIS index BIS 2007
Dept. of Information Systems at Poznan University of Economics, Dominik Flejter