XQuery/Wikipedia Page scraping

Page scraping allows any web page to be the source of raw data suitable for transformation. This example takes the data on the Wikipedia current events page 24 September 2007 and transforms to a simple HTML page

The key components of an XQuery page scraper are:

  1. The fn:doc function which accepts a URL and retrieves the page as XML. Many pages are not well-formed XML, but the Wikipedia pages are.
  2. Setting a namespace if the page has a default namespace. This page has a default namespace of "http://www.w3.org/1999/xhtml" so a namespace must be declared and its namespace prefix used in path expressions which access the page's XML.
  3. Identification of a path to the selected content. In this case, the content is located in a td tag with a class of 'description'
  4. Re-basing any relative URLs. Here the links to Wikipedia articles have relative URLs. To re-base these, the XML is serialized to a string with util:serialize(), the relative URLs edited with replace, and the string converted back to XML using util:parse()

Sample XQuery to Extract Data from Wikipedia Current Events Page

edit

In this example, there is some date re-formating to do since the date format in the page's URL is not the XML formated date. Links to the previous and next days are included, making use of XQuery date arithmetic.

declare namespace h= "http://www.w3.org/1999/xhtml" ;
declare option exist:serialize "method=xhtml media-type=text/html indent=yes";
declare variable $months := 
 ("January","February","March","April","May","June","July","August","September","October","November","December")
;
declare function local:wikidate($date as xs:date) as xs:string {
      concat(year-from-date($date),"_",
                   $months[month-from-date($date)],"_",
                   day-from-date($date)
                   )
};
declare function local:displaydate($date as xs:date) as xs:string {
      concat(day-from-date($date)," ",
                   $months[month-from-date($date)],", ",
                   year-from-date($date)
                   )
};

declare function local:add-base($element , $base as xs:string, $delimiter as xs:string)   {
    let $evtext := util:serialize($element,())
    let $evtext := replace($evtext,
                           concat ("href=",$delimiter,"/"),
                           concat("href=",$delimiter,$base,"/")
                   )
    return util:parse($evtext)
};

let $date := xs:date(request:get-parameter("date",()))
let $wikidate := local:wikidate($date)
let $url := concat("http://en.wikipedia.org/wiki/Portal:Current_events/",$wikidate)
let $wikipage := doc($url)
let $desc := $wikipage//h:td[@class="description"]
let $nextDay := $date + xs:dayTimeDuration("P1D")
let $previousDay := $date - xs:dayTimeDuration("P1D")
return
<html>
   <body>
    <h1>Current events from <a href="{$url}">Wikipedia</a></h1>
      <h2>Wiki Events for        
                <span style="font-size:12;"><a href="wikidate.xq?date={$previousDay}">{local:displaydate($previousDay)}</a></span>&#160;
               {local:displaydate($date)}
               <span style="font-size:12;"><a href="wikidate.xq?date={$nextDay}">{local:displaydate($nextDay)}</a></span>&#160;
      </h2>
            {
            local:add-base($desc/*,"http://en.wikipedia.org",'"')
            }
   </body>
</html>