XQuery/Get zipped XML file

Motivation

edit

You want to process XML documents from the web which contained in a zip file.

Implementation

edit

This script uses the unzip function in the eXist compression module. This function uses higher order functions to filter the required components of the zipped file and to process each component.

The Unzip Function

edit

The unzip function has five input parameters, two of which are XQuery functions that are passed to the unzip function. Each of these functions in turn have parameters.

Here is the general layout of the compression function:

compression:unzip(
  $zip-data as xs:base64Binary, 
  $entry-filter as function, 
  $entry-filter-param as xs:anyType*, 
  $entry-data as function, 
  $entry-data-param as xs:anyType*) item()*

UnZip all the resources/folders from the provided data by calling user defined functions to determine what and how to store the resources/folders

  • $zip-data The zip file data
  • $entry-filter A user defined function for filtering resources from the zip file. The function takes 3 parameters e.g. user:unzip-entry-filter($path as xs:string, $data-type as xs:string, $param as item()*) as xs:boolean. $type may be 'resource' or 'folder'. $param is a sequence with any additional parameters, for example a list of extracted files.If the return type is true() it indicates the entry should be processed and passed to the entry-data function, else the resource is skipped.
  • $entry-filter-param A sequence with an additional parameters for filtering function.
  • $entry-data A user defined function for storing an extracted resource from the zip file. The function takes 4 parameters e.g. user:unzip-entry-data($path as xs:string, $data-type as xs:string, $data as item()?, $param as item()*). $type may be 'resource' or 'folder'. $param is a sequence with any additional parameters
  • $entry-data-param A sequence with an additional parameters for storing function.

In the first example, we know that there is only one XML file and we intend to process the XML in the script. Later examples store the file or files for later processing.

Extracting a single zipped file

edit
declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";

declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
 (: pass all :)
 true()
};

declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
 (: return the XML :)
 $data
};

let $uri := request:get-parameter("uri","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $zip := httpclient:get(xs:anyURI($uri), true(), ())/httpclient:body/text()
let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)
let $xml := compression:unzip($zip,$filter,(),$process,())
return $xml

Execute

Sample XML Output

edit
<ISO_3166-1_List_en xml:lang="en">
   <ISO_3166-1_Entry>
      <ISO_3166-1_Country_name>AFGHANISTAN</ISO_3166-1_Country_name>
      <ISO_3166-1_Alpha-2_Code_element>AF</ISO_3166-1_Alpha-2_Code_element>
   </ISO_3166-1_Entry>
   <ISO_3166-1_Entry>
      <ISO_3166-1_Country_name>ÅLAND ISLANDS</ISO_3166-1_Country_name>
      <ISO_3166-1_Alpha-2_Code_element>AX</ISO_3166-1_Alpha-2_Code_element>
   </ISO_3166-1_Entry>
   ...
</ISO_3166-1_List_en>

How the Process Function Works

edit

The compression:unzip() function calls the process function for each component in the zip archive it finds. This is known as a callback function. You can place any valid XQuery code in the process function to do what you would like with each input file such as list or store it.

For example the following process function will list all the items in a zip file, their path, their type and the root node if the item is an XML file.

declare function t:process($path as xs:string, $type as xs:string, $data as item()? , $param as item()*) {
 (: return a list of the items in the zip file. :)
 <item path="{$path}" type="{$type}">{name($data/*)}</item>
};

Running this on a Office Open XML file returns the following:

<item path="[Content_Types].xml" type="resource">Types</item>
<item path="_rels/.rels" type="resource">Relationships</item>
<item path="word/_rels/document.xml.rels" type="resource">Relationships</item>
<item path="word/document.xml" type="resource">w:document</item>
<item path="word/theme/theme1.xml" type="resource">a:theme</item>
<item path="word/settings.xml" type="resource">w:settings</item>
<item path="word/fontTable.xml" type="resource">w:fonts</item>
<item path="word/webSettings.xml" type="resource">w:webSettings</item>
<item path="docProps/app.xml" type="resource">Properties</item>
<item path="docProps/core.xml" type="resource">cp:coreProperties</item>
<item path="word/styles.xml" type="resource">w:styles</item>

Storing the unzipped File

edit

You probably want to store the unzipped documents in the database. We can modify the process function to do this. We can use the third parameter to pass in the directory in which to store each file. In addition we need to create a collection to hold the unzipped files.


declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";

declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
 (: pass all :)
 true()
};

declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
 (: store the XML  in the nominated directory :)
                xmldb:store($param/@directory, $path, $data)  
};

let $baseCollection := "/db/apps/zip/data/"
let $uri := request:get-parameter("uri","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")
let $zip := httpclient:get(xs:anyURI($uri), true(), ())/httpclient:body/text()
let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)

let $login :=  xmldb:login("/db","admin","password")
let $fullPath := concat($baseCollection, $unzipCollection)
let $mkdir := 
            if (xmldb:collection-available($fullPath)) then () 
            else xmldb:create-collection($baseCollection, $unzipCollection)
            
let $store := compression:unzip($zip,$filter,(),$process,<param directory="{$fullPath}"/>)
return $store

Unzipping a zip archive

edit

Zip files commonly contain multiple files. In particular Microsoft Word .docX and Excel .xslx files are zipped collections of xmlfiles which together define the document or spreadsheet.

When documents are stored in the eXist database, the mime type (media type) is inferred from the file suffix using the mime-types.xml file. Alternatively the mime type can be set explicitly when the document is stored.

We assume here that filenames in the zip file are simple. If there is a directory structure, this needs additional coding.

declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";

declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
 (: pass all :)
 true()
};

declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
 (: store the XML  in the nominated directory :)
 
 (: we need to encode the filename to account for filenames with illegal characters like [Content_Types].xml :)
       let $path := xmldb:encode($path)
 (: ensure mime type is set properly for .rels files which are xml
    alternatively you could add this mime type to the mime-types.xml configuration file
 :)
       return
          if (ends-with($path, '.rels')) then 
                xmldb:store($param/@directory, $path, $data, 'application/xml')
          else
                xmldb:store($param/@directory, $path, $data)  
};

let $baseCollection := "/db/apps/zip/data/"
let $uri := request:get-parameter("uri","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")
let $zip := httpclient:get(xs:anyURI($uri), true(), ())/httpclient:body/text()
let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)

let $login :=  xmldb:login("/db","admin","password")
let $fullPath := concat($baseCollection, $unzipCollection)
let $mkdir := 
            if (xmldb:collection-available($fullPath)) 
            then () 
            else xmldb:create-collection($baseCollection, $unzipCollection)
            
let $store := compression:unzip($zip,$filter,(),$process,<param directory="{$fullPath}"/>)
return 
  <result>
    {for $file in $store
     return 
       <file>{$file}</file>
    }
  </result>

Zips with a directory structure

edit

Most zip files contain a directory tree of files. This directory structure needs to be recreated in the database as the files are unzipped. We can modify the process function to create database collections as necessary, assuming that higher directories are referenced before sub directories.

declare namespace fw = "http://www.cems.uwe.ac.uk/xmlwiki/fw";

declare function fw:filter($path as xs:string, $type as xs:string, $param as item()*) as xs:boolean {
 (: filter any files which are not required :)
   if (ends-with($path,".bin")) then false() else true()
};

declare function fw:process($path as xs:string,$type as xs:string, $data as item()? , $param as item()*) {
    (: parse the path and create a collection if necessary :)
       let $steps := tokenize($path,"/")
       let $nsteps := count($steps)
       let $filename := $steps[$nsteps]
       let $collection := string-join(subsequence($steps,1,$nsteps - 1 ),"/")
       let $baseCollection := string($param/@collection)
       let $fullCollection := concat($baseCollection,"/",$collection)
       let $mkdir := 
            if (xmldb:collection-available($fullCollection)) then () 
            else xmldb:create-collection($baseCollection, $collection)

       let $filename := xmldb:encode($filename)
       return
           xmldb:store($fullCollection, $filename, $data)     
};

let $baseCollection := "/db/apps/zip/data/"
let $path := request:get-parameter("path","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")

let $zip :=  httpclient:get(xs:anyURI($path), true(), ())/httpclient:body/text() 

let $filter := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:filter"),3)
let $process := util:function(QName("http://www.cems.uwe.ac.uk/xmlwiki/fw","fw:process"),4)

let $login :=  xmldb:login("/db","admin","password")
let $collection := concat($baseCollection, $unzipCollection)
let $mkdir := 
            if (xmldb:collection-available($collection)) then () 
            else xmldb:create-collection($baseCollection, $unzipCollection)  
           
let $store := compression:unzip($zip,$filter,(),$process,<param collection="{$collection}"/>)
return
  <result>
    {for $file in $store
     return 
       <file>{$file}</file>
    }
  </result>

Processing stored zip files

edit

It may be desirable to store the zip files in the database as binary resources before they are unzipped. By default files with a .zip suffix are stored as binary data. To store .docx and .xslx files in eXist, you will need to add these suffices to the entry in the $EXIST_HOME/mime-type.xml configuration file.

Change

       
    <mime-type name="application/zip" type="binary">
        <description>ZIP archive</description>
        <extensions>.zip</extensions>
    </mime-type>

to

       
    <mime-type name="application/zip" type="binary">
        <description>ZIP archive and Office Open XML</description>
        <extensions>.zip,.docx,.xlsx,.pptx</extensions>
    </mime-type>

You will need to reboot the server for this change to take effect.

The basic script remains the same with minor modifications

let $path := request:get-parameter("path","http://www.iso.org/iso/iso_3166-1_list_en.zip")
let $unzipCollection := request:get-parameter("dir","temp")

let $zip := 
   if (starts-with($path,"http"))
   then httpclient:get(xs:anyURI($path), true(), ())/httpclient:body/text()
   else util:binary-doc($path)