XQuery/Unzipping an Office Open XML docx file

Motivation edit

You want to uncompress an docx file

Method edit

We will use the compression:unzip() function used in the prior example and pass it a local version of the function that handles the uncompression.

File Names edit

Some file names in docx files such as '[Content_Types].xml' are not valid URIs. So these must be renamed to files with valid URIs.

Here is a typical list of the path names in a docx file:

<item path="[Content_Types].xml" type="resource">Types</item>
<item path="_rels/.rels" type="resource">Relationships</item>
<item path="word/_rels/document.xml.rels" type="resource">Relationships</item>
<item path="word/document.xml" type="resource">w:document</item>
<item path="word/theme/theme1.xml" type="resource">a:theme</item>
<item path="word/settings.xml" type="resource">w:settings</item>
<item path="word/fontTable.xml" type="resource">w:fonts</item>
<item path="word/webSettings.xml" type="resource">w:webSettings</item>
<item path="docProps/app.xml" type="resource">Properties</item>
<item path="docProps/core.xml" type="resource">cp:coreProperties</item>
<item path="word/styles.xml" type="resource">w:styles</item>

Note that there are three subfolders created (_rels, word and docProps). The XML files are stored in these files.

unzip-docx function edit

The following function is used to unzip a docx file. This function name must be passed as a parameter to the unzip function to tell it to do with each docx file.

Note that you must pass in parameters to this function from the calling function.

unzip-docx function:

declare function local:unzip-docx($path as xs:string, $data-type as xs:string, $data as item()?, $param as item()*) {
    if ($param[@name eq 'list']/@value eq 'true') then
        <item path="{$path}" data-type="{$data-type}"/>
    else 
        let $base-collection := $param[@name="base-collection"]/@value/string()
        let $zip-collection := 
            concat(
                functx:substring-before-last($param[@name="zip-filename"]/@value, '.'),
                '_',
                functx:substring-after-last($param[@name="zip-filename"]/@value, '.')
                ,
                '_parts/'
            )
        let $inner-collection := functx:substring-before-last($path, '/')
        let $filename := if (contains($path, '/')) then functx:substring-after-last($path, '/') else $path
        (: we need to encode the filename to account for filenames with illegal characters like [Content_Types].xml :)
        let $filename := xmldb:encode($filename)
        let $target-collection := concat($base-collection, $zip-collection, $inner-collection)
        let $mkdir := 
            if (xmldb:collection-available($target-collection)) then () 
            else xmldb:create-collection($base-collection, concat($zip-collection, $inner-collection))
        let $store := 
            (: ensure mimetype is set properly for .docx rels files :)
            if (ends-with($filename, '.rels')) then 
                xmldb:store($target-collection, $filename, $data, 'application/xml')
            else
                xmldb:store($target-collection, $filename, $data)
        return 
            <result object="{$path}" destination="{concat($target-collection, '/', $filename)}"/>
};

unzip function edit

declare function local:unzip($base-collection as xs:string, $zip-filename as xs:string, $action as xs:string) {
    if (not($action = ('list', 'unzip'))) then <error>Invalid action</error>
    else
    let $file := util:binary-doc(concat($base-collection, $zip-filename))
    let $entry-filter := util:function(QName("local", "local:unzip-entry-filter"), 3)
    let $entry-filter-params := ()
    let $entry-data := util:function(QName("local", "local:unzip-docx"), 4)
    let $entry-data-params := 
        (
        if ($action eq 'list') then <param name="list" value="true"/> else (), 
        <param name="base-collection" value="{$base-collection}"/>,
        <param name="zip-filename" value="{$zip-filename}"/>
        )
    let $login := xmldb:login('/db', 'admin', '')

    (: recursion :)
    let $unzip := compression:unzip($file, $entry-filter, $entry-filter-params, $entry-data, $entry-data-params)
    return 
        <results action="{$action}">{$unzip}</results>
};

Sample Driver edit

let $collection := '/db/test/'
let $zip-filename := 'hello-world.docx'
let $action := 'unzip' (: valid actions: 'list', 'unzip' :)
return 
    local:unzip($collection, $zip-filename, $action)