XQuery/Unzipping an Office Open XML docx file
Motivation
editYou want to uncompress an docx file
Method
editWe will use the compression:unzip() function used in the prior example and pass it a local version of the function that handles the uncompression.
File Names
editSome file names in docx files such as '[Content_Types].xml' are not valid URIs. So these must be renamed to files with valid URIs.
Here is a typical list of the path names in a docx file:
<item path="[Content_Types].xml" type="resource">Types</item>
<item path="_rels/.rels" type="resource">Relationships</item>
<item path="word/_rels/document.xml.rels" type="resource">Relationships</item>
<item path="word/document.xml" type="resource">w:document</item>
<item path="word/theme/theme1.xml" type="resource">a:theme</item>
<item path="word/settings.xml" type="resource">w:settings</item>
<item path="word/fontTable.xml" type="resource">w:fonts</item>
<item path="word/webSettings.xml" type="resource">w:webSettings</item>
<item path="docProps/app.xml" type="resource">Properties</item>
<item path="docProps/core.xml" type="resource">cp:coreProperties</item>
<item path="word/styles.xml" type="resource">w:styles</item>
Note that there are three subfolders created (_rels, word and docProps). The XML files are stored in these files.
unzip-docx function
editThe following function is used to unzip a docx file. This function name must be passed as a parameter to the unzip function to tell it to do with each docx file.
Note that you must pass in parameters to this function from the calling function.
unzip-docx function:
declare function local:unzip-docx($path as xs:string, $data-type as xs:string, $data as item()?, $param as item()*) {
if ($param[@name eq 'list']/@value eq 'true') then
<item path="{$path}" data-type="{$data-type}"/>
else
let $base-collection := $param[@name="base-collection"]/@value/string()
let $zip-collection :=
concat(
functx:substring-before-last($param[@name="zip-filename"]/@value, '.'),
'_',
functx:substring-after-last($param[@name="zip-filename"]/@value, '.')
,
'_parts/'
)
let $inner-collection := functx:substring-before-last($path, '/')
let $filename := if (contains($path, '/')) then functx:substring-after-last($path, '/') else $path
(: we need to encode the filename to account for filenames with illegal characters like [Content_Types].xml :)
let $filename := xmldb:encode($filename)
let $target-collection := concat($base-collection, $zip-collection, $inner-collection)
let $mkdir :=
if (xmldb:collection-available($target-collection)) then ()
else xmldb:create-collection($base-collection, concat($zip-collection, $inner-collection))
let $store :=
(: ensure mimetype is set properly for .docx rels files :)
if (ends-with($filename, '.rels')) then
xmldb:store($target-collection, $filename, $data, 'application/xml')
else
xmldb:store($target-collection, $filename, $data)
return
<result object="{$path}" destination="{concat($target-collection, '/', $filename)}"/>
};
unzip function
editdeclare function local:unzip($base-collection as xs:string, $zip-filename as xs:string, $action as xs:string) {
if (not($action = ('list', 'unzip'))) then <error>Invalid action</error>
else
let $file := util:binary-doc(concat($base-collection, $zip-filename))
let $entry-filter := util:function(QName("local", "local:unzip-entry-filter"), 3)
let $entry-filter-params := ()
let $entry-data := util:function(QName("local", "local:unzip-docx"), 4)
let $entry-data-params :=
(
if ($action eq 'list') then <param name="list" value="true"/> else (),
<param name="base-collection" value="{$base-collection}"/>,
<param name="zip-filename" value="{$zip-filename}"/>
)
let $login := xmldb:login('/db', 'admin', '')
(: recursion :)
let $unzip := compression:unzip($file, $entry-filter, $entry-filter-params, $entry-data, $entry-data-params)
return
<results action="{$action}">{$unzip}</results>
};
Sample Driver
editlet $collection := '/db/test/'
let $zip-filename := 'hello-world.docx'
let $action := 'unzip' (: valid actions: 'list', 'unzip' :)
return
local:unzip($collection, $zip-filename, $action)