XQuery/TEI Concordance

Motivation edit

You want to build a multi-lingual concordance from parallel texts already in the TEI format (see http://www.tei-c.org/ )

Architecture edit

There are three steps in this example:

  1. preprocessing the texts to enable easier indexing, which is done in XSLT 2.0
  2. querying the text to return a tei:entry (see http://www.tei-c.org/release/doc/tei-p5-doc/en/html/DI.html ) which is done in XQuery
  3. processing the tei:entry into HTML which is done in XSLT 1.0 in the browser

In this particular example the languages in use are English and te reo Māori. It assumes that structural tags have 'n' attributes with urls pointing to the original source of the data.

Preprocessing the text edit

This stylesheet splits the text into words (tei:w see http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-w.html ) and records on each a 'lemma' or normalised form of the word and the language it's in. These are re-calculated to allow indexes to be built of them.

<?xml version="1.0"?>
<xsl:stylesheet version="2.0"
		xmlns="http://www.tei-c.org/ns/1.0"
		xmlns:tei="http://www.tei-c.org/ns/1.0"
		xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output indent="yes"/>

  <!-- This is a simple stylesheet that inserts word tags around words
  (and implicitly defines what those words are) -->

  <xsl:variable name="lowernormal" select="'qwertyuiopasdfghjklzxcvbnmaeiouaeiou'"/>
  <xsl:variable name="upper"       select="'QWERTYUIOPASDFGHJKLZXCVBNMĀĒĪŌŪāēīōū'"/>
  
  <xsl:variable name="drop" select="'{}()*'"/>
  <xsl:variable name="punctuation" select="'.:;,!?'"/>
  <xsl:variable name="regexp" select="('.:;,!?')*()"/>


  <xsl:template match="@*|node()" priority="-1">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="text()[normalize-space()]">
    <xsl:variable name='orig' select="."/>
    <xsl:variable name='lang' select="$orig/ancestor::*[normalize-space(@xml:lang)][1]/@xml:lang"/>

    <xsl:analyze-string select="." regex="[\p{{L}}\p{{N}}]+">
      <xsl:matching-substring>

	<xsl:variable name="normalised">
	  <xsl:call-template name="normal">
	    <xsl:with-param name="string" select="translate(.,$upper,$lowernormal)"/>
	  </xsl:call-template>
	</xsl:variable>
	
	<xsl:element name="w" namespace="http://www.tei-c.org/ns/1.0">
	  <xsl:attribute name="xml:lang"><xsl:value-of select="$lang"/></xsl:attribute>
	  <xsl:attribute name="lemma"><xsl:value-of select="$normalised"/></xsl:attribute>
	  <xsl:value-of select="."/>
	  </xsl:element>
	  
      </xsl:matching-substring>
      <xsl:non-matching-substring>
	<xsl:value-of select="."/>
      </xsl:non-matching-substring>
    </xsl:analyze-string>
  </xsl:template>

  <xsl:template name="normal">
    <xsl:param name="string"/>
    
    <xsl:if test="string-length($string) &gt; 0">
      <xsl:if test="not(compare(substring($string,1,1),substring($string,2,1))=0)">
	<xsl:value-of select="substring($string,1,1)"/>
      </xsl:if>
      <xsl:call-template name="normal">
	<xsl:with-param name="string" select="substring($string,2)"/>
	</xsl:call-template>
    </xsl:if>
  </xsl:template>
  
</xsl:stylesheet>

Querying the text edit

The query builds a single <tei:entry> tag containing multiple <tei:cit>, one for each hit. A processing instruction is used to associate the TEI with a stylesheet.

xquery version "1.0";
 
declare default element namespace "http://www.tei-c.org/ns/1.0";
declare option exist:serialize "method=xml media-type=application/xml process-xsl-pi=yes indent=yes"; 
 
let $target := 'xml-stylesheet',
    $content := 'href="teiresults2htmlresults.xsl" type="text/xsl" '

return  processing-instruction {$target} {$content}, 
document {
<TEI>
  <teiHeader> 
     <!-- substantial header information needs to go here to be well formed TEI -->
  </teiHeader>
  <text>
    <body>
      <div> {
              let    $collection := '/db/kupu/korero',
                     $q := request:get-parameter('kupu', 'mohio'),
                     $lang := request:get-parameter('reo', 'mi'),
                     $first := request:get-parameter('kotahi', 1) cast as xs:decimal,
                     $last := 25 + $first
              return
         <entry xml:lang="{$lang}" n="{$last}">
            <form>
               <orth>{$q}</orth>
            </form>{
  for $word at $count in subsequence(collection($collection)//w[@lemma=$q][@xml:lang=$lang], $first,  $last)
     let $this := $word/ancestor::*[@n][1]
     let $thisid := $this/@xml:id
     let $url := $this/@n
     let $lang := $word/@xml:lang
     let $that :=
         if ( $this/@corresp )
         then (
           $this/../../*/*[concat('#',@xml:id)=$this/@corresp]
         ) else (
         "no corresp"
         )
     return
         <cit n="{$url}" corresp="#{$word/@xml:id}">
           {$this}
           
           {$that}
	 </cit>
   }</entry>
      }        
      </div>
    </body>
  </text>
</TEI>
}


Transformation to HTML edit

The TEI is transformed into HTML in the browser following the processing instruction:

<?xml version="1.0"?>
<xsl:stylesheet version="1.0"
		xmlns:html="http://www.w3.org/1999/xhtml"
		xmlns:tei="http://www.tei-c.org/ns/1.0"
		xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  
  <xsl:output indent="yes"/>

    <xsl:variable name="title"><xsl:value-of select="//tei:orth/text()"/></xsl:variable>
    <xsl:variable name="lang"><xsl:value-of select="//tei:entry/@xml:lang"/></xsl:variable>

  <xsl:template match="@*|node()" priority="-1">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="/">
    <html:html xml:lang="{$lang}" >
      <html:head>
        <html:title><xsl:value-of select="$title"/></html:title>
        <html:meta property="dc:title" xml:lang="{$lang}" content="{$title}"/>
      </html:head>
      <html:body xml:lang="{$lang}" >
        <xsl:apply-templates select="/tei:TEI/tei:text/tei:body/tei:div/tei:entry"/>
      </html:body>
    </html:html>
  </xsl:template>

  <xsl:template match="@xml:id" />

  <xsl:template match="tei:entry">
    <html:h2 xml:lang="mi">He Kupu Tawhito</html:h2>
    <html:h1 xml:lang="mi">Kupu matua: <html:span class="hit-word" style="font-style: italic" xml:lang="{$lang}"><xsl:value-of select="$title"/></html:span></html:h1>
    <html:div>
      <xsl:apply-templates select="tei:cit"/>
    </html:div>
    <xsl:variable name="url"><xsl:value-of select="concat('kupu.xql?reo=', @xml:lang, '&amp;kupu=', tei:form/tei:orth/text(), '&amp;kotahi=', @n)"/></xsl:variable>

    <html:div> <html:p> <html:a href="{$url}" style="font-style: italic">Panuku</html:a> </html:p> </html:div>
  </xsl:template>
  

  <xsl:template match="tei:cit" >
    <html:div>
      <xsl:apply-templates select="node()"/>
    </html:div>
    <html:hr/>
  </xsl:template>

  <xsl:template match="tei:p">
    <html:div>
      <xsl:apply-templates select="node()"/>
      <html:a href="{@n}" alt="ko te tohutoro"  style="font-style: italic"></html:a>
    </html:div>
  </xsl:template>

  <xsl:template match="tei:w">
    <xsl:variable name="url"><xsl:value-of select="concat('kupu.xql?reo=', @xml:lang, '&amp;kupu=', @lemma)"/></xsl:variable>
    <xsl:choose>
      <xsl:when test="concat('#',@xml:id)=../../@corresp">
        <html:span class="hit-word" style="font-style: italic"><html:a href="{$url}" alt="">
          <xsl:apply-templates select="node()"/>
        </html:a></html:span>
      </xsl:when>
      <xsl:otherwise>
        <html:a href="{$url}">
          <xsl:apply-templates select="node()"/>
        </html:a>
      </xsl:otherwise>
    </xsl:choose>
  </xsl:template>
  
</xsl:stylesheet>