#!/usr/bin/perl
# extract rhetoricaltext environment from XML
# THIS FILE NEEDS TO BE SAVED AS UNICODE!!! (UTF-8)
# args: extractdoc xmlfile docid
$fn = $ARGV[0];
warn "loading $fn\n";
{
local(*INPUT, $/);
open (INPUT, $fn) || die "can't open $fn: $!";
$f = ;
close INPUT;
}
$docid = $ARGV[1];
if ($docid eq '')
{
print "printing all document ids: \n";
while ($f =~ /\]+)(\"?)\s*>/isg)
{
print $2."\n";
}
} else
{
$f =~ /(.*?)<\/\s*document\s*>/is;
$doc = $3;
print '\begin{rhetoricaltext}\source{'."$docid}\n";
while($doc =~ /]+)(\"?)[^>]*>(.*?)<\/\s*segment\s*>/isg )
{
my $segid = $2;
my $seg = $4;
$segid =~ s/$docid\.?//;
$seg =~ s/\<\/?sign[^>]*>//isg;
$seg =~ s/(Ä|\Ä)/\\\"\{A\}/sg;
$seg =~ s/(Ö|\Ö)/\\\"\{O\}/sg;
$seg =~ s/(Ü|\Ü)/\\\"\{U\}/sg;
$seg =~ s/(ä|\ä)/\\\"\{a\}/sg;
$seg =~ s/(ö|\ö)/\\\"\{o\}/sg;
$seg =~ s/(ü|\ü)/\\\"\{u\}/sg;
$seg =~ s/(ß|\ß)/\\\"\{ss\}/sg;
if ($seg)
{ print "\\unit[$segid]{$seg}\n";
}
}
print '\end{rhetoricaltext}'."\n";
}