#!/usr/bin/perl # extract rhetoricaltext environment from XML # THIS FILE NEEDS TO BE SAVED AS UNICODE!!! (UTF-8) # args: extractdoc xmlfile docid $fn = $ARGV[0]; warn "loading $fn\n"; { local(*INPUT, $/); open (INPUT, $fn) || die "can't open $fn: $!"; $f = ; close INPUT; } $docid = $ARGV[1]; if ($docid eq '') { print "printing all document ids: \n"; while ($f =~ /\]+)(\"?)\s*>/isg) { print $2."\n"; } } else { $f =~ /(.*?)<\/\s*document\s*>/is; $doc = $3; print '\begin{rhetoricaltext}\source{'."$docid}\n"; while($doc =~ /]+)(\"?)[^>]*>(.*?)<\/\s*segment\s*>/isg ) { my $segid = $2; my $seg = $4; $segid =~ s/$docid\.?//; $seg =~ s/\<\/?sign[^>]*>//isg; $seg =~ s/(Ä|\Ä)/\\\"\{A\}/sg; $seg =~ s/(Ö|\Ö)/\\\"\{O\}/sg; $seg =~ s/(Ü|\Ü)/\\\"\{U\}/sg; $seg =~ s/(ä|\ä)/\\\"\{a\}/sg; $seg =~ s/(ö|\ö)/\\\"\{o\}/sg; $seg =~ s/(ü|\ü)/\\\"\{u\}/sg; $seg =~ s/(ß|\ß)/\\\"\{ss\}/sg; if ($seg) { print "\\unit[$segid]{$seg}\n"; } } print '\end{rhetoricaltext}'."\n"; }