#!/usr/bin/perl #330 $skip = " mwsj_0681 mwsj_1377 mwsj_1362 mwsj_1355 mwsj_1322 mwsj_1318 mwsj_1192 mwsj_1170 mwsj_1138 "; $direct = '/Volumes/dr/Documents/RST/corpus/marcu/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/ALL/*.dis'; $direct = '../../corpus/marcu/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/ALL/*.dis'; $doccount=0; %relations=(); open TEMPFILE, ">/tmp/conv.tmp"; @f = @ARGV; unless (@f) {@f = glob($direct); } warn @f." documents to be read."; foreach(@f) { warn $_; &readfile($_); print TEMPFILE $doctext; $doctext = ""; } close TEMPFILE; &print_header; system 'cat /tmp/conv.tmp'; print ""; warn $doccount. " docs converted."; sub print_header { print '
'; foreach(keys %relations) { if ($relations{$_} == -1) { print "\n"; } else { print "\n"; } } print "
\n"; } sub readfile { my $file = $_[0]; my $docid; if ($file =~ /\/(wsj_[0-9]+)\./i) { $docid = $1; } else { $docid = $doccount; } $doccount++; $idprefix = $docid.'_'; if (index($skip, " m$docid ")>0) { warn "skipped $docid"; $doctext = ''; } else { my $txt = ''; open IN, "<$file" || die("file error"); while() { chop $_; $txt .= $_; } close IN; $indent=0; $idcount = 1; $level=0; $relationtext = ''; $segmenttext = ''; parseroot($txt); $segmenttext =~ s/

/

/isg; $doctext.= "\n$segmenttext\\n\n$relationtext\n"; } } sub parse { my $t = $_[0]; # print "tok: $_ \n"; my @list = &tokenizelisplist($t); if (scalar(@list) == 0) { print " " x $indent . "T: ".$terminal."\n"; } else { $indent +=4; evaluate_rst_list(@list); $indent -=4; } } sub parseroot { my $t = $_[0]; # print "tok: $_ \n"; parserel($t); return; my @list = &tokenizelisplist($t); my $pred = shift @list; if ($pred =~ /^\s*Root\s*$/is) { shift @list; foreach (@list) { parserel($_); } } else { warn "cannot parse root. Pred=".$pred; } } sub parserel { my @list = tokenizelisplist($_[0]); my $pred = shift @list; if ($pred =~ /^\s*Nucleus\s*$/is || $pred =~ /^\s*Satellite\s*$/is || $pred =~ /^\s*Root\s*$/is ) { # ignore shift @list; my @rp ; # the next item should be a rel2par if ($list[0] =~ /^\s*\(\s*rel2par/is) { @rp = tokenizelisplist( shift @list); } else { @rp = tokenizelisplist( @list); } $idcount++; my $newid = $idprefix.$idcount; my $el = ''; my $binarize = 0; my $the_nuc = ''; my $the_nuc_id; my @the_sats = (); my $rel_type = ''; my $num_segments=0; my $id_generated; my $rel_assigned; my $nuclearity=0; # -1 is multinuc, 1 is nuc-sat, 0 is unknown my $nucfound=0; foreach (@list) { ($id_generated, $rel_assigned, $role_assigned) = parserel($_); if ($rel_type) { # must be 'span' or match the rel_type if ($rel_type ne '' && $rel_assigned ne 'span' && $rel_assigned ne $rel_type) { warn "Multiple satellites with different relation types under one parent, parent = $newid, offending satellite = $id_generated, types $rel_type, $rel_assigned"; $binarize = 1; } } elsif ($rel_assigned ne 'span') { $rel_type = $rel_assigned ; } # check roles if ($role_assigned eq 'satellite') { if ($nuclearity == -1) { die "found satellite in multinuc relation! id=m$newid"; } $nuclearity = 1; } elsif ($role_assigned eq 'nucleus') { $nucfound++; if ($nucfound>1 ) { if ($nuclearity == 1) { die "found multiple nuclei in nuc-sat relation! id=m$newid"; } $nuclearity = -1; # is multinuc! } } my $element = ' <'.$role_assigned.' id="m'.$id_generated.'" />'."\n"; # if ($role_assigned eq "nucleus") # { # $the_nuc = $element; # $the_nuc_id = $id_generated; # } else # { # push @the_sats, $element # } $el .= $element; $num_segments++; } if ($num_segments==1 && $rel_type eq '') { # if only one element in there and there is no type, it's a # terminal segment. $el = ''; $newid = $id_generated; } else { my $tag; if ($nuclearity == -1) { $tag = 'parRelation'; } elsif ($nuclearity == 1) { $tag = 'hypRelation'; } else { $tag = 'relation'; } $el = "<$tag type=\"$rel_type\" id=\"m$newid\">\n$el\n"; if (exists($relations{$rel_type}) && $relations{$rel_type} != $nuclearity) { warn "rel type $rel_type both defined as multinuc and as nuc-sat! id=m$newid"; } $relations{$rel_type} = $nuclearity; if ($nuclearity >-1 && $num_segments>2) { warn "$num_segments found as arguments to relation $newid: $el"; } } # if ($binarize) # { # $el = &binarize; # } $relationtext .= $el; return ($newid, $rp[1], lc($pred)); } elsif ($pred =~ /^\s*text\s*$/is) { $idcount++; my $newid = $idprefix.$idcount; my $el = "$list[0]"; $segmenttext .= $el; return ($newid, '', ''); } else { warn "cannot parse Pred=".$pred." whole text=".$_; } } # sub binarize # { # # my $l; # # foreach (@the_sats) # { # $idcount++; # my $newid = $idprefix.$idcount; # # # # $l = "$el"; # # $the_nuc = # } # # # # } # shifts the first lisp list from a text # args: text # return: tokens sub tokenizelisplist { my $in = $_[0]; #print substr($in,0,20)."\n"; # remove outer parantheses unless ($in =~ /[\(\)]/) { $terminal=$in; return (); } unless ($in =~ s/^\s*\(//is) { # $terminal = $in; # return (); die "tokenizelisplist: left paren not found. ".$in; } unless ($in =~ s/\)\s*$//is) { die "tokenizelisplist: right paren not found. ".$in; } $terminal = ''; #print $in; $in =~ s/\n//isg; $in =~ s/\r//isg; my @list = (); my $level=0; my $string =0; my $stack = ''; my $searchcode = '^(.*?)([\(\)\ ]|\_\!)'; while($in =~ s/$searchcode//is) { if ($2 eq '_!' || $string>0) { if ($string>0 && ($2 eq '_!')) { $string = 0; $searchcode = '^(.*?)([\(\)\ ]|\_\!)'; } else { $string = 1; $searchcode = '^(.*?)(\_\!)'; } if ($level==0 && $2 eq '_!') { $stack .= $1; $stack =~ s/\&/&/isg; $stack =~ s/\/>/sg; } else { $stack .= $1.$2; } } else { if ($level>0) # shift please { $stack .= $1.$2; } else { $stack .= $1; my $sign = $2; push @list, $stack unless ($stack =~ /^\s*$/); $stack = ''; if ($sign eq '(') { $stack = $sign; # should be opening paren or space } elsif ($sign eq ')') { die "clos paren on level 0: ".$in; } } if ($2 eq '(' ) { $level++; } elsif ($2 eq ')' ) { $level--; } } } if ($stack || $in) { $stack .= $in; push @list, $stack; $stack = ''; $in = ''; } return @list; }