-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprefixcounts
executable file
·52 lines (43 loc) · 1.35 KB
/
prefixcounts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/local/bin/perl -w
# Author: Jason Eisner, University of Pennsylvania
# Usage: prefixcounts [files ...]
#
# Input is in the format produced by rules2frames. We collapse tokens
# of the same (word, frame) type. Each line is therefore unique, and
# is preceded by
# count TAB totalcount TAB
# where count is the number of tokens of this (word, frame),
# and totalcount is the number of tokens of this word.
#
# Leading comments are stripped, other comments discarded.
# Locations are retained; when several lines are combined into
# one, a location is kept from one arbitrarily.
#
# We also delete all ~ arg marks in the frames.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
die "$0: bad command line flags" if @ARGV && $ARGV[0] =~ /^-./;
$inlead = 1;
while (<>) {
if (/^\#/) {
print if $inlead;
next;
}
$inlead = 0;
chop;
s/^(\S+:[0-9]+:\t)?//, $location = $&;
next if /^#/;
s/~//g;
/\t/ || die "$0: bad format line";
$totalcount{$`}++; # find the word
$count{$_}++;
$loc{$_} = $location;
$bodylinesin++;
}
while (($line, $count) = each %count) {
$line =~ /\t/; # find the word
$totalcount = $totalcount{$`};
$location = $loc{$line};
print "$count\t$totalcount\t$location$line\n";
$bodylinesout++;
}
print STDERR "$0: $bodylinesin tokens in, $bodylinesout types out\n";