#!/usr/local/bin/perl # # wordcount # # A script to generate word-frequency data from an arbitrary corpus of # source text. Configurable for arbitrary processing rules. # # by Michael J. Fromberger & Joshua Macdonald Alden # # Copyright (C) 1996 Michael J. Fromberger & Joshua Macdonald Alden # All rights reserved # # $Id: wordcount,v 1.3 1997/11/02 23:42:55 sting Exp $ # # Wish list: # - Keep certain word-pairs together (in particular, a', an, na, ag and # whatever follows them. This may require some supporting machinery # in the analysis automaton. # # - Find a way to keep multi-word kennings and collocations together # and have these specified in a file. I'm guessing this would have # to be done after killing punctuation, and might require introducing # some bracketing-syntax to get around the word-breaker. # # - Add support for 7-bit-only output. 8-bit output should be the # default. # # Bugs: # # -------------------------------------------------------------------------- require 'getopts.pl'; chomp($prog = `basename $0`); # # Global variables # $DEBUG = 1; # set to > 0 to get some debugging output $WORKDIR = "/tmp"; # directory to store working files in $GRESDIR = "/"; # global resource directory $PREPROC = "$prog.pp"; # preprocessor temp file name prefix $EXCLUDE = "wc-exclusion"; # filename for word exclusion list $PREFEX = "wc-prefix"; # filename for line prefix exclusion list $CONFIG = ".wc-default"; # filename for configuration file $RULESET = "generic"; # name of default configuration set $SAVEPP = 0; # save preprocessor file at exit? $ONLYPP = 0; # only run the preprocessor? $SILENT = 0; # run with no console output? $PROCEX = 0; # resort exclusion lists? $PERMIT = 0; # permit empty command line options? $NOPROC = 0; # do no processing at all? $DISASM = 0; # display disassembled rule set? $CUSTWB = 0; # custom wordbreak function defined? $COLLECT = ""; # if non-empty, append pp output to file $VERSION = "2.3.3"; # version string $LANG = "Generic"; # what language this is configured for @OUTFORM = ( "W", "F", "+", "\t", "\n" ); # see below... $OPTIONS = "adc:DefFghko:pqr:s:t:vwx"; # valid command line options $OUTFILE = "-"; # output file ('-' indicates standard out) $USAGE = "$prog: run '$prog -h' for help with options.\n"; %RULESETS = (); # hash mapping language names to files # # Define the directory to search for resource files (exclusion lists, etc.) # If the environment variable 'WCRESDIR' is set, use that; otherwise use the # user's home directory (which is hopefully defined) # $RCDIR = $ENV{'WCRESDIR'} || $ENV{'HOME'}; # Indices into the @OUTFORM array... $outorder = 0; # output field ordering $sortkey = 1; # sort key $sortsense = 2; # sort direction (up or down) $fieldsep = 3; # output field separator $recsep = 4; # output record separator # Other random constants... $feedint = 25; # generate feedback every $feedint lines $feedmul = 5; # ..or every ($feedmul * $feedint) words # # PROCESSING RULES # # Processing is done by a simple finite-state automaton with a simple # instruction set. The instructions for this machine are stored in a # list called @RULES and take the general form: # ::::[::] # # The field is an alphanumeric string that should uniquely identify # the rule. The field is a regular expression (including leading # and trailing slashes) that is used to pattern-match the input. The # field is optional, and is used by some type codes. # # The field is one of: # S - Input is "saved" if it matches the regular expression. # (terminates processing and keeps the input) # # K - Input is "killed" if it matches the regular expression. # (terminates processing and discards the input) # # C - Conditional; parameter is # The parameter is the name of the rule to be used # if the match succeeds. Otherwise execution continues. # # F - Filter; the should be a substitution (s//) or # translation (tr//) expression, and the input will be # transformed appropriately. # # J - Jump; the field should be null, and the first # parameter is taken as the name of the rule to execute next # # JS - Jump to Subroutine; the field should be null, and # the first parameter is taken as the name of the rule to # execute next. The address AFTER the current one is pushed # onto a return stack, and the RS instruction causes control # to resume at the instruction atop the stack. # # D - Do; the field should be null, and the first # parameter is taken as the name of a subroutine to execute on # the input. The input is not affected by the return value, # and execution continues in sequence (no jump). # # V - Evaluate; the field should be null, and the first # parameter is treated as a block of Perl code to be evaluated # in the context of the running program. # # E - Execute; the field should be null, and the first # parameter is taken as the name of a subroutine to execute # on the input. The input is replaced by the return value. # # P - Predicate; like conditional, but calls a subroutine instead # of matching a regexp. # # H - Halt; end processing of input (input is accepted) # # N - Null; don't do anything, skip to next instruction. # # R - Reject; end processing of input (input is rejected) # # RS - Return from Subroutine; pop the instruction address from the # top of the call stack and execute that instruction next. # If the stack is empty, this is a null operation and control # resumes with the next instruction in sequence (which may not # be what you had intended). # # Other types may be defined in the future. The namespaces for rules and # for types is guaranteed to be independent, however. Rules are executed # in sequential order except where C, E, P or J type rules are found. # # When constructing substitution expressions inside the 'F' instruction, or # pattern-matches inside 'K', 'S', etc., be careful about how you escape # things. Ordinarily, special characters need to be backslash-escaped in # order to get Perl to interpret them correctly. But, since the operands # of these instructions are evaluated inside an 'eval' construct, we want # the escaping to happen THEN, not during the initial compilation. Thus, # you should use '\\' as the escape sequence instead of '\' when escaping # patterns inside an instruction. # # Replacing backslashes can be difficult in this context; Perl is sometimes # pretty picky about context inside these expressions. My advice for doing # anything complex is to use an 'E' instruction instead of 'F'. # # -------------------------------------------------------------------------- # OPTIONS AND SETUP # # Setup consists of parsing the command line options, setting up some # internal state variables, and finding the input sources. # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # LOAD CONFIGURATION FILE # # The configuration file is just going to be evaluated as Perl code, so # it must be valid in that respect; however, it is otherwise unrestricted. # It can change almost anything, though it will be overridden by command # line options. # -------------------------------------------------------------------------- if(-f "$RCDIR/$CONFIG") { # read configuration if it exists print STDERR "$prog: loading configuration from '$CONFIG'\n" unless $SILENT; open(IFP, "<$RCDIR/$CONFIG") || die "$prog: $CONFIG: $!\n"; $oldfs = select(IFP); $osep = $/; undef $/; $suck = ; # suck in the whole file $/ = $osep; select($oldfs); close(IFP); eval $suck; if($@) { die "$prog: error reading configuration file: $@\n"; } undef $suck; } # -------------------------------------------------------------------------- # Parse out command-line options # -------------------------------------------------------------------------- &Getopts($OPTIONS) || die "$USAGE\n"; if($opt_a) { die "$prog: the -a and -d options are mutually exclusive.\n" if $opt_d; $OUTFORM[$sortsense] = "+"; } if($opt_c) { $COLLECT = $opt_c; } if($opt_d) { $OUTFORM[$sortsense] = "-"; } if($opt_D) { ++$DEBUG; } if($opt_e) { $PROCEX = 1; $PERMIT = 1; } if($opt_f) { die "$prog: the -f and -w options are mutually exclusive.\n" if $opt_w; die "$prog: the -F and -f options are mutually exclusive.\n" if $opt_F; $OUTFORM[$sortkey] = "F"; } if($opt_F) { die "$prog: the -F and -w options are mutually exclusive.\n" if $opt_w; $OUTFORM[$sortkey] = "FW"; } if($opt_g) { $DISASM = 1; $PERMIT = 1; } if($opt_h) { &help; exit 0; } if($opt_k) { $SAVEPP = 1; } if($opt_o) { $OUTFILE = $opt_o; } if($opt_p) { $ONLYPP = 1; } if($opt_q) { $SILENT = 1; } if($opt_r) { $OUTFORM[$recsep] = $opt_r; } if($opt_s) { $RULESET = $opt_s; } if($opt_t) { $OUTFORM[$fieldsep] = $opt_t; } if($opt_v) { print STDERR "$prog version $VERSION, by Michael J. Fromberger\n"; print STDERR "Copyright (C) 1996 by Michael J. Fromberger & ". "Joshua Macdonald Alden\n\n"; exit 0; } if($opt_w) { $OUTFORM[$sortkey] = "W"; } if($opt_x) { $OUTFORM[$outorder] = "F"; } if($#ARGV < 0) { die "$prog: usage is '$prog [ options ] [ ...]'\n" unless $PERMIT; $NOPROC = 1; } if($ONLYPP && $COLLECT) { print STDERR "$prog: WARNING: stopping after preprocessor, no " . "collection will occur\n"; } # -------------------------------------------------------------------------- # LANGUAGE DEFINITIONS # # To process any given language, it is necessary to define four variables: # $LANG - a string denoting the language being processed # @PPRULES - a list of strings containing instructions for preprocessing # @ANRULES - a list of strings containing instructions for analysis # @OPRULES - a list of strings containing instructions for postprocessing # # Note: These lists are permitted to be empty. However, if they are, the # ----- analysis automaton will implicitly accept ALL strings without # modification. If this is what you want, great, but be warned. # # These may be stored in a separate file, which can be loaded by wordcount # at runtime. The default values are entirely overridden by the rules given # in the language file. A command-line option can be used to specify the # ruleset file that will be loaded in this manner -- by default the file # named in the $RULESET variable will be loaded. # # The language file should also define any language-specific subroutines # used by the rulesets the file defines. This whole file will be eval'd # in at runtime, so it must be valid Perl code. # -------------------------------------------------------------------------- $RULESET =~ tr/A-Z/a-z/; $RULEPATH = $RULESETS{$RULESET}; $RULEPATH =~ s/^~/$ENV{'HOME'}/; $RULEPATH =~ s/^=/$GRESDIR\//; $RULEPATH =~ s/\/\//\//g; print STDERR "RULEPATH is '$RULEPATH'\n" if $DEBUG; if(-f "$RULEPATH") { # read ruleset if it exists print STDERR "$prog: loading ruleset from '$RULEPATH'\n" unless $SILENT; open(IFP, "<$RULEPATH") || die "$prog: $RULEPATH: $!\n"; $oldfs = select(IFP); $osep = $/; undef $/; $suck = ; # suck in the whole file $/ = $osep; select($oldfs); close(IFP); eval $suck; if($@) { die "$prog: error reading language file: $@\n"; } undef $suck; } elsif($opt_s) { # if user specified this file... die "$prog: couldn't find language file for '$RULESET'.\n"; } else { # otherwise, set up some defaults print STDERR "$prog: loading generic ruleset\n" unless $SILENT; @PPRULES = ( "::K::/^\\s*\$/", # eliminate blank or all-white lines "::F::tr/A-Z/a-z/", # canonicalize to lower-case "::H", # accept this line ); @ANRULES = ( "::F::s/[^\\w]/ /g", # convert non-alphanumerics to spaces "::H", # accept this word ); } print STDERR "$prog: current language is '$LANG'\n" unless $SILENT; if(defined(&breakline)) { $CUSTWB = 1; } # # Read in the exclusion lists... # if($PROCEX) { print "$prog: reorganizing exclusion lists\n" unless $SILENT; pprocex("$RCDIR/$EXCLUDE", 1); } %EXCL = readex("$RCDIR/$EXCLUDE"); @PREX = readlex("$RCDIR/$PREFEX"); if($DISASM) { print STDERR "\nRule set currently loaded:\n\n"; print STDERR "Language: $LANG\n\n"; print STDERR "Custom word-break function defined.\n\n" if $CUSTWB; print STDERR "Preprocessor program:\n"; for($i = 0; $i <= $#PPRULES; $i++) { printf STDERR ( "%2d: %s\n", $i, disasm($PPRULES[$i])); } print "\nAnalysis program:\n"; for($i = 0; $i <= $#ANRULES; $i++) { printf STDERR ( "%2d: %s\n", $i, disasm($ANRULES[$i])); } print "\nPostprocessing program:\n"; for($i = 0; $i <= $#OPRULES; $i++) { print STDERR ( "%2d: %s\n", $i, disasm($OPRULES[$i])); } } # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # # MAIN PROGRAM # # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- if($NOPROC) { exit 0; } # -------------------------------------------------------------------------- # PREPROCESSING # # Preprocessing consists of reading each input file a line at a time, # chopping off the carriage return, and then feeding it into the analysis # automaton. The result, if not empty, is appended to a running temporary # file. This file will be used as the input source for the analysis phase # -------------------------------------------------------------------------- @RULES = @PPRULES; # select ruleset for preprocessing $PPFILE = "$WORKDIR/$PREPROC.$$"; open(PPOF, ">$PPFILE") || die "$prog: $PPFILE: $!\n"; foreach $file (@ARGV) { print STDERR "$prog: preprocessing '$file'\n" unless $SILENT; open(IFP, "<$file") || die "$prog: $file: $!\n"; $nlines = 0; while($line = ) { unless($SILENT) { print STDERR "." if (!($nlines % $feedint)); } chomp($line); $line = analyze($line); if($line ne "") { print PPOF "$line\n"; } ++$nlines; } close(IFP); } print STDERR "\n" unless ($SILENT); close(PPOF); if($ONLYPP) { print STDERR "$prog: preprocessing complete.\n" unless $SILENT; print STDERR "$prog: output saved in $PPFILE.\n" unless $SILENT; exit 0; } # -------------------------------------------------------------------------- # ANALYSIS # # In the analysis phase, we rewind to the beginning of the # preprocessor output file, and read in each line again. This time, # the lines are broken up into words (using whitespace as delimiter) # and each word is fed to the analysis automaton. The output of the # automaton, if not empty, is counted in the frequency hash. # -------------------------------------------------------------------------- @RULES = @ANRULES; # select ruleset for analysis open(PPIF, "<$PPFILE") || die "$prog: $PPFILE: $!\n"; print STDERR "$prog: generating frequency data\n" unless $SILENT; if($COLLECT) { open(CPOF, ">>$COLLECT") || die "$prog: $COLLECT: $!\n"; } $nwords = 0; while($line = ) { chomp($line); if($CUSTWB) { @words = breakline($line); # custom word breaking } else { @words = split(/\s+/, $line); # default word breaking (whitespace) } foreach $word (@words) { # analyze each word $word = analyze($word); if($word ne "") { $FREQ{$word}++; # count words that come back intact if($COLLECT) { print CPOF " $word"; } } unless($SILENT) { print STDERR "." if(!($nwords % ($feedint * $feedmul))); } ++$nwords; } if($COLLECT && $line) { print CPOF "\n"; } } print STDERR "\n" unless ($SILENT); close(PPIF); if($COLLECT) { close(CPOF); } # -------------------------------------------------------------------------- # CLEANUP AND CLOSURE # # Here we sort and format the output, write it out to wherever it's going, # and then clean up working files and close down. Unless the user asked # for the preprocessor temp to be kept around, it will be unlinked. # # As the data are extracted from the frequency-count hash, they are passed # through a post-processing pass of the analysis automaton. This gives the # language file the ability to do any just-in-time conversions that are # needed to render the output a certain way. # # @OUTFORM is a list consisting of five members: # (1) The order of output (word-frequency 'W' or frequency-word 'F') # (2) Which item to sort on (word 'W', frequency 'F', or freq-word 'FW') # (3) Whether to sort ascending ('+') or descending ('-') # (4) The delimiter between fields (a string, default '\t') # (5) The delimiter between records (a string, default '\n') # -------------------------------------------------------------------------- # Convert the frequency table to a flat (ordered) list @RULES = @OPRULES; # select postprocessor ruleset while(($w, $f) = each(%FREQ)){ $w = analyze($w); # analyze each word push(@unsorted, [ $w, $f ]); # push an anonymous list reference } # Sort according to the specification in @OUTFORM print STDERR "$prog: sorting frequency table\n" unless $SILENT; if($OUTFORM[$sortkey] eq "W") { if($OUTFORM[$sortsense] eq "+") { # ascending by word print STDERR "(ascending by word)\n" if $DEBUG; @sorted = sort asc_word (@unsorted); } else { # descending by word print STDERR "(descending by word)\n" if $DEBUG; @sorted = sort des_word (@unsorted); } } elsif($OUTFORM[$sortkey] eq "FW") { if($OUTFORM[$sortsense] eq "+") { # ascending by freq, sub word print STDERR "(ascending by frequency, sub word)\n" if $DEBUG; @sorted = sort asc_freq_sub (@unsorted); } else { # descending by freq, sub word print STDERR "(descending by frequency, sub word)\n" if $DEBUG; @sorted = sort des_freq_sub (@unsorted); } } else { if($OUTFORM[$sortsense] eq "+") { # ascending by frequency print STDERR "(ascending by frequency)\n" if $DEBUG; @sorted = sort asc_freq (@unsorted); } else { # descending by frequency print STDERR "(descending by frequency)\n" if $DEBUG; @sorted = sort des_freq (@unsorted); } } undef @unsorted; # Write output in this sorted order # # Output defaults to standard output, unless the user has specified a # filename on the command line # open(OFP, ">$OUTFILE") || die "$prog: $OUTFILE: $!\n"; foreach $record (@sorted) { if($OUTFORM[$outorder] eq "W") { ($first, $second) = ( $$record[0], $$record[1] ); } else { ($first, $second) = ( $$record[1], $$record[0] ); } print OFP "$first" . $OUTFORM[$fieldsep] . "$second" . $OUTFORM[$recsep]; } if(!$SAVEPP) { print STDERR "$prog: removing preprocessor workfile $PPFILE\n" if $DEBUG; if(unlink($PPFILE) < 1) { warn "$prog: $!\n"; } } close(OFP); # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # # END OF MAIN PROGRAM # # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- ########################################################################## # -------------------------------------------------------------------------- # SUBROUTINE DEFINITIONS # -------------------------------------------------------------------------- # # analyze(input) # Subjects the input string to the analysis program specified in $RULES # This is basically a simple finite-state automaton with the instruction # set defined by the codes in the rule strings. # # If the input was accepted, the transformed version is returned to the # caller; otherwise the empty string is returned indicating the input # was rejected. # sub analyze { my($input) = @_; my($cur, $running) = (0, 1); my($name, $op, $regex, $target, @param); my(@calls) = (); print STDERR "--- entering analysis automaton ---\n" if ($DEBUG > 1); if($#RULES < 0) { return $input; } while($running) { if($cur < 0 || $cur > $#RULES) { print STDERR "$prog: >> ERROR << out of program range\n"; die "$prog: last OP=$op, last NAME=$name, POS=$cur\n"; } ($name, $op, @param) = split(/::/, $RULES[$cur]); ($regex, $target) = @param; if($op eq "S") { # SAVE if(match($input, $regex)) { $running = 0; } ++$cur; } elsif($op eq "K") { # KILL if(match($input, $regex)) { $input = ""; $running = 0; } ++$cur; } elsif($op eq "C") { # CONDITION if(match($input, $regex)) { $cur = findrule($target); } else { ++$cur; } } elsif($op eq "F") { # FILTER $input = filter($input, $regex); ++$cur; } elsif($op eq "J") { # JUMP $cur = findrule($regex); } elsif($op eq "JS") { # JUMP to SUBROUTINE push(@calls, $cur + 1); $cur = findrule($regex); } elsif($op eq "E") { # EXEC $input = &$regex($input); ++$cur; } elsif($op eq "D") { &$regex($input); # DO ++$cur; } elsif($op eq "V") { # EVAL eval $regex; ++$cur; } elsif($op eq "P") { # PREDICATE if(&$regex($input)) { $cur = findrule($target); } else { ++$cur; } } elsif($op eq "H") { # HALT (accept) $running = 0; } elsif($op eq "N") { # NOOP ++$cur; } elsif($op eq "R") { # REJECT $input = ""; $running = 0; } elsif($op eq "RS") { # RETURN from SUBROUTINE if($#calls < 0) { ++$cur; } else { $cur = pop(@calls); } } else { # ILLEGAL # This should not get executed under normal circumstances # Since the program is wired-in, a bad instruction aborts print STDERR "$prog: >> ERROR << bad instruction in " . " analysis program\n"; die "$prog: OP=$op, NAME=$name, POS=$cur DATA=$input\n"; } print STDERR "OP=$op / NAME=$name / NEXT=$cur / " . "DATA=$input\n" if ($DEBUG > 1); } # end of while($running) print STDERR "--- leaving analysis automaton ---\n" if ($DEBUG > 1); $input; } # end of analyze($string) # # findrule(name) # Find the index of the given rule in the rules list. Returns -1 if no # such rule is found # sub findrule { my($name) = @_; my($i); for($i = 0; $i <= $#RULES; $i++) { if($RULES[$i] =~ /^($name)::/) { return $i; } } -1; } # # filter(input, rule) # Apply the given filter rule to the input and return the result # sub filter { my($input, $rule) = @_; eval "\$input =~ $rule"; $input; } # # match(input, rule) # Apply the given match rule to the input and return the result # sub match { my($input, $rule) = @_; if(eval "\$input =~ $rule") { print STDERR "MATCH: $rule FOUND\n" if ($DEBUG > 1); return 1; } else { print STDERR "MATCH: $rule NOT FOUND\n" if ($DEBUG > 1); return 0; } } # # disasm(inst) # Disassemble an instruction string and print out a reasonable # facsimile of its operation # sub disasm { my($inst) = @_; my($name, $op, $regex, @param); ($name, $op, $regex, @param) = split(/::/, $inst); if($name) { $_ = sprintf("%10s: ", $name); } else { $_ = " " x 12; } if($op eq "S") { $_ .= "Save if input matches $regex"; } elsif($op eq "K") { $_ .= "Kill if input matches $regex"; } elsif($op eq "C") { $_ .= "If input matches $regex, then go to $param[0]"; } elsif($op eq "F") { $_ .= "Filter input by $regex"; } elsif($op eq "J") { $_ .= "Go to $regex"; } elsif($op eq "JS") { $_ .= "Jump to subroutine at $regex"; } elsif($op eq "E") { $_ .= "input = $regex(input)"; } elsif($op eq "D") { $_ .= "Invoke $regex(input)"; } elsif($op eq "V") { $_ .= "Evaluate " . $regex; } elsif($op eq "P") { $_ .= "If $regex(input), then go to $param[0]"; } elsif($op eq "H") { $_ .= "Halt and accept input"; } elsif($op eq "N") { $_ .= "Null operation"; } elsif($op eq "R") { $_ .= "Halt and reject input"; } elsif($op eq "RS") { $_ .= "Return from subroutine"; } else { $_ .= ">> ILLEGAL INSTRUCTION << Opcode is '$op'"; } $_; } # # readex(filename) # Read in the given file name into an exclusion list. This is basically # just a hash where the values are always true. Each line is taken liter- # ally as a key. Returns a hash of the values. An empty hash is returned # if the file was not found, or was empty. # # If the file name given is a directory instead of a file, all the plain # files in that directory are consumed. # # readlex(filename) # Just like readex(), except reads the list into a linear array instead of # a hash. Actually, readlex() just calls readex() and rearranges its # output... # sub readex { my($fname) = @_; my(%out, $line, @files); if(-d $fname) { @files = sort <$fname/*>; # for all files in this directory } else { @files = ( $fname ); # for just one plain file } foreach $fname (@files) { if(! -f $fname) { next; } # skip links and subdirectories open(RDX, "<$fname") || return (); while($line = ) { chomp($line); ++$out{$line}; } close(RDX); } %out; } sub readlex { my($fname) = @_; my(@out, %tmp); %tmp = readex($fname); foreach (sort keys(%tmp)) { push(@out, $_); } @out; } # # pprocex(filename[, dc]) # Preprocess an exclusion file. This consists of reading in the file, # canonicalizing it to lower-case, sorting it into forward alphabetical # order, and then writing it back out. Returns nothing. # # If is true, everything will be lower-cased. If it's false, that # step will be skipped. The parameter is optional, and will be assumed # false if not supplied. # # If names a directory instead of a file, all the files in # the directory so named are processed individually. Only plain files # are considered; links and subdirectories are ignored. # # Warning: This is a very time-consuming operation, relatively speaking. # ------- Each file is read in completely, processed, and then written # back out. Asymptotically, this is no worse than O(n log n) # but the constant factor is rather large (there are several # linear passes) # sub pprocex { my($fname, $dc) = @_; my(@hold, @shold, $i); my($reg, @files); if(-d $fname) { @files = <$fname/*>; } else { @files = ( $fname ); } foreach $fname (@files) { if(! -f $fname) { next; } # only deal with plain files open(RDX, "<$fname") || next; # skip files that fail to open print STDERR "PPROCEX: processing '$fname'\n" if $DEBUG; @hold = ; # suck in the complete file close(RDX); # Lowercase all items if it's called for (default doesn't) if($dc) { for($i = 0; $i <= $#hold; $i++) { $hold[$i] =~ tr/A-Z/a-z/; } } # Sort... @shold = sort @hold; # Filter out duplicates (done after sorting so it can be done # in one relatively quick pass) @hold = (); $reg = $shold[0]; for($i = 0; $i <= $#shold; $i++) { if($shold[$i] ne $reg) { push(@hold, $shold[$i]); $reg = $shold[$i]; } } # Write the processed file back over the original open(RDX, ">$fname") || die "$prog: pprocex: $fname: $!\n"; foreach $i (@hold) { print RDX "$i"; } close(RDX); } } sub help { print STDERR < [ ... ] The following command line options are available: Output sorting and formatting ----------------------------- -a : sort order is ascending (default) -d : sort order is descending -f : sort output by frequency (default) -F : sort output by frequency, subsort by word -r : set record-separator (default is carriage return) -t : set field-separator (default is tab) -w : sort output by word -x : exchange output field order (default is word, freq) File management --------------- -k : keep preprocessor output (deleted by default) -c : collect clean wordlist in file -o : specify output file name -s : specify language name (default: $RULESET) Function controls ----------------- -e : reorganize and clean up exclusion lists -g : disassemble and display analyzer programs -q : run in 'silent' mode (no console output) -p : stop after the preprocessing phase Miscellaneous ------------- -h : display this help message -v : display version information and exit ------------------------------------------------------------------------ EOHELP } # # Sorting subroutines (used for the 'sort' operator built into Perl) # asc_word - sort ascending by word # asc_freq - sort ascending by frequency # des_word - sort descending by word # des_freq - sort descending by frequency # asc_freq_sub - sort ascending by frequency, subsort ascending by word # des_freq_sub - sort descending by frequency, subsort ascending by word # # These routines make the bold assumption that the inputs will be # references to two-member lists where the first member is the word, # and the second member is the frequency (this is how they are called # in the code above) # sub asc_word { my($la, $lb); $la = $$a[0]; $lb = $$b[0]; $la cmp $lb; } sub des_word { my($la, $lb); $la = $$a[0]; $lb = $$b[0]; $lb cmp $la; } sub asc_freq { my($la, $lb); $la = $$a[1]; $lb = $$b[1]; $la <=> $lb; } sub des_freq { my($la, $lb); $la = $$a[1]; $lb = $$b[1]; $lb <=> $la; } sub asc_freq_sub { my($la, $lb, $lc, $ld, $tmp); $la = $$a[1]; $lb = $$b[1]; $lc = $$a[0]; $ld = $$b[0]; $tmp = ($la <=> $lb); if(!$tmp) { $lc cmp $ld; } else { $tmp; } } sub des_freq_sub { my($la, $lb, $lc, $ld, $tmp); $la = $$a[1]; $lb = $$b[1]; $lc = $$a[0]; $ld = $$b[0]; $tmp = ($lb <=> $la); if(!$tmp) { $lc cmp $ld; } else { $tmp; } } # -------------------------------------------------------------------------- # FILTER FUNCTIONS # # These are generally useful filter functions that can be used in a program # for the analysis automaton. # -------------------------------------------------------------------------- # # killurl(string) # Strip anything that looks like a URL from the input string, and return # the transformed input string # sub killurl { my($string) = @_; $string =~ s/(http|mailto|ftp|gopher|telnet):([^\s]+|$)//g; $string; } # # killmail(string) # Strip anything that looks like an e-mail address from the input string, # and return the transformed string # sub killmail { my($string) = @_; $string =~ s/?//g; $string; } # # unqp(string) # Remove quoted-printable encoding from a string, by converting all # quoted-printable characters back to their binary representation # sub unqp { my($string) = @_; $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr(hex($1))/eg; $string; } # # enqp(string) # Add quoted-printable encoding to a string, by converting all # strings outside of printable ASCII into the appropriate QP form # sub enqp { my($string) = @_; my(@ch, $i, $k); @ch = split(//, $string); for($i = 0; $i <= $#ch; $i++) { $k = $ch[$i]; $ch[$i] = sprintf("=%02X", ord($ch[$i])) unless($k eq "\t"||$k eq "\r"||$k eq "\n"||$k eq "\f" ||(ord($k) >= 32 && ord($k) < 127 && $k ne '=')); } return join('', @ch); } # dequote(string) # Find runs of non-whitespace text which occur between pairs of single # or double quotation marks, and eliminate those quotation pairs. Returns # the string with these modifications. # sub dequote { my($string) = @_; print STDERR "DEQUOTE: pre: $string\n" if ($DEBUG > 1); $string =~ s/\'([^\s]+)\'/\1/g; $string =~ s/\"([^\s]+)\"/\1/g; print STDERR "DEQUOTE: post: $string\n" if ($DEBUG > 1); $string; } # -------------------------------------------------------------------------- # PREDICATES # # These functions test input words/strings for conditions which may be # true or false. # -------------------------------------------------------------------------- # # isheader(line) # Test if the input line looks like a mail header (loosely defined) # sub isheader { my($line) = @_; if($line =~ /^([A-Z][-\w\d]+:)/) { print STDERR "ISHEADER: Found header '$1'\n" if ($DEBUG > 1); return 1; } else { print STDERR "ISHEADER: No match\n" if ($DEBUG > 1); return 0; } } # # isemail(word) # Test if input word looks like an e-mail address (loosely defined) # sub isemail { my($word) = @_; if($word =~ /^?$/) { return 1; } else { return 0; } } # # isexcluded(word) - is this word in the (global) exclusion list? # sub isexcluded { my($word) = @_; $word =~ s/^\'+//; # remove leading apostrophes $word =~ s/\'+$//; # remove trailing apostrophes return exists($EXCL{$word}); } # # isprefix(string) - does this string match an excluded prefix? # sub isprefix { my($string) = @_; my($key); foreach $key (@PREX) { if(eval "\$string =~ /^($key)/") { print STDERR "PREFIX match found for '$string'\n" if $DEBUG; return 1; } } return 0; } # # hasurl(string) - does this string contain a URL? # Returns a list of the form ( url, type ) where 'type' is the kind of # URL it is, and 'url' is the text that was matched. An empty list is # returned if no match is made. # sub hasurl { my($string) = @_; my($url, $type); if($string =~ /((http|mailto|ftp|gopher|telnet):[^\s]+)/) { return ( $1, $2 ); } else { return (); } } ########################################################################## # END