#!/usr/local/bin/perl -s ############################################################ # # # fsconcordance # # # # [con-cor-dance] # # (k@n-ko^r'dns) # # # # (NOUN). # # 1. A state of agreement; concord. # # 2. An alphabetical index of all the words in a # # text or corpus of texts, showing every contextual # # occurrence of a word. # # # # a standard concordance is simply a special # # case of a mega-concordance, a concordance of clauses, # # phrases, and sentences; each index of a mega-concordance # # can have any number of atoms; a regular concordance just # # happens to limit itself to indexes of size one atom. # # # # Method # # # # consume a text; break it into sentences; index each word # # with a unique numerical tag. construct a concordance # # with indexes of size N, where N is passed on the command # # line. create two output files, the marked up text, and # # the concordance thereof. # # # # an important consideration is that the memory complexity # # of this program MUST be O(1). # # # # Usage # # # # flags: -makehtml creates the indexed html file # # args: file1 file2 filen # # leaves the original text alone # # -makecomplex creates the phrase concordance # # which is an html document pointing # # to the indexed html file # # args: N file1 file2 filen # # where 0$text.html"); $index = 0; $blank = 0; for (split (/\n/, )) { # for each line if (! /\w/) { # totally non-word print OUTPUT; if (/^\s*$/) { $blank++; } if ($blank == 2) { print OUTPUT "

\n"; $blank = 0; } } else { for (split (/[ \t]+/, $_)) { # for each word ... next if (! /\w/); print OUTPUT "$_\n"; $index++; } } $debug && print "$text: index: $index\tword: $_"; } close OUTPUT; warn "$index words.\n"; $debug && print "$text: $text.html done.\n"; } } if ($makemany) { $N = shift @ARGV; if ($N <= 0) { die "$0: usage: -makemany N file file file\n"; } for (@ARGV) { if (! -e "$_.html") { warn "$_.html doesn't exist, creating it ...\n"; system ("$0 -makehtml $_"); } } for (1 .. $N) { # print "makemany: $0 -makecomplex $_ @ARGV\n"; system ("$0 -makecomplex $_ @ARGV"); } } if ($makecomplex) { $N = shift @ARGV; if ($N <= 0) { die "$0: usage: -makecomplex N file file file\n"; } foreach $text (@ARGV) { open (INPUT, $text); $debug && print "$text: now processing.\n"; $index = 0; $sentence_delimiter = "[.?!]'?\"?\$"; @periodwords = ("Mr.", "Mrs."); # more to come, i'm sure. # a sentence is merely the smallest logical semantic # structure; i can envision analysis requiring alternative # approaches. # if true, this word marks end of sentence. # otherwise known as ([.?!]) @INPUT = (split (/[\n \t]+/, )); $debug && open (DEBUGOUT, ">$text.debug\n"); while (@INPUT) { @sentence = (); $startofsentence = $index; # scarf the next sentence do { # for each word ... $word = shift @INPUT; next if ($word =~ /^\s*$/); # $debug && print "adding $word to @sentence\n"; push (@sentence, $word); if ($word =~ /\w/) { # increment global index counter $index++; $debug && print DEBUGOUT $index, "\t", $word, "\n"; } } until ($word =~ /$sentence_delimiter/ && (! grep ($word eq $_, @periodwords))); $debug && print "$text: sentence: @sentence\n"; if (grep (/\w/, @sentence) < $N) { # warn "$text: sentence shorter than $N words, skipping.\n"; } else { # what's the last valid index position for this sentence? $lastvalidindex = $#sentence; $validwords = 0; do { $validwords++ if ($sentence[$lastvalidindex] =~ /\w/); $lastvalidindex--; } until ( $validwords == $N ); $lastvalidindex++; # loop within the sentence to get out all phrases. foreach $sentenceindex (0 .. $lastvalidindex) { $phrase = ""; $phrasecount = 0; $phraseindex = $sentenceindex; do { # this better terminate. $word = $sentence[$phraseindex]; $word =~ s/^\W+|\W+$//g; $phrase .= $word . " "; if ($word =~ /\w/) { $phrasecount++; } $phraseindex++; } until ( $phrasecount == $N ); chop $phrase; $phrase =~ tr/A-Z/a-z/; $INDEX{$phrase} .= $sentenceindex + $startofsentence . ","; } } shift @sentence; } $debug && print "$text: concordance of phrase length $N done.\n"; open (OUTPUT, ">$text.$N.html"); print OUTPUT "Concordance for $text with phrase length $N\n"; print OUTPUT "

\n"; close OUTPUT; $debug && close DEBUGOUT; warn "$text: length: $N. phrases: ", scalar(keys %INDEX), ". words: $ index. $maxword appeared $maxfrequency times.\n"; } }