#!/usr/local/bin/perl ############################################################ # # # fsconcordance # # # # [con-cor-dance] # # (k@n-ko^r'dns) # # # # (NOUN). # # 1. A state of agreement; concord. # # 2. An alphabetical index of all the words in a # # text or corpus of texts, showing every contextual # # occurrence of a word. # # # # Considerations # # # # a standard concordance is simply a special # # case of a mega-concordance, a concordance of clauses, # # phrases, and sentences; each index of a mega-concordance # # can have any number of atoms; a regular concordance just # # happens to limit itself to indexes of size one atom. # # # # Method # # # # consume a text; break it into sentences; index each word # # with a unique numerical tag. construct a concordance # # with indexes of size N, where N is passed on the command # # line. create two output files, the marked up text, and # # the concordance thereof. # # # # an important consideration is that the memory complexity # # of this program should attempt to be O(1) at best and # # O(n) at worst. # # # # for concordances of more than one word, only those # # expressions that occur severally will appear. # # # # Usage # # # # flags: -makehtml creates the indexed html file # # args: master file1 file2 filen # # leaves the original text alone # # -makecomplex creates the phrase concordance # # which is an html document pointing # # to the indexed html file # # args: N master file1 file2 filen # # where 0= 5) ? $localtime=scalar(localtime) : chop($localtime=`date`)); $minmatch = 1; # minimum number of leading whitespace chars # to qualify as a poetic half-line $frequencytext = "Frequency"; # sorted by frequency $alltext = "All"; # extension for everything $othertext = "Other"; # stuff that's not a..z $uniquetext= "Unique"; # lone occurrences, mostly typos? @letters = ($alltext, $frequencytext, 'a'..'z', $othertext, $uniquetext); undef $/; # do delimiting ourselves $bigfile = 500000; # how big is big? warns users. $bigfronttext = "("; $bigbacktext = ")"; # our hostname chop ($hostname = (`uname -n 2> /dev/null` || `hostname`)); ($hostname,@junk) = gethostbyname ($hostname); if ($debug) { foreach $current ("makemany", "makecomplex", "makemaster", "makehtml") { # $myname = $_ if ($$_); } print STDERR "*** $myname: debug mode on\n"; } ($fscname = $0) =~ s/.*\/([^\/]+)$/$1/; ############################################################ # # # sub dealwithsentence # # # ############################################################ sub dealwithsentence { # input is a is a sentence # action is to add to the assoc array local (@sentence) = @_; local ($word); if (grep (/[a-zA-Z0-9]/, @sentence) < $N) { # warn "$text: sentence shorter than $N words, skipping.\n"; } else { # what's the last valid index position for this sentence? $lastvalidindex = $#sentence; $validwords = 0; do { $validwords++ if ($sentence[$lastvalidindex] =~ /[a-zA-Z0-9]/); $lastvalidindex--; } until ( $validwords == $N ); $lastvalidindex++; # loop within the sentence to get out all phrases. foreach $sentenceindex (0 .. $lastvalidindex) { $phrase = ""; $phrasecount = 0; $phraseindex = $sentenceindex; do { # this better terminate. $word = $sentence[$phraseindex]; # ARGH. we can't use \w because it works for _. bah. $word =~ s/^[^a-zA-Z0-9]+//g; $word =~ s/[^a-zA-Z0-9]+$//g; next if ($word =~ /^\s*$/); $phrase .= $word . " "; $phrasecount++; $phraseindex++; } until ( $phrasecount == $N ); chop $phrase; $phrase =~ tr/A-Z/a-z/; $INDEX{$phrase} .= "${text}.html\#" . ($sentenceindex + $startofsentence) . ","; } } shift @sentence; } ############################################################ # # # sub makemaster # # # ############################################################ sub makemaster { # creates master html file # that points to all subfiles local($N, $master, @ARGV) = (@_); open (MASTER, ">$master.html") || die "unable to write to $master: $!\n"; $plural = ((@ARGV > 1) ? "s" : ""); $intertextual = "Intertextual " if ($intertext); if (length("@ARGV") < 60) { # don't want obscenely long H1's. print MASTER "

${intertextual}Concordance$plural for"; for (@ARGV) { print MASTER " $_"; } print MASTER "

\n"; } else { print MASTER "

${intertextual}Concordance$plural for ...

\n\n"; for (@ARGV) { print MASTER "$_ "; } print MASTER "

\n"; } print MASTER "Generated by ", scalar(getpwuid($>)), "\@$hostname", " at ", $localtime, "
", " using fsconcordance, a program by Meng Weng Wong.\n"; if ($intertext) { print MASTER "

For phrases of two words or more, only those which occur in different source files are shown.\n"; } print MASTER "\n

\n"; foreach $number (1..$N) { print MASTER "
Phrases of length $number\n"; print MASTER "
\n"; foreach $letter (@letters) { if (-s "$master.$number.$letter.html" >= $bigfile) { $bigfront = $bigfronttext; $bigback = $bigbacktext; $bigpresent = 1; } else { $bigfront = ""; $bigback = ""; } print MASTER " $bigfront$letter$bigback\n" unless (($number>1) && ($letter eq $uniquetext)); } print MASTER "\n

\n"; } print MASTER "\n

\n"; if ($bigpresent) { print MASTER "Note - links that look like ${bigfronttext}this${bigbacktext} are bigger than $bigfile bytes. You may want to think twice about downloading them.\n"; } close MASTER; print STDERR "$fscname: @ARGV", " by ", scalar(getpwuid($>)), " at ", $localtime, "\n"; } ############################################################ # # # sub printoutput # # # ############################################################ sub printoutput { # cuz we need to do it for several output files local ($prettier, $toprint); print "
  • $_:\n"; # some mindless prettification for (@output) { ($prettier = $_) =~ s/\.html\#/ /g; $toprint .= " $prettier,\n"; } chop $toprint; chop $toprint; print $toprint, "\n"; } ############################################################ # # # makemany # # # # parent program to do everything in one easy step; calls # # makehtml and makecomplex # # # ############################################################ if ($makemany) { $N = shift @ARGV; if ($N <= 0) { die "$fscname: usage: -makemany N master file file file\n"; } $master = shift @ARGV; # children inherit options $options = "-poem " if $poem; $options .= "-byline " if $byline; $options .= "-debug " if $debug; $options .= "-bystanza " if $bystanza; $options .= "-intertext " if $intertext; for (@ARGV) { if (! -e "$_.html") { warn "$fscname: $_.html doesn't exist, creating it ...\n"; system ("$0 -makehtml $options $_"); } } for (1 .. $N) { # print "$fscname: $0 $options -makecomplex $_ @ARGV\n"; system ("$0 $options -makecomplex $_ $master @ARGV"); } if (! -e "$master.html") { warn "$fscname: $master.html doesn't exist, creating it ...\n"; system ("$0 $options -makemaster $N $master @ARGV"); } } ############################################################ # # # makemaster # # # # makes the master html file which points to everyone else # # # ############################################################ if ($makemaster) { # make html file. $N = shift @ARGV; if ($N <= 0) { die "$fscname: usage: -makemaster N master file file file\n"; } $master = shift @ARGV; &makemaster($N, $master, @ARGV); } ############################################################ # # # makehtml # # # # makes the html file, indexed by word number # # # ############################################################ if ($makehtml) { # make html file. foreach $text (@ARGV) { open (INPUT, $text); open (OUTPUT, ">$text.html"); $index = 0; $blank = 0; for (split (/\n/, )) { # for each line if (! /[a-zA-Z0-9]/) { # totally non-word print OUTPUT; if (/^\s*$/) { $blank++; } if ((!$poem) && $blank == 1) { print OUTPUT "

    \n"; $blank = 0; } } else { if ($poem && /^\s{$minmatch,}/) { # poetic half-line print OUTPUT "_" x $numofunderscores, "\n"; } for (split (/[ \t]+/, $_)) { # for each word ... next if (! /[a-zA-Z0-9]/); print OUTPUT "$_\n"; $index++; } } if ($poem) { print OUTPUT "
    \n"; } } close OUTPUT; warn "makehtml: $text: $index words.\n"; } } ############################################################ # # # makecomplex # # # # makes the concordance file for phrases of length N # # # ############################################################ if ($makecomplex) { $N = shift @ARGV; if ($N <= 0) { die "$fscname: usage: -makecomplex N file file file\n"; } $master = shift @ARGV; foreach $text (@ARGV) { open (INPUT, $text); if ($debug) { print STDERR "*** $myname: processing $text\n"; } $index = 0; if ($poem) { # dealing with pomes. happy happy pomes. # part 1: convert (line / half-line) --> line half-line # part 2: based on whether you want to break by lines # or by stanzas, snarf lines/stanzas into @sentence # deal with sentences in the standard way. i think this'll work. # since we want to deal with half-lines as part of the same # syntactical logical construction, we perform magic with s/ # and make half-lines magically append to the previous line. # stripping newlines from multiline strings works in perl4 and 5. # $/ was undefined, so $INPUT becomes all if . ($INPUT = ) =~ s/\n\s{$minmatch,}\s*/ /g; if ($bystanza) { @INPUT = (split (/\n\s*\n/, $INPUT)); } elsif ($byline) { @INPUT = (split (/\n/, $INPUT)); } undef $INPUT; # we're members of the RAM conservation league. while (@INPUT) { @sentence = (split (' ', shift @INPUT)); $startofsentence = $index; # scarf the next sentence for (@sentence) { $index++ if (/[a-zA-Z0-9]/); } &dealwithsentence(@sentence); } } else { # dealing with prose $sentence_delimiter = "[.?!]'?\"?\$"; @periodwords = ("Mr.", "Mrs."); # more to come, i'm sure. # a sentence is merely the smallest logical semantic # structure; i can envision analysis requiring alternative # approaches. for poetry, use -poem. # if true, sentence_delimiter marks end of sentence. # otherwise known as ([.?!]) @INPUT = (split (/[\n \t]+/, )); while (@INPUT) { @sentence = (); $startofsentence = $index; if ($debug) { print STDERR "*** $myname: @INPUT is now @INPUT\n"; } # scarf the next sentence while (@INPUT) { # for each word ... $word = shift @INPUT; next if ($word !~ /[a-zA-Z0-9]/); push (@sentence, $word); $index++; # increment global index counter last if (($word =~ /$sentence_delimiter$/) && (! grep ($word eq $_, @periodwords))); } if ($debug) { print STDERR "*** got a sentence: @sentence\n"; } &dealwithsentence(@sentence); } } } # by this point, all texts have been processed; their contents # are indexed into a huge array. it's time to write out that # index into chapters of the resulting concordance; their root filename # is going to be $master. # concordances can get really huge. solution: split into 27 chapters, # [^a-z] and [a-z] # NOTE: this opens something like 28 simultaneous file # descriptors. your operating system may not be able to # handle this. linux's max is 256; i'm not sure about # other unixes. foreach $letter (@letters) { eval " open (OUTPUT$letter, \">$master.$N.$letter.html\"); select OUTPUT$letter; " unless ($letter eq $uniquetext && ($N>1)); if ($letter eq $alltext && ($N==1)) { print "

    phrase length $N, complete concordance (a-z)

    \n"; } elsif ($letter eq $alltext && ($N>1)) { print "

    phrase length $N, repeats-only concordance (a-z)

    \n"; } elsif ($letter eq $uniquetext && ($N==1)) { print "

    phrase length $N, unique concordance (a-z)

    \n"; } elsif ($letter eq $othertext) { print "

    phrase length $N, non-alphabetic characters

    \n"; } elsif ($letter eq $frequencytext) { print "

    phrase length $N, sorted by frequency

    \n"; } else { print "

    phrase length $N, starting letter $letter

    \n"; } print "for $master\n"; print "\n$_ occurrence$plural.\n\nUnique words:\n\n"; } # let 'em know we're done print STDERR "$master: length: $N. phrases: ", scalar(keys %INDEX), ". "; if (! $maxfrequency) { print STDERR "no"; print STDERR " intertextual" if ($intertext); print STDERR " unique" if ($unique); print STDERR " repeat phrases.\n"; } else { print STDERR "$maxfrequency"; print STDERR " intertextual" if ($intertext); print STDERR " unique" if ($unique); print STDERR " occurrences of $maxword.\n"; } }