terms.pl

#!/usr/bin/perl

# terms.pl <oid> <term file> <document file>

# This script expects three arguments: an integer corresponding to 
# the object identifier for a document stored in the database, a
# text file consisting of words to be used as terms in constructing 
# a term-frequency vector, and, finally, a simple text version of the 
# stored document with all tags, formatting commands, etc. removed.

$oid = shift ARGV ;

# Load the terms into an associative array.

open (TERMS, shift ARGV ) ;

while ($_ = <TERMS>) {
    tr /A-Z/a-z/ ;
    tr /a-z/\n/cs ;
    @terms = split(/\W+/, $_) ;
    foreach $term (@terms) {
        $ok{$term} = 1 ;
    }
}

# Count the number of occurrences of each term.

open (TEXT, shift ARGV ) ;

while ($_ = <TEXT>) {
    tr /A-Z/a-z/ ;
    tr /a-z/\n/cs ;
    @words = split(/\W+/, $_) ;
    foreach $word (@words) {
        $wordcount{$word}++ if $ok{$word} ;
    }
}

# Print the frequency the require fields for a SQL COPY 
# command with ';' as a delimiter: <oid>;<term>;<count>

foreach $word ( keys ( %wordcount ) ) {
    printf "%d;%s;%d\n", $oid, $word, $wordcount{$word};
}