#!/usr/bin/perl
# terms.pl <oid> <term file> <document file>
# This script expects three arguments: an integer corresponding to
# the object identifier for a document stored in the database, a
# text file consisting of words to be used as terms in constructing
# a term-frequency vector, and, finally, a simple text version of the
# stored document with all tags, formatting commands, etc. removed.
$oid = shift ARGV ;
# Load the terms into an associative array.
open (TERMS, shift ARGV ) ;
while ($_ = <TERMS>) {
tr /A-Z/a-z/ ;
tr /a-z/\n/cs ;
@terms = split(/\W+/, $_) ;
foreach $term (@terms) {
$ok{$term} = 1 ;
}
}
# Count the number of occurrences of each term.
open (TEXT, shift ARGV ) ;
while ($_ = <TEXT>) {
tr /A-Z/a-z/ ;
tr /a-z/\n/cs ;
@words = split(/\W+/, $_) ;
foreach $word (@words) {
$wordcount{$word}++ if $ok{$word} ;
}
}
# Print the frequency the require fields for a SQL COPY
# command with ';' as a delimiter: <oid>;<term>;<count>
foreach $word ( keys ( %wordcount ) ) {
printf "%d;%s;%d\n", $oid, $word, $wordcount{$word};
}