#!/usr/bin/perl # terms.pl <oid> <term file> <document file> # This script expects three arguments: an integer corresponding to # the object identifier for a document stored in the database, a # text file consisting of words to be used as terms in constructing # a term-frequency vector, and, finally, a simple text version of the # stored document with all tags, formatting commands, etc. removed. $oid = shift ARGV ; # Load the terms into an associative array. open (TERMS, shift ARGV ) ; while ($_ = <TERMS>) { tr /A-Z/a-z/ ; tr /a-z/\n/cs ; @terms = split(/\W+/, $_) ; foreach $term (@terms) { $ok{$term} = 1 ; } } # Count the number of occurrences of each term. open (TEXT, shift ARGV ) ; while ($_ = <TEXT>) { tr /A-Z/a-z/ ; tr /a-z/\n/cs ; @words = split(/\W+/, $_) ; foreach $word (@words) { $wordcount{$word}++ if $ok{$word} ; } } # Print the frequency the require fields for a SQL COPY # command with ';' as a delimiter: <oid>;<term>;<count> foreach $word ( keys ( %wordcount ) ) { printf "%d;%s;%d\n", $oid, $word, $wordcount{$word}; }