words.csh

#!/bin/csh

# words.csh <text file> <oid> <user>

# This script expects three arguments: the first argument is a 
# text file, the second argument is an integer corresponding 
# to the file's oid (object identifier) in the database, and 
# the third argument is a string corresponding to a user id.
# Since the shell changes directory be sure to include a full
# path for the text file or put it in the user directory.

# Variables used in this script:

set file="$1"
set oid="$2"
set usr="$3"
set dms=~tld/Sites/dms
set sql="$dms/sql"
set dict="$dms/dict"
set dmsbin="$dms/bin"
set psqlbin=/usr/local/bin

# Change to the user's directory.

chdir "$dms/users/$usr/"

# Data files used in this script:

# the list of words in Webster's 2nd edition
# /usr/share/dict/words
# lower-case version of /usr/share/dict/words
# $dict/lower-case-words.txt  
# list of stop-words, e.g., "and", "the", etc
# $dict/stop-words.txt        
# words found in the user's documents so far
# $dms/users/$usr/entered-words.txt     

# Temporary files used in this script:

# script for updating 'occurs' table 
# all-words.sql
# script for updating 'dictionary' table
# new-words.sql
# other temporary files 
# tmp-words.txt all-words.txt new-words.txt 

# 0. Delete files used in procesing the last document.

if ( -r all-words.sql ) /bin/rm -f all-words.sql
if ( -r new-words.sql ) /bin/rm -f new-words.sql
if ( -r all-words.txt ) /bin/rm -f all-words.txt
if ( -r new-words.txt ) /bin/rm -f new-words.txt
if ( -r tmp-words.txt ) /bin/rm -f tmp-words.txt 

# 1. Find all words that are in the dictionary.

# strip off html leaving just the text
# convert all words to lower case 
# place each word on a separate line
# sort and eliminate duplicate words
# eliminate all stop words 
# include only words from Webster's 2nd edition

/bin/cat $file | \
/usr/bin/tr 'A-Z' 'a-z' | /usr/bin/tr -cs 'a-z' '\n' | \
/usr/bin/sort | /usr/bin/uniq | \
/usr/bin/comm -23 - $dict/stop-words.txt | \
/usr/bin/comm -12 - $dict/lower-case-words.txt > tmp-words.txt

# 2. Add proper names and capitalized words.

# place each word on a separate line
# include only capitalized words 
# convert to lower case and sort
# eliminate words in Webster's 2nd edition

/bin/cat $file | \
/usr/bin/tr -cs 'A-Za-z' '\n' | \
/usr/bin/egrep '[A-Z][[:alpha:]]+' | \
/usr/bin/tr 'A-Z' 'a-z' | /usr/bin/sort | /usr/bin/uniq | \
/usr/bin/comm -23 - $dict/lower-case-words.txt >> tmp-words.txt
/usr/bin/sort tmp-words.txt > all-words.txt

# 3. Update the 'dictionary' table.

# initialize the SQL script 
echo "COPY dictionary (term) FROM stdin ;" > new-words.sql
# eliminate any words previously encountered
/bin/cat all-words.txt | \
/usr/bin/comm -23 - entered-words.txt > new-words.txt
# add the new words to the SQL script
/bin/cat new-words.txt >> new-words.sql
# terminate the SQL script
echo "\." >> new-words.sql
# execute the SQL script
$psqlbin/psql "$usr" --user=postgres < new-words.sql

# 4. Update the 'occurs' table.

# initialize the SQL script 
echo "CREATE TEMPORARY TABLE tmp (id integer, term text, count integer) ;" > all-words.sql
echo "COPY tmp FROM stdin USING DELIMITERS ';' ;" >> all-words.sql
# compute term-frequencies and add them to the SQL script
$dmsbin/terms.pl $oid all-words.txt $file >> all-words.sql
# terminate the SQL script
echo "\." >> all-words.sql
echo "SELECT insert_occurs (id, term_lookup ( term ), count ) FROM tmp ;" >> all-words.sql
# execute the SQL script
$psqlbin/psql "$usr" --user=postgres < all-words.sql

# 5. Update the sorted list of previously seen words.

# add the new words to the list of seen words
/bin/cat new-words.txt >> entered-words.txt
# reuse the temporary file by first deleting it
/bin/rm tmp-words.txt
# sort the list of previously entered words
/usr/bin/sort entered-words.txt > tmp-words.txt
# write over the earlier list of entered words
/bin/mv -f tmp-words.txt entered-words.txt

# 6. Delete all of the temporary files.

/bin/rm all-words.sql new-words.sql all-words.txt new-words.txt