#!/bin/csh # words.csh <text file> <oid> <user> # This script expects three arguments: the first argument is a # text file, the second argument is an integer corresponding # to the file's oid (object identifier) in the database, and # the third argument is a string corresponding to a user id. # Since the shell changes directory be sure to include a full # path for the text file or put it in the user directory. # Variables used in this script: set file="$1" set oid="$2" set usr="$3" set dms=~tld/Sites/dms set sql="$dms/sql" set dict="$dms/dict" set dmsbin="$dms/bin" set psqlbin=/usr/local/bin # Change to the user's directory. chdir "$dms/users/$usr/" # Data files used in this script: # the list of words in Webster's 2nd edition # /usr/share/dict/words # lower-case version of /usr/share/dict/words # $dict/lower-case-words.txt # list of stop-words, e.g., "and", "the", etc # $dict/stop-words.txt # words found in the user's documents so far # $dms/users/$usr/entered-words.txt # Temporary files used in this script: # script for updating 'occurs' table # all-words.sql # script for updating 'dictionary' table # new-words.sql # other temporary files # tmp-words.txt all-words.txt new-words.txt # 0. Delete files used in procesing the last document. if ( -r all-words.sql ) /bin/rm -f all-words.sql if ( -r new-words.sql ) /bin/rm -f new-words.sql if ( -r all-words.txt ) /bin/rm -f all-words.txt if ( -r new-words.txt ) /bin/rm -f new-words.txt if ( -r tmp-words.txt ) /bin/rm -f tmp-words.txt # 1. Find all words that are in the dictionary. # strip off html leaving just the text # convert all words to lower case # place each word on a separate line # sort and eliminate duplicate words # eliminate all stop words # include only words from Webster's 2nd edition /bin/cat $file | \ /usr/bin/tr 'A-Z' 'a-z' | /usr/bin/tr -cs 'a-z' '\n' | \ /usr/bin/sort | /usr/bin/uniq | \ /usr/bin/comm -23 - $dict/stop-words.txt | \ /usr/bin/comm -12 - $dict/lower-case-words.txt > tmp-words.txt # 2. Add proper names and capitalized words. # place each word on a separate line # include only capitalized words # convert to lower case and sort # eliminate words in Webster's 2nd edition /bin/cat $file | \ /usr/bin/tr -cs 'A-Za-z' '\n' | \ /usr/bin/egrep '[A-Z][[:alpha:]]+' | \ /usr/bin/tr 'A-Z' 'a-z' | /usr/bin/sort | /usr/bin/uniq | \ /usr/bin/comm -23 - $dict/lower-case-words.txt >> tmp-words.txt /usr/bin/sort tmp-words.txt > all-words.txt # 3. Update the 'dictionary' table. # initialize the SQL script echo "COPY dictionary (term) FROM stdin ;" > new-words.sql # eliminate any words previously encountered /bin/cat all-words.txt | \ /usr/bin/comm -23 - entered-words.txt > new-words.txt # add the new words to the SQL script /bin/cat new-words.txt >> new-words.sql # terminate the SQL script echo "\." >> new-words.sql # execute the SQL script $psqlbin/psql "$usr" --user=postgres < new-words.sql # 4. Update the 'occurs' table. # initialize the SQL script echo "CREATE TEMPORARY TABLE tmp (id integer, term text, count integer) ;" > all-words.sql echo "COPY tmp FROM stdin USING DELIMITERS ';' ;" >> all-words.sql # compute term-frequencies and add them to the SQL script $dmsbin/terms.pl $oid all-words.txt $file >> all-words.sql # terminate the SQL script echo "\." >> all-words.sql echo "SELECT insert_occurs (id, term_lookup ( term ), count ) FROM tmp ;" >> all-words.sql # execute the SQL script $psqlbin/psql "$usr" --user=postgres < all-words.sql # 5. Update the sorted list of previously seen words. # add the new words to the list of seen words /bin/cat new-words.txt >> entered-words.txt # reuse the temporary file by first deleting it /bin/rm tmp-words.txt # sort the list of previously entered words /usr/bin/sort entered-words.txt > tmp-words.txt # write over the earlier list of entered words /bin/mv -f tmp-words.txt entered-words.txt # 6. Delete all of the temporary files. /bin/rm all-words.sql new-words.sql all-words.txt new-words.txt