-- This file initializes all tables and functions used by the dms.
-- Each registered user has a 'username' and 'password'
CREATE TABLE dms_user (username text, password text) ;
-- Each document has an oid, a timestamp indicating when it was
-- entered in the database, a MIME type and a text description.
CREATE TABLE document (id oid, mark timestamp, description text, type text) ;
CREATE INDEX idx_document_id ON document (id) ;
CREATE TABLE dictionary (id serial, term text, sum integer DEFAULT 0) ;
CREATE INDEX idx_dictionary_term ON dictionary (term) ;
CREATE INDEX idx_dictionary_id ON dictionary (id) ;
CREATE TABLE occurs (doc integer, term integer, count integer) ;
CREATE INDEX idx_occurs_term ON occurs (term) ;
-- FUNCTION update_dictionary_sum () is used to update the field in
-- the 'dictionary' record which records how many times an entry
-- appears in the set of all documents. This function should be
-- called whenever a record is added to the 'occurs' table. there is
-- one exception to this rule however; the 'document' id 0 is reserved
-- for a faux document used in similarity searching and hence doesn't
-- increase 'dictioary.sum'.
CREATE OR REPLACE FUNCTION update_dictionary_sum () RETURNS trigger AS '
BEGIN
IF ( NEW.doc != 0 ) THEN
UPDATE dictionary SET sum = sum + NEW.count WHERE id = NEW.term ;
END IF ;
RETURN NEW ;
END ;
' LANGUAGE 'plpgsql' ;
-- The 'dictionary' update operation is implemented as a trigger that
-- executes whenever a new record is inserted into the 'occurs' table.
CREATE TRIGGER trigger_update_dictionary_sum AFTER INSERT
ON occurs FOR EACH ROW
EXECUTE PROCEDURE update_dictionary_sum () ;
-- FUNCTION term_lookup ( text ) -> integer
-- Given text specifying a term returns the integer corresponding
-- to the term's identifier in the 'dictionary' table.
CREATE OR REPLACE FUNCTION term_lookup (text) RETURNS integer AS '
SELECT id FROM dictionary WHERE term = $1 ;
' LANGUAGE SQL;
-- FUNCTION insert_occurs ( integer, integer, integer ) -> integer
-- Given three integers corresponding to a document oid, a term id and
-- a count of how many occurrences of the term appear in the specified
-- document, this function inserts a record into the 'occurs' table.
CREATE OR REPLACE FUNCTION insert_occurs (integer, integer, bigint) RETURNS integer AS '
INSERT INTO occurs VALUES ( $1, $2, $3 ) ;
SELECT 1 ;
' LANGUAGE SQL;
-- FUNCTION dot_product ( integer, integer ) -> real
-- Takes two integers specifying documents and returns the dot
-- product of the corresponding term-frequency vectors. Note that
-- integer division truncates results hence the induced type cast.
CREATE OR REPLACE FUNCTION dot_product ( integer, integer ) RETURNS numeric AS '
SELECT sum ( ( u.count / ( d.sum * 1.0) ) * ( v.count / ( d.sum * 1.0) ) )
FROM occurs AS u, occurs AS v, dictionary AS d
WHERE u.doc = $1 AND v.doc = $2 AND
u.term = v.term AND d.id = u.term ;
' LANGUAGE SQL;
-- FUNCTION magnitude ( integer ) -> real
-- Takes an integer specifying a document and returns the magnitude of
-- the term-frequency vector associated with the specified document.
-- Integer division truncates results hence the induced type cast. Note
-- that if there are no terms associated with the specified document
-- the function will return NULL; any arithmetic operation involving
-- a null term will retun NULL.
CREATE OR REPLACE FUNCTION magnitude ( integer ) RETURNS numeric AS '
SELECT sqrt ( sum ( count / (sum * 1.0) * count / (sum * 1.0) ) )
FROM occurs, dictionary
WHERE occurs.term = dictionary.id AND occurs.doc = $1 ;
' LANGUAGE SQL;
-- FUNCTION cosine_angle ( integer, integer ) -> real
-- Takes two integers specifying documents and returns the cosine
-- of the angle between the corresponding term-frequency vectors:
-- cosine_angle ( u, v ) = dot ( u, v ) / ( magnitude ( u ) * magnitude ( v ) )
-- where u and v are the term-frequency vectors for two documents.
-- Uses simple normalized term frequency:
-- tf ( d, t ) = occurs ( d, t ) / sum_d' occurs ( d', t )
-- where d and d' are documents and t is a term.
-- The coalesce function returns its first non NULL argument; this ensures that
-- cosine_angle will return a numeric result even if it is given an integer that
-- doesn't correspond to the object identifier for recognized document.
CREATE OR REPLACE FUNCTION cosine_angle ( integer, integer ) RETURNS numeric AS '
SELECT coalesce ( ( dot_product ( $1, $2 ) /
( magnitude ( $1 ) * magnitude ( $2 ) ) ), 0.0) ;
' LANGUAGE SQL;
-- term frequency in the Cornell SMART system:
-- tf ( d, t ) = 0 if occurs( d, t ) = 0
-- = 1 + log ( 1 + log ( occurs ( d, t ) ) ) otherwise
-- inverse document frequency:
-- idf ( t ) = log(1 + number-of-docs) / sum_d occurs(d,t)
-- term frequency with inverse document frequency weighting:
-- tfidf ( d, t ) = tf ( d, t ) * idf ( t )
-- Ranking documents returned by a query is accomplished using a faux
-- document with id = 0 which is generated from one or more documents:
--
-- DELETE FROM occurs WHERE doc = 0 ;
--
-- SELECT insert_occurs ( 0, sub.term, sum ( sub.count ) ) FROM
-- (SELECT * FROM occurs WHERE doc = 1 OR doc = 2 ) AS sub
-- GROUP BY sub.term ;
-- FUNCTION score_document ( integer ) -> real
-- Need an explicit type cast to get this to work in the obvious way:
-- SELECT score_document ( CAST ( id AS integer) ) FROM document ;
CREATE OR REPLACE FUNCTION score_document ( integer ) RETURNS numeric AS '
SELECT cosine_angle ( $1, 0 ) ;
' LANGUAGE SQL;