-- This file initializes all tables and functions used by the dms.

-- Each registered user has a 'username' and 'password'

CREATE TABLE dms_user (username text, password text) ;

-- Each document has an oid, a timestamp indicating when it was 
-- entered in the database, a MIME type and a text description.

CREATE TABLE document (id oid, mark timestamp, description text, type text) ;

CREATE INDEX idx_document_id ON document (id) ;

CREATE TABLE dictionary (id serial, term text, sum integer DEFAULT 0) ;

CREATE INDEX idx_dictionary_term ON dictionary (term) ; 

CREATE INDEX idx_dictionary_id ON dictionary (id) ; 

CREATE TABLE occurs (doc integer, term integer, count integer) ;

CREATE INDEX idx_occurs_term ON occurs (term) ;


-- FUNCTION update_dictionary_sum () is used to update the field in
-- the 'dictionary' record which records how many times an entry
-- appears in the set of all documents.  This function should be
-- called whenever a record is added to the 'occurs' table.  there is
-- one exception to this rule however; the 'document' id 0 is reserved
-- for a faux document used in similarity searching and hence doesn't
-- increase 'dictioary.sum'.

CREATE OR REPLACE FUNCTION update_dictionary_sum () RETURNS trigger AS '
    BEGIN
        IF ( NEW.doc != 0 ) THEN
           UPDATE dictionary SET sum = sum + NEW.count WHERE id = NEW.term ;
        END IF ;
        RETURN NEW ;
    END ;
' LANGUAGE 'plpgsql' ;


-- The 'dictionary' update operation is implemented as a trigger that
-- executes whenever a new record is inserted into the 'occurs' table.

CREATE TRIGGER trigger_update_dictionary_sum AFTER INSERT
        ON occurs FOR EACH ROW
        EXECUTE PROCEDURE update_dictionary_sum () ;


-- FUNCTION term_lookup ( text ) -> integer 
-- Given text specifying a term returns the integer corresponding
-- to the term's identifier in the 'dictionary' table. 

CREATE OR REPLACE FUNCTION term_lookup (text) RETURNS integer AS '
    SELECT id FROM dictionary WHERE term = $1 ;
' LANGUAGE SQL;


-- FUNCTION insert_occurs ( integer, integer, integer ) -> integer 
-- Given three integers corresponding to a document oid, a term id and 
-- a count of how many occurrences of the term appear in the specified
-- document, this function inserts a record into the 'occurs' table. 

CREATE OR REPLACE FUNCTION insert_occurs (integer, integer, bigint) RETURNS integer AS '
    INSERT INTO occurs VALUES ( $1, $2, $3 ) ;
    SELECT 1 ;
' LANGUAGE SQL;


-- FUNCTION dot_product ( integer, integer ) -> real
-- Takes two integers specifying documents and returns the dot
-- product of the corresponding term-frequency vectors. Note that
-- integer division truncates results hence the induced type cast.

CREATE OR REPLACE FUNCTION dot_product ( integer, integer ) RETURNS numeric AS '
    SELECT sum ( ( u.count / ( d.sum * 1.0) ) * ( v.count / ( d.sum * 1.0) ) )
           FROM occurs AS u, occurs AS v, dictionary AS d
           WHERE u.doc = $1 AND v.doc = $2 AND 
                 u.term = v.term AND d.id = u.term ;
' LANGUAGE SQL;


-- FUNCTION magnitude ( integer ) -> real
-- Takes an integer specifying a document and returns the magnitude of
-- the term-frequency vector associated with the specified document.
-- Integer division truncates results hence the induced type cast. Note 
-- that if there are no terms associated with the specified document
-- the function will return NULL; any arithmetic operation involving 
-- a null term will retun NULL. 

CREATE OR REPLACE FUNCTION magnitude ( integer ) RETURNS numeric AS '
    SELECT sqrt ( sum ( count / (sum * 1.0) * count / (sum * 1.0) ) )
           FROM occurs, dictionary 
           WHERE occurs.term = dictionary.id AND occurs.doc = $1 ;
' LANGUAGE SQL;


-- FUNCTION cosine_angle ( integer, integer ) -> real
-- Takes two integers specifying documents and returns the cosine 
-- of the angle between the corresponding term-frequency vectors:
--   cosine_angle ( u, v ) = dot ( u, v )  / ( magnitude ( u ) * magnitude ( v ) )
-- where u and v are the term-frequency vectors for two documents.
-- Uses simple normalized term frequency: 
--   tf ( d, t ) = occurs ( d, t ) / sum_d' occurs ( d', t )
-- where d and d' are documents and t is a term.

-- The coalesce function returns its first non NULL argument; this ensures that 
-- cosine_angle will return a numeric result even if it is given an integer that 
-- doesn't correspond to the object identifier for recognized document. 

CREATE OR REPLACE FUNCTION cosine_angle ( integer, integer ) RETURNS numeric AS '
    SELECT coalesce ( ( dot_product ( $1, $2 ) / 
                        ( magnitude ( $1 ) * magnitude ( $2 ) ) ), 0.0) ;
' LANGUAGE SQL;

-- term frequency in the Cornell SMART system:
-- tf ( d, t ) = 0  if occurs( d, t ) = 0
--             = 1 + log ( 1 + log ( occurs ( d, t ) ) ) otherwise

-- inverse document frequency:
-- idf ( t ) = log(1 + number-of-docs) / sum_d occurs(d,t)

-- term frequency with inverse document frequency weighting:
-- tfidf ( d, t ) = tf ( d, t ) * idf ( t )


-- Ranking documents returned by a query is accomplished using a faux 
-- document with id = 0 which is generated from one or more documents:
-- 
-- DELETE FROM occurs WHERE doc = 0 ;
--
-- SELECT insert_occurs ( 0, sub.term, sum ( sub.count ) ) FROM 
--        (SELECT * FROM occurs WHERE doc = 1 OR doc = 2 ) AS sub 
--        GROUP BY sub.term ;

-- FUNCTION score_document ( integer ) -> real
-- Need an explicit type cast to get this to work in the obvious way:
-- SELECT score_document ( CAST ( id AS integer) ) FROM document ;

CREATE OR REPLACE FUNCTION score_document ( integer ) RETURNS numeric AS '
    SELECT cosine_angle ( $1, 0 ) ;
' LANGUAGE SQL;