-- This file initializes all tables and functions used by the dms. -- Each registered user has a 'username' and 'password' CREATE TABLE dms_user (username text, password text) ; -- Each document has an oid, a timestamp indicating when it was -- entered in the database, a MIME type and a text description. CREATE TABLE document (id oid, mark timestamp, description text, type text) ; CREATE INDEX idx_document_id ON document (id) ; CREATE TABLE dictionary (id serial, term text, sum integer DEFAULT 0) ; CREATE INDEX idx_dictionary_term ON dictionary (term) ; CREATE INDEX idx_dictionary_id ON dictionary (id) ; CREATE TABLE occurs (doc integer, term integer, count integer) ; CREATE INDEX idx_occurs_term ON occurs (term) ; -- FUNCTION update_dictionary_sum () is used to update the field in -- the 'dictionary' record which records how many times an entry -- appears in the set of all documents. This function should be -- called whenever a record is added to the 'occurs' table. there is -- one exception to this rule however; the 'document' id 0 is reserved -- for a faux document used in similarity searching and hence doesn't -- increase 'dictioary.sum'. CREATE OR REPLACE FUNCTION update_dictionary_sum () RETURNS trigger AS ' BEGIN IF ( NEW.doc != 0 ) THEN UPDATE dictionary SET sum = sum + NEW.count WHERE id = NEW.term ; END IF ; RETURN NEW ; END ; ' LANGUAGE 'plpgsql' ; -- The 'dictionary' update operation is implemented as a trigger that -- executes whenever a new record is inserted into the 'occurs' table. CREATE TRIGGER trigger_update_dictionary_sum AFTER INSERT ON occurs FOR EACH ROW EXECUTE PROCEDURE update_dictionary_sum () ; -- FUNCTION term_lookup ( text ) -> integer -- Given text specifying a term returns the integer corresponding -- to the term's identifier in the 'dictionary' table. CREATE OR REPLACE FUNCTION term_lookup (text) RETURNS integer AS ' SELECT id FROM dictionary WHERE term = $1 ; ' LANGUAGE SQL; -- FUNCTION insert_occurs ( integer, integer, integer ) -> integer -- Given three integers corresponding to a document oid, a term id and -- a count of how many occurrences of the term appear in the specified -- document, this function inserts a record into the 'occurs' table. CREATE OR REPLACE FUNCTION insert_occurs (integer, integer, bigint) RETURNS integer AS ' INSERT INTO occurs VALUES ( $1, $2, $3 ) ; SELECT 1 ; ' LANGUAGE SQL; -- FUNCTION dot_product ( integer, integer ) -> real -- Takes two integers specifying documents and returns the dot -- product of the corresponding term-frequency vectors. Note that -- integer division truncates results hence the induced type cast. CREATE OR REPLACE FUNCTION dot_product ( integer, integer ) RETURNS numeric AS ' SELECT sum ( ( u.count / ( d.sum * 1.0) ) * ( v.count / ( d.sum * 1.0) ) ) FROM occurs AS u, occurs AS v, dictionary AS d WHERE u.doc = $1 AND v.doc = $2 AND u.term = v.term AND d.id = u.term ; ' LANGUAGE SQL; -- FUNCTION magnitude ( integer ) -> real -- Takes an integer specifying a document and returns the magnitude of -- the term-frequency vector associated with the specified document. -- Integer division truncates results hence the induced type cast. Note -- that if there are no terms associated with the specified document -- the function will return NULL; any arithmetic operation involving -- a null term will retun NULL. CREATE OR REPLACE FUNCTION magnitude ( integer ) RETURNS numeric AS ' SELECT sqrt ( sum ( count / (sum * 1.0) * count / (sum * 1.0) ) ) FROM occurs, dictionary WHERE occurs.term = dictionary.id AND occurs.doc = $1 ; ' LANGUAGE SQL; -- FUNCTION cosine_angle ( integer, integer ) -> real -- Takes two integers specifying documents and returns the cosine -- of the angle between the corresponding term-frequency vectors: -- cosine_angle ( u, v ) = dot ( u, v ) / ( magnitude ( u ) * magnitude ( v ) ) -- where u and v are the term-frequency vectors for two documents. -- Uses simple normalized term frequency: -- tf ( d, t ) = occurs ( d, t ) / sum_d' occurs ( d', t ) -- where d and d' are documents and t is a term. -- The coalesce function returns its first non NULL argument; this ensures that -- cosine_angle will return a numeric result even if it is given an integer that -- doesn't correspond to the object identifier for recognized document. CREATE OR REPLACE FUNCTION cosine_angle ( integer, integer ) RETURNS numeric AS ' SELECT coalesce ( ( dot_product ( $1, $2 ) / ( magnitude ( $1 ) * magnitude ( $2 ) ) ), 0.0) ; ' LANGUAGE SQL; -- term frequency in the Cornell SMART system: -- tf ( d, t ) = 0 if occurs( d, t ) = 0 -- = 1 + log ( 1 + log ( occurs ( d, t ) ) ) otherwise -- inverse document frequency: -- idf ( t ) = log(1 + number-of-docs) / sum_d occurs(d,t) -- term frequency with inverse document frequency weighting: -- tfidf ( d, t ) = tf ( d, t ) * idf ( t ) -- Ranking documents returned by a query is accomplished using a faux -- document with id = 0 which is generated from one or more documents: -- -- DELETE FROM occurs WHERE doc = 0 ; -- -- SELECT insert_occurs ( 0, sub.term, sum ( sub.count ) ) FROM -- (SELECT * FROM occurs WHERE doc = 1 OR doc = 2 ) AS sub -- GROUP BY sub.term ; -- FUNCTION score_document ( integer ) -> real -- Need an explicit type cast to get this to work in the obvious way: -- SELECT score_document ( CAST ( id AS integer) ) FROM document ; CREATE OR REPLACE FUNCTION score_document ( integer ) RETURNS numeric AS ' SELECT cosine_angle ( $1, 0 ) ; ' LANGUAGE SQL;