%% DCNT sample toy script to generate valid dataobj clear all; close all; clc; % Imagine we have a set of 3 documents with a very small number of words doc1 = {'jump','cat','house','squid','cat','house',}; doc2 = {'cat','saddle','boat'}; doc3 = {'jump','squid','house','squid'}; % We can index these words in a vocab cell array. vocab{1} = 'jump'; vocab{2} = 'cat'; vocab{3} = 'house'; vocab{4} = 'squid'; vocab{5} = 'saddle'; vocab{6} = 'boat'; % We then assign word indices and save them in dataobj.data.doc dataobj.data.doc{1} = [1 2 3 4 2 3]; dataobj.data.doc{2} = [2 5 6]; % for document 3, squid corresponds to word 4 in the vocab cell array dataobj.data.doc{3} = [1 4 3 4]; % Create metadata % Assuming we have no features, we still always need a bias matrix F = 1; % F will always be greater than or equal to 1 D = 3; % Number of documents dataobj.true.phiF = ones(F,D); dataobj.true.W = length(vocab); %The number of unique words in our corpus % If we have metadata, assume we have two features % Let's say feature one is a binary flag that tells us a document is fiction/non-fiction % Let's say feature two is a normalized real value tha tells us how well % known it is with 1 being the most well known and 0 being the least well known F = 3; %bias feature + two features per document phiF = zeros(F,D); phiF(1,:) = 1; % bias feature phiF(2,:) = [0 0 1]; phiF(3,:) = [.1 .1 1]; dataobj.true.phiF = phiF; save('toyData.mat','dataobj'); % you can then load this to run DCNT %NOTE: NECESSARY FOR DCNT % dataobj.data.doc % dataobj.true.phiF % dataobj.true.W