# This program analyses the frequency of stopwords
# in texts

# IMPORT STATEMENTS
import re

# CONSTANTS: these are variables that NEVER CHANGE
# They may be accessed by ANY function
FILE_LIST = ['file1.txt','file2.txt','file3.txt',
         'file4.txt','file5.txt','file6.txt']

def testFiles(outfile):
    '''Counts the word frequencies in each text file in the 'files' variable.
    INPUT: outfile (String)
    OUTPUT: none'''

    # wordList is an sorted list of stop words
    wordList = getStopWords()
    print 'There are',len(wordList),'stop words.\n'
    
    # wordFrequencies is a list of lists.  Get word frequencies for each file
    wordFrequencies = []
    for filename in FILE_LIST:
        myDictionary = fillDictionary(filename,wordList)

        # make a tmp list of frequencies ORDERED by wordlist
        # this is IMPORTANT to do because dictionaries are unsorted.
        tmpList = []
        for word in wordList:
            tmpList = tmpList + [myDictionary[word]]

        # add tmpList to wordFrequencies
        wordFrequencies = wordFrequencies + [tmpList]

    # print part of the wordFrequencies list
    print 'printing wordFrequencies for the following stop words:',wordList[0:3]
    for i in range(0,len(wordFrequencies)):
        print 'index',i,'(',FILE_LIST[i],') :',wordFrequencies[i][0:3]
    print '\n'
    
    # now compare all pairs of rows
    distMatrix = [] 
    for i in range(0,len(FILE_LIST)):
        tmpList = []
        for j in range(0,len(FILE_LIST)):
            val = compareTwo(i,j,wordFrequencies)
            tmpList = tmpList + [val]
            
        distMatrix = distMatrix + [tmpList]

    # print distMatrix to outfile.
    myFile = open(outfile,'w')

    # print headers
    for i in range(0,len(FILE_LIST)): # write title with comma prefix
        myFile.write(',"'+FILE_LIST[i]+'"')
    myFile.write('\n')

    # print values
    for i in range(0,len(distMatrix)):
        myFile.write('"'+FILE_LIST[i]+'"')
        for j in range(0,len(distMatrix[i])): # write rest with a comma prefix
            myFile.write(','+str(distMatrix[i][j]))
        myFile.write('\n')
    myFile.close()
    
    return
        
    
def getStopWords():
    '''Reads 'stopwords.txt' and returns a sorted list.
    INPUT: none
    OUTPUT: list of words'''

    # read file into a string
    filename = 'stopwords.txt'
    myFile = open(filename,'r')
    myString = myFile.read()
    myFile.close()

    # split into a list of words/phrases
    myList = myString.split(',')
    myList.sort()

    return myList

def fillDictionary(filename,wordList):
    '''Takes an empty dictionary and a filename and returns a
    dictionary of word frequencies.
    INPUT: filename (String) and a list of words
    OUTPUT: Dictionary of word frequencies'''

    # fill a variable called filledDictionary. It starts empty.
    filledDictionary = {}

    # initialize frequencies to 0
    for word in wordList:
        filledDictionary[word] = 0

    # read file into a string
    myFile = open(filename,'r')
    myString = myFile.read()
    myFile.close()

    # do some basic text cleaning. Just change to lowercase.
    myString = myString.lower()

    # allWords is a list of all words in the text file.
    allWords = re.findall('\w+',myString)

    for word in allWords:
        if word in filledDictionary: # if this is a stop word
            filledDictionary[word] = filledDictionary[word] + 1

    # normalize frequencies by the total number of words found
    for key in filledDictionary:
        filledDictionary[key] = filledDictionary[key]/float(len(allWords))
    
    return filledDictionary
    
def compareTwo(i,j,frequencies):
    '''Compares the ith and jth rows of the frequencies array.
    INPUT: i & j (integers), list of lists
    OUTPUT: val (double)'''

    # calculate the sum of the differences
    val = 0.0
    list1 = frequencies[i]
    list2 = frequencies[j]
    for ind in range(0,len(list1)):
        val = val + abs(list1[ind]-list2[ind])

    # normalize by the number of words
    val = val / len(list1)
    return val
