# This program analyses the frequency of stopwords
# in texts

# IMPORT STATEMENTS
import re

# CONSTANTS: these are variables that NEVER CHANGE
# They may be accessed by ANY function
FILE_LIST = ['file1.txt','file2.txt','file3.txt',
         'file4.txt','file5.txt','file6.txt']

## There are no more comments! Fill them in.
def testFiles(outfileName):
    '''Counts the stop-word frequencies in each text file in the 'FILE_LIST'
    variable, and writes a distance matrix (comparing each pair of files) to
    the file indicated by 'outfile'.
    INPUT: outfile (str): a filename.
    OUTPUT: none.'''
    
    wordList = getStopWords()
    print('There are ', len(wordList), ' stop words.\n')
    
    wordFrequencies = []
    for filename in FILE_LIST:
        myDictionary = fillDictionary(filename,wordList)

        tmpList = []
        for word in wordList:
            tmpList = tmpList + [myDictionary[word]]

        wordFrequencies = wordFrequencies + [tmpList]

    print('printing wordFrequencies for the following stop words:',wordList[0:3])
    for i in range(0,len(wordFrequencies)):
        print('index',i,'(',FILE_LIST[i],') :',wordFrequencies[i][0:3])
    print('\n')
    
    distMatrix = [] 
    for i in range(0,len(FILE_LIST)):
        tmpList = []
        for j in range(0,len(FILE_LIST)):
            val = compareTwo(wordFrequencies, i, j)
            tmpList = tmpList + [val]
            
        distMatrix = distMatrix + [tmpList]

    print('Distance Matrix:')
    print('\t ',FILE_LIST)
    for i in range(0,len(distMatrix)):
        print(FILE_LIST[i],distMatrix[i])

    # TODO: Output the distance matrix (table) into a new csv file called outfileName.
    # That way we can inspect these values using Excel, conditional formatting, etc.

    # TODO: Output the distance matrix (table) into a new csv file called outfileName.
    # That way we can inspect these values using Excel, conditional formatting, etc.

    outFile = open(outfileName,'w')

    row = ""
    for i in range(0,len(FILE_LIST)):
        row =  row+","+ FILE_LIST[i]
    outFile.write(row+"\n")

    for i in range(0,len(FILE_LIST)):
        row = '' + FILE_LIST[i]

        # Loop through the columns in the current list row
        for val in distMatrix[i]:
             row = row + ',' + str(val)

        #At this point, we created our string row.
        #We want to write this row into our csv
        outFile.write(row)

        #Need a newline at the end of each string row
        outFile.write('\n')
        
    # Finalize the new file by closing it
    outFile.close()
            
    return
        
    
def getStopWords():
    '''Reads 'stopwords.txt' and returns a sorted list of stop words.
    INPUT: none.
    OUTPUT: list of strings.'''

    filename = 'stopwords.txt'
    myFile = open(filename,'r')
    myString = myFile.read()
    myFile.close()

    myList = myString.split(',')
    myList.sort()

    return myList

def fillDictionary(filename, wordList):
    '''Computes word frequencies in a file for a chosen list of words.  Note
    that these are *frequencies* (normalized counts), not counts.
    INPUT: filename (str): the text file for which to count frequencies.
           wordList (list of str): words whose frequencies should be counted.
    OUTPUT: dict (str to float): a frequency dictionary.'''

    filledDictionary = {}

    for word in wordList:
        filledDictionary[word] = 0

    myFile = open(filename,'r')
    myString = myFile.read()
    myFile.close()

    myString = myString.lower()

    allWords = re.findall('\w+',myString)

    for word in allWords:
        if word in filledDictionary: 
            filledDictionary[word] = filledDictionary[word] + 1

    for key in filledDictionary:
        filledDictionary[key] = filledDictionary[key]/len(allWords)
    
    return filledDictionary
    
def compareTwo(frequencies, i, j):
    '''Compares two frequency lists out of a word-frequency table, returning
    a scalar difference value.
    INPUT: frequencies (list of list of float): a word-frequency table.  Each
             row (inner list) has the same number of elements, and elements at
             the same index represent the frequency of the same word in two
             different texts.
           i (int): the index of a text (row) in the frequency table.
           j (int): the index of another text.
    OUTPUT: val (float): a difference value between frequencies in texts
              i and j.'''
    
    val = 0.0

    for word in range(0, len(frequencies[i])):
        freqsI = frequencies[i]
        freqsJ = frequencies[j]
        val = val + abs(freqsI[word] - freqsJ[word])

    return val

