# Helper Code for Data Formatting
# Created by Anna Ritz
# Last edited Oct 29, 2015 by Alexandra Papoutsaki to move from Python2 to Python3

## Import Statements
# (1) regular expression package
# (2) urllib.request package (to read from a web page)
import re
import urllib.request

# Project Gutenberg:
# see http://www.gutenberg.org/ for formatting & citation info
# Input files are downloaded from the website in Plain Text format.
def removeLicenseFromProjectGutenberg(infileName,outfileName):
    '''Takes a text file (downloaded from project gutenberg)
    and writes a text file without the licensing information.
    INPUTS: infileName (String) - the text file you want to use
            outfileName (String) - the output file to write to
    OUTPUTS: none'''

    # read infile
    print('Reading input file', infileName, ' ...')
    myFile = open(infileName,'r',encode='utf-8')
    myString = myFile.read()
    myFile.close()

    # Step 1: remove the Project Gutenberg text ABOVE the main document
    # Search for the line that contains '*** START'
    print('Formatting text ...')
    start_str = r'\*\*\*\s*START.+\n'
    myMatch = re.search(start_str,myString)
    if myMatch == None:
        print('Error! The expression "' + start_str + '" was not found.')
        return
    myString = myString[myMatch.end(0):len(myString)]

    # Step 2: remove the Project Gutenberg text BELOW the main document
    # Search for the line that contains '*** END'
    end_str = r'\*\*\*\s*END.+\n'
    myMatch = re.search(end_str,myString)
    if myMatch == None:
        print('Error! The expression "' + start_str + '" was not found.')
        return
    myString = myString[0:myMatch.start(0)]

    # Step 3: substitute 3 or more consecutive newlines with only 2 newlines
    myString = re.sub('\n{3,}','\n\n',myString)

    # open output file FOR WRITING (note the 'w'), write main doc, and close file.
    print('Writing to file', outfileName, '...')
    outFile = open(outfileName,'w')
    outFile.write(myString+'\n')
    outFile.close()

    return

# Webster's Dictionary:
# See http://www.mso.anu.edu.au/~ralph/OPTED/ for formatting & citation info
# example URL here: http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_b.html
def getWebsterDictionary(letter,outputfile):
    '''Gets all the words, parts of speech, and definitions for words
    starting with the letter and writes a tab-delimited file.
    INPUTS: letter (String)- a letter (from A to Z, case insensitive)
             outputfile (String) - the output file to write to
    OUTPUTS: none'''

    # make letter lowercase
    letter = letter.lower()

    # fetch text from URL
    url = 'http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_' + letter + '.html'
    print('Fetching letter',letter,'url:',url,'...')
    myURLFile = urllib.request.urlopen(url)
    myString = myURLFile.read().decode('ISO-8859-1')
    myURLFile.close()

    # open output file FOR WRITING (note the 'w')
    outFile = open(outputfile,'w')
    
    # Entries are delimited by <P>...</P>
    myIter = re.finditer('<P>.*?</P>',myString)
    count = 0
    print('Writing to file',outputfile,'...')
    for line in myIter:
        count = count+1
        # Each line has the word (in bold), the type (in italics), and the definition
        myMatch = re.match('<P><B>(.*)</B>\s+\(<I>(.*)</I>\)\s+(.*)</P>',line.group(0))
        
        # test print statement
        #print(myMatch.group(1),' ',myMatch.group(2),' ',myMatch.group(3))

        # write the word, the type, and the definition to the file.
        # We can split this into 4 lines because it's all within the write()
        # function.
        outFile.write(
            myMatch.group(1) + '\t' +
            myMatch.group(2) + '\t' +
            myMatch.group(3)+'\n')

    # Done with for loop - close output file
    outFile.close()
    
    print(count,'lines read.')

    return

# American Presidency Project: 
# Example URL: http://www.presidency.ucsb.edu/ws/index.php?pid=98813#axzz1pUinkhcL
def getTranscript(url,outputfileName):
    '''Gets the text from a Presidency Project URL and puts it in a text file.
    INPUTS: url (String) - the url of the transcript you want
            outputfile (String) - the name of the output file to write to
    OUTPUTS: none'''

    # fetch text from URL
    print('Fetching url:',url,'...')
    myURLFile = urllib.request.urlopen(url)
    myString = myURLFile.read().decode('utf-8')
    myURLFile.close()

    # There is a SINGLE line that has the ENTIRE transcript.
    # This is the LONGEST line in the file! Find it.
    myList = myString.split('\n')
    longestLine = ''
    for line in myList:
        if len(line) > len(longestLine):
            longestLine = line

    # set longest line to be myStr
    myStr = longestLine
    
    # substitute all <p> (paragraphs) with TWO newlines
    myStr = re.sub('<p>','\n\n',myStr)

    # substute all <br> (breaks) with ONE newline
    myStr = re.sub('<br>','\n',myStr)

    # remove all other bracket tags
    myStr = re.sub('<.*?>','',myStr)

    # remove leading and trailing whitespace
    myStr = re.sub('^\s+','',myStr)
    myStr = re.sub('\s+$','',myStr)

    # open output file FOR WRITING (note the 'w'), write to file, and close.
    print('Writing to file',outputfileName,'...')
    outFile = open(outputfileName,'w')
    outFile.write(myStr+'\n')
    outFile.close()

    return