## Helper Code for Data Formatting
## Anna Ritz
## Last Edited April 1, 2012
###################################

## Import Statements
# (1) regular expression package
# (2) urllib package (to read from a web page)
# (3) csv package (to read twitter lines)
import re
import urllib
import csv

# see http://www.gutenberg.org/ for formatting & citation info
# Input files are downloaded from the website in Plain Text format.
def removeLicenseFromProjectGutenberg(infile,outfile):
    '''Takes a text file (downloaded from project gutenberg)
    and writes a text file without the licensing information.
    INPUTS: infile(String) - the text file you want to use
            outfile (String) - the output file to write to
    OUTPUTS: none'''

    # read infile
    myFile = open(infile,'r')
    myString = myFile.read()
    myFile.close()

    # Step 1: remove the Project Gutenberg text ABOVE the main document
    # Search for the line that contains '*** START'
    start_str = r'\*\*\*\s*START.+\n'
    myMatch = re.search(start_str,myString)
    if myMatch == None:
        print('Error! The expression "' + start_str + '" was not found.')
        return
    myString = myString[myMatch.end(0):len(myString)]

    # Step 2: remove the Project Gutenberg text BELOW the main document
    # Search for the line that contains '*** END'
    end_str = r'\*\*\*\s*END.+\n'
    myMatch = re.search(end_str,myString)
    if myMatch == None:
        print('Error! The expression "' + start_str + '" was not found.')
        return
    myString = myString[0:myMatch.start(0)]

    # Step 3: substitute 3 or more consecutive newlines with only 2 newlines
    myString = re.sub('\n{3,}','\n\n',myString)

    # open output file FOR WRITING (note the 'w'), write main doc, and close file.
    outFile = open(outfile,'w')
    outFile.write(myString+'\n')
    outFile.close()
    
    return

# See http://www.mso.anu.edu.au/~ralph/OPTED/ for formatting & citation info
# example URL here: http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_b.html
def getWebsterDictionary(letter,outputfile):
    '''Gets all the words, parts of speech, and definitions for words
    starting with the letter and writes a tab-delimited file.
    INPUTS: letter (String)- a letter (from A to Z, case insensitive)
             outputfile (String) - the output file to write to
    OUTPUTS: none'''

    # make letter lowercase
    letter = letter.lower()

    # fetch text from URL
    url = 'http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_' + letter + '.html'
    print 'fetching letter',letter,'url:',url,'...'
    myURLFile = urllib.urlopen(url)
    myString = myURLFile.read()
    myURLFile.close()

    # open output file FOR WRITING (note the 'w')
    outFile = open(outputfile,'w')
    
    # Entries are delimited by <P>...</P>
    myIter = re.finditer('<P>.*?</P>',myString)
    print 'Writing to file',outputfile,'...'
    for line in myIter:

        # Each line has the word (in bold), the type (in italics), and the definition
        myMatch = re.match('<P><B>(.*)</B>\s+\(<I>(.*)</I>\)\s+(.*)</P>',line.group(0))
        
        # test print statement
        #print myMatch.group(1),' ',myMatch.group(2),' ',myMatch.group(3)

        # write the word, the type, and the definition to the file.
        # We can split this into 4 lines because it's all within the write()
        # function.
        outFile.write(
            myMatch.group(1) + '\t' +
            myMatch.group(2) + '\t' +
            myMatch.group(3)+'\n')

    # Done with for loop - close output file
    outFile.close()
    
    print count,'lines read.'
    return
    
def twitterExample(n):
    '''Reads in a file called 'Tweets_With_Elect_Mar20.txt' and prints
    the first n tweets.
    INPUTS: n (Integer)- the number of tweets to report
    OUTPUTS: none'''
    
    filename = 'Tweets_With_Elect_Mar20.txt'
    myFile = open(filename,'r')
    myCSVReader = csv.reader(myFile,delimiter=',',quotechar='"')
    
    # read lines from file and put in a list
    # after this loop, myCSVList will contain a list of tweets,
    # which are each 13-element lists.
    myCSVList = []
    for row in myCSVReader: # iterator object that returns lists
        myCSVList = myCSVList + [row]

    # done with file: close it
    myFile.close()
    
    # keep only the first n elements of the list.
    # these correspond to the first n tweets.
    myCSVList = myCSVList[0:n]

    # iterate through the list and print the author and tweet
    for tweet in myCSVList:

        if len(tweet) != 13:
            print 'skipping row with',len(tweet),'columns:',tweet
        else:
            # finally, print the name and the tweet ONLY.
            print tweet[3],':',tweet[12]

    return


# American Presidency Project: 
# Example URL: http://www.presidency.ucsb.edu/ws/index.php?pid=98813#axzz1pUinkhcL
def getTranscript(url,outputfile):
    '''Gets the text from a Presidency Project URL and puts it in a text file.
    INPUTS: url (String) - the url of the transcript you want
            outputfile (String) - the name of the output file to write to
    OUTPUTS: none'''

    # fetch text from URL
    print 'fetching url:',url,'...'
    myURLFile = urllib.urlopen(url)
    myString = myURLFile.read()
    myURLFile.close()

    # There is a SINGLE line that has the ENTIRE transcript.
    # This is the LONGEST line in the file! Find it.
    myList = myString.split('\n')
    longestLine = ''
    for line in myList:
        if len(line) > len(longestLine):
            longestLine = line

    # set longest line to be myStr
    myStr = longestLine
    
    # substitute all <p> (paragraphs) with TWO newlines
    myStr = re.sub('<p>','\n\n',myStr)

    # substute all <br> (breaks) with ONE newline
    myStr = re.sub('<br>','\n',myStr)

    # remove all other bracket tags
    myStr = re.sub('<.*?>','',myStr)

    # remove leading and trailing whitespace
    myStr = re.sub('^\s+','',myStr)
    myStr = re.sub('\s+$','',myStr)

    # open output file FOR WRITING (note the 'w'), write to file, and close.
    outFile = open(outputfile,'w')
    outFile.write(myStr+'\n')
    outFile.close()

    return
