## Helper Code for Data Formatting
## Anna Ritz
## Last Edited March 21, 2012
###################################

## Import Statements
# (1) regular expression package
# (2) urllib package (to read from a web page)
import re
import urllib

# see http://www.gutenberg.org/ for formatting & citation info
# Input files are downloaded from the website in Plain Text format.
def removeLicenseFromProjectGutenberg(infile,outfile):
    '''Takes a text file (downloaded from project gutenberg)
    and writes a text file without the licensing information.
    INPUTS: infile(String) - the text file you want to use
            outfile (String) - the output file to write to
    OUTPUTS: none'''

    # read infile
    myFile = open(infile,'r')
    myString = myFile.read()
    myFile.close()

    # Step 1: remove the Project Gutenberg text ABOVE the main document
    # Search for the line that contains '*** START'
    myMatch = re.search('\*\*\* START.+\n',myString)
    myString = myString[myMatch.end(0):len(myString)]

    # Step 2: remove the Project Gutenberg text BELOW the main document
    # Search for the line that contains '*** END'
    myMatch = re.search('\*\*\* END.+\n',myString)
    myString = myString[0:myMatch.start(0)]

    # Step 3: substitute 3 or more consecutive newlines with only 2 newlines
    myString = re.sub('\n{3,}','\n\n',myString)

    # open output file FOR WRITING (note the 'w'), write main doc, and close file.
    outFile = open(outfile,'w')
    outFile.write(myString+'\n')
    outFile.close()
    
    return

# See http://www.mso.anu.edu.au/~ralph/OPTED/ for formatting & citation info
# example URL here: http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_b.html
def getWebsterDictionary(letter,outputfile):
    '''Gets all the words, parts of speech, and definitions for words
    starting with the letter and writes a tab-delimited file.
    INPUTS: letter (String)- a letter (from A to Z, case insensitive)
             outputfile (String) - the output file to write to
    OUTPUTS: none'''

    # make letter lowercase
    letter = letter.lower()

    # fetch text from URL
    url = 'http://www.mso.anu.edu.au/~ralph/OPTED/v003/wb1913_' + letter + '.html'
    print 'fetching letter',letter,'url:',url,'...'
    myURLFile = urllib.urlopen(url)
    myString = myURLFile.read()
    myURLFile.close()

    # open output file FOR WRITING (note the 'w')
    outFile = open(outputfile,'w')
    
    # Entries are delimited by <P>...</P>
    myIter = re.finditer('<P>.*?</P>',myString)
    print 'Writing to file',outputfile,'...'
    for line in myIter:

        # Each line has the word (in bold), the type (in italics), and the definition
        myMatch = re.match('<P><B>(.*)</B>\s+\(<I>(.*)</I>\)\s+(.*)</P>',line.group(0))
        
        # test print statement
        #print myMatch.group(1),' ',myMatch.group(2),' ',myMatch.group(3)

        # write the word, the type, and the definition to the file.
        # We can split this into 4 lines because it's all within the write()
        # function.
        outFile.write(
            myMatch.group(1) + '\t' +
            myMatch.group(2) + '\t' +
            myMatch.group(3)+'\n')

    # Done with for loop - close output file
    outFile.close()
    
    print count,'lines read.'
    return
    
def twitterExample(n):
    '''Reads in a file called 'Tweets_With_Elect_Mar20.txt and prints
    the first n tweets.
    INPUTS: n (Integer)- the number of tweets to report
    OUTPUTS: none'''
    
    filename = 'Tweets_With_Elect_Mar20.txt'
    myFile = open(filename,'r')
    myString = myFile.read()
    myFile.close()

    # create a list by splitting on the newlines in the text.
    myList = myString.split('\n')

    # keep only the first n elements of the list.
    myList = myList[0:n]

    # iterate through the list and print the author and tweet
    for tweet in myList:
        # Add a comma to the END of the tweet (this is silly, but will make the
        # regular expression work)
        tweet = tweet + ','
        
        # We want to get index #3 and #12 in this comma-separated string.
        # BUT sometimes commas exist WITHIN a text!  Use a regular expression.
        # We use the sytax (A|B), which means look for regex A OR regex B.
        # The question mark says find the SHORTEST match of the preceding thing.
        #  Regex A: [^"]+?, means get the next substring up to the next comma with NO quotes
        #  Regex B: ".*?", means get the next substring up to the next comma SURROUNDED by quotes
        myRow = re.findall('([^"]+?,|".*?",)',tweet)

        # myRow is a list of ALL occurrences of the regex above.
        # Get rid of the commas at the end of each occurrence
        for i in range(0,len(myRow)):
            myRow[i] = myRow[i][0:len(myRow[i])-1]

        # finally, print the name and the tweet ONLY.
        print myRow[3],':',myRow[12]

    return


# American Presidency Project: 
# Example URL: http://www.presidency.ucsb.edu/ws/index.php?pid=98813#axzz1pUinkhcL
def getTranscript(url,outputfile):
    '''Gets the text from a Presidency Project URL and puts it in a text file.
    INPUTS: url (String) - the url of the transcript you want
            outputfile (String) - the name of the output file to write to
    OUTPUTS: none'''

    # fetch text from URL
    print 'fetching url:',url,'...'
    myURLFile = urllib.urlopen(url)
    myString = myURLFile.read()
    myURLFile.close()

    # There is a SINGLE line that has the ENTIRE transcript.
    # This is the LONGEST line in the file! Find it.
    myList = myString.split('\n')
    longestLine = ''
    for line in myList:
        if len(line) > len(longestLine):
            longestLine = line

    # set longest line to be myStr
    myStr = longestLine
    
    # substitute all <p> (paragraphs) with TWO newlines
    myStr = re.sub('<p>','\n\n',myStr)

    # substute all <br> (breaks) with ONE newline
    myStr = re.sub('<br>','\n',myStr)

    # remove all other bracket tags
    myStr = re.sub('<.*?>','',myStr)

    # remove leading and trailing whitespace
    myStr = re.sub('^\s+','',myStr)
    myStr = re.sub('\s+$','',myStr)

    # open output file FOR WRITING (note the 'w'), write to file, and close.
    outFile = open(outputfile,'w')
    outFile.write(myStr+'\n')
    outFile.close()

    return
