# Functions for a silly parsing task on Project Gutenberg texts: extract
# all proper nouns, convert them to spooky zombie versions, and replace
# them in the text.  Write out word-frequency stats for the detected proper
# nouns too.

# Get the libraries for regular expressions and random-number generation.
import re, random


def zombify_text(input_filename, output_filename, csv_filename):
    '''Zombify the text of an input file.
INPUT:
 - input_filename: the name of an input file to read.
 - output_filename: the name of an output file to write.
 - csv_filename: the name of an output CSV file to which to write stats.
OUTPUT: Nothing.
SIDE EFFECTS: Creates two new files.'''
    
    # Read the input file into a big string.
    orig_text = readProjectGutenberg(input_filename)
    
    # Detect all the proper nouns we can.
    name_list = detectProperNouns(orig_text)
    
    # Pick a zombified version for each name.
    zombie_name_dict = pickZombifiedVersions(name_list)
    
    # Create a new string with names replaced with their zombified versions.
    # Hold onto frequency statistics too.
    (zombie_text, name_freqs) = replaceStrings(orig_text, zombie_name_dict)
    
    # Write the output file and frequency statistics files.
    writeFile(output_filename, zombie_text)
    writeDictToCsvFile(csv_filename, name_freqs)


# Define a bunch of special variables for pattern-matching and so forth.

# Regex pattern for the divider text that sets of the actual document from the
# header and footer in a Project Gutenberg file.
GUTENBERG_DIVIDER = r'(\*)+\s*(START|END).*PROJECT GUTENBERG.*$'

# Regex patterns for paragraphs.
PARAGRAPH_PATT = r'\n{2,}[^\n]([^\n]+\n)+'
GUTENBERG_PARA_PATT = r'\n{2,}[^\n]*project\s+gutenberg([^\n]+\n)+'

# Regex pattern for titles in proper names that contain a period.
TITLE_PATT = r'(St|Ste|Messr|Rev|Mssr|Mr|Mrs|Ms|Mz|Dr|Mdm)\.'

# A special string that we use in place of periods that we want not to treat as
# sentence dividers.
PERIOD_REPLACEMENT = 'qqqperiodqqq'

# Regex pattern for proper names.
PROPER_NAME_PATT = r'(?<=[a-z])[-\s,_]*[\s][-\s,_]*([A-Z][a-z]+((-[A-Za-z]|\s[A-Z])[a-z]+)*)'

# The list of zombifying modifiers.
ZOMBIE_PREFIXES = ['Zombie', 'Undead', 'Rotting', 'Flesh-Starved',
    'Brain-Eating', 'Drooling', 'Shambling Horror', 'Skeleton', 'Way Spooky']



def readProjectGutenberg(filename):
    '''Read a plain-text file from Project Gutenberg, stripping all the extra
header and footer stuff.
INPUT: a filename.
OUTPUT: a single string containing the text of the document.'''
    
    file = open(filename, 'r')
    text = file.read()
    file.close()
    
    # Convert to standard line breaks.
    text = re.sub(r'\r?\n', '\n', text)
    
    # Strip the header.
    header_match = re.search(GUTENBERG_DIVIDER, text, re.MULTILINE)
    text = text[header_match.end()+1 : len(text)]
    
    # Strip the footer.
    footer_match = re.search(GUTENBERG_DIVIDER, text, re.MULTILINE)
    text = text[0 : footer_match.start()]
    
    # Remove any paragraphs that mention Project Gutenberg.
    text = re.sub(GUTENBERG_PARA_PATT, '', text, flags=re.IGNORECASE)
    
    return text


def detectProperNouns(input_text):
    '''Detect as many proper nouns in the text as we can.
INPUT: A long string representing the text we want to check.
OUTPUT: A (not necessarily exhaustive) list of proper nouns in the text.'''
    
    # Replace all the periods that occur in titles (like Mrs., Dr., and so on)
    # with a special string, so that we don't treat those periods like the ends
    # of sentences.
    text = re.sub(TITLE_PATT, r'\1'+PERIOD_REPLACEMENT, input_text)
    
    # Replace all consecutive strings of whitespace with a single space.
    text = re.sub(r'\s+', ' ', text)
    
    # Find every unique proper name in the text.
    names = {}
    for match in re.finditer(PROPER_NAME_PATT, text):
        name = match.group(1)
        if name not in names:
            real_name = re.sub(PERIOD_REPLACEMENT, '.', name)
            if not re.match(TITLE_PATT+'$', real_name):
                names[name] = re.sub(PERIOD_REPLACEMENT, '.', name)
    
    return names.values()


def pickZombifiedVersions(name_list):
    '''For each name in a list, pick a "zombified" version of it, and return a
dictionary matching names (keys) to zombified versions (values).'''
    zombie_name_dict = {}
    for name in name_list:
        zombie_name_dict[name] = random.choice(ZOMBIE_PREFIXES) + ' ' + name
    return zombie_name_dict


def replaceStrings(text, replace):
    '''Given a text and a dictionary that maps strings to strings, replace every
occurrence of a key in the text with its corresponding value.
INPUT:
 - text: a string.
 - replace: a dictionary in which both the keys and the values are strings.
OUTPUT: a 2-tuple:
 - 0: a string, the modified version of the input text.
 - 1: a dictionary.  Keys are strings (same as the "replace" dictionary) and
      values are counts of occurrences of each key in the input text.'''
    name_patt = '|'.join(replace.keys())
    zombie_text = ''
    name_freqs = {}
    end_of_last_match = 0
    for match in re.finditer(name_patt, text):
        name = match.group(0)
        if name not in name_freqs:
            name_freqs[name] = 1
        else:
            name_freqs[name] = name_freqs[name] + 1
        
        zombie_text = zombie_text + text[end_of_last_match:match.start()] + replace[name]
        end_of_last_match = match.end()
    zombie_text = zombie_text + text[end_of_last_match:len(text)]
    
    return (zombie_text, name_freqs)

def writeFile(filename, text):
    '''Write some text straight to a file.
INPUT:
 - filename: a filename.
 - text: a string to write to a file with the given name.
OUTPUT: None.
SIDE-EFFECTS: Creates one file.'''
    file = open(filename, 'w')
    file.write(text)
    file.close()


def writeDictToCsvFile(filename, dictionary):
    '''Write the contents of a dictionary to a CSV file.
INPUT:
 - filename: a filename.
 - dictionary: a dictionary to write to a file with the given name.
OUTPUT: None.
SIDE-EFFECTS: Creates one CSV file.'''
    file = open(filename, 'w')
    for key in dictionary.keys():
        file.write(str(key) + ',' + re.sub(',', r'\,', str(dictionary[key])) + '\n')
    file.close()


