import json
import re

INPUT_FILE = 'cola.txt'
#INPUT_FILE = 'computer.txt'
#INPUT_FILE = 'smartphone.txt'

READ_SIZE = 1024*1024*10   # this is ten million (10M)


myFile = open(INPUT_FILE)

# Read the file into a string in chuncks of 10mb
count = 0     # how many chunks we have read
myString = ''
buffer = myFile.read(READ_SIZE) 


while buffer != '': # read() returns '' when reaching the end of file
    count += 1
    myString += buffer
    buffer = myFile.read(READ_SIZE)
    print(str(count*10) + 'Mb data read...')

myFile.close()
print('file reading done.')


# Split the string into chuncks, each being a JSON representing a tweet
unparsedTweets = re.findall(r'{.*}\n', myString)
print(str(len(unparsedTweets)) + ' tweets found in the file.')


# Load the unparsedTweets (strings) as dictionaries
tweets = []
print('loading tweets...')

for i in range(0,len(unparsedTweets)):
    tweets += [json.loads(unparsedTweets[i])]
    # print progress when every two thousand tweets are loaded.
    if((i+1)%2000 == 0):
        print(str(i+1) + ' tweets loaded')

print(str(len(tweets)) +' tweets loaded.')

# Count how many tweets have both coke and pepsi in their texts
rivalryCount = 0

# TODO