# File: fileAnalysis.py
# Purpose: Analysis of a file's lines and words.

# Author: TODO
#
# Collaboration statement: TODO

from textlib import getWordTokens

import matplotlib.pyplot as plt

#########################################################
## Dictionaries with numerical values                  ##
#########################################################
    
def getMostPopularKeys(d, n):
    """
    Finds the n most popular keys in the dictionary.
    Assumes there are at least n unique keys in the dictionary.

    Note: There may be more than n values if there are ties.

    d: a dictionary with values that are numbers
    n: the number of keys to find (an int)
    returns: a list of the n most popular keys
    """
    # TODO: Part 2
    return [] # replace with your code

def printMostPopularKeys(d, s, n=5):
    """
    Prints the n most popular keys in the given dictionary.
    Assumes there are at least n unique keys in the dictionary.

    Note: There may be more than n values if there are ties.

    d: a dictionary with values that are numbers
    s: what the keys represent (e.g., "words", "bigrams")
    n: the number of keys to print (an int; default: 5)
    """
    # Print a header
    print(f"\n### {n} most popular {s}")

    # Get the n most popular keys
    highestCountKeys = getMostPopularKeys(d, n)

    # Build a list of count-key tuples for popular keys
    countKeyPairs = []
    for key in highestCountKeys:
        count = d[key]
        countKeyPairs.append( (count, key) )

    # Sort the list of popular keys
    countKeyPairs.sort(reverse=True)

    # Print the key-count pairs
    for count, key in countKeyPairs:
        print(f"{key}: {count}")

#########################################################
## FileAnalyzer class                                  ##
#########################################################

class FileAnalyzer:

    ######################################################
    ## Constructor                                      ##
    ######################################################

    def __init__(self, filepath):
        """
        Creates a FileAnalyzer object for the provided file.

        filepath: path to the file (a string)
        """
        # Store the filepath
        self.filepath = filepath

        # Parse the file into the text, list of lines,
        # list of words, etc.
        self.setup()

    def setup(self):
        """
        Reads in the file, storing its text, lines, and words.
        """
        # Create empty instance variables (they'll exist everywhere
        # once created, even if they're not created in __init__)
        self.text = ""
        self.lines = []
        self.wordList = []
        self.wordCountDict = {}
        self.bigramCountDict = {}
        self.wordBigramMap = {}
        self.bigramProbDict = {}

        # Read the file as one huge string
        with open(self.filepath, 'r') as f:
            self.text = f.read()

        # Split the text into lines in a list
        self.lines = self.text.split("\n")

        # Pre-process the file contents to a list of words
        self.wordList = getWordTokens(self.text)

        # Build a word-frequency dictionary for this file, mapping
        # each word to its count in the text
        self.wordCountDict = self.buildWordFrequencyDict()

        # Build a bigram-frequency dictionary for this file, mapping
        # each bigram (consecutive pair of words) to its count
        self.bigramCountDict = self.buildBigramFrequencyDict()

        # Build a dictionary mapping each word to the list of bigrams
        # that start with that word
        self.wordBigramMap = self.buildWordBigramMap()

        # Build a bigram-probability dictionary for this file, mapping
        # each bigram to its probability of being the right one for
        # the first word in that bigram
        self.bigramProbDict = self.buildBigramProbabilityDict()

    ######################################################
    ## Line types                                       ##
    ######################################################

    def getLineTypeDict(self):
        """
        Creates a dictionary mapping each line type to how many lines
        in the file have that type.

        Types: "comment", "empty", and "regular"

        returns: a dictionary mapping each line type to a count (int)
        """
        # Initialize the dictionary to have 0 for each line type
        lineTypeCounts = {"comment": 0,
                          "empty": 0,
                          "regular": 0}
        
        # TODO: Part 1a
        return lineTypeCounts # replace with your code

    def analyzeLineTypes(self):
        # TODO: Part 1b (below are some comments to get you started)
        pass

        # Build a dict mapping line types to their counts
        # Print the counts out in a table format
        # Make a pie chart (see https://www.cs.carleton.edu/faculty/tamert/courses/cs111-s24/exercises/exercises16/#ex2)

    ######################################################
    ## Word frequencies                                 ##
    ######################################################
    
    def buildWordFrequencyDict(self):
        """
        Builds a dictionary mapping each word in the given
        file to how many times it appears in the file.

        returns: a dictionary mapping each word to its count in the file
        """
        # Start with an empty dictionary
        wordCountDict = {}

        # If the text is empty, just return; we have our empty dictionary
        if self.text == "":
            return wordCountDict

        # Track how many times each word appears in the text
        for word in self.wordList:
            wordCountDict[word] = wordCountDict.get(word, 0) + 1

        return wordCountDict

    def printMostPopularWords(self, n=5):
        """
        Prints the n most popular words in the given file.
        Assumes there are at least n unique words in the file.

        Note: There may be more than n values if there are ties.

        filepath: path to the file (a string)
        n: the number of words to print (an int; default: 5)
        """
        printMostPopularKeys(self.wordCountDict, "words", n)

    ######################################################
    ## Bigram frequencies                               ##
    ######################################################

    def buildBigramFrequencyDict(self):
        """
        Builds a dictionary mapping each bigram (i.e., consecutive pair
        of words) in the given file to how many times the bigram appears
        in the file.

        returns: a dictionary mapping each bigram to its count in the file
        """
        # Start with an empty dictionary
        bigramCountDict = {}

        # If the text is empty, just return; we have our empty dictionary
        if self.text == "":
            return bigramCountDict

        # Track how many times each bigram appears in the text
        prevWord = self.wordList[0]
        for word in self.wordList[1:]:
            # Update the dictionary for this bigram
            bigram = (prevWord, word)
            bigramCountDict[bigram] = bigramCountDict.get(bigram, 0) + 1

            # Get ready for the next pair
            prevWord = word

        return bigramCountDict
    
    def buildWordBigramMap(self):
        """
        Builds a dictionary mapping each word in the given file to the
        list of bigrams that start with that word.

        returns: a dictionary mapping each word to a list of bigrams
        """
        # TODO: Part 3
        return {} # replace with your code

    def printMostPopularBigrams(self, n=5):
        """
        Prints the n most popular words in the given file.
        Assumes there are at least n unique words in the file.

        Note: There may be more than n values if there are ties.

        n: the number of words to print (an int; default: 5)
        """
        printMostPopularKeys(self.bigramCountDict, "bigrams", n)

    ######################################################
    ## Word prediction                                  ##
    ######################################################

    def buildBigramProbabilityDict(self):
        """
        Returns a new dictionary mapping each bigram in bigramCountDict
        to the probability that its first word follows its second.

        The probability is defined as the bigram's frequency divided
        by the sum of the frequency of all bigrams with the same first word.

        returns: the new dictionary (mapping bigram -> probability)
        """
        # If the word->bigram map isn't implemented, do nothing
        if len(self.wordBigramMap) == 0:
            return

        # Now make the goal: map bigram -> probability
        bigramProbDict = {}
        for bigram in self.bigramCountDict:
            # Figure out what the total count is for any bigrams
            # that start with this word
            firstWord = bigram[0]
            firstWordBigrams = self.wordBigramMap[firstWord]
            totalCount = 0
            for fwb in firstWordBigrams:
                totalCount += self.bigramCountDict[fwb]

            # The probability of this bigram is its count divided by the total
            bigramProbDict[bigram] = self.bigramCountDict[bigram] / totalCount

        return bigramProbDict
    
    def getNextWords(self, word):
        """
        For a given word, use the probability of each bigram that starts
        with that word being the one with the correct next word to build
        a dictionary mapping next-words to their corresponding probabilities.

        returns: the new dictionary (mapping next-word -> probability)
        """
        # TODO: Part 4
        return {} # replace with your code
    
    def printMultipleNextWords(self, startWord, n=3):
        """
        Starting with the given word, build a sequence of up to n words using the
        bigram probabilities.

        Prints the sequence of words.
        """
        # Prep for n iterations
        print(f"\n### Starting with '{startWord}':")
        word = startWord
        for i in range(n):
            # Print out the current word (with a space after)
            print(word, end=" ")

            # Look up the next word
            nextWordProbDict = self.getNextWords(word)
            if len(nextWordProbDict) == 0: break
            word = getMostPopularKeys(nextWordProbDict, 1)[0]

#########################################################
## Try it all out!                                     ##
#########################################################

def main():
    # Build our FileAnalyzer object
    file_analyzer = FileAnalyzer("senseAndSensibility.txt")

    # Analyze the different line types
    file_analyzer.analyzeLineTypes()

    # Print out the four most popular words and their counts
    file_analyzer.printMostPopularWords(n=4)

    # Do it again, but for n=10
    file_analyzer.printMostPopularWords(n=10)

    # Print out the four most popular bigrams and their counts
    file_analyzer.printMostPopularBigrams(n=4)

    # Predict a sequence of words starting from a given starting word
    startWord = "she"
    numWords = 12
    file_analyzer.printMultipleNextWords(startWord, numWords) 

if __name__ == "__main__":
    main()