"""
Aspect Sentiment Analysis Vector Maker

Makes sentiment-analysis vector representations of books and saves them in datafile/Sentiment.npz

Authors: Sofia Serrano, Ryan Gorey

V 0.1

Required downloads:
    scipy.sparse
    numpy
and files:
    datafile/noun_dict.txt
    datafile/modifiers_dict.txt
    datafile/ISBNNumsInCommon.txt
    isbn_text_files (a directory full of lemmatized text files in list form)
"""

### IMPORTS ###

import ast
import numpy as np
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
import math

### GLOBALS ###

num_books = 0
f = open("../datafile/ISBNNumsInCommon.txt", "r")
for line in f:
    if line.strip() != "":
        num_books += 1
f.close()
with open("datafile/noun_dict.txt", "r") as file:
    noun_dict = ast.literal_eval(file.readline())
num_terms = len(noun_dict)
with open("datafile/op_dict.txt", "r") as file:
    op_dict = ast.literal_eval(file.readline())
with open("datafile/modifiers_dict.txt", "r") as file:
    modifiers_dict = ast.literal_eval(file.readline())
hinges_list = ["actually", "although", "but", "despite", "however", "though"]
matrix = None

'''
Output: the columns of the TFIDF identified by ISBN in a text file

:TFIDFMatrix: - a TFIDF matrix (numpy)
'''
def save_matrix(TFIDFMatrix):
    TFIDFMatrix = csr_matrix(TFIDFMatrix)
    np.savez("datafile/Sentiment.npz", data = TFIDFMatrix.data, indices = TFIDFMatrix.indices,
             indptr = TFIDFMatrix.indptr, shape = TFIDFMatrix.shape)
    print("Sentiment vector matrix successfully saved!")

'''
Fill in later
'''
def load_matrix():
    loader = np.load("datafile/Sentiment.npz")
    matrix = lil_matrix(csr_matrix((loader['data'], loader['indices'], loader['indptr'] ), shape = loader['shape']))
    print("Sentiment vector matrix successfully loaded from file.")
    return matrix

def get_word_list_from_file(string_isbn):
    """
    Assuming books' text files are already created in [text_file_directory_name], this
    function returns the prepared list of words
    :param string_isbn: the string version of the isbn to get text for
    :return: a list of lists of strings, where each inner list represent a sentence
      and each string within that list represents a word
    """
    f = open("isbn_text_files/" + string_isbn + ".txt", "r")
    string_list = f.readline().strip()
    f.close()
    return ast.literal_eval(string_list)

def make_all_sentiment_analysis_vectors(filename):
    global matrix
    matrix = lil_matrix((num_books, num_terms))
    isbns = open(filename, "r")
    index = 0
    for isbn in isbns:
        isbn = isbn.strip()
        if isbn:
            calculate_one_book_vector(isbn, index)
            index += 1
        if index % 100 == 0:
            print(str(index) + " books' vectors made so far.")
    isbns.close()
    save_matrix(matrix)

def calculate_one_book_vector(isbn, isbn_index):
    sentences = get_word_list_from_file(isbn)
    term_sentiment_totals = [0 for i in range(num_terms)]
    term_occurrence_totals = [0 for i in range(num_terms)]
    cur_sentence = make_abstract_sentence(sentences[0])
    valence = score_sentence(0, cur_sentence, term_sentiment_totals, term_occurrence_totals)
    for i in range(1, len(sentences)):
        cur_sentence = make_abstract_sentence(sentences[i])
        valence = score_sentence(valence, cur_sentence, term_sentiment_totals, term_occurrence_totals)
    for i in range(len(term_sentiment_totals)):
        if term_sentiment_totals[i] != 0:
            avg = term_sentiment_totals[i] / term_occurrence_totals[i]
            global matrix
            matrix[isbn_index, i] = avg

def score_sentence(prev_sentence_valence, cur_sentence, term_sentiment_totals, term_occurrence_totals):
    rep_valence = 0
    for i in range(len(cur_sentence)):
        word = cur_sentence[i]
        try:
            symbol = word[0]
            if symbol == "*":
                score = calculate_one_word_occurrence_score(prev_sentence_valence, cur_sentence, i)
                term_sentiment_totals[word[1]] += score
                term_occurrence_totals[word[1]] += 1
                rep_valence = score
        except:
            if word == "^":
                if i == 0:
                    rep_valence = -1 * prev_sentence_valence
                else:
                    rep_valence = -1 * rep_valence
            continue
    return rep_valence

def calculate_one_word_occurrence_score(prev_sentence_valence, cur_sentence, ind_of_word):
    sentence_to_sum = [0] * len(cur_sentence)
    coef = 1

    for i in range(len(cur_sentence)):
        word = cur_sentence[i]

        try:
            word[1]
            isTuple = True
        except:
            isTuple = False

        if word == "^":
            sentence_to_sum[i] = "^"
        elif isTuple and word[0] == "m":
             coef = word[1]
        elif isTuple:
            pass
        else:
            sentence_to_sum[i] = word*coef
            coef=1

    i = 0
    ind_to_stop = ind_of_word
    hinges_before_word = 0
    while i < ind_to_stop:
        if i == 0 and sentence_to_sum[i] == "^":
            sentence_to_sum.insert(0, prev_sentence_valence)
            ind_to_stop += 1
            ind_of_word += 1
        if sentence_to_sum[i] == "^":
            hinges_before_word += 1
        i += 1
    if hinges_before_word % 2 == 0:
        # don't start negating until you find a hinge
        negate = False
    else:
        # negate until hinge
        negate = True
    agg_op_score = 0
    for i in range(len(sentence_to_sum)):
        word = sentence_to_sum[i]
        if word == "^":
            sentence_to_sum[i] = 0
            if negate:
                negate = False
            else:
                negate = True
        else:
            if negate:
                sentence_to_sum[i] = -1 * word

        if i != ind_of_word:
            distance = math.fabs(i - ind_of_word)
            agg_op_score += (sentence_to_sum[i] / distance)
    return agg_op_score


def make_abstract_sentence(sentence):
    abstract_sentence = []
    if sentence:
        words_removed = 0
        word_already_processed = False
        for i in range(len(sentence)):
            if word_already_processed:
                word_already_processed = False
                words_removed += 1
                continue
            word = sentence[i]
            try:
                noun_dict[word]
                abstract_sentence.append(("*", noun_dict[word]))
            except:
                try:
                    op_dict[word]
                    abstract_sentence.append(op_dict[word])
                except:
                    try:
                        modifiers_dict[word]
                        if modifiers_dict[word][1] == "":
                            abstract_sentence.append(("m", modifiers_dict[word][0]))
                        else:
                            check_word = modifiers_dict[word][1][1]
                            check_word_before = (modifiers_dict[word][1][0] == 0)
                            if check_word_before and i != 0:
                                if sentence[i - 1] == check_word:
                                    abstract_sentence.append(0)
                                else:
                                    abstract_sentence.append(("m", modifiers_dict[word][0]))
                            elif i != len(sentence) - 1 and not check_word_before:
                                if sentence[i + 1] == check_word:
                                    abstract_sentence.append(0)
                                else:
                                    abstract_sentence.append(("m", modifiers_dict[word][0]))
                            else:
                                sentence.append(abstract_sentence.append(("m", modifiers_dict[word][0])))
                    except:
                        try:
                            if i != (len(sentence) - 1):
                                phrase = word + " " + sentence[i + 1]
                                modifiers_dict[phrase]
                                abstract_sentence.append(("m", modifiers_dict[phrase][0]))
                                word_already_processed = True
                            else:
                                append_hinge_or_zero(abstract_sentence, word, sentence, i)
                        except:
                            append_hinge_or_zero(abstract_sentence, word, sentence, i)
        assert len(abstract_sentence) + words_removed == i+1, \
            "Sentence length changed more than expected in one pass through abstracter loop. Sentence: " + \
            str(sentence) + ", " + str(abstract_sentence)
    return abstract_sentence

def append_hinge_or_zero(abstract_sentence, word, sentence, i):
    if word in hinges_list:
        if word == "actually":
            if i == 0:
                abstract_sentence.append("^")
            else:
                abstract_sentence.append(0)
        elif word == "although" or word == "despite" or word == "though":
            if i != 0 and i != (len(sentence)-1):
                abstract_sentence.append("^")
            else:
                abstract_sentence.append(0)
        elif word == "but":
            if i != len(sentence)-1 and sentence[i+1] == "also":
                abstract_sentence.append(0)
            elif i <= 1:
                abstract_sentence.append("^")
            else:
                notAdded = True
                for j in range(i-1):
                    phrase = sentence[j] + " " + sentence[j + 1]
                    if phrase == "not only":
                        abstract_sentence.append(0)
                        notAdded = False
                        break
                if notAdded:
                    abstract_sentence.append("^")
        elif word == "however":
            abstract_sentence.append("^")
    else:
        abstract_sentence.append(0)

def main():
    make_all_sentiment_analysis_vectors("datafile/ISBNNumsInCommon.txt")
    

main()