from collections import defaultdict
from collections import Counter

def add_element_check1(elements):
    """if ele not in dict (v1)"""
    d = dict()
    for e in elements:
        if e not in d:
            d[e] = 1
        else:
            d[e] += 1
    return d
            
    
def add_element_check2(elements):
    """if ele not in dict (v2)"""
    d = dict()
    for e in elements:
        if e not in d:
            d[e] = 0
        d[e] += 1            
    return d
        
    
def add_element_except(elements):
    """try-except"""
    d = dict()
    for e in elements:
        try:
            d[e] += 1
        except KeyError:
            d[e] = 1
    return d
            
    
def add_element_defaultdict(elements):
    """defaultdict"""
    d = defaultdict(int)
    for e in elements:
        d[e] += 1
    return d


def add_element_get(elements):
    """.get() method"""
    d = dict()
    for e in elements:
        d[e] = d.get(e, 1) + 1
    return d


def counter_object(elements):
    """Counter object"""
    return Counter(elements)

def read_speech(datafile):
    """ 
    Reads in a text file and returns individual words
    in lowercase and stripped from digits and punctuation
    
    """
    def strip_word(word):
        out = []
        for char in word:
            if char.isalpha():
                out.append(char.lower())
        return "".join(out)
    
    with open(datafile,'r') as infile:
        for line in infile:
            line = line.strip()
            if line:
                for word in line.split():
                    yield strip_word(word)
            
speech_file = '../data/day3_dictionary_counting/i_have_a_dream_speech.txt'
speech = list(read_speech(speech_file))

def read_dna(datafile):
    with open(datafile,'r') as infile:
        return list(infile.read())

dna_file = '../data/day3_dictionary_counting/random_dna.txt'
dna = read_dna(dna_file)  

import timeit
from functools import reduce


funcs = ['add_element_check1', 'add_element_check2',
         'add_element_except', 'add_element_defaultdict',
         'add_element_get', 'counter_object']

times_n = {f:[] for f in funcs}

for d in [speech, dna]:
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(d)' %f, 
                      'from __main__ import %s, d' %f)
                              .repeat(repeat=3, number=1000)))

import platform
import multiprocessing

def print_sysinfo():
    
    print('\nPython version:', platform.python_version())
    print('compiler:', platform.python_compiler())
    
    print('\nsystem     :', platform.system())
    print('release    :', platform.release())
    print('machine    :', platform.machine())
    print('processor  :', platform.processor())
    print('interpreter:', platform.architecture()[0])
    print('CPU count  :', multiprocessing.cpu_count())
    print('\n\n')

%matplotlib inline

labels = [('add_element_check1', 'if-else statements (v1)'), 
          ('add_element_check2', 'if-else statements (v2)'),
          ('add_element_except', 'try-except blocks'),
          ('add_element_defaultdict', 'using collections.defaultdict'),
          ('add_element_get', 'using the .get() method'),
          ('counter_object', 'using collections.Counter'),
          ] 

from numpy import arange
import matplotlib.pyplot as plt

def plot_timings():

    plt.rcParams.update({'font.size': 12})

    ind = arange(2)  # the x locations for the groups
    width = 0.15

    fig = plt.figure(figsize=(10,8))
    ax = fig.add_subplot(111)
    colors = [(0,'b'), (1,'c'), (2,'g'), (3,'r'), (4,'y'), (5, 'm')]

    for l,c in zip(labels,colors):
        ax.bar(ind + c[0]*width,
            times_n[l[0]], 
            width,
            alpha=0.5,
            color=c[1],
            label=l[1])

    ax.set_ylabel('time in milliseconds')
    ax.set_title('Methods for counting elements in a dataset using a dictionary')
    ax.set_xticks(ind + width)
    ax.set_xticklabels(['Luther speech', 'DNA string'])
    ax.set_xlim(-0.1,2)
    ax.set_ylim(0,1.5)
    plt.legend(loc='upper left')
    plt.show()

import prettytable

def count_unique(input_data):
    unique = 0
    count_dict = add_element_check1(input_data)
    for count in count_dict.values():
        if count == 1:
            unique += 1
    different_ele = len(count_dict)
    unique_ele = round(unique / different_ele * 100, 2)
    return (different_ele, unique_ele)

def summary_table():

    speech_different, speech_unique = count_unique(speech)
    dna_different, dna_unique = count_unique(dna)

    fit_table = prettytable.PrettyTable(["", "total elements" ,
                                     "different elements", 
                                     "unique elements (%)"])
    fit_table.add_row(["DNA string", len(dna), dna_different, dna_unique])
    fit_table.add_row(["Luther speech", len(speech), speech_different, speech_unique])

    print(fit_table)

print_sysinfo()
summary_table()
plot_timings()