from collections import defaultdict from collections import Counter def add_element_check1(elements): """if ele not in dict (v1)""" d = dict() for e in elements: if e not in d: d[e] = 1 else: d[e] += 1 return d def add_element_check2(elements): """if ele not in dict (v2)""" d = dict() for e in elements: if e not in d: d[e] = 0 d[e] += 1 return d def add_element_except(elements): """try-except""" d = dict() for e in elements: try: d[e] += 1 except KeyError: d[e] = 1 return d def add_element_defaultdict(elements): """defaultdict""" d = defaultdict(int) for e in elements: d[e] += 1 return d def add_element_get(elements): """.get() method""" d = dict() for e in elements: d[e] = d.get(e, 1) + 1 return d def counter_object(elements): """Counter object""" return Counter(elements) def read_speech(datafile): """ Reads in a text file and returns individual words in lowercase and stripped from digits and punctuation """ def strip_word(word): out = [] for char in word: if char.isalpha(): out.append(char.lower()) return "".join(out) with open(datafile,'r') as infile: for line in infile: line = line.strip() if line: for word in line.split(): yield strip_word(word) speech_file = '../data/day3_dictionary_counting/i_have_a_dream_speech.txt' speech = list(read_speech(speech_file)) def read_dna(datafile): with open(datafile,'r') as infile: return list(infile.read()) dna_file = '../data/day3_dictionary_counting/random_dna.txt' dna = read_dna(dna_file) import timeit from functools import reduce funcs = ['add_element_check1', 'add_element_check2', 'add_element_except', 'add_element_defaultdict', 'add_element_get', 'counter_object'] times_n = {f:[] for f in funcs} for d in [speech, dna]: for f in funcs: times_n[f].append(min(timeit.Timer('%s(d)' %f, 'from __main__ import %s, d' %f) .repeat(repeat=3, number=1000))) import platform import multiprocessing def print_sysinfo(): print('\nPython version:', platform.python_version()) print('compiler:', platform.python_compiler()) print('\nsystem :', platform.system()) print('release :', platform.release()) print('machine :', platform.machine()) print('processor :', platform.processor()) print('interpreter:', platform.architecture()[0]) print('CPU count :', multiprocessing.cpu_count()) print('\n\n') %matplotlib inline labels = [('add_element_check1', 'if-else statements (v1)'), ('add_element_check2', 'if-else statements (v2)'), ('add_element_except', 'try-except blocks'), ('add_element_defaultdict', 'using collections.defaultdict'), ('add_element_get', 'using the .get() method'), ('counter_object', 'using collections.Counter'), ] from numpy import arange import matplotlib.pyplot as plt def plot_timings(): plt.rcParams.update({'font.size': 12}) ind = arange(2) # the x locations for the groups width = 0.15 fig = plt.figure(figsize=(10,8)) ax = fig.add_subplot(111) colors = [(0,'b'), (1,'c'), (2,'g'), (3,'r'), (4,'y'), (5, 'm')] for l,c in zip(labels,colors): ax.bar(ind + c[0]*width, times_n[l[0]], width, alpha=0.5, color=c[1], label=l[1]) ax.set_ylabel('time in milliseconds') ax.set_title('Methods for counting elements in a dataset using a dictionary') ax.set_xticks(ind + width) ax.set_xticklabels(['Luther speech', 'DNA string']) ax.set_xlim(-0.1,2) ax.set_ylim(0,1.5) plt.legend(loc='upper left') plt.show() import prettytable def count_unique(input_data): unique = 0 count_dict = add_element_check1(input_data) for count in count_dict.values(): if count == 1: unique += 1 different_ele = len(count_dict) unique_ele = round(unique / different_ele * 100, 2) return (different_ele, unique_ele) def summary_table(): speech_different, speech_unique = count_unique(speech) dna_different, dna_unique = count_unique(dna) fit_table = prettytable.PrettyTable(["", "total elements" , "different elements", "unique elements (%)"]) fit_table.add_row(["DNA string", len(dna), dna_different, dna_unique]) fit_table.add_row(["Luther speech", len(speech), speech_different, speech_unique]) print(fit_table) print_sysinfo() summary_table() plot_timings()