def automatic_readability_index(n_chars, n_words, n_sents): # insert your code here # do not modify the code below, it is for testing your answer only! # it should output True if you did well print(abs(automatic_readability_index(300, 40, 10) - 15.895) < 0.001) from pyhum.preprocessing import read_corpus def extract_counts(sentences): # insert your code here # do not modify the code below, for testing only! print(extract_counts( [["this", "was", "rather", "easy"], ["please", "give", "me", "something", "more", "challenging"]]) == (53, 10, 2)) sentences = [["this", "was", "rather", "easy"], ["Please", "give", "me", "something", "more", "challenging"]] n_chars, n_words, n_sents = extract_counts(sentences) print(automatic_readability_index(n_chars, n_words, n_sents)) def compute_ARI(sentences): # insert your code here # do not modify the code below, it is for testing your answer only! # it should output True if you did well print(abs(compute_ARI(sentences) - 4.442) < 0.001) def compute_ARIs(directory): # insert your code here import matplotlib.pyplot as plt # insert your code here def predict_author(text, feature_database): "Predict who wrote this text." return classify(score(extract_features(text), feature_database)) scores = {"Hermans": 0.15, "Voskuil": 0.55, "Reve": 0.2, "Mulisch": 0.18, "Claus": 0.02} def classify(scores): # insert your code here print(classify(scores) == "Voskuil") from pyhum.preprocessing import read_corpus_file, tokenize def extract_features(filename): return tokenise(read_corpus_file(filename)) from collections import defaultdict feature_database = defaultdict(lambda: defaultdict(int)) def extract_author(filename): # insert your code here # do not modify the code below, it is for testing your answer only! # it should output True if you did well print(extract_author("Austen-emma.txt") == "Austen") print(extract_author("/path/to/Austen-emma.txt") == "Austen") from preprocess import tokenise def update_counts(author, text, feature_database): # insert your code here return feature_database # do not modify the code below, for testing only! feature_database = defaultdict(lambda: defaultdict(int)) feature_database = update_counts("Anonymous", "This was written with a lack of inspiration", feature_database) test_database = defaultdict(lambda: defaultdict(int)) for word in "This was written with a lack of inspiration".split(): test_database["Anonymous"][word] += 1 print(sorted(feature_database.items()) == sorted(test_database.items())) def add_file_to_database(filename, feature_database): return update_counts(extract_author(filename), extract_features(filename), feature_database) import os def add_directory_to_database(directory, feature_database): # insert your code here return feature_database x = 0.00000000000000001 for i in range(30): x = x * 0.000000000000001 print(x) from math import log def log_probability(feature_counts, features_sum, n_features): return log((feature_counts + 1.0) / (features_sum + n_features)) def score(features, feature_database): "Predict who wrote the document on the basis of the corpus." scores = defaultdict(float) # compute the number of features in the feature database here for author in feature_database: # compute the probability of features given that author here return scores # do not modify the code below, for testing your answer only! # It should return True if you did well! features = ["the", "a", "the", "be", "book"] feature_database = defaultdict(lambda: defaultdict(int)) feature_database["A"]["the"] = 2 feature_database["A"]["a"] = 5 feature_database["A"]["book"]= 1 feature_database["B"]["the"] = 5 feature_database["B"]["a"] = 1 feature_database["B"]["book"] = 6 print(abs(dict(score(features, feature_database))["A"] - -7.30734) < 0.001) # first define the feature_database feature_database = defaultdict(lambda: defaultdict(int)) feature_database = add_directory_to_database("data/gutenberg/training", feature_database) print(predict_author("data/gutenberg/testing/milton-poetical.txt", feature_database)) def test_from_corpus(directory, feature_database): results = [] # insert your code here return results def analyze_results(results): # insert your code here # do not modify the code below, for testing only! print(analyze_results([("A", "A"), ("A", "B"), ("C", "C"), ("D", "C"), ("E", "E")]) == 0.6) from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling()