from nltk.corpus import words, brown import string from collections import Counter, defaultdict from itertools import combinations from __future__ import division corpus = set(w.lower() for w in words.words()) print len(corpus) trigram_trie = defaultdict(Counter) def ngram_tokenize(word, n=3): m = len(word) grams = [] for i in range(m-n+1): j = i + n grams.append(word[i:j]) return grams # Process trigrams as unigram->bigram for word in corpus: for trigram in ngram_tokenize(word, n=3): start, bigram = trigram[0], trigram[1:] trigram_trie[bigram].update([start]) # Count occurence of bigrams stats = Counter() for k,v in trigram_trie.iteritems(): stats[k] = sum(v.values()) # Investigate what makes something a rule def print_rule_stats(rule, trigram_trie=trigram_trie, stats=stats): """ Print summary statistics for a trigram rule """ ab,c = rule a,b = ab ba = b+a print ab, stats[ab], '\t', c+ab, trigram_trie[ab][c] print ba, stats[ba], '\t', c+ba, trigram_trie[ba][c] #print c+ab, trigram_trie[ab][c] #print c+ba, trigram_trie[ba][c] print "Weak rule" print_rule_stats(['ie','c']) print "\nStrong rule" print_rule_stats(['ie','a']) def score_rule(rule, k=1/2): ab,c = rule a,b = ab ba=b+a if stats[ab]/(stats[ab] + stats[ba]) > k: return (trigram_trie[ba][c]+1)/(trigram_trie[ab][c]+1) else: return 0 def extract_trigram_rules(corpus, scoring_function=score_rule): """ Extracts "I before E except after C" style 'trigram rules' from an input corpus, ranked by the specified scoring function. """ # Process trigrams as unigram->bigram trigram_trie = defaultdict(Counter) for word in corpus: for trigram in ngram_tokenize(word, n=3): start, bigram = trigram[0], trigram[1:] trigram_trie[bigram].update([start]) # Tally bigrams stats = Counter() for k,v in trigram_trie.iteritems(): stats[k] = sum(v.values()) special_cases = [] for a, b in combinations(string.lowercase,2): #print a, b comb1 = a+b comb2 = b+a if stats[comb2] > stats[comb1]: comb1, comb2 = comb2, comb1 for c in trigram_trie[comb2].keys(): if trigram_trie[comb2][c] > trigram_trie[comb1][c]: rule = [comb1, str(c)] special_cases.append([scoring_function(rule), rule]) special_cases.sort(reverse=True) return special_cases, trigram_trie, stats vocab_rules, trigram_trie, stats = extract_trigram_rules(corpus, scoring_function=lambda x:score_rule(x, k=1/2)) vocab_rules[0] for score, rule in vocab_rules[:5]: print rule print_rule_stats(rule, trigram_trie, stats) print vocab_rules, trigram_trie, stats = extract_trigram_rules(corpus, scoring_function=lambda x:score_rule(x, k=3/4)) vocab_rules[0] for score, rule in vocab_rules[:5]: print rule print_rule_stats(rule, trigram_trie, stats) print brown_corpus = [w.lower() for w in brown.words()] print "Total words:", len(brown_corpus) # plus a few grammatical tokens print "Unique words:", len(set(brown_corpus)) print "\nMost common tokens:" Counter(brown_corpus).most_common(10) brown_rules, brown_trigrams, brown_stats = extract_trigram_rules(brown_corpus) for score, rule in brown_rules[:5]: print rule print_rule_stats(rule, brown_trigrams, brown_stats) print print_rule_stats(['ie','c'], brown_trigrams, brown_stats)