import nltk from nltk.corpus import gutenberg gutenberg for fileid in gutenberg.fileids(): print fileid fileid = 'shakespeare-macbeth.txt' words = gutenberg.words(fileid) words[:20] raw = gutenberg.raw(fileid) raw[:1000] len(words) sents = gutenberg.sents(fileid) sents len(sents) len(raw) from nltk.corpus import stopwords words = stopwords.words('english') for word in words[:25]: print word, languages = stopwords.fileids() for lang in languages: words = stopwords.words(lang) print lang for word in words[:25]: print word, print '\n' from nltk.corpus import treebank words = treebank.tagged_words()[:25] print ', '.join(["('%s', '%s')" % (word[0], word[1]) for word in words]) sent = treebank.tagged_sents()[0] sent from nltk.corpus import wordnet good_synsets = wordnet.synsets('good') for synset in good_synsets[:20]: print synset.name print synset.pos print synset.definition print '-' * 40 good_nouns = wordnet.synsets('good', pos='n') for noun in good_nouns: print noun.name print noun.pos print noun.definition print '-' * 40 for noun in good_nouns: print noun.definition if len(noun.examples) == 0: print '**NO EXAMPLES' else: for example in noun.examples: print example print '-' * 40 noun = good_nouns[1] for lemma in good_nouns[1].lemmas: print lemma.name antonyms = lemma.antonyms() for antonym in antonyms: print '-', antonym.name url = 'http://www.gutenberg.org/cache/epub/1661/pg1661.txt' # The Adventures of Sherlock Holmes import urllib book = urllib.urlopen(url).read() tokens = book.split(' ') print tokens[:100] from nltk import sent_tokenize sents = sent_tokenize(book) print 'There are', len(sents), 'sentences' print sents[1002:1004] from nltk import word_tokenize words = word_tokenize(book) print 'There are', len(words), 'words' print word_tokenize(''.join(sents[1002:1004])) from nltk.tokenize import WhitespaceTokenizer tokenizer = WhitespaceTokenizer() print tokenizer.tokenize(''.join(sents[1002:1004])) from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() print tokenizer.tokenize(''.join(sents[1002:1004])) def stem(word): for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: if word.endswith(suffix): return word[:-len(suffix)] return word print 'runs:', stem('runs') print 'played:', stem('played') print 'playing:', stem('playing') print 'running:', stem('running') from nltk.stem import PorterStemmer stemmer = PorterStemmer() print 'talking:', stemmer.stem('talking') print 'talks:', stemmer.stem('talks') print 'talked:', stemmer.stem('talked') print 'running:', stemmer.stem('running') from nltk.stem import RegexpStemmer stemmer = RegexpStemmer('ing') print 'talking:', stemmer.stem('talking') print 'talks:', stemmer.stem('talks') stemmer = RegexpStemmer('ed') print 'talked:', stemmer.stem('talked') stemmer = RegexpStemmer('ing') print 'running:', stemmer.stem('running') from nltk.stem import SnowballStemmer stemmer = SnowballStemmer('spanish') print 'hablar:', stemmer.stem('hablar') # to speak print 'hablo:', stemmer.stem('hablo') # I speak from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures finder = BigramCollocationFinder.from_words(gutenberg.words('shakespeare-macbeth.txt')) bigram = BigramAssocMeasures() finder.nbest(bigram.pmi, 25) # pmi => pointwise mutual information, scoring method from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder tri_finder = TrigramCollocationFinder.from_words(gutenberg.words('shakespeare-macbeth.txt')) trigram = TrigramAssocMeasures() tri_finder.nbest(trigram.pmi, 25) finder.apply_freq_filter(5) finder.nbest(bigram.pmi, 25) def remove_my(word): return word == 'my' finder.apply_word_filter(remove_my) finder.nbest(bigram.pmi, 25) nltk.pos_tag(nltk.word_tokenize('PyOhio is an awesome software development conference.')) from nltk.tag import DefaultTagger default_tagger = DefaultTagger('NN') sent = default_tagger.tag(nltk.word_tokenize('PyOhio is an awesome software development conference.')) sent from nltk.tag import UnigramTagger tagger = UnigramTagger(treebank.tagged_sents()) sent = tagger.tag(nltk.word_tokenize('PyOhio is an awesome software development conference.')) sent sent = tagger.tag(nltk.word_tokenize('The quick brown fox jumped over the lazy dog.')) sent from nltk.corpus import brown tagger = UnigramTagger(brown.tagged_sents(brown.fileids())) sent = tagger.tag(nltk.word_tokenize('The quick brown fox jumped over the lazy dog.')) sent sent = tagger.tag(nltk.word_tokenize('PyOhio is an awesome software development conference.')) sent tagger._taggers = [tagger, default_tagger] sent = tagger.tag(nltk.word_tokenize('PyOhio is an awesome software development conference.')) sent import pickle f = open('/Users/douglasstarnes/nltk_data/taggers/dumbtagger.pickle', 'w') pickle.dump(tagger, f) f.close() dumbtagger = nltk.tag.load('taggers/dumbtagger.pickle') sent = dumbtagger.tag(nltk.word_tokenize('PyOhio is an awesome software development conference.')) sent nltk.tag.untag(sent) import nltk.data tagger = nltk.data.load('taggers/treebank_aubt.pickle') sent = tagger.tag(treebank.sents()[0]) sent words = nltk.word_tokenize('PyOhio is an awesome software development conference.') sent = tagger.tag(words) sent words = nltk.word_tokenize('The quick brown fox jumped over the lazy dog.') sent = tagger.tag(words) sent from nltk.corpus import movie_reviews import nltk.classify.util from nltk.classify import NaiveBayesClassifier p_f = [] # positive features n_f = [] # negative features for fileid in movie_reviews.fileids('pos'): words = movie_reviews.words(fileid) words = dict([(word, True) for word in words]) p_f.append((words, 'pos')) for fileid in movie_reviews.fileids('neg'): words = movie_reviews.words(fileid) words = dict([(word, True) for word in words]) n_f.append((words, 'neg')) training_set = p_f[:900] + n_f[:900] test_set = p_f[900:] + n_f[900:] classifier = NaiveBayesClassifier.train(training_set) nltk.classify.util.accuracy(classifier, test_set) import math from nltk.tokenize import WhitespaceTokenizer documents = [ 'I like to play golf and tennis', 'The local court is a place I like', 'I do not like to play tennis but I like to play golf', 'My neighbor went bowling yesterday' ] tokenizer = WhitespaceTokenizer() def tf(term, doc): words = tokenizer.tokenize(doc) terms = sum([1 for t in words if t == term]) return float(terms) / float(len(words)) def idf(term): docs = sum([1 for doc in documents if term in tokenizer.tokenize(doc)]) return math.log(float(len(documents)) / float(1 + docs)) def tfidf(term, doc): return tf(term, doc) * idf(term) for doc in documents: print tfidf('bowling', doc) # golf, play, tennis, bowling from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(min_df=1) # consider terms that occur one or more times doc_matrix = vectorizer.fit_transform(['happy dog', 'sad dog', 'happy cat', 'sad cat']) (doc_matrix * doc_matrix.T).A vectorizer = TfidfVectorizer(min_df=1) doc_matrix = vectorizer.fit_transform(documents) (doc_matrix * doc_matrix.T).A url_osu_home = 'http://www.osu.edu/' html = urllib.urlopen(url_osu_home).read() print html[:1000] html = nltk.util.clean_html(html) nltk.word_tokenize(html) from bs4 import BeautifulSoup html = urllib.urlopen(url_osu_home).read() osu_home = BeautifulSoup(html) osu_home.title metas = osu_home.findAll('meta') for tag in metas: try: print tag.attrs['content'] except KeyError: pass from textblob import TextBlob text = 'the quick brown fox jumped over the lazy dog' blob = TextBlob(text) blob.tags fr = TextBlob(text) fr = fr.translate(to="fr") print fr.string cn = TextBlob(text) cn = cn.translate(to="zh-CN") print cn.string iw = TextBlob(text) iw = iw.translate(to="IW") print iw.string text = 'I spell like an amatur. I beleive it is not a serious issue. But ignorence is bliss.' blob = TextBlob(text) print text print blob.correct().string blobs = [ 'Today is a beautiful day.', 'Today is a terrible day.', 'Today is a rainy day.', 'I love rainy days.', 'I think rainy days are beautiful.', 'Today is a sunny day.', 'I think sunny days are awful.' ] for blob in blobs: print '****',blob print TextBlob(blob).sentiment