# I keep this as a cell in my title slide so I can rerun # it easily if I make changes, but it's low enough it won't # be visible in presentation mode. %run talktools !sudo pip install -U textblob !sudo pip install -U nltk textblob-aptagger !sudo python -m textblob.download_corpora import nltk nltk.download('wordnet') import textblob blob = textblob.TextBlob(u' Welcome to textblob: text analytics for humans ') print type(blob) print blob[5:10] print blob.title() print blob.find('text') print blob.endswith(' ') print blob.stripped print textblob.Word print textblob.Word(u'sweet').pluralize() print textblob.Word(u'sweets').singularize() print textblob.Word(u'running').lemmatize('v') print type(blob.words) print blob.words.singularize() print blob.words.count('text') blob = textblob.TextBlob(u'why is it so hot in here?!\nthis is not hot at all') print blob.words print type(blob.words), type(blob.words[0]) print blob.sentences print blob.ngrams() print blob.word_counts['hot'] print blob.tokenizer import nltk.tokenize blob.tokenizer = nltk.tokenize.LineTokenizer() print blob.tokenize() print blob.tokenize(nltk.tokenize.PunktWordTokenizer()) blob = textblob.TextBlob(u'Ray Charles has got Georgia on his mind') print blob.noun_phrases print blob.pos_tags print blob.parse() import textblob.np_extractors print blob.np_extractor blob = textblob.TextBlob(u'Ray Charles has got Georgia on his mind', np_extractor=textblob.np_extractors.ConllExtractor()) print blob.noun_phrases import textblob.taggers print blob.pos_tagger blob = textblob.TextBlob(u'Ray Charles has got Georgia on his mind', pos_tagger=textblob.taggers.NLTKTagger()) print blob.pos_tags blob = textblob.TextBlob(u'I am runing across the boarder') print blob.correct() # based on Pattern print blob.words[2].spellcheck() blob = textblob.TextBlob(u'the weather is fantastic!') print blob.sentiment import textblob.sentiments print blob.analyzer blob.analyzer = textblob.sentiments.NaiveBayesAnalyzer() print blob.sentiment training = [ (u'tobey maguire is a terrible spiderman.','pos'), (u'a terrible Javert (Russell Crowe) ruined Les Miserables for me...','pos'), (u'The Dark Knight is the greatest superhero movie ever!','neg'), (u'Fantastic Four should have never been made.','pos'), (u'Wes Anderson is my favorite director!','neg'), (u'Captain America 2 is pretty awesome.','neg'), (u'Let\s pretend "Batman and Robin" never happened..','pos'), ] testing = [ (u'Superman was never an interesting character.','pos'), (u'Fantastic Mr Fox is an awesome film!','neg'), (u'Dragonball Evolution is simply terrible!!','pos') ] import textblob.classifiers classifier = textblob.classifiers.NaiveBayesClassifier(training) print classifier.accuracy(testing) classifier.show_informative_features(3) blob = textblob.TextBlob(u'the weather is terrible!', classifier=classifier) print blob.classify() np_extractor = textblob.np_extractors.ConllExtractor() pos_tagger = textblob.taggers.NLTKTagger() tokenizer = nltk.tokenize.PunktWordTokenizer() analyzer = textblob.sentiments.NaiveBayesAnalyzer() blob = textblob.TextBlob(u'Dog goes woof. Cat goes meow. Bird goes tweet. And mouse goes squeek.', np_extractor=np_extractor, pos_tagger=pos_tagger, tokenizer=tokenizer, analyzer=analyzer) # do something with blob blob2 = textblob.TextBlob(u'Cow goes moo. Frog goes croak. And the elephant goes toot.', np_extractor=np_extractor, pos_tagger=pos_tagger, tokenizer=tokenizer, analyzer=analyzer) # do something with blob2 blob3 = textblob.TextBlob(u'Ducks say quack. And fish go blub. And the seal goes ow ow ow ow ow.', np_extractor=np_extractor, pos_tagger=pos_tagger, tokenizer=tokenizer, analyzer=analyzer) # do something with blob3 blobber = textblob.Blobber( np_extractor=np_extractor, pos_tagger=pos_tagger, tokenizer=tokenizer, analyzer=analyzer) blob = blobber(u'But there\'s one sound that no one knows: What does the fox say?') print blob print blob.np_extractor print blob.pos_tagger print blob.tokenizer print blob.analyzer print map(blobber, ['Ring-ding-ding-ding-dingeringeding!', 'Wa-pa-pa-pa-pa-pa-pow!', 'Hatee-hatee-hatee-ho!', 'Joff-tchoff-tchoffo-tchoffo-tchoff!']) training = [ (u'tobey maguire is a terrible spiderman.','pos'), (u'a terrible Javert (Russell Crowe) ruined Les Miserables for me...','pos'), (u'The Dark Knight is the greatest superhero movie ever!','neg'), (u'Fantastic Four should have never been made.','pos'), (u'Wes Anderson is my favorite director!','neg'), (u'Captain America 2 is pretty awesome.','neg'), (u'Let\s pretend "Batman and Robin" never happened..','pos'), ] texts = [ u'Superman was never an interesting character.', u'Fantastic Mr Fox is an awesome film!', u'Dragonball Evolution is simply terrible!!' ] import textblob.classifiers nb_classifier = textblob.classifiers.NaiveBayesClassifier(training) dt_classifier = textblob.classifiers.DecisionTreeClassifier(training) for text in texts: nb_class = textblob.TextBlob(text, classifier=nb_classifier).classify() dt_class = textblob.TextBlob(text, classifier=dt_classifier).classify() print nb_class == dt_class nb_blobber = textblob.Blobber(classifier=nb_classifier) dt_blobber = textblob.Blobber(classifier=dt_classifier) for text in texts: print nb_blobber(text).classify() == dt_blobber(text).classify()