import tweepy # Twitter API wrapper import nltk # Classic NLP package #Variables that contains the user credentials to access Twitter API consumer_key = "Consumer_Key" # Replace with your own consumer_key consumer_secret = "Consumer_Secret" # Replace with your own consumer_secret # Create authorization for API auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret) #auth.set_access_token(access_token, access_token_secret) # Initialize API object by passing it your credentials api = tweepy.API(auth) # Use the api to search tweets = api.search(q="kaggle", count=10, result_type="recent") print tweets[0] tweets[0].text tweets_text = [] for tweet in tweepy.Cursor(api.search, q='kaggle', result_type='recent').items(1000): tweets_text.append(tweet.text.encode('ascii','ignore')) print tweets_text[0] # Running this will overwrite the current data. ''' with open('../data/kaggle_tweets.csv','w') as f: for tweet in tweets_text: f.write('"%s"\n' % tweet) ''' with open('../data/kaggle_tweets.csv','r') as f: tweets_text = [tweet.replace('\n','').replace('"','') for tweet in f.readlines()] tweets_text[0] # Tokenize into sentences sentences = [] for tweet in tweets_text: for sent in nltk.sent_tokenize(tweet): sentences.append(sent) sentences[:10] # Tokenize into words tokens = [] for tweet in tweets_text: for word in nltk.word_tokenize(tweet): tokens.append(word) tokens[:10] # Only keep tokens that start with a letter (using regular expressions) import re clean_tokens = [token for token in tokens if re.search('^[a-zA-Z]+', token)] clean_tokens[:20]# Tokenize into words # Count the tokens from collections import Counter c = Counter(clean_tokens) c.most_common(25) # Most frequent tokens # Initialize stemmer from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer('english') # Some exmaples print 'charge:', stemmer.stem('charge') print 'charging:', stemmer.stem('charging') print 'charged:', stemmer.stem('charged') # Stem the tokens stemmed_tokens = [stemmer.stem(t) for t in clean_tokens] # Count the stemmed tokens c = Counter(stemmed_tokens) c.most_common(25) # all lowercase # Initialize lemmatizer lemmatizer = nltk.WordNetLemmatizer() # Compare stemmer to lemmatizer print 'dogs - stemmed:', stemmer.stem('dogs'), ', lemmatized:', lemmatizer.lemmatize('dogs') print 'wolves - stemmed:', stemmer.stem('wolves'), ', lemmatized:', lemmatizer.lemmatize('wolves') # Lemmatize the tokens lemmatized_tokens = [lemmatizer.lemmatize(t).lower() for t in clean_tokens] # I lowercased things too. # Count the lemmatized tokens c = Counter(lemmatized_tokens) c.most_common(25) # all lowercase # One more example print 'is - stemmed:', stemmer.stem('is'), ', lemmatized:', lemmatizer.lemmatize('is') lemmatizer.lemmatize('is',pos='v') nltk.pos_tag(nltk.word_tokenize('Lloyld loves NLP')) # View the list of stopwords stopwords = nltk.corpus.stopwords.words('english') print stopwords[0:25] # Stem the stopwords stemmed_stops = [stemmer.stem(t) for t in stopwords] # Remove stopwords from stemmed tokens stemmed_tokens_no_stop = [stemmer.stem(t) for t in stemmed_tokens if t not in stemmed_stops] c = Counter(stemmed_tokens_no_stop) most_common_stemmed = c.most_common(25) # Remove stopwords from cleaned tokens clean_tokens_no_stop = [t.lower() for t in clean_tokens if t.lower() not in stopwords] c = Counter(clean_tokens_no_stop) most_common_not_stemmed = c.most_common(25) # Compare the most common results for stemmed words and non stemmed words for i in range(25): text_list = most_common_stemmed[i][0] + ' ' + str(most_common_stemmed[i][1]) + ' '*25 text_list = text_list[0:30] text_list += most_common_not_stemmed[i][0] + ' ' + str(most_common_not_stemmed[i][1]) print text_list def extract_entities(text): entities = [] # tokenize into sentences for sentence in nltk.sent_tokenize(text): # tokenize sentences into words # add part-of-speech tags # use NLTK's NER classifier chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))) # parse the results entities.extend([chunk for chunk in chunks if hasattr(chunk, 'label')]) return entities # Let's look at all of the words in this dataset and see which named entities are identified. for entity in extract_entities('Kevin and Brandon are instructors for General Assembly in Washington, D.C.'): print '[' + entity.label() + '] ' + ' '.join(c[0] for c in entity.leaves()) for entity in extract_entities('kevin and BRANDON are instructors for @GA_DC, DC'): print '[' + entity.label() + '] ' + ' '.join(c[0] for c in entity.leaves()) print tweets_text[21] for entity in extract_entities(tweets_text[21]): print '[' + entity.label() + '] ' + ' '.join(c[0] for c in entity.leaves()) import lda # Latent Dirichlet Allocation import numpy as np from sklearn.feature_extraction.text import CountVectorizer # Instantiate a count vectorizer with two additional parameters vect = CountVectorizer(stop_words='english', ngram_range=[1,3]) sentences_train = vect.fit_transform(np.array(tweets_text)) # Instantiate an LDA model model = lda.LDA(n_topics=10, n_iter=500) model.fit(sentences_train) # Fit the model n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i+1, ', '.join(topic_words))) # Imports import requests from bs4 import BeautifulSoup # Get Data Science Wiki page r = requests.get("http://en.wikipedia.org/wiki/Data_science") b = BeautifulSoup(r.text) paragraphs = b.find("body").findAll("p") paragraphs_text = [p.text for p in paragraphs] text = "" for paragraph in paragraphs: text += paragraph.text + " " # Data Science corpus text[:500] # tokenize into sentences sentences = [sent for sent in nltk.sent_tokenize(text)] sentences[0] # Instantiate a count vectorizer with two additional parameters vect = CountVectorizer(stop_words='english', ngram_range=[1,3]) sentences_train = vect.fit_transform(paragraphs_text) # Instantiate an LDA model model = lda.LDA(n_topics=10, n_iter=500) model.fit(sentences_train) # Fit the model n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i+1, ', '.join(topic_words))) # Instantiate a count vectorizer with two additional parameters vect = CountVectorizer(stop_words='english', ngram_range=[1,3]) sentences_train = vect.fit_transform(paragraphs_text) # Instantiate an LDA model model = lda.LDA(n_topics=10, n_iter=500) model.fit(sentences_train) # Fit the model n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i+1, ', '.join(topic_words))) from textblob import TextBlob, Word # Textblob has a different syntax, but it generally performs the same functions as NLTK. blob = TextBlob('Kevin and Brandon are instructors for General Assembly in Washington, D.C. They both love Data Science.') print 'Sentences:', blob.sentences print 'Words:', blob.words print 'Noun Phrases:', blob.noun_phrases # Singularize and pluralize blob = TextBlob('Put away the dishes.') print [word.singularize() for word in blob.words] print [word.pluralize() for word in blob.words] # Spelling correction blob = TextBlob('15 minuets late') print 'Original: 15 minuets late Corrected:', blob.correct() # Spellcheck print 'Original: parot Corrected:', Word('parot').spellcheck() # Definitions print Word('bank').define() print ' ' print Word('bank').define('v') # translation and language identification blob = TextBlob('Welcome to the classroom.') print 'English: "Welcome to the classroom." Spanish:', blob.translate(to='es') print '' blob = TextBlob('Hola amigos') print '"Hola amigos" is the language', blob.detect_language() # The sentiment polarity score is a float within the range [-1.0, 1.0]. print 'I love pizza Sentiment =', TextBlob('I love pizza').sentiment.polarity print 'I hatee pizza Sentiment =', TextBlob('I hate pizza').sentiment.polarity print 'I feel nothing about pizza Sentiment =', TextBlob('I feel nothing about pizza').sentiment.polarity # The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. print 'I am a cool person Subjectivity =', TextBlob("I am a cool person").sentiment.subjectivity # Pretty subjective print 'I am a person Subjectivity =', TextBlob("I am a person").sentiment.subjectivity # Pretty objective # different scores for essentially the same sentence print TextBlob('Kevin and Brandon are instructors for General Assembly in Washington, D.C.').sentiment.subjectivity print TextBlob('Kevin and Brandon are instructors in Washington, D.C.').sentiment.subjectivity # Let's loop through our tweets and calculate sentiment sentiments = [TextBlob(tweet).sentiment.polarity for tweet in tweets_text] print tweets_text[0], sentiments[0] # Average sentiment avg_sentiment = np.sum(sentiments)/len(sentiments) print avg_sentiment %matplotlib inline import seaborn as sns import matplotlib.pyplot as plt sns.distplot(sentiments) plt.title('Distribution of Sentiment') plt.xlabel('Sentiment (-1 to 1)') plt.ylabel('Frequency') # Loop through sentiments and look for negative sentiments. for i in range(len(sentiments)): if sentiments[i] <= -0.25 and 'http' not in tweets_text[i]: print tweets_text[i], sentiments[i] # Loop through sentiments and look for positive sentiments. for i in range(len(sentiments)): if sentiments[i] >= 0.25 and 'http' not in tweets_text[i]: print tweets_text[i], sentiments[i] # Loop through all of the sentiments and put them into the appropriate group pos_neg_neutral = [] for sentiment in sentiments: if sentiment <= -0.25: pos_neg_neutral.append('negative') elif sentiment >= 0.25: pos_neg_neutral.append('positive') elif sentiment > -0.25 and sentiment < 0.25: pos_neg_neutral.append('neutral') sns.barplot(np.array(pos_neg_neutral)) plt.title('Positive, Negative, and Neutral Sentiment') plt.xlabel('Sentiment Category') plt.ylabel('Frequency')