import tweepy # Twitter API wrapper
import nltk # Classic NLP package

#Variables that contains the user credentials to access Twitter API 
consumer_key = "Consumer_Key" # Replace with your own consumer_key
consumer_secret = "Consumer_Secret" # Replace with your own consumer_secret

# Create authorization for API
auth = tweepy.auth.OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_token_secret)

# Initialize API object by passing it your credentials
api = tweepy.API(auth)

# Use the api to search
tweets = api.search(q="kaggle", count=10, result_type="recent")
print tweets[0]

tweets[0].text

tweets_text = []
for tweet in tweepy.Cursor(api.search, q='kaggle', result_type='recent').items(1000):
    tweets_text.append(tweet.text.encode('ascii','ignore'))
print tweets_text[0]

# Running this will overwrite the current data.
'''
with open('../data/kaggle_tweets.csv','w') as f:
    for tweet in tweets_text:
        f.write('"%s"\n' % tweet)
'''

with open('../data/kaggle_tweets.csv','r') as f:
    tweets_text = [tweet.replace('\n','').replace('"','') for tweet in f.readlines()]

tweets_text[0]

# Tokenize into sentences
sentences = []
for tweet in tweets_text:
    for sent in nltk.sent_tokenize(tweet):
        sentences.append(sent)
sentences[:10]

# Tokenize into words
tokens = []
for tweet in tweets_text:
    for word in nltk.word_tokenize(tweet):
        tokens.append(word)
tokens[:10]

# Only keep tokens that start with a letter (using regular expressions)
import re
clean_tokens = [token for token in tokens if re.search('^[a-zA-Z]+', token)]
clean_tokens[:20]# Tokenize into words

# Count the tokens
from collections import Counter
c = Counter(clean_tokens)
c.most_common(25) # Most frequent tokens

# Initialize stemmer
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

# Some exmaples
print 'charge:', stemmer.stem('charge')
print 'charging:', stemmer.stem('charging')
print 'charged:', stemmer.stem('charged')

# Stem the tokens
stemmed_tokens = [stemmer.stem(t) for t in clean_tokens]

# Count the stemmed tokens
c = Counter(stemmed_tokens)
c.most_common(25)       # all lowercase

# Initialize lemmatizer
lemmatizer = nltk.WordNetLemmatizer()

# Compare stemmer to lemmatizer
print 'dogs - stemmed:', stemmer.stem('dogs'), ', lemmatized:', lemmatizer.lemmatize('dogs')


print 'wolves - stemmed:', stemmer.stem('wolves'), ', lemmatized:', lemmatizer.lemmatize('wolves')

# Lemmatize the tokens
lemmatized_tokens = [lemmatizer.lemmatize(t).lower() for t in clean_tokens] # I lowercased things too.

# Count the lemmatized tokens
c = Counter(lemmatized_tokens)
c.most_common(25)       # all lowercase

# One more example
print 'is - stemmed:', stemmer.stem('is'), ', lemmatized:', lemmatizer.lemmatize('is')

lemmatizer.lemmatize('is',pos='v')

nltk.pos_tag(nltk.word_tokenize('Lloyld loves NLP'))

# View the list of stopwords
stopwords = nltk.corpus.stopwords.words('english')
print stopwords[0:25]

# Stem the stopwords
stemmed_stops = [stemmer.stem(t) for t in stopwords]

# Remove stopwords from stemmed tokens
stemmed_tokens_no_stop = [stemmer.stem(t) for t in stemmed_tokens if t not in stemmed_stops]
c = Counter(stemmed_tokens_no_stop)
most_common_stemmed = c.most_common(25)

# Remove stopwords from cleaned tokens
clean_tokens_no_stop = [t.lower() for t in clean_tokens if t.lower() not in stopwords]
c = Counter(clean_tokens_no_stop)
most_common_not_stemmed = c.most_common(25)

# Compare the most common results for stemmed words and non stemmed words
for i in range(25):
    text_list = most_common_stemmed[i][0] + '  ' + str(most_common_stemmed[i][1]) + ' '*25
    text_list = text_list[0:30]
    text_list += most_common_not_stemmed[i][0] + '  ' + str(most_common_not_stemmed[i][1])
    print text_list

def extract_entities(text):
    entities = []
    # tokenize into sentences
    for sentence in nltk.sent_tokenize(text):
        # tokenize sentences into words
        # add part-of-speech tags
        # use NLTK's NER classifier
        chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)))
        # parse the results
        entities.extend([chunk for chunk in chunks if hasattr(chunk, 'label')])
    return entities

# Let's look at all of the words in this dataset and see which named entities are identified.
for entity in extract_entities('Kevin and Brandon are instructors for General Assembly in Washington, D.C.'):
    print '[' + entity.label() + '] ' + ' '.join(c[0] for c in entity.leaves())

for entity in extract_entities('kevin and BRANDON are instructors for @GA_DC, DC'):
    print '[' + entity.label() + '] ' + ' '.join(c[0] for c in entity.leaves())

print tweets_text[21]
for entity in extract_entities(tweets_text[21]):
    print '[' + entity.label() + '] ' + ' '.join(c[0] for c in entity.leaves())

import lda # Latent Dirichlet Allocation
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate a count vectorizer with two additional parameters
vect = CountVectorizer(stop_words='english', ngram_range=[1,3]) 
sentences_train = vect.fit_transform(np.array(tweets_text))

# Instantiate an LDA model
model = lda.LDA(n_topics=10, n_iter=500)
model.fit(sentences_train) # Fit the model 
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i+1, ', '.join(topic_words)))

# Imports
import requests
from bs4 import BeautifulSoup

# Get Data Science Wiki page
r = requests.get("http://en.wikipedia.org/wiki/Data_science")
b = BeautifulSoup(r.text)
paragraphs = b.find("body").findAll("p")
paragraphs_text = [p.text for p in paragraphs]
text = ""
for paragraph in paragraphs:
    text += paragraph.text + " "

# Data Science corpus
text[:500]

# tokenize into sentences
sentences = [sent for sent in nltk.sent_tokenize(text)]
sentences[0]

# Instantiate a count vectorizer with two additional parameters
vect = CountVectorizer(stop_words='english', ngram_range=[1,3]) 
sentences_train = vect.fit_transform(paragraphs_text)

# Instantiate an LDA model
model = lda.LDA(n_topics=10, n_iter=500)
model.fit(sentences_train) # Fit the model 
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i+1, ', '.join(topic_words)))

# Instantiate a count vectorizer with two additional parameters
vect = CountVectorizer(stop_words='english', ngram_range=[1,3]) 
sentences_train = vect.fit_transform(paragraphs_text)

# Instantiate an LDA model
model = lda.LDA(n_topics=10, n_iter=500)
model.fit(sentences_train) # Fit the model 
n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vect.get_feature_names())[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i+1, ', '.join(topic_words)))

from textblob import TextBlob, Word

# Textblob has a different syntax, but it generally performs the same functions as NLTK.
blob = TextBlob('Kevin and Brandon are instructors for General Assembly in Washington, D.C.  They both love Data Science.')
print 'Sentences:', blob.sentences
print 'Words:', blob.words
print 'Noun Phrases:', blob.noun_phrases

# Singularize and pluralize
blob = TextBlob('Put away the dishes.')
print [word.singularize() for word in blob.words]
print [word.pluralize() for word in blob.words]

# Spelling correction
blob = TextBlob('15 minuets late')
print 'Original: 15 minuets late    Corrected:', blob.correct()

# Spellcheck
print 'Original: parot    Corrected:', Word('parot').spellcheck()

# Definitions
print Word('bank').define()
print ' '
print Word('bank').define('v')

# translation and language identification
blob = TextBlob('Welcome to the classroom.')
print 'English: "Welcome to the classroom."    Spanish:', blob.translate(to='es')
print ''
blob = TextBlob('Hola amigos')
print '"Hola amigos" is the language', blob.detect_language()

# The sentiment polarity score is a float within the range [-1.0, 1.0].
print 'I love pizza    Sentiment =', TextBlob('I love pizza').sentiment.polarity
print 'I hatee pizza    Sentiment =', TextBlob('I hate pizza').sentiment.polarity
print 'I feel nothing about pizza    Sentiment =', TextBlob('I feel nothing about pizza').sentiment.polarity

# The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.
print 'I am a cool person    Subjectivity =', TextBlob("I am a cool person").sentiment.subjectivity # Pretty subjective
print 'I am a person    Subjectivity =', TextBlob("I am a person").sentiment.subjectivity # Pretty objective

# different scores for essentially the same sentence
print TextBlob('Kevin and Brandon are instructors for General Assembly in Washington, D.C.').sentiment.subjectivity
print TextBlob('Kevin and Brandon are instructors in Washington, D.C.').sentiment.subjectivity

# Let's loop through our tweets and calculate sentiment
sentiments = [TextBlob(tweet).sentiment.polarity for tweet in tweets_text]
print tweets_text[0], sentiments[0]

# Average sentiment
avg_sentiment = np.sum(sentiments)/len(sentiments)
print avg_sentiment

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(sentiments)
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment (-1 to 1)')
plt.ylabel('Frequency')

# Loop through sentiments and look for negative sentiments. 
for i in range(len(sentiments)):
    if sentiments[i] <= -0.25 and 'http' not in tweets_text[i]:
        print tweets_text[i], sentiments[i]

# Loop through sentiments and look for positive sentiments. 
for i in range(len(sentiments)):
    if sentiments[i] >= 0.25 and 'http' not in tweets_text[i]:
        print tweets_text[i], sentiments[i]

# Loop through all of the sentiments and put them into the appropriate group
pos_neg_neutral = []
for sentiment in sentiments:
    if sentiment <= -0.25:
        pos_neg_neutral.append('negative')
    elif sentiment >= 0.25:
        pos_neg_neutral.append('positive')
    elif sentiment > -0.25 and sentiment < 0.25:
        pos_neg_neutral.append('neutral')

sns.barplot(np.array(pos_neg_neutral))
plt.title('Positive, Negative, and Neutral Sentiment')
plt.xlabel('Sentiment Category')
plt.ylabel('Frequency')