#!/usr/bin/env python # coding: utf-8 # # Twitter text analysis # # Let's load one day's worth of tweets from India. These were # [captured](https://github.com/gramener/twitter-stream) via the # [Twitter API](https://dev.twitter.com/). The file is at . # It's just under 7MB. # # First, let's download the file. # In[1]: import os import urllib tweetfile = 'tweets.json.gz' if not os.path.exists(tweetfile): url = 'http://files.gramener.com/data/tweets.20130919.json.gz' urllib.urlretrieve(url, tweetfile) # This file is not *quite* a gzipped JSON file, despite the file name. Each row is a JSON string. Some lines might be blank -- especially alternate lines. # In[3]: import gzip for line in gzip.open(tweetfile).readlines()[:8]: if line.strip(): print line[:80] # Let's load this into a Pandas data structure. After some experimentation, I find that this is a reasonably fast way of loading it. # In[7]: import pandas as pd import json series = pd.Series([ line for line in gzip.open(tweetfile) if line.strip() ]).apply(json.loads) data = pd.DataFrame({ 'id' : series.apply(lambda t: t['id_str']), 'name': series.apply(lambda t: t['user']['screen_name']), 'text': series.apply(lambda t: t['text']), }).set_index('id') # We've extracted just a few things from the tweets -- such as the ID (which we set as the index), the person who tweeted it, the text of the tweet. # In[8]: data.head() # # Pure Python # # Now let's do some basic text analysis on this. # # ## Most frequent words: `.split(' ')` and `.value_counts()` # # Let's get the full text as a string and count the words. Let's assume that words are split by a single space. # In[10]: words = pd.Series(' '.join(data['text']).split(' ')) words.value_counts().head() # There are lots of errors in the assumption that words are split by a single space. That ignores punctuation, multiple spaces, hyphenation, and a lot of other things. But **it's not a bad starting point** and you can start making reasonable inferences as a first approximation. # ### NLTK: `.word_tokenize()` # # The process of converting a sentence into words is called tokenization. NLTK offers an `nltk.word_tokenize()` function for this. Let's try it out: # In[11]: import nltk for i in range(2, 6): print data['text'][i] print nltk.word_tokenize(data['text'][i]) print '' # There are a few problems with this. User names like `@ilovearrt` are split into `@` and `iloverrrt`. Similarly, `&` is split. And so on. # # NLTK offers other tokenizers, including the ability to custom-write your own. But for now, we'll just go with our simple list of space-separated words. # # **NOTE**: Tokenization is usually specific to a given dataset. # # NLTK # # ## Remove stopwords: `nltk.corpus.stopwords` and `.drop()` # # The bigger problem is that the most common words are also the most often used -- to, the, in, a, etc. These are called **stopwords**. We need a way of finding and removing them. # # NLTK offers a standard list of stopwords. This is what we get if we remove those. # In[12]: from nltk.corpus import stopwords ignore = set(stopwords.words('english')) & set(words.unique()) words.value_counts().drop(ignore) # Still, it's not really clear what the words are. We need to go further. # # - Let's use lowecase for standardisation. # - Let's remove punctuations. Maybe any word that *even contains punctuation*, like "I'm" or "&" # - All single-letter words are a good idea to drop off too, like "u". # In[13]: relevant_words = words.str.lower() relevant_words = relevant_words[~relevant_words.str.contains(r'[^a-z]')] relevant_words = relevant_words[relevant_words.str.len() > 1] # In[14]: ignore = set(stopwords.words('english')) & set(relevant_words.unique()) relevant_words.value_counts().drop(ignore) # This list is a lot more meaningful. # # But before we go ahead, let's take a quick look at the *words we've ignored* to see if we should've taken something from there. # In[15]: words.drop(relevant_words.index).str.lower().value_counts().head(30) # ... Ah! We're missing all the smileys (which may be OK) and the hashtags (which could be useful). Should we just pull in the hashtags alone? Let's do that. We'll allow `#` as an exception. We'll also ignore `@` which usually indicates reply to a person. # In[16]: relevant_words = words.str.lower() relevant_words = relevant_words[~relevant_words.str.contains(r'[^#@a-z]')] relevant_words = relevant_words[relevant_words.str.len() > 1] ignore = set(stopwords.words('english')) & set(relevant_words.unique()) relevant_words.value_counts().drop(ignore) # We haven't added anything to the list of top words, but further down, it may be useful. # ## Word stems: `nltk.PorterStemmer()` # # Let's look at all the words that start with `time`, like `timing`, `timer`, etc. # In[17]: relevant_words[relevant_words.str.startswith('tim')].value_counts() # At the very least, we want `time` and `times` to mean the same word. These are word stems. Here's one way of doing this in NLTK. # In[18]: porter = nltk.PorterStemmer() stemmed_words = relevant_words.apply(porter.stem) stemmed_words[stemmed_words.str.startswith('tim')].value_counts() # Notice that this introduces words like `timelin` instead of `timeline`. These can be avoided through the use of a process called `lemmatization` (see `nltk.WordNetLemmatizer()`). However, this is relatively slower. # # For now, we'll just stick to the original words. # ## Bigrams: `nltk.collocations` # # What if we want to find phrases? If we're looking for 2-word combinations (bigrams), we can use the `nltk.collocations.BigramCollocationFinder`. These are the top 30 word pairs. # In[19]: from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures bcf = BigramCollocationFinder.from_words(relevant_words) for pair in bcf.nbest(BigramAssocMeasures.likelihood_ratio, 30): print ' '.join(pair) # ## See this as a word cloud # # Let's get the data into a DataFrame # In[20]: top_words = relevant_words.value_counts().drop(ignore).reset_index() top_words.columns = ['word', 'count'] top_words.head() # (Work in progress...) # # sklearn # In[21]: import re re_separator = re.compile(r'[\s"#\.\?,;\(\)!/]+') re_url = re.compile(r'http.*?($|\s)') def tokenize(sentence): sentence = re_url.sub('', sentence) words = re_separator.split(sentence) return [word for word in words if len(word) > 1] from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer( # analyser='word', # Separate using punctuations # analyzer=re_separator.split, # Separate using spaces # analyzer=re_separator.split, # Separate using custom separator analyzer=tokenize, # Separate using custom separator min_df=10, # Ignore words that occur less than 10 times in the corpus ) # In[22]: # Note: for these 18,000 documents, sklearn takes about ~0.5 seconds on my system X = vectorizer.fit_transform(data['text']) # In[24]: # Here are some of the terms that have special characters print '# terms: %d' % len(vectorizer.vocabulary_) for key in vectorizer.vocabulary_.keys(): if re.search('\W', key) and not re.search(r'[@#\']', key) and re.search('\w', key): print key, vectorizer.vocabulary_[key] # In[25]: # Apply TF-IDF from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) # In[26]: # Let's see the unusual terms import numpy as np terms = np.array(vectorizer.get_feature_names()) for index in range(100): t = terms[(tfidf[index] >= 0.99).toarray()[0]] if len(t): print index, t, data['text'][index] # In[28]: # Segment by those with above median followers followers_count = series.map(lambda v: v['user']['followers_count']) segment = followers_count.values > followers_count.median() count1 = X[segment].sum(axis=0) count2 = X[~segment].sum(axis=0) # In[29]: # Count of term in each segment df = pd.DataFrame(np.concatenate([count1, count2]).T).astype(float) df.columns = ['a', 'b'] df['term'] = terms df.head() # In[30]: total = df['a'] + df['b'] contrast = df['a'] / total - 0.5 freq = total.rank() / len(df) df['significance'] = freq / 2 + contrast.abs() # In[31]: df.sort_values('significance', ascending=False).head() # In[52]: def termdiff(terms, counts, segment): df = pd.DataFrame(np.concatenate([ counts[segment].sum(axis=0), counts[~segment].sum(axis=0) ]).T).astype(float) df.columns = ['a', 'b'] df['term'] = terms total = df['a'] + df['b'] df['contrast'] = 2 * (df['a'] / total - 0.5) df['freq'] = total.rank() / len(df) df['significance'] = (df['freq'] + df['contrast'].abs()) / 2 return df.sort_values('significance', ascending=False) # In[53]: termdiff(terms, X, segment).head() # There seem to be several influential people on Twitter tweetings about properties for sale. Non-influential people are tweeting about BappaMorya. # In[54]: with_hashtags = series.apply(lambda v: len(v['entities']['hashtags']) > 0).values # In[59]: tdiff = termdiff(terms, X, with_hashtags) tdiff[tdiff['b'] > tdiff['a']].head(10) # Tweets without hashtags tend to be Hindi tweets. # # The word "I'm" often is used without hashtags. (These are typically tweets that say "I'm at".) # In[84]: data.ix[X.T[451].toarray()[0] > 0]['text'].values[:5] # The word "dear" is often used in hashtags. These are typically replies. # In[85]: data.ix[X.T[1163].toarray()[0] > 0]['text'].values[:5] # In[92]: tdiff = termdiff(terms, X, series.map(lambda v: v['user']['location'].lower().startswith('bangalore'))) # In[96]: tdiff.head() # ## Lessons learnt # # 1. Tokenisation and filtering of words *always* have a manual element -- so make that easy. # - But are there some robust English tokenisation patterns? # 1. Have a single function that tells me what token is unusual about a group # 1. For each token, show the concordance for context # # spaCy # # Install [spaCy](https://spacy.io/) # # conda config --add channels spacy # conda install spacy # python -m spacy.en.download all # # If you get an SSL error, run: # # conda config --set ssl_verify False # # and re-run the above commands. This adds an `ssl_verify: False` statement to `~/.condarc` add a line: # In[ ]: