%pylab inline plt.rc('figure', figsize=(8, 5)) import networkx as nx import text_utils as tu # shorthand for convenience from twython import Twython # Create the main Twitter object we'll use later for all queries twitter = Twython() query = "big data" words_to_remove = """with some your just have from it's /via & that they your there this""" n_pages = 30 results = [] retweets = [] for page in range(1, n_pages+1): search = twitter.search(q=query+' lang:en', page=str(page)) res = search['results'] if not res: print 'Stopping at page:', page break for t in res: if t['text'].startswith('RT '): retweets.append(t) else: results.append(t) tweets = [t['text'] for t in results] # Quick summary print 'query: ', query print 'results: ', len(results) print 'retweets:', len(retweets) print 'Variable `tweets` has a list of all the tweet texts' tweets[:10] remove = tu.removal_set(words_to_remove, query) lines = tu.lines_cleanup([tweet['text'].encode('utf-8') for tweet in results], remove=remove) words = '\n'.join(lines).split() wf = tu.word_freq(words) sorted_wf = tu.sort_freqs(wf) tu.summarize_freq_hist(sorted_wf) n_words = 10 tu.plot_word_histogram(sorted_wf, n_words,"Frequencies for %s most frequent words" % n_words); tu.plot_word_histogram(sorted_wf, 1.0, "Frequencies for entire word list"); n_nodes = 10 popular = sorted_wf[-n_nodes:] pop_words = [wc[0] for wc in popular] co_occur = tu.co_occurrences(lines, pop_words) wgraph = tu.co_occurrences_graph(popular, co_occur, cutoff=1) wgraph = nx.connected_component_subgraphs(wgraph)[0] centrality = nx.eigenvector_centrality_numpy(wgraph) tu.summarize_centrality(centrality) print "Graph visualization for query:", query tu.plot_graph(wgraph, tu.centrality_layout(wgraph, centrality), plt.figure(figsize=(8,8)), title='Centrality and term co-occurrence graph, q="%s"' % query)