import numpy as np import pandas as pd from pandas import DataFrame from pandas import Series #Set PANDAS to show all columns in DataFrame pd.set_option('display.max_columns', None) print pd.__version__ df = pd.read_csv('CSR_user_timeline_2013.csv', sep=',', low_memory=False) print len(df) df.head(2) df.columns len(df.columns) df.dtypes df = df.drop('created_at_text',1) df = df.drop('tweet_id',1) df = df.drop('withheld_in_countries',1) df = df.drop('withheld_scope',1) df = df.drop('truncated',1) df = df.drop('possibly_sensitive',1) len(df.columns) df.head(2) df2 = df[['created_at', 'from_user_screen_name', 'retweet_count']] print len(df2) df2.head(2) pd.unique(df.from_user_screen_name.ravel()) len(pd.unique(df.from_user_screen_name.ravel())) len(df[df['from_user_screen_name'] != 'TICalculators']) len(df[df['from_user_screen_name'] == 'TICalculators']) 1767 + 32330 (1767 + 32330) - len(df) len(df[df['from_user_screen_name'] != 'TICalculators']) + len(df[df['from_user_screen_name'] == 'TICalculators']) - len(df) df = df[df['from_user_screen_name'] != 'TICalculators'] print len(df) df.head(2) pd.unique(df.from_user_screen_name.ravel()) len(pd.unique(df.from_user_screen_name.ravel())) df.to_pickle('CSR tweets - 2013 by 41 accounts.pkl')