# special IPython command to prepare the notebook for matplotlib %matplotlib inline import requests from StringIO import StringIO import numpy as np import pandas as pd # pandas import matplotlib.pyplot as plt # module for plotting import datetime as dt # module for manipulating dates and times import numpy.linalg as lin # module for performing linear algebra operations # special matplotlib argument for improved plots from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' #your code here url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv" exprs = pd.read_csv(url_exprs, index_col=0) url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv" sampleinfo = pd.read_csv(url_sampleinfo) (exprs.columns == sampleinfo.filename).all() sampleinfo[exprs.columns == sampleinfo.filename] #your code here a = list(sampleinfo.filename) b = list(exprs.columns) matchIndex = [b.index(x) for x in a] exprs = exprs[matchIndex] # check if all the column names match the file names in sampleinfo (exprs.columns == sampleinfo.filename).all() exprs.head() sampleinfo.head() #your code here sampleinfo["date"] = pd.to_datetime(sampleinfo.date) sampleinfo["month"] = map(lambda x: x.month, sampleinfo.date) sampleinfo["year"] = map(lambda x: x.year, sampleinfo.date) #your code here oct31 = dt.datetime(2002,10,31,0,0) oct31 sampleinfo["elapsedInDays"] = map(lambda x: (x - oct31).days, sampleinfo.date) sampleinfo.head() #your code here sampleinfoCEU = sampleinfo[sampleinfo.ethnicity == "CEU"] sampleinfoCEU.head() #your code here exprsCEU = exprs[sampleinfoCEU.filename] exprsCEU.head() #your code here (exprsCEU.columns == sampleinfoCEU.filename).all() #your code here data = exprsCEU.apply(lambda x: x - exprsCEU.mean(axis=1), axis = 0) data.head() #your code here U,s,Vh = lin.svd(data.values) V = Vh.T #your code here plt.hist(V[:,0], bins = 25) plt.xlabel('PC1') plt.ylabel('Frequency') plt.title('Distribution of the values from PC1') #your code here plt.scatter(sampleinfoCEU.elapsedInDays, V[:,0]) plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)') plt.ylabel('PC1') plt.title('Relationship between the PC1 and the date the samples were processed') #your code here plt.scatter(sampleinfoCEU.elapsedInDays, V[:,0]) plt.xlim(0,160) plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)') plt.ylabel('PC1') plt.title('Relationship between the PC1 and the date the samples were processed') plt.axvline(x=100, color='r') #your code here url = "http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv" source = requests.get(url).text s = StringIO(source) election = pd.DataFrame.from_csv(s, index_col=None).convert_objects( convert_dates="coerce", convert_numeric=True) # Access polls as a CSV file #your code here election.head() #your code here filtered = election[map(lambda x: (x.month == 11) and (x.year ==2012), election["Start Date"])] filtered.drop_duplicates('Pollster', inplace = True) # Removes duplicate pollsters M = len(filtered) print "Number of polls in November: %i" % M #your code here N = np.median(filtered["Number of Observations"]) print N #your code here p = 0.53 "Simulated number of votes for Obama: %i" % np.random.binomial(N, p, size=1) #your code here p = 0.53 B = 1000 obs = np.random.binomial(N, p, size = B) / N p = 0.53 B = 1000 obs = map(lambda x: np.mean(np.random.binomial(1, p, size = N)), xrange(B)) #your code here plt.hist(obs) import scipy.stats as stats stats.probplot((obs - np.mean(obs)) / np.std(obs, ddof=1), dist="norm", plot = plt) plt.show() #your code here np.std(obs, ddof=1) np.sqrt((0.53 * 0.47) / 1200) "Number of polls in November: %i" % M "Median size of polls in November: %i" % N # Represents the percentage of Obama votes from M polls def simulatePolls(p, N, M): """ Function to simulate the results of M polls each measuring the percent of Obama votes out of a sample size of N with probability p of voting for Obama M = Number of polls to simulate N = Sample size of each poll p = Probability of voting for Obama """ return map(lambda x: np.mean(np.random.binomial(1, p, size = N)), xrange(M)) simulatePolls(p, N, M) p = 0.53 B = 1000 mom = map(lambda y: np.mean(simulatePolls(p, N, M)), xrange(B)) #your code here plt.hist(mom) stats.probplot((mom - np.mean(mom)) / np.std(mom, ddof=1), dist="norm", plot = plt) plt.show() #your code here np.std(mom, ddof = 1) "The SE of the average of polls is %g" % np.round(np.std(mom, ddof = 1), 5) #your code here ratio = np.std(mom, ddof = 1) / np.std(obs, ddof = 1) "The ratio of the SE of the average of polls to the SD of a single poll is %g" % ratio #your code here B = 1000 p = 0.53 sds = map(lambda y: np.std(simulatePolls(p, N, M), ddof = 0), xrange(B)) #your code here plt.hist(sds) plt.xlabel('Standard deviations across %i polls' % M) plt.ylabel('Frequency') plt.title('Histogram of standard deviations across %i polls' % M) stats.probplot((sds - np.mean(sds)) / np.std(sds, ddof=1), dist="norm", plot = plt) plt.show() #your code here thesd = np.std(filtered["Obama"] / 100, ddof = 0) thesd #your code here thesd / np.mean(sds) np.mean(thesd > sds) # Standard deviation from simulations in 2(b) print "SD from simulations: %g" % np.std(obs, ddof=1) # Standard deviation computed analytically print "SD using normal approximation %g" % np.sqrt(p * (1-p)/ N) # Standard deviation from simulations in 2(c) print "SD from simulations: %g" % np.std(mom, ddof=1) # Standard deviation computed analytically print "SD using normal approximation %g" % np.sqrt((p * (1-p)/ N) / M) #your code here election["Diff"] = (election.Obama / 100) - (election.Romney / 100) election.head() #your code here last_day = max(election["Start Date"]) filtered = election[map(lambda x: (last_day - x).days <= 5, election["Start Date"]) ] filtered = filtered.sort(columns=["Start Date"]) days= map(lambda x: (last_day - x).days , filtered["Start Date"]) color_map = {} for i, p in enumerate(set(filtered.Pollster)): color_map[p] = np.random.rand(); plt.scatter(days, filtered.Diff, c = map(lambda x: color_map[x], filtered.Pollster), s=60 ) plt.axhline(y=0.039, c = "gray") plt.axhline(y=np.mean(filtered.Diff), c = "red") plt.xlabel("Days") plt.ylabel("Difference (Obama - Romney)") plt.title("Plot of the difference between Obama and Romney colored by different pollsters in the last week") #your code here pollster_map = {} polls = list(set(filtered.Pollster)) for i, p in enumerate(polls): pollster_map[p] = i plt.scatter(map(lambda x: pollster_map[x],filtered.Pollster), filtered.Diff, \ c = map(lambda x: color_map[x],filtered.Pollster),s=60) plt.xticks(range(len(polls)), polls, rotation = 90) plt.xlabel("Pollsters") plt.ylabel("Difference (Obama - Romney)") plt.title("Plot of the difference between Obama and Romney by different pollsters") plt.show() #your code here aggr = filtered.groupby("Pollster").mean() print "Average across pollsters: %g" % np.round(np.mean(aggr.Diff),4) print "Standard error: %g" % np.std(aggr.Diff, ddof = 0) #your code here three_months = dt.datetime(2012,8,15,0,0) new_data = election [map(lambda x: x >= three_months , election["Start Date"]) ] new_data = new_data.sort("Start Date") new_data["days"]= map(lambda x: (x - three_months).days , new_data["Start Date"]) new_data["Diff"] = (new_data.Obama/100) - (new_data.Romney/100) new_data = new_data.groupby(["days"], as_index=False).mean() plt.figure() plt.plot(new_data.days, new_data.Diff ) plt.xlabel("Days from three month before the election") plt.ylabel("Difference (Obama - Romney)") plt.title("Difference between Obama and Romney across time") plt.show() url_str = "http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate" election_urls = [election['url'] + '.csv' for election in requests.get(url_str).json()] election_urls def build_frame(url): """ Returns a pandas DataFrame object containing the data returned from the given url """ source = requests.get(url).text # Use StringIO because pd.DataFrame.from_csv requires .read() method s = StringIO(source) return pd.DataFrame.from_csv(s, index_col=None).convert_objects( convert_dates="coerce", convert_numeric=True) # Makes a dictionary of pandas DataFrames keyed on election string. dfs = dict((election.split("/")[-1][:-4], build_frame(election)) for election in election_urls) #your code here dfs['2014-kentucky-senate-mcconnell-vs-grimes'].head() #your code here x = {} for keys in dfs: dat = dfs[keys] candidate1 = dat.columns[7] candidate2 = dat.columns[8] dat.Diff = (dat[candidate1]/100) - (dat[candidate2]/100) x[keys] = [candidate1, candidate2, np.round(np.mean(dat.Diff), 3)] predictions = pd.DataFrame(x).T predictions.columns = ['Candidate1', 'Candidate2', 'Difference'] predictions['Winner'] = np.where(predictions.Difference >=0, predictions.Candidate1, predictions.Candidate2) predictions