# special IPython command to prepare the notebook for matplotlib %matplotlib inline from collections import defaultdict import pandas as pd import matplotlib.pyplot as plt import requests from pattern import web import re from numpy.random import rand from numpy import arange url0 = 'http://www.steelersdepot.com/2014-nfl-combine-results-defensive-linemen/' # used later url1 = 'http://www.steelersdepot.com/2014-nfl-combine-results-linebackers/' url2 = 'http://www.steelersdepot.com/2013-nfl-combine-results-defensive-linemen/' url3 = 'http://www.steelersdepot.com/2013-nfl-combine-results-linebackers/' website_html = requests.get(url0).text def get_html_table(html): """Parse html and return html tables.""" # web is the pattern module dom = web.Element(html) # we can get all tables using # tbls = dom('table') # returns a list of the elements that you can # iterate over tbls = dom.by_class('tableizer-table sortable') #### 2. step: get all tables we care about return tbls data_table = get_html_table(website_html) print "table length: %d" %len(data_table) print data_table[0].attributes def clean_num(str): if(str == "DNP"): return "DNP" nums = re.split(r"'|\"", str) res = 0 try: res = float(nums[0]) + float(nums[1])/12 except ValueError: res = float(nums[0][:-1]) + float(nums[1])/12 return res def get_data(table): """Extract data for players from table and store it in dictionary. Args: table: a DOM element """ result = defaultdict(dict) # we get the first row in the table and all the th elements # this gives us a list of all the column headers. th_s = table[0]('tr')[0]('th') format_2014 = True if th_s[0]('span') == []: format_2014 = False # use a little python magic to extract the actual text if format_2014: headers = [th('span')[0].content for th in th_s] else: headers = [th.content for th in th_s] headers[headers.index("VERT")] = "VJ" headers[headers.index("JUMP")] = "BJ" # we're going to work with the vertical jump data # we'll use the indices below to convert text values to number values bj_idx = headers.index("BJ") vj_idx = headers.index("VJ") wgt_idx = headers.index("WGT") # throw away the header row rows = table[0]('tr')[1:] for row in rows: tds = row('td') if format_2014: row_data = [td('span')[0].content if len(td('span')) != 0 else u"n/a" for td in tds] else: row_data = [td.content for td in tds] row_data[vj_idx] = float(row_data[vj_idx]) if row_data[vj_idx] != u"DNP" else 0 row_data[bj_idx] = clean_num(row_data[bj_idx]) row_data[wgt_idx] = int(row_data[wgt_idx]) subdict = dict(zip(headers, row_data)) # iterate over all tables, extract headings and actual data and combine data into single dict result[row_data[0]] = subdict return result result = get_data(data_table) # quick check that we were put the data into the dataframe successfully. # for key in result: # print (str(result[key]['VJ']) + ", "), # create dataframe df = pd.DataFrame.from_dict(result, orient='index') # sort based on year df.sort(axis=1,inplace=True) print df subtable = df.iloc[0:2, 0:2] print "subtable" print subtable print "" df.head() column = df['VJ'] print "column" print column print "" row = df.ix[0] #row 0 print "row" print row print "" rows = df.ix[:2] #rows 0,1 print "rows" print rows print "" # extract the vertical jump information from the dataframe. bar_width = 0.35 opacity = 0.4 col = df["VJ"] # generate the pos = [i + .5 for i in arange(len(col))] # Here we set the width and height of the figure fig = plt.figure(figsize=(4,10)) # Add a subplot, 1 row, 1 col ax = fig.add_subplot(111) ax.set_title("Vertical jump") ax.barh(pos, col.values, align='center', height=0.3) ax.set_yticks(pos) _ = ax.set_yticklabels(col.index, fontsize="small") #filter out the zero entries mask = df['VJ'] > 0 subdf = df[mask] fig, ax = plt.subplots() ax.scatter(subdf['WGT'], subdf['VJ']) ax.set_xlabel('Weight') ax.set_ylabel('Vertical Jump') ax.annotate("Hageman", xy=(310, 35.5), xytext=(330,40),arrowprops=dict(facecolor="black",shrink=0.05)) # plt.scatter(subdf['WGT'],subdf['VJ']) website_html1 = requests.get(url1).text website_html2 = requests.get(url2).text website_html3 = requests.get(url3).text data_table_1 = get_html_table(website_html1) data_table_2 = get_html_table(website_html2) data_table_3 = get_html_table(website_html3) dict1 = get_data(data_table_1) dict2 = get_data(data_table_2) dict3 = get_data(data_table_3) df1 = pd.DataFrame.from_dict(dict1, orient='index') df2 = pd.DataFrame.from_dict(dict2, orient='index') df3 = pd.DataFrame.from_dict(dict3, orient='index') df_all = pd.concat([df,df1,df2,df3]) # extract the vertical jump information from the dataframe. df_VJ = df_all[df_all.VJ > 0] df_VJ.sort(["VJ"], ascending=[1], inplace=True) bar_width = 0.35 opacity = 0.4 col = df_VJ["VJ"] # generate the pos = [i + .5 for i in arange(len(col))] # Here we set the width and height of the figure fig = plt.figure(figsize=(4,40)) # Add a subplot, 1 row, 1 col ax = fig.add_subplot(111) ax.set_title("Vertical jump") ax.barh(pos, col.values, align='center', height=0.3) ax.set_yticks(pos) ax.xaxis.grid(True) _ = ax.set_yticklabels(col.index, fontsize="small")