# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

from collections import defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import requests
from pattern import web
import re

from numpy.random import rand
from numpy import arange

url0 = 'http://www.steelersdepot.com/2014-nfl-combine-results-defensive-linemen/'


# used later
url1 = 'http://www.steelersdepot.com/2014-nfl-combine-results-linebackers/'
url2 = 'http://www.steelersdepot.com/2013-nfl-combine-results-defensive-linemen/'
url3 = 'http://www.steelersdepot.com/2013-nfl-combine-results-linebackers/'

website_html = requests.get(url0).text


def get_html_table(html):
    """Parse html and return html tables."""

    # web is the pattern module
    dom = web.Element(html)
    
    # we can get all tables using
    #  tbls = dom('table')
    # returns a list of the elements that you can 
    #  iterate over
    tbls = dom.by_class('tableizer-table sortable')

    #### 2. step: get all tables we care about

    return tbls

data_table = get_html_table(website_html)
print "table length: %d" %len(data_table)
print data_table[0].attributes


def clean_num(str): 
    if(str == "DNP"): 
        return "DNP" 
    nums = re.split(r"'|\"", str)
    res = 0
    try: 
        res = float(nums[0]) + float(nums[1])/12
    except ValueError: 
        res = float(nums[0][:-1]) + float(nums[1])/12
    return res
    
def get_data(table):
    """Extract data for players from table and store it in dictionary.
       Args:
           table: a DOM element
     """
    
    result = defaultdict(dict)

    # we get the first row in the table and all the th elements
    # this gives us a list of all the column headers. 
    th_s = table[0]('tr')[0]('th')
    
    format_2014 = True
    if th_s[0]('span') == []: 
        format_2014 = False
    
    # use a little python magic to extract the actual text
    
    if format_2014: 
        headers = [th('span')[0].content for th in th_s]
    else: 
        headers = [th.content for th in th_s] 
        headers[headers.index("VERT")] = "VJ"
        headers[headers.index("JUMP")] = "BJ"
     

    # we're going to work with the vertical jump data
    # we'll use the indices below to convert text values to number values
    bj_idx = headers.index("BJ")
    vj_idx = headers.index("VJ") 
    wgt_idx = headers.index("WGT")
    
    # throw away the header row
    rows = table[0]('tr')[1:]
    for row in rows: 
        tds = row('td')
        if format_2014: 
            row_data = [td('span')[0].content if len(td('span')) != 0 else u"n/a"  
                       for td in tds]
        else: 
            row_data = [td.content for td in tds]
        
        row_data[vj_idx] = float(row_data[vj_idx]) if row_data[vj_idx] != u"DNP" else 0
        row_data[bj_idx] = clean_num(row_data[bj_idx])
        row_data[wgt_idx] = int(row_data[wgt_idx])
        
        subdict = dict(zip(headers, row_data))
        # iterate over all tables, extract headings and actual data and combine data into single dict
        result[row_data[0]] = subdict
    
    return result


result = get_data(data_table)

# quick check that we were put the data into the dataframe successfully. 
# for key in result: 
#     print (str(result[key]['VJ']) + ", "), 

# create dataframe

df = pd.DataFrame.from_dict(result, orient='index')
# sort based on year
df.sort(axis=1,inplace=True)
print df


subtable = df.iloc[0:2, 0:2]
print "subtable"
print subtable
print ""

df.head()

column = df['VJ']
print "column"
print column
print ""

row = df.ix[0] #row 0
print "row"
print row
print ""

rows = df.ix[:2] #rows 0,1
print "rows"
print rows
print ""


# extract the vertical jump information from the dataframe. 

bar_width = 0.35
opacity = 0.4

col = df["VJ"]

# generate the 
pos = [i + .5 for i in arange(len(col))]

# Here we set the width and height of the figure 
fig = plt.figure(figsize=(4,10))

# Add a subplot, 1 row, 1 col 
ax = fig.add_subplot(111)
ax.set_title("Vertical jump")

ax.barh(pos, col.values, align='center', height=0.3)
ax.set_yticks(pos)
_ = ax.set_yticklabels(col.index, fontsize="small")


#filter out the zero entries
mask = df['VJ'] > 0
subdf = df[mask]

fig, ax = plt.subplots()


ax.scatter(subdf['WGT'], subdf['VJ'])
ax.set_xlabel('Weight')
ax.set_ylabel('Vertical Jump')
ax.annotate("Hageman", xy=(310, 35.5), xytext=(330,40),arrowprops=dict(facecolor="black",shrink=0.05))

# plt.scatter(subdf['WGT'],subdf['VJ'])

website_html1 = requests.get(url1).text
website_html2 = requests.get(url2).text
website_html3 = requests.get(url3).text

data_table_1 = get_html_table(website_html1)
data_table_2 = get_html_table(website_html2)
data_table_3 = get_html_table(website_html3)

dict1 = get_data(data_table_1)
dict2 = get_data(data_table_2)
dict3 = get_data(data_table_3)

df1 = pd.DataFrame.from_dict(dict1, orient='index')
df2 = pd.DataFrame.from_dict(dict2, orient='index')
df3 = pd.DataFrame.from_dict(dict3, orient='index')

df_all = pd.concat([df,df1,df2,df3])

# extract the vertical jump information from the dataframe. 


df_VJ = df_all[df_all.VJ > 0] 


df_VJ.sort(["VJ"], ascending=[1], inplace=True)

bar_width = 0.35
opacity = 0.4


col = df_VJ["VJ"]

# generate the 
pos = [i + .5 for i in arange(len(col))]
                                                                
# Here we set the width and height of the figure 
fig = plt.figure(figsize=(4,40))

# Add a subplot, 1 row, 1 col 
ax = fig.add_subplot(111)
ax.set_title("Vertical jump")

ax.barh(pos, col.values, align='center', height=0.3)
ax.set_yticks(pos)
ax.xaxis.grid(True)

_ = ax.set_yticklabels(col.index, fontsize="small")