Set working path, and import libraries and read dataframe pickles
last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory

import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
    os.makedirs(save_path)
#os.chdir("Baby_names_US_IPython")

from math import floor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn # comment out if you don't have it, but it makes good-looking charts
%run download_and_process.py

# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(floor((log10(x))))
    val = floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

namelist = ["William", "Will", "Willy", "Willie", "Billy", "Bill"]
sexes = ['M'] # can be length 1 or same length as names

yearstart = 1880
yearend = 2013

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(namelist)
    
df_chart = df_chart[df_chart['name'].isin(namelist)]   


df_chart['temp'] = 0 # I think this whole block is now unnecessary due to the previous line .isin function
for row in range(len(df_chart)):
    for pos in range(len(namelist)):
        if df_chart.name.iloc[row] == namelist[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]

print "%0.1f minutes elapsed. %d records remain.\nTail of dataframe:" % ((time.time() - start)/60, len(df_chart))
print df_chart.tail()

#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

temp = names[names.name != 'Jennifer']

sexes = list(temp[temp.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).sex)
namelist = list(temp[temp.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).name)


yearstart = 1880
yearend = 2013

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(namelist)
    
df_chart = df_chart[df_chart['name'].isin(namelist)]   


df_chart['temp'] = 0 # I think this whole block is now unnecessary due to the previous line .isin function
for row in range(len(df_chart)):
    for pos in range(len(namelist)):
        if df_chart.name.iloc[row] == namelist[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]

print "%0.1f minutes elapsed. %d records remain.\nTail of dataframe:" % ((time.time() - start)/60, len(df_chart))
print df_chart.tail()

#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
        colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                  "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(startyear, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(startyear, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()


#line graph

make_chart(df=chart_1,
           form='line', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


from math import log10
%matplotlib inline

#subplots with autoscaling, with the intensity of the fill color proportional to peak maximum

make_chart(df=chart_1,
           form='subplots_auto', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


#subplots with all y axes having the same scale

make_chart(df=chart_1,
           form='subplots_same', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


#stream graph 'sym'

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


#stream graph 'zero'

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='zero',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


#stream graph 'wiggle'

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='wiggle',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


#stream graph 'weighted_wiggle'

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='weighted_wiggle',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = ''  # if '', will not be saved
           )


names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False)

x = list(names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).name)
y = list(names[names.name.str.contains("^J.+n+.(f|v|ph).+r$")].sort('pct_sum', ascending=False).sex)