listed_path = "lists/mythological_names_eg_gk_ro_no.list"
totals_title = "Mythological names in U.S. Social Security baby names database, 1880-2013"
top_cutoff = 6

top_boys_title = "Top %d mythological boys' names from U.S. Social Security database, 1880-2013" % (top_cutoff)
top_girls_title = "Top %d mythological girls' names from U.S. Social Security database, 1880-2013" % (top_cutoff)
last_year = 2013 #change this when Social Security database is updated
save_path = "user_charts" # files created by this notebook will be saved in this directory

import time
import os
if not os.path.isdir(save_path): # creates path if it does not exist
    os.makedirs(save_path)
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn # comment out if you don't have it, but it makes good-looking charts
print 'This is standard output from download_and_process.py'
%run download_and_process.py

print '--------------------\nFirst 80 characters of list:'
listed_file = open(listed_path, "r").read()
print listed_file[:80] + ' ...'
all_listed = eval(listed_file) # make sure you trust this file!
all_listed_set = set(all_listed) # to remove duplicates
all_listed = list(all_listed)
print "all_listed: list of length", len(all_listed)

# reduce names dataframe to those matching list
print '--------------------\nDataframe names filtered to those that match list'
print "%d records to begin." % (len(names))
names_listed = names[names.name.isin(all_listed)].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
print "%d records remaining." % (len(names_listed))
listed_in_df = list(names_listed.name)
print names_listed.head(10)
listed_m = list(names[(names.sex == 'M') & (names.name.isin(listed_in_df))]['name'])
listed_f = list(names[(names.sex == 'F') & (names.name.isin(listed_in_df))]['name'])

#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed = yob[yob.name.isin(listed_in_df)].copy()
yob_listed.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))

# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'F'].groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed[yob_listed.sex == 'M'].groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()

# print chart of m and f totals
print '\n'

# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(math.floor((math.log10(x))))
    val = math.floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)

plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")

plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
                                  +list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)

plt.title(totals_title, fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)

plt.show()

#function to make dataframe for top names

def top_df(yobdf, names, sexes):
    """ yobdf = dataframe derived from yob; normally it would just be yob itself.
        names = list of names
        sexes = list of length 1 for all the same sex, or same length as names. 'F' and 'M' allowed
        """

    df_chart = yobdf.copy()
    assert len(sexes) == 1 or len(names) == len(sexes)
    if len(sexes) == 1:
        sexes = sexes * len(names)

    df_chart = df_chart[df_chart['name'].isin(names)]   

    df_chart['temp'] = 0
    for row in range(len(df_chart)):
        for pos in range(len(names)):
            if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
                df_chart.temp.iloc[row] = 1
    df_chart = df_chart[df_chart.temp == 1]

    print "Tail of dataframe:"
    print df_chart.tail()

    output_df = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

    col = output_df.columns[0]

    for yr in range(1880, last_year + 1): #inserts missing years
        if yr not in output_df.index:
            #output_df[col][yr] = 0.0
            output_df = output_df.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

    output_df = output_df.fillna(0)
    
    return output_df

listed_top_m = top_df(yob, listed_m[:top_cutoff], ['M'])
listed_top_f = top_df(yob, listed_f[:top_cutoff], ['F'])

#a single function to make the four different kinds of charts

def make_chart(df, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
        colors = ['#BB2114', '#0C5966', '#BA7814', '#4459AB', '#6B3838', 
                  '#B8327B', '#2B947F', '#0D83B5', '#684287', '#8C962C', 
                  '#92289E', '#242D7D']
        # my own list of dark contrasting colors
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(startyear, endyear)
        ax.set_ylabel(y_text, size = 13)
        ax.set_title(title, size = 18)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(startyear, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()

# line charts

make_chart(df=listed_top_m,
           form='line', # line , subplots_auto , subplots_same , stream
           title=top_boys_title,
           colors= [],
           smoothing=0,
           baseline='zero',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           )

make_chart(df=listed_top_f,
           form='line', # line , subplots_auto , subplots_same , stream
           title=top_girls_title,
           colors= [],
           smoothing=0,
           baseline='zero',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           )

names_listed.reset_index(drop = True, inplace = True)
names_listed.head()
names_listed.to_csv('lists/names_matching_mythological_list.csv')

print names_listed.name.unique()

cutoffn = 0
# how many names will remain to evaluate after duplicates removed

from collections import OrderedDict
evallistm = OrderedDict()
evallistf = OrderedDict()

# remove names with more common duplicates in other sex
# this happens frequently in ssa db

for name in listed_m:
    try:
        pctf = names_listed[(names_listed.sex == 'F') & 
                            (names_listed.name == name)].pct_max.iloc[0]
        pctm = names_listed[(names_listed.sex == 'M') & 
                            (names_listed.name == name)].pct_max.iloc[0]
    except:
        pctf = 98
        pctm = 99
    if (name not in names_listed[names_listed.sex == 'F'].name.unique() or
        pctf < pctm):
        evallistm[name] = ''
        
for name in listed_f:
    try:
        pctf = names_listed[(names_listed.sex == 'F') & 
                            (names_listed.name == name)].pct_max.iloc[0]
        pctm = names_listed[(names_listed.sex == 'M') & 
                            (names_listed.name == name)].pct_max.iloc[0]
    except:
        pctf = 99
        pctm = 98
    if (name not in names_listed[names_listed.sex == 'M'].name.unique() or
        pctm < pctf):
        evallistf[name] = ''
        
if cutoffn > 0:
    assert len(evallistm) > cutoffn
    assert len(evallistf) > cutoffn
    print evallistm[:cutoffn]
    print evallistf[:cutoffn]
else:
    print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf))
    print evallistm
    print ' '
    print evallistf

#manually copy and paste the above lists and assign 
#'acc' or 'rej' individually to accept or reject

evallistm = OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'), ('Amon', 'rej'), 
                         ('Thor', 'acc'), ('Hercules', 'acc'), ('Ladon', 'rej'), ('Odin', 'acc'), 
                         ('Hermes', 'acc'), ('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'), 
                         ('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'), ('Amen', 'rej'), 
                         ('Mars', 'acc'), ('Ares', 'acc'), ('Loki', 'acc'), ('Nike', 'rej'), 
                         ('Ran', 'rej'), ('Mercury', 'acc'), ('Tyr', 'acc'), ('Jupiter', 'acc'), 
                         ('Kore', 'rej'), ('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'), 
                         ('Poseidon', 'acc'), ('Makar', 'rej'), ('Pater', 'rej'), ('Amun', 'rej'), 
                         ('Fenris', 'acc'), ('Set', 'rej'), ('Demeter', 'rej'), ('Horus', 'acc'), 
                         ('Megale', 'rej'), ('Aten', 'acc'), ('Saturn', 'acc')])
 
evallistf = OrderedDict([('Athena', 'acc'), ('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'), ('Phoebe', 'rej'), 
                         ('Chloe', 'rej'), ('Diana', 'rej'), ('Flora', 'rej'), ('Sophia', 'rej'), 
                         ('Rhea', 'rej'), ('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'),
                         ('Thalia', 'acc'), ('Lucina', 'rej'), ('Gerda', 'rej'), ('Eris', 'acc'), 
                         ('Artemis', 'acc'), ('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'), 
                         ('Persephone', 'acc'), ('Melaina', 'rej'), ('Shai', 'rej'), ('Andromeda', 'acc'), 
                         ('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'), ('Nanna', 'rej'), ('Urania', 'acc'), 
                         ('Gaia', 'acc'), ('Khloe', 'rej'), ('Chloris', 'rej'), ('Athene', 'acc'), 
                         ('Janus', 'rej'), ('Freyja', 'acc'), ('Valkyrie', 'acc'), ('Ourania', 'acc'), 
                         ('Juno', 'acc'), ('Vali', 'acc'), ('Holle', 'rej'), ('Cybele', 'acc'), 
                         ('Pelagia', 'rej'), ('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'), 
                         ('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'), ('Chimera', 'acc'), 
                         ('Deianeira', 'rej'), ('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'), 
                         ('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'), ('Saturn', 'rej'), 
                         ('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc'), ('Demeter', 'acc'),
                         ('Nike', 'acc')])

# Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females
# Similarly, Saturn moved from females' to males'

# Test that all names have 'acc' or 'rej' values

final_m = []
final_f = []

names_not_validated = []
for item in evallistm:
    if evallistm[item] not in ['acc', 'rej']:
        names_not_validated.append(item)
    elif evallistm[item] == 'acc':
        final_m.append(item)
for item in evallistf:
    if evallistf[item] not in ['acc', 'rej']:
        names_not_validated.append(item)
    elif evallistf[item] == 'acc':
        final_f.append(item)
        
final_all = final_m + final_f

if len(names_not_validated) > 0:
    print "The following names do not have 'acc' or 'rej' values: ", names_not_validated
    raise exception("Names not validated")
    
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

print 'Length: %d male, %d female\n' % (len(final_m), len(final_f))

cutmin = min(len(final_m), len(final_f))

final_m = final_m[:cutmin]
final_f = final_f[:cutmin]

print 'After resizing to %d names each:' % (cutmin)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

from copy import deepcopy
oldm = deepcopy(evallistm)
oldf = deepcopy(evallistf)

cutoffn = 0
# how many names will remain to evaluate after duplicates removed

from collections import OrderedDict
evallistm = OrderedDict()
evallistf = OrderedDict()

# remove names with more common duplicates in other sex
# this happens frequently in ssa db

for name in listed_m:
    try:
        pctf = names_listed[(names_listed.sex == 'F') & 
                            (names_listed.name == name)].pct_max.iloc[0]
        pctm = names_listed[(names_listed.sex == 'M') & 
                            (names_listed.name == name)].pct_max.iloc[0]
    except:
        pctf = 98
        pctm = 99
    if (name not in ['Demeter', 'Nike'] and (name not in names_listed[names_listed.sex == 'F'].name.unique() or
        pctf < pctm or name == 'Saturn')):
        evallistm[name] = ''
        
for name in listed_f:
    try:
        pctf = names_listed[(names_listed.sex == 'F') & 
                            (names_listed.name == name)].pct_max.iloc[0]
        pctm = names_listed[(names_listed.sex == 'M') & 
                            (names_listed.name == name)].pct_max.iloc[0]
    except:
        pctf = 99
        pctm = 98
    if (name != 'Saturn' and (name not in names_listed[names_listed.sex == 'M'].name.unique() or
        pctm < pctf or name in ['Demeter', 'Nike'])):
        evallistf[name] = ''

for item in evallistm: # copy from above block
    try:
        evallistm[item] = oldm[item]
    except:
        pass
for item in evallistf:
    try:
        evallistf[item] = oldf[item]
    except:
        pass    
        
        
if cutoffn > 0:
    assert len(evallistm) > cutoffn
    assert len(evallistf) > cutoffn
    print evallistm[:cutoffn]
    print evallistf[:cutoffn]
else:
    print 'Length of lists: %d male, %d female\n' % (len(evallistm), len(evallistf))
    print evallistm
    print ' '
    print evallistf

#manually copy and paste the above lists and assign 
#'acc' or 'rej' individually to accept or reject

# 72, 29 and 80 character rule (PEP)                       do not reach this ->|
#########1#########2#########3#########4#########5#########6#########7#2######9X

evallistm = OrderedDict([('Sol', 'rej'), ('Seth', 'rej'), ('Griffin', 'rej'), 
                         ('Amon', 'rej'), ('Thor', 'acc'), ('Hercules', 'acc'), 
                         ('Ladon', 'rej'), ('Odin', 'acc'), ('Hermes', 'acc'), 
                         ('Apollo', 'acc'), ('Osiris', 'acc'), ('Min', 'rej'), 
                         ('Clete', 'rej'), ('Zeus', 'acc'), ('Phoenix', 'acc'),
                         ('Amen', 'rej'), ('Mars', 'acc'), ('Ares', 'acc'), 
                         ('Loki', 'acc'), ('Ran', 'rej'), ('Mercury', 'acc'),
                         ('Tyr', 'acc'), ('Jupiter', 'acc'), ('Kore', 'rej'),
                         ('Ra', 'acc'), ('Anubis', 'acc'), ('Helios', 'acc'),
                         ('Poseidon', 'acc'), ('Makar', 'rej'),
                         ('Pater', 'rej'), ('Amun', 'rej'), ('Fenris', 'acc'),
                         ('Set', 'rej'), ('Horus', 'acc'), ('Megale', 'rej'),
                         ('Aten', 'acc')])
 
evallistf = OrderedDict([('Athena', 'acc'), ('Delia', 'rej'), ('Minerva', 'acc'), ('Doris', 'rej'),
                         ('Phoebe', 'rej'), ('Chloe', 'rej'), ('Diana', 'rej'),
                         ('Flora', 'rej'), ('Sophia', 'rej'), ('Rhea', 'rej'),
                         ('Venus', 'acc'), ('Vesta', 'acc'), ('Luna', 'rej'), 
                         ('Thalia', 'acc'), ('Lucina', 'rej'), ('Gerda', 'rej'), 
                         ('Eris', 'acc'), ('Artemis', 'acc'), 
                         ('Aphrodite', 'acc'), ('Isis', 'acc'), ('Clio', 'acc'),
                         ('Persephone', 'acc'), ('Melaina', 'rej'), 
                         ('Shai', 'rej'), ('Andromeda', 'acc'), 
                         ('Lamia', 'acc'), ('Sia', 'rej'), ('Hera', 'acc'),
                         ('Nanna', 'rej'), ('Urania', 'acc'), ('Gaia', 'acc'),
                         ('Khloe', 'rej'), ('Chloris', 'rej'),
                         ('Athene', 'acc'), ('Nike', 'acc'), ('Janus', 'rej'),
                         ('Freyja', 'acc'), ('Valkyrie', 'acc'), 
                         ('Ourania', 'acc'), ('Juno', 'acc'), ('Vali', 'acc'),
                         ('Holle', 'rej'), ('Cybele', 'acc'), ('Pelagia', 'rej'),
                         ('Anat', 'rej'), ('Soteria', 'rej'), ('Pallas', 'acc'),
                         ('Fortuna', 'rej'), ('Maat', 'acc'), ('Caliope', 'acc'),
                         ('Chimera', 'acc'), ('Deianeira', 'rej'),
                         ('Agathe', 'rej'), ('Lousia', 'rej'), ('Shu', 'rej'),
                         ('Areion', 'rej'), ('Ceres', 'acc'), ('Areia', 'rej'),
                         ('Lakinia', 'rej'), ('Tyche', 'acc'), ('Khepri', 'acc')])

# Note, Demeter and Nike taken from males' list and moved to females'; for some reason it spiked in males higher than in females
# Similarly, Saturn moved from females' to males'

# Test that all names have 'acc' or 'rej' values

final_m = []
final_f = []

names_not_validated = []
for item in evallistm:
    if evallistm[item] not in ['acc', 'rej']:
        names_not_validated.append(item)
    elif evallistm[item] == 'acc':
        final_m.append(item)
for item in evallistf:
    if evallistf[item] not in ['acc', 'rej']:
        names_not_validated.append(item)
    elif evallistf[item] == 'acc':
        final_f.append(item)

if len(names_not_validated) > 0:
    print "The following names do not have 'acc' or 'rej' values: ", names_not_validated
    raise exception("Names not validated")
    
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

print 'Length: %d male, %d female\n' % (len(final_m), len(final_f))

# manually limit to nice round number

nice_round_number = 100 # if too high, there will be no change
final_m = final_m[:nice_round_number]
final_f = final_f[:nice_round_number]

print 'After manually resizing to nice round number of %d names each:' % (nice_round_number)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

# BTW, here's those missassigned (it appears) genders:
print names[names.name == 'Saturn']
print names[names.name == 'Demeter']
print names[names.name == 'Nike']

%run download_and_process.py

# reduce names dataframe to those matching list
# print '--------------------\nDataframe names filtered to those that match list'
# print "%d records to begin." % (len(names))
names_listed = names[((names.name.isin(final_m) & (names.sex == 'M')) |
                      (names.name.isin(final_f) & (names.sex == 'F')) )].copy()
names_listed.sort('pct_max', ascending=False, inplace=True)
# print "%d records remaining." % (len(names_listed))
# listed_in_df = list(names_listed.name)
# print names_listed.head(10)
# listed_m = list(names[(names.sex == 'M') & (names.name.isin(final_m))]['name'])
# listed_f = list(names[(names.sex == 'F') & (names.name.isin(final_f))]['name'])

#reduce yob dataframe to those matching list
print '--------------------\nDataframe yob filtered to those that match list (count only)'
print "%d records to begin." % (len(yob))
yob_listed_m = yob[(yob.name.isin(final_m)) & (yob.sex == 'M')].copy()
yob_listed_m.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
yob_listed_f = yob[(yob.name.isin(final_f)) & (yob.sex == 'F')].copy()
yob_listed_f.sort(['year', 'sex', 'name'], ascending=False, inplace=True)
print "%d records remaining." % (len(yob_listed))

# m and f totals
yob_listed_f_agg = pd.DataFrame(yob_listed_f.groupby('year').sum())[['births', 'pct']]
yob_listed_m_agg = pd.DataFrame(yob_listed_m.groupby('year').sum())[['births', 'pct']]
print '--------------------\nHead of total matching list per year, female'
print yob_listed_f_agg.head()

# print chart of m and f totals
print '\n'

# function to determine a nice y-axis limit a little above the maximum value
# rounds maximum y up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(math.floor((math.log10(x))))
    val = math.floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

#data
xf = list(yob_listed_f_agg.index)
xm = list(yob_listed_m_agg.index)

plt.figure(figsize=(16,9))
plt.plot(xf, list(yob_listed_f_agg.pct), color="red")
plt.plot(xm, list(yob_listed_m_agg.pct), color="blue")

plt.ylim(0, determine_y_limit(max(list(yob_listed_f_agg.pct)
                                  +list(yob_listed_m_agg.pct))))
plt.xlim(1880, 2013)

plt.title('Top 10 mythological names, boy=blue, girl=red', fontsize = 20)
plt.xlabel("Year", fontsize = 14)
plt.ylabel("% of total births of that sex", fontsize = 14)

plt.show()

# all names_listed, so we can see which ones to aggregate
# cutoff of 10 already done

print names_listed[names_listed.sex == 'M'].head(50)
print ''
print names_listed[names_listed.sex == 'F'].head(50)

# just take top 10

nice_round_number = 10 # if too high, there will be no change
final_m = final_m[:nice_round_number]
final_f = final_f[:nice_round_number]

print 'After manually resizing to nice round number of %d names each:' % (nice_round_number)
print 'Accepted male names:', final_m
print 'Accepted female names:', final_f

names = final_m[:10]
sexes = ['M'] # can be length 1 or same length as names

yearstart=1880
yearend=2013

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(names)
    
df_chart = df_chart[df_chart['name'].isin(names)]   

df_chart['temp'] = 0
for row in range(len(df_chart)):
    for pos in range(len(names)):
        if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]


#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
       colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                 "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
        #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
    from random import shuffle
    shuffle(colors)
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(1980, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(startyear, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()
    
#line graph

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = '',  # if '', will not be saved
           )

names = final_f[:12]
sexes = ['F'] # can be length 1 or same length as names

yearstart=1880
yearend=2013

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(names)
    
df_chart = df_chart[df_chart['name'].isin(names)]   

df_chart['temp'] = 0
for row in range(len(df_chart)):
    for pos in range(len(names)):
        if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]


#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
       colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                 "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
        #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
    from random import shuffle
    shuffle(colors)
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(1980, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(startyear, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()
    
#line graph

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title="10 most popular mythological girls' names, 2914-2013",
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = '',  # if '', will not be saved
           )

names = final_f[:10]
sexes = ['F'] # can be length 1 or same length as names

yearstart=1880 # for data, not graph
yearend=2013

xmin = 1940

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(names)
    
df_chart = df_chart[df_chart['name'].isin(names)]   

df_chart['temp'] = 0
for row in range(len(df_chart)):
    for pos in range(len(names)):
        if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]


#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
       colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                 "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
        #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
    from random import shuffle
    shuffle(colors)
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(xmin, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,16.67), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(xmin, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()
    
#stream graph

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = '',  # if '', will not be saved
           )

names = final_m[:10]
sexes = ['M'] # can be length 1 or same length as names

yearstart=1880 # for data, not graph
yearend=2013

xmin = 1940

start = time.time()
df_chart = yob.copy()
if len(sexes) == 1:
    sexes = sexes * len(names)
    
df_chart = df_chart[df_chart['name'].isin(names)]   

df_chart['temp'] = 0
for row in range(len(df_chart)):
    for pos in range(len(names)):
        if df_chart.name.iloc[row] == names[pos] and df_chart.sex.iloc[row] == sexes[pos]:
            df_chart.temp.iloc[row] = 1
df_chart = df_chart[df_chart.temp == 1]


#To keep more than one data set for charts in memory, change name of chart_1

chart_1 = pd.DataFrame(pd.pivot_table(df_chart, values='pct', index = 'year', columns=['name', 'sex']))

col = chart_1.columns[0]

for yr in range(yearstart, yearend+1): #inserts missing years
    if yr not in chart_1.index:
        #chart_1[col][yr] = 0.0
        chart_1 = chart_1.append(pd.DataFrame(index=[yr], columns=[col], data=[0.0]))

chart_1 = chart_1.fillna(0)

chart_1.sort(inplace=True, ascending=True)

#a single function to make the four different kinds of charts

def make_chart(df=chart_1, form='line', title='', colors= [], smoothing=0, \
               groupedlist = [], baseline='sym', png_name=''):
    
    dataframe = df.copy()
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    has_male = False
    has_female = False
    has_both = False
    max_y = 0
    for name, sex in dataframe.columns:
        max_y = max(max_y, dataframe[(name, sex)].max())
        final_name = name
        if sex == 'M': has_male = True
        if sex == 'F': has_female = True
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[(name, sex)].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[(name, sex)].iloc[row] = newvalues[row]
    if has_male and has_female:
        y_text = "% of births of indicated sex"
        has_both = True
    elif has_male:
        y_text = "Percent of male births"
    else:
        y_text = "Percent of female births"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
       colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                 "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
        #colors = ['#ff0000', '#b00000', '#870000', '#550000', '#e4e400', '#baba00', '#878700', '#545400', '#00ff00', '#00b000', '#008700', '#005500', '#00ffff', '#00b0b0', '#008787', '#005555', '#b0b0ff', '#8484ff', '#4949ff', '#0000ff', '#ff00ff', '#b000b0', '#870087', '#550055', '#e4e4e4', '#bababa', '#878787', '#545454']
    from random import shuffle
    shuffle(colors)
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    if title == '':
        if num_series == 1:
            title = "Popularity of baby name %s in U.S., %s" % (final_name, yearstr)
        else:
            title = "Popularity of baby names in U.S., %s" % (yearstr)
    
    x_values = range(startyear, endyear + 1)
    y_zeroes = [0] * (endyear - startyear)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for name, sex in dataframe.columns:
            color = colors[counter % num_colors]
            counter += 1
            if has_both:
                label = "%s (%s)" % (name, sex)
            else:
                label = name
            ax.plot(x_values, dataframe[(name, sex)], label=label, color=color, linewidth = 3)
        ax.set_ylim(0,determine_y_limit(max_y)) 
        ax.set_xlim(xmin, endyear)
        ax.set_ylabel(y_text, size = 13)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            current_ymax = dataframe[(name, sex)].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for name, sex in dataframe.columns:
            if sex=='M':
                sex_label = 'male'
            else:
                sex_label = 'female'
            label = "Percent of %s births for %s" % (sex_label, name)
            axes[counter].plot(x_values, dataframe[(name, sex)], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(xmin, endyear)
            axes[counter].fill_between(x_values, dataframe[(name, sex)], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        plt.figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(xmin, endyear)
        
        if has_both:
            yaxtext = 'Percent of births of indicated sex (scale: '
        elif has_male:
            yaxtext = 'Percent of male births (scale: '
        else:
            yaxtext = 'Percent of female births (scale: '
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = plt.stackplot(x_values, *[dataframe[(name, sex)] for name, sex in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(plt.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        namelist = []
        for name, sex in dataframe.columns:
            if has_both:
                namelist.append('%s (%s)' % (name, sex))
            else:
                namelist.append(name)
        plt.legend(legendProxies, namelist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        filename = save_path + "/" + png_name + ".png"
        plt.savefig(filename)
    plt.close()
    
#stream graph

make_chart(df=chart_1,
           form='stream', # line , subplots_auto , subplots_same , stream
           title='',
           colors= [],
           smoothing=0,
           baseline='sym',  # zero ,  sym ,  wiggle ,  weighted_wiggle
           png_name = '',  # if '', will not be saved
           )