print len( get_all_ym_tuples() ) ym_email_dict = get_emails_by_month() # sample entry: July 2004 print ym_email_dict[(2004,7)] print "Total Emails:", numpy.sum( [ len(i) for i in ym_email_dict.values() ] ) print "Total Unique Authors:", len( set( [ i for sublist in ym_email_dict.values() for i in sublist ] ) ) ym_percentage_dict = get_ym_percentage_dict() #sample entry: October 2013 print "Example entry (October 2013):", ym_percentage_dict[(2013,10)] print "Average male fraction:", numpy.average( ym_percentage_dict.values() ) population_male_fraction = get_male_female_counts() print population_male_fraction plot( [ ym_percentage_dict[ym] for ym in get_all_ym_tuples() ] ) xlabel( "Months Since July 2004" ) ylabel( "Fraction of Emails Sent by Males" ) hist( ym_percentage_dict.values() ) xlabel('Fraction of Emails Sent by Males' ) ylabel( 'Number of Months' ) from scipy import stats data = numpy.array( ym_percentage_dict.values() ) ttest = stats.ttest_1samp(data,0.7) print "t-statistic:", ttest[0] print "one-tailed p-value:", ttest[1] / 2 from lxml import etree from StringIO import StringIO import urllib2 sys.path.append( '../third-party/gender-from-name/' ) import gender import collections import calendar def get_male_female_counts(): with open( '../data/mitcsailpeople.html' ) as f: html_string = f.read() parser = etree.HTMLParser() tree = etree.parse( StringIO( html_string ), parser ) root = tree.getroot() for i in list(root): if i.tag == 'body': body_element = i table_element = [ i for i in body_element if i.tag == 'table' ][0] csail_first_names = [] for elem in list( table_element ): if elem.tag == 'tbody': tr_elems = [ i for i in list( elem ) if i.tag == 'tr' ] for tr_elem in tr_elems: td_elems = [ i for i in list( tr_elem ) if i.tag == 'td' ] if len( td_elems ) == 6: first_name_elem = td_elems[1] assert first_name_elem.getchildren()[0].tag == 'a' csail_first_names.append( first_name_elem.getchildren()[0].text ) else: pass ct = 0 current_csail_gender_list = [] for name in csail_first_names: if name == None: pass elif " " in name: split_name = name.split() if len( split_name[0] ) > 1: name_to_use = split_name[0] else: name_to_use = split_name[1] else: name_to_use = name if name_to_use.upper() in gender.gender: inferred_gender = gender.gender[name_to_use.upper()] current_csail_gender_list.append( inferred_gender ) else: #print repr(name_to_use.upper()), ct += 1 total_counts = collections.Counter( current_csail_gender_list ) total_male = total_counts['male'] total_female = total_counts['female'] return float( total_male ) / ( total_male + total_female ) def get_all_ym_tuples(): years = range( 2004,2015) all_month_numbers = range(1,13) months = list( calendar.month_name[1:] ) year_month_tuples = [] year = 2004 months = range(7,13) for m in months: year_month_tuples.append( ( year, m ) ) years = range( 2005, 2014 ) for y in years: for m in all_month_numbers: year_month_tuples.append( ( y, m ) ) year_month_tuples.append( ( 2014, 1 ) ) return year_month_tuples def get_emails_by_month(): ym_email_dict = {} year_month_tuples = get_all_ym_tuples() base_url = "http://lists.csail.mit.edu/pipermail/csail-related/" for ym in year_month_tuples: year = ym[0] month_name = calendar.month_name[ym[1]] folder = str( year ) + "-" + month_name + "/" full_url = base_url + folder + "author.html" data = urllib2.urlopen( full_url ) html_string = data.read() parser = etree.HTMLParser() tree = etree.parse( StringIO( html_string), parser ) root = tree.getroot() for i in list(root): if i.tag == 'body': body_element = i #print body_element ul_elements = [ i for i in list( body_element ) if i.tag == 'ul' ] email_thread_elements = ul_elements[1] all_authors = [] for elem in email_thread_elements.getchildren(): if elem.tag == 'li': for elem2 in elem.getchildren(): if elem2.tag == 'i': all_authors.append( elem2.text.strip('\n' ) ) #print year, month_name, len( flatten_threads( all_authors ) ) ym_email_dict[ ym ] = all_authors return ym_email_dict def get_ym_percentage_dict(): ym_percentage_dict = {} for ym in get_all_ym_tuples(): all_authors = ym_email_dict[ym] gender_list = [] gender_unknown = [] for name in list(all_authors): first_name = get_name( name ) if first_name != None: first_name_upper = first_name.upper() if first_name_upper in gender.gender: gender_result = gender.gender[ first_name.upper() ] gender_list.append( gender_result ) #if type(gender_result) is tuple: # print first_name_upper, gender_result else: gender_unknown.append( first_name_upper ) #print first_name, "not in dictionary" total_counts = collections.Counter( gender_list ) total_male = total_counts['male'] total_female = total_counts['female'] ym_percentage_dict[ym] = float( total_male ) / ( total_male + total_female ) return ym_percentage_dict def get_name( full_name ): name_to_check = None if full_name == '': name_to_check = None else: name_words = full_name.split() if "," in name_words[0]: first_name_start_idx = 1 else: first_name_start_idx = 0 possible_first_name = name_words[first_name_start_idx] possible_first_name_no_initials = possible_first_name.replace( '.', '' ) if len( possible_first_name_no_initials ) == 1: if first_name_start_idx+1 < len( name_words ): name_to_check = name_words[first_name_start_idx+1] else: name_to_check = None else: name_to_check = name_words[first_name_start_idx] # clean name name_to_check = name_to_check.replace( '"', '' ) return name_to_check