cd '/Users/gregorysaxton/Google Drive/SOX'
/Users/gregorysaxton/Google Drive/SOX
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
#Set PANDAS to show all columns in DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth',200)
from __future__ import division
df = pd.read_pickle('Merged CN dataset with Age, State, Category, Total Revenues, Efficiency, Complexity, SOX, Donor Advisory (with added 990 data).pkl')
print len(df.columns)
print len(df)
df.head(1)
304 85401
org_id | EIN | org_url | name | category | category-full | Date Published | Form 990 FYE | Form 990 FYE, v2 | FYE | Earliest Rating Publication Date | ratings_system | Overall Score | Overall Rating | advisory text - current advisory | advisory text - past advisory | current_or_past_donor_advisory | current_donor_advisory | past_donor_advisory | latest_entry | current_ratings_url | ein_2016 | Publication_date_and_FY_2016 | Publication Date_2016 | FYE_2016 | donor_alert_2016 | overall_rating_2016 | efficiency_rating_rating_2016 | AT_rating_2016 | overall_rating_star_2016 | financial_rating_star_2016 | AT_rating_star_2016 | program_expense_percent_2016 | admin_expense_percent_2016 | fund_expense_percent_2016 | fund_efficiency_2016 | working_capital_ratio_2016 | program_expense_growth_2016 | liabilities_to_assets_2016 | independent_board_2016 | no_material_division_2016 | audited_financials_2016 | no_loans_related_2016 | documents_minutes_2016 | form_990_2016 | conflict_of_interest_policy_2016 | whistleblower_policy_2016 | records_retention_policy_2016 | CEO_listed_2016 | process_CEO_compensation_2016 | no_board_compensation_2016 | donor_privacy_policy_2016 | board_listed_2016 | audited_financials_web_2016 | form_990_web_2016 | staff_listed_2016 | contributions_gifts_grants_2016 | federated_campaigns_2016 | membership_dues_2016 | fundraising_events_2016 | related_organizations_2016 | government_grants_2016 | total_contributions_2016 | program_service_revenue_2016 | total_primary_revenue_2016 | other_revenue_2016 | total_revenue_2016 | program_expenses_2016 | administrative_expenses_2016 | fundraising_expenses_2016 | total_functional_expenses_2016 | payments_to_affiliates_2016 | excess_or_deficit_2016 | net_assets_2016 | comp_2016 | cp_2016 | mission_2016 | 2011_data | charity_name_2011 | category_2011 | city_2011 | state_2011 | cause_2011 | tag_line_2011 | url_2011 | ein_2011 | fye_2011 | overall_rating_2011 | overall_rating_2011_plus_30 | overall_rating_2011_plus_30_v2 | overall_rating_star_2011 | overall_rating_star_2011_text | efficiency_rating_2011 | AT_rating_2011 | financial_rating_star_2011 | AT_rating_star_2011 | program_expense_percent_2011 | admin_expense_percent_2011 | fund_expense_percent_2011 | fund_efficiency_2011 | primary_revenue_growth_2011 | program_expense_growth_2011 | working_capital_ratio_2011 | independent_board_2011 | no_material_division_2011 | audited_financials_2011 | no_loans_related_2011 | documents_minutes_2011 | form_990_2011 | conflict_of_interest_policy_2011 | whistleblower_policy_2011 | records_retention_policy_2011 | CEO_listed_2011 | process_CEO_compensation_2011 | no_board_compensation_2011 | donor_privacy_policy_2011 | board_listed_2011 | audited_financials_web_2011 | form_990_web_2011 | staff_listed_2011 | primary_revenue_2011 | other_revenue_2011 | total_revenue_2011 | govt_revenue_2011 | program_expense_2011 | admin_expense_2011 | fund_expense_2011 | total_functional_expense_2011 | affiliate_payments_2011 | budget_surplus_2011 | net_assets_2011 | leader_comp_2011 | leader_comp_percent_2011 | email_2011 | website_2011 | 2016 Advisory - Date Posted | 2016 Advisory - Charity Name | 2016 Advisory - advisory_url | 2016 Advisory - advisory | _merge_v1 | to_be_merged | NEW ROW | NAME_2015_BMF | STREET_2015_BMF | CITY_2015_BMF | STATE_2015_BMF | ZIP_2015_BMF | RULING_2015_BMF | ACTIVITY_2015_BMF | TAX_PERIOD_2015_BMF | ASSET_AMT_2015_BMF | INCOME_AMT_2015_BMF | REVENUE_AMT_2015_BMF | NTEE_CD_2015_BMF | 2015 BMF | ruledate_2004_BMF | name_MSTRALL | state_MSTRALL | NTEE1_MSTRALL | nteecc_MSTRALL | zip_MSTRALL | fips_MSTRALL | taxper_MSTRALL | income_MSTRALL | F990REV_MSTRALL | assets_MSTRALL | ruledate_MSTRALL | deductcd_MSTRALL | accper_MSTRALL | rule_date_v1 | taxpd | NAME_SOI | yr_frmtn | pt1_num_vtng_gvrn_bdy_mems | pt1_num_ind_vtng_mems | num_vtng_gvrn_bdy_mems | num_ind_vtng_mems | tot_num_empls | tot_num_vlntrs | contri_grnts_cy | prog_srvc_rev_cy | invst_incm_cy | oth_rev_cy | grnts_and_smlr_amts_cy | tot_prof_fndrsng_exp_cy | tot_fndrsng_exp_cy | pt1_tot_asts_eoy | aud_fincl_stmts | mtrl_divrsn_or_misuse | cnflct_int_plcy | whistleblower_plcy | doc_retention_plcy | federated_campaigns | memshp_dues | rltd_orgs | govt_grnts | all_oth_contri | nncsh_contri | tot_contri | psr_tot | inv_incm_tot_rev | bonds_tot_rev | roylrev_tot_rev | net_rent_tot_rev | gain_or_loss_sec | gain_or_loss_oth | oth_rev_tot | tot_rev | mgmt_srvc_fee_tot | fee_for_srvc_leg_tot | fee_for_srvc_acct_tot | fee_for_srvc_lbby_tot | fee_for_srvc_prof_tot | fee_for_srvc_invst_tot | fee_for_srvc_oth_tot | fs_audited | audit_committee | vlntr_hrs | _merge_v2 | rule_date | ruledate_2004_BMF_v2 | ruledate_MSTRALL_v2 | yr_frmtn_v2 | age | category_Animals | category_Arts, Culture, Humanities | category_Community Development | category_Education | category_Environment | category_Health | category_Human Services | category_Human and Civil Rights | category_International | category_Religion | category_Research and Public Policy | govt_revenue_2011_binary | other_revenue_2011_binary | complexity_2011 | advisory | SOX_policies_2011 | total_revenue_2011_logged | total_revenue | total_revenue_logged | program_efficiency_2016 | state | tot_func_expns_prg_srvcs | tot_func_expns_tot | _merge_v3 | program_expenses | total_expenses | program_efficiency | fndrsng_events | _merge_v4 | other_revenue_SOI | complexity_2016 | complexity_SOI | complexity | conflict_of_interest_policy | whistleblower_policy | records_retention_policy | conflict_of_interest_policy_v2 | records_retention_policy_v2 | whistleblower_policy_v2 | SOX_policies | SOX_policies_binary | 2016_data | Advisory Text | donor_advisory | donor_advisory_2016 | donor_advisory_2011_to_2016 | SOX_policies_all_binary | total_revenue_no_neg | EIN_47 | conflict_of_interest_policy_47 | records_retention_policy_47 | whistleblower_policy_47 | SOX_policies_47 | SOX_policies_all_binary_47 | SOX_policies_binary_47 | tot_rev_47 | total_revenue_logged_47 | program_expenses_47 | total_expenses_47 | program_efficiency_47 | complexity_47 | _merge_47 | OrganizationName_efile | URL_efile | SubmittedOn_efile | TaxPeriod_efile | whistleblower_policy_efile | conflict_of_interest_policy_efile | records_retention_policy_efile | SOX_policies_efile | SOX_policies_binary_efile | SOX_policies_all_binary_efile | tot_rev_efile | tot_rev_no_neg_efile | total_revenue_logged_efile | program_expenses_efile | total_expenses_efile | program_efficiency_efile | complexity_efile | _merge_efile | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16722 | 020503776 | http://www.charitynavigator.org/index.cfm?bay=search.summary&orgid=16722 | Portsmouth Girls Softball Association | Human Services | Human Services : Multipurpose Human Service Organizations | 2016-08-12 00:00:00 | current | 2015-01-01 | current | NaN | current | NaN | current (2016) donor advisory | On August 1, 2016, the New Hampshire Union Leader published an article titled, "Former Portsmouth youth softball president accused of stealing thousands from nonprofit." | NaN | 1.0 | 1.0 | 0.0 | True | http://www.charitynavigator.org/index.cfm?bay=search.summary&orgid=16722 | NaN | NaN | NaN | current | current donor advisory 2016 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | 0.0 | NaN | PORTSMOUTH GIRLS SOFTBALL ASSOCIATION | PO BOX 8092 | PORTSMOUTH | NH | 03802-8092 | 201104.0 | 0.0 | 201309.0 | 0.0 | 0.0 | 0.0 | N63 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2011 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | 2011 | NaN | NaN | nan | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NH | NaN | NaN | left_only | NaN | NaN | NaN | NaN | left_only | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | On August 1, 2016, the New Hampshire Union Leader published an article titled, "Former Portsmouth youth softball president accused of stealing thousands from nonprofit." | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only |
'fund_expense_percent_2016', 'program_expense_growth_2016', 'independent_board_2016', 'audited_financials_2016',
'no_loans_related_2016', 'contributions_gifts_grants_2016', 'total_contributions_2016',
'fund_expense_percent_2011', 'fund_expense_2011', 'primary_revenue_growth_2011', 'program_expense_growth_2011',
'independent_board_2011', 'audited_financials_2011', 'no_loans_related_2011',
'pt1_num_vtng_gvrn_bdy_mems', 'pt1_num_ind_vtng_mems', 'num_vtng_gvrn_bdy_mems', 'num_ind_vtng_mems',
'contri_grnts_cy', 'tot_fndrsng_exp_cy', 'aud_fincl_stmts', 'fs_audited', 'audit_committee',
print len(df)
print df['FYE'].value_counts().sum(), '\n'
print df['FYE'].value_counts()
85401 85401 FY2014 15155 FY2013 8671 FY2009 7865 FY2012 7792 FY2010 7145 FY2011 6879 FY2008 4770 FY2007 4441 FY2006 4315 FY2005 4117 FY2004 3887 FY2003 3178 FY2015 2386 FY2002 2190 FY2001 1698 FY2000 591 current 321 Name: FYE, dtype: int64
df['latest_entry'].value_counts()
False 75593 True 8304 False 48 Name: latest_entry, dtype: int64
df = df[df['latest_entry']=='True']
len(df)
8304
df_advisories = pd.read_pickle('Additional Variables for Robustness Tests - 152 advisory orgs.pkl')
print len(df_advisories)
df_advisories[:2]
152
EIN | FYE | audit_committee | board_size | independent_members | audited_financials | fundraising_exp | low_quality_reporting | |
---|---|---|---|---|---|---|---|---|
0 | 030498214 | FY2012 | 0.0 | 6 | 6 | 0 | 140183 | 0 |
1 | 042129889 | FY2014 | 1.0 | 23 | 19 | 1 | 0 | 1 |
print len(pd.merge(df, df_advisories, left_on='EIN', right_on='EIN', how='left'))
merged = pd.merge(df, df_advisories, left_on='EIN', right_on='EIN', how='left', indicator=True)
print len(merged)
print merged['_merge'].value_counts()
merged[:1]
8304 8304 left_only 8152 both 152 right_only 0 Name: _merge, dtype: int64
org_id | EIN | org_url | name | category | category-full | Date Published | Form 990 FYE | Form 990 FYE, v2 | FYE_x | Earliest Rating Publication Date | ratings_system | Overall Score | Overall Rating | advisory text - current advisory | advisory text - past advisory | current_or_past_donor_advisory | current_donor_advisory | past_donor_advisory | latest_entry | current_ratings_url | ein_2016 | Publication_date_and_FY_2016 | Publication Date_2016 | FYE_2016 | donor_alert_2016 | overall_rating_2016 | efficiency_rating_rating_2016 | AT_rating_2016 | overall_rating_star_2016 | financial_rating_star_2016 | AT_rating_star_2016 | program_expense_percent_2016 | admin_expense_percent_2016 | fund_expense_percent_2016 | fund_efficiency_2016 | working_capital_ratio_2016 | program_expense_growth_2016 | liabilities_to_assets_2016 | independent_board_2016 | no_material_division_2016 | audited_financials_2016 | no_loans_related_2016 | documents_minutes_2016 | form_990_2016 | conflict_of_interest_policy_2016 | whistleblower_policy_2016 | records_retention_policy_2016 | CEO_listed_2016 | process_CEO_compensation_2016 | no_board_compensation_2016 | donor_privacy_policy_2016 | board_listed_2016 | audited_financials_web_2016 | form_990_web_2016 | staff_listed_2016 | contributions_gifts_grants_2016 | federated_campaigns_2016 | membership_dues_2016 | fundraising_events_2016 | related_organizations_2016 | government_grants_2016 | total_contributions_2016 | program_service_revenue_2016 | total_primary_revenue_2016 | other_revenue_2016 | total_revenue_2016 | program_expenses_2016 | administrative_expenses_2016 | fundraising_expenses_2016 | total_functional_expenses_2016 | payments_to_affiliates_2016 | excess_or_deficit_2016 | net_assets_2016 | comp_2016 | cp_2016 | mission_2016 | 2011_data | charity_name_2011 | category_2011 | city_2011 | state_2011 | cause_2011 | tag_line_2011 | url_2011 | ein_2011 | fye_2011 | overall_rating_2011 | overall_rating_2011_plus_30 | overall_rating_2011_plus_30_v2 | overall_rating_star_2011 | overall_rating_star_2011_text | efficiency_rating_2011 | AT_rating_2011 | financial_rating_star_2011 | AT_rating_star_2011 | program_expense_percent_2011 | admin_expense_percent_2011 | fund_expense_percent_2011 | fund_efficiency_2011 | primary_revenue_growth_2011 | program_expense_growth_2011 | working_capital_ratio_2011 | independent_board_2011 | no_material_division_2011 | audited_financials_2011 | no_loans_related_2011 | documents_minutes_2011 | form_990_2011 | conflict_of_interest_policy_2011 | whistleblower_policy_2011 | records_retention_policy_2011 | CEO_listed_2011 | process_CEO_compensation_2011 | no_board_compensation_2011 | donor_privacy_policy_2011 | board_listed_2011 | audited_financials_web_2011 | form_990_web_2011 | staff_listed_2011 | primary_revenue_2011 | other_revenue_2011 | total_revenue_2011 | govt_revenue_2011 | program_expense_2011 | admin_expense_2011 | fund_expense_2011 | total_functional_expense_2011 | affiliate_payments_2011 | budget_surplus_2011 | net_assets_2011 | leader_comp_2011 | leader_comp_percent_2011 | email_2011 | website_2011 | 2016 Advisory - Date Posted | 2016 Advisory - Charity Name | 2016 Advisory - advisory_url | 2016 Advisory - advisory | _merge_v1 | to_be_merged | NEW ROW | NAME_2015_BMF | STREET_2015_BMF | CITY_2015_BMF | STATE_2015_BMF | ZIP_2015_BMF | RULING_2015_BMF | ACTIVITY_2015_BMF | TAX_PERIOD_2015_BMF | ASSET_AMT_2015_BMF | INCOME_AMT_2015_BMF | REVENUE_AMT_2015_BMF | NTEE_CD_2015_BMF | 2015 BMF | ruledate_2004_BMF | name_MSTRALL | state_MSTRALL | NTEE1_MSTRALL | nteecc_MSTRALL | zip_MSTRALL | fips_MSTRALL | taxper_MSTRALL | income_MSTRALL | F990REV_MSTRALL | assets_MSTRALL | ruledate_MSTRALL | deductcd_MSTRALL | accper_MSTRALL | rule_date_v1 | taxpd | NAME_SOI | yr_frmtn | pt1_num_vtng_gvrn_bdy_mems | pt1_num_ind_vtng_mems | num_vtng_gvrn_bdy_mems | num_ind_vtng_mems | tot_num_empls | tot_num_vlntrs | contri_grnts_cy | prog_srvc_rev_cy | invst_incm_cy | oth_rev_cy | grnts_and_smlr_amts_cy | tot_prof_fndrsng_exp_cy | tot_fndrsng_exp_cy | pt1_tot_asts_eoy | aud_fincl_stmts | mtrl_divrsn_or_misuse | cnflct_int_plcy | whistleblower_plcy | doc_retention_plcy | federated_campaigns | memshp_dues | rltd_orgs | govt_grnts | all_oth_contri | nncsh_contri | tot_contri | psr_tot | inv_incm_tot_rev | bonds_tot_rev | roylrev_tot_rev | net_rent_tot_rev | gain_or_loss_sec | gain_or_loss_oth | oth_rev_tot | tot_rev | mgmt_srvc_fee_tot | fee_for_srvc_leg_tot | fee_for_srvc_acct_tot | fee_for_srvc_lbby_tot | fee_for_srvc_prof_tot | fee_for_srvc_invst_tot | fee_for_srvc_oth_tot | fs_audited | audit_committee_x | vlntr_hrs | _merge_v2 | rule_date | ruledate_2004_BMF_v2 | ruledate_MSTRALL_v2 | yr_frmtn_v2 | age | category_Animals | category_Arts, Culture, Humanities | category_Community Development | category_Education | category_Environment | category_Health | category_Human Services | category_Human and Civil Rights | category_International | category_Religion | category_Research and Public Policy | govt_revenue_2011_binary | other_revenue_2011_binary | complexity_2011 | advisory | SOX_policies_2011 | total_revenue_2011_logged | total_revenue | total_revenue_logged | program_efficiency_2016 | state | tot_func_expns_prg_srvcs | tot_func_expns_tot | _merge_v3 | program_expenses | total_expenses | program_efficiency | fndrsng_events | _merge_v4 | other_revenue_SOI | complexity_2016 | complexity_SOI | complexity | conflict_of_interest_policy | whistleblower_policy | records_retention_policy | conflict_of_interest_policy_v2 | records_retention_policy_v2 | whistleblower_policy_v2 | SOX_policies | SOX_policies_binary | 2016_data | Advisory Text | donor_advisory | donor_advisory_2016 | donor_advisory_2011_to_2016 | SOX_policies_all_binary | total_revenue_no_neg | EIN_47 | conflict_of_interest_policy_47 | records_retention_policy_47 | whistleblower_policy_47 | SOX_policies_47 | SOX_policies_all_binary_47 | SOX_policies_binary_47 | tot_rev_47 | total_revenue_logged_47 | program_expenses_47 | total_expenses_47 | program_efficiency_47 | complexity_47 | _merge_47 | OrganizationName_efile | URL_efile | SubmittedOn_efile | TaxPeriod_efile | whistleblower_policy_efile | conflict_of_interest_policy_efile | records_retention_policy_efile | SOX_policies_efile | SOX_policies_binary_efile | SOX_policies_all_binary_efile | tot_rev_efile | tot_rev_no_neg_efile | total_revenue_logged_efile | program_expenses_efile | total_expenses_efile | program_efficiency_efile | complexity_efile | _merge_efile | FYE_y | audit_committee_y | board_size | independent_members | audited_financials | fundraising_exp | low_quality_reporting | _merge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16722 | 020503776 | http://www.charitynavigator.org/index.cfm?bay=search.summary&orgid=16722 | Portsmouth Girls Softball Association | Human Services | Human Services : Multipurpose Human Service Organizations | 2016-08-12 00:00:00 | current | 2015-01-01 | current | NaN | current | NaN | current (2016) donor advisory | On August 1, 2016, the New Hampshire Union Leader published an article titled, "Former Portsmouth youth softball president accused of stealing thousands from nonprofit." | NaN | 1.0 | 1.0 | 0.0 | True | http://www.charitynavigator.org/index.cfm?bay=search.summary&orgid=16722 | NaN | NaN | NaN | current | current donor advisory 2016 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | 0.0 | NaN | PORTSMOUTH GIRLS SOFTBALL ASSOCIATION | PO BOX 8092 | PORTSMOUTH | NH | 03802-8092 | 201104.0 | 0.0 | 201309.0 | 0.0 | 0.0 | 0.0 | N63 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2011 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | 2011 | NaN | NaN | nan | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NH | NaN | NaN | left_only | NaN | NaN | NaN | NaN | left_only | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | On August 1, 2016, the New Hampshire Union Leader published an article titled, "Former Portsmouth youth softball president accused of stealing thousands from nonprofit." | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only | NaN | NaN | NaN | NaN | NaN | NaN | NaN | left_only |
pd.crosstab(merged['_merge'], merged['donor_advisory_2016'])
donor_advisory_2016 | 0.0 | 1.0 |
---|---|---|
_merge | ||
left_only | 7983 | 169 |
right_only | 0 | 0 |
both | 0 | 152 |
columns = ['org_id', 'EIN', 'FYE_x',
'fund_expense_percent_2016', 'independent_board_2016', #'program_expense_growth_2016',
'audited_financials_2016', #'no_loans_related_2016', 'contributions_gifts_grants_2016',
#'total_contributions_2016', #'fund_expense_percent_2011', 'fund_expense_2011',
#'primary_revenue_growth_2011', 'program_expense_growth_2011', 'independent_board_2011',
#'audited_financials_2011', 'no_loans_related_2011',
'pt1_num_vtng_gvrn_bdy_mems', 'pt1_num_ind_vtng_mems',
'num_vtng_gvrn_bdy_mems', 'num_ind_vtng_mems', 'contri_grnts_cy', 'tot_fndrsng_exp_cy', 'aud_fincl_stmts',
'fs_audited', 'audit_committee_x', 'FYE_y', 'audit_committee_y', 'board_size',
'independent_members', 'audited_financials', 'fundraising_exp', 'low_quality_reporting']
merged[columns][115:125]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-7786e17ed6e8> in <module>() ----> 1 merged[columns][115:125] NameError: name 'merged' is not defined
merged['independent_members'] = merged['independent_members'].astype('float')
merged['board_size'] = merged['board_size'].astype('float')
print len(merged[merged['independent_members'].isnull()])
print len(merged[merged['independent_members'].notnull()])
8152 152
merged['independent_board'] = np.nan
merged['independent_board'] = np.where(( (merged['independent_members'].notnull())
& (merged['independent_members']>4)
& (merged['independent_members']>(merged['board_size']/2))),
1, 0 ) #merged['independent_board'])
merged['independent_board'] = np.where(merged['independent_members'].isnull(), np.nan, merged['independent_board'])
merged['independent_board'].value_counts()
1.0 83 0.0 69 Name: independent_board, dtype: int64
columns = ['org_id', 'EIN', 'FYE_x', 'fund_expense_percent_2016', 'independent_board_2016',
'board_size', 'independent_members', 'independent_board',
'audited_financials_2016', 'pt1_num_vtng_gvrn_bdy_mems', 'pt1_num_ind_vtng_mems',
'num_vtng_gvrn_bdy_mems', 'num_ind_vtng_mems', 'contri_grnts_cy', 'tot_fndrsng_exp_cy',
'aud_fincl_stmts', 'fs_audited', 'audit_committee_x', 'FYE_y', 'audit_committee_y', 'audited_financials',
'fundraising_exp', 'low_quality_reporting']
merged[merged['independent_members'].notnull()][columns][110:120]
org_id | EIN | FYE_x | fund_expense_percent_2016 | independent_board_2016 | board_size | independent_members | independent_board | audited_financials_2016 | pt1_num_vtng_gvrn_bdy_mems | pt1_num_ind_vtng_mems | num_vtng_gvrn_bdy_mems | num_ind_vtng_mems | contri_grnts_cy | tot_fndrsng_exp_cy | aud_fincl_stmts | fs_audited | audit_committee_x | FYE_y | audit_committee_y | audited_financials | fundraising_exp | low_quality_reporting | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5798 | 4292 | 941347046 | current | NaN | NaN | 22.0 | 19.0 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2014 | 1.0 | 0.0 | 3146137 | 0.0 |
5887 | 16234 | 451258469 | current | NaN | NaN | 4.0 | 2.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2013 | NaN | 0.0 | 0 | 1.0 |
5917 | 16254 | 710987457 | current | NaN | NaN | 1.0 | 0.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2014 | NaN | 0.0 | 0 | 1.0 |
5954 | 15512 | 450579664 | current | NaN | NaN | 2.0 | 2.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2011 | NaN | 0.0 | 26827 | 0.0 |
6059 | 16633 | 330807324 | current | NaN | NaN | 16.0 | 16.0 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2012 | NaN | 0.0 | 0 | 1.0 |
6104 | 11327 | 720760857 | current | NaN | NaN | 30.0 | 30.0 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2014 | 1.0 | 1.0 | 63530 | 0.0 |
6310 | 13735 | 141368361 | current | NaN | NaN | 15.0 | 12.0 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2015 | 1.0 | 1.0 | 0 | 1.0 |
6561 | 13601 | 742463670 | current | NaN | NaN | 6.0 | 4.0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2014 | 1.0 | 1.0 | 28560 | 0.0 |
6636 | 16139 | 510138358 | current | NaN | NaN | 23.0 | 23.0 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2014 | 0.0 | 0.0 | 33086 | 0.0 |
6648 | 4441 | 042129889 | current | NaN | NaN | 23.0 | 19.0 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | FY2014 | 1.0 | 1.0 | 0 | 1.0 |
print len(merged[merged[columns[0]].isnull()])
print len(merged[merged[columns[1]].isnull()])
print len(merged[merged[columns[2]].isnull()])
print len(merged[merged[columns[3]].isnull()])
print len(merged[merged[columns[4]].isnull()])
print len(merged[merged[columns[5]].isnull()])
print len(merged[merged[columns[6]].isnull()])
print len(merged[merged[columns[7]].isnull()])
print len(merged[merged[columns[8]].isnull()])
print len(merged[merged[columns[9]].isnull()])
print len(merged[merged[columns[10]].isnull()])
print len(merged[merged[columns[11]].isnull()])
print len(merged[merged[columns[12]].isnull()])
print len(merged[merged[columns[13]].isnull()])
print len(merged[merged[columns[14]].isnull()])
print len(merged[merged[columns[15]].isnull()])
print len(merged[merged[columns[16]].isnull()])
print len(merged[merged[columns[17]].isnull()])
print len(merged[merged[columns[18]].isnull()])
print len(merged[merged[columns[19]].isnull()])
print len(merged[merged[columns[20]].isnull()])
print len(merged[merged[columns[21]].isnull()])
0 66 0 321 321 321 7667 7667 7667 7667 7667 7667 7667 7667 7667 8152 8181 8152 8152 8152 8152 8152
for index, row in merged[10:].iterrows():
if type(row['independent_board_2016'])==list:
#print 'yup'
merged.ix[index, 'independent_board_2016'] = row['independent_board_2016'][0]
for index, row in merged[:10].iterrows():
print type(row['independent_board_v2']), row['independent_board_v2'], row['independent_board_2016']
<type 'float'> 0.0 nan <type 'float'> 0.0 _gfx_/icons/checkboxX.gif <type 'float'> 0.0 _gfx_/icons/checked.gif <type 'float'> 0.0 _gfx_/icons/checked.gif <type 'float'> 0.0 _gfx_/icons/checked.gif <type 'float'> 0.0 _gfx_/icons/checked.gif <type 'float'> 0.0 _gfx_/icons/checked.gif <type 'float'> 0.0 _gfx_/icons/checked.gif <type 'float'> 0.0 nan <type 'float'> 0.0 nan
print merged['independent_board_2016'].value_counts().sum(),'\n'
print merged['independent_board_2016'].value_counts()
7983 _gfx_/icons/checked.gif 7658 _gfx_/icons/checkboxX.gif 325 Name: independent_board_2016, dtype: int64
merged['independent_board_v2'] = np.nan
print len(merged[merged['independent_board_v2'].notnull()])
merged['independent_board_v2'] = np.where(merged['independent_board_2016'].notnull(), merged['independent_board_2016'],
merged['independent_board_v2'] )
print len(merged[merged['independent_board_v2'].notnull()])
merged['independent_board_v2'] = np.where( ((merged['independent_board_v2'].isnull()) &
(merged['independent_board'].notnull())),
merged['independent_board'], merged['independent_board_v2'] )
print len(merged[merged['independent_board_v2'].notnull()])
merged['independent_board_v2'].value_counts()
0 7983 8135
_gfx_/icons/checked.gif 7658 _gfx_/icons/checkboxX.gif 325 1.0 83 0.0 69 Name: independent_board_v2, dtype: int64
merged['independent_board_v2'] = np.where(merged['independent_board_v2']== '_gfx_/icons/checked.gif',
1, merged['independent_board_v2'])
merged['independent_board_v2'] = np.where(merged['independent_board_v2']== '_gfx_/icons/checkboxX.gif',
0, merged['independent_board_v2'])
merged['independent_board_v2'].value_counts()
1 7741 0 394 Name: independent_board_v2, dtype: int64
merged['independent_board_v2'].value_counts().sum()
8135
merged.rename(columns={'FYE_x':'FYE'}, inplace=True)
columns = ['org_id', 'EIN', 'FYE', #'independent_board_2016',
#'board_size', 'independent_members', 'independent_board',
#'pt1_num_vtng_gvrn_bdy_mems', 'pt1_num_ind_vtng_mems',
'independent_board_v2',
'audited_financials_2016',
#'FYE_y',
'audited_financials', 'aud_fincl_stmts', 'fs_audited', 'audit_committee_x', 'audit_committee_y',
#'num_vtng_gvrn_bdy_mems', 'num_ind_vtng_mems', 'contri_grnts_cy', 'tot_fndrsng_exp_cy',
#'fund_expense_percent_2016',
#'fundraising_exp', 'low_quality_reporting'
]
merged[columns][:5]
org_id | EIN | FYE | independent_board_v2 | audited_financials_2016 | audited_financials | aud_fincl_stmts | fs_audited | audit_committee_x | audit_committee_y | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 16722 | 020503776 | current | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 10166 | 043314346 | FY2013 | 0 | 1 | NaN | NaN | NaN | NaN | NaN |
2 | 6466 | 953667812 | FY2014 | 1 | 1 | NaN | NaN | NaN | NaN | NaN |
3 | 12098 | 581974429 | FY2014 | 1 | 1 | NaN | NaN | NaN | NaN | NaN |
4 | 12123 | 237172077 | FY2014 | 1 | 1 | NaN | NaN | NaN | NaN | NaN |
print len(merged[merged[columns[0]].isnull()])
print len(merged[merged[columns[1]].isnull()])
print len(merged[merged[columns[2]].isnull()])
print len(merged[merged[columns[3]].isnull()])
print len(merged[merged[columns[4]].isnull()])
print len(merged[merged[columns[5]].isnull()])
print len(merged[merged[columns[6]].isnull()])
print len(merged[merged[columns[7]].isnull()])
print len(merged[merged[columns[8]].isnull()])
print len(merged[merged[columns[9]].isnull()])
0 66 0 0 321 8152 7667 7667 7667 8181
for index, row in merged.iterrows():
if type(row['audited_financials_2016'])==list:
merged.ix[index, 'audited_financials_2016'] = row['audited_financials_2016'][0]
merged[columns][:5]
org_id | EIN | FYE | independent_board_v2 | audited_financials_2016 | aud_fincl_stmts | fs_audited | audit_committee_x | audit_committee_y | audited_financials | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 16722 | 020503776 | current | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 10166 | 043314346 | FY2013 | 0 | _gfx_/icons/checked.gif | NaN | NaN | NaN | NaN | NaN |
2 | 6466 | 953667812 | FY2014 | 1 | _gfx_/icons/checked.gif | NaN | NaN | NaN | NaN | NaN |
3 | 12098 | 581974429 | FY2014 | 1 | _gfx_/icons/checked.gif | NaN | NaN | NaN | NaN | NaN |
4 | 12123 | 237172077 | FY2014 | 1 | _gfx_/icons/checked.gif | NaN | NaN | NaN | NaN | NaN |
print merged['audited_financials_2016'].value_counts()
merged['audited_financials_2016'] = np.where(merged['audited_financials_2016']== '_gfx_/icons/checked.gif',
1, merged['audited_financials_2016'])
merged['audited_financials_2016'] = np.where(merged['audited_financials_2016']== '_gfx_/icons/checkboxX.gif',
0, merged['audited_financials_2016'])
print merged['audited_financials_2016'].value_counts()
_gfx_/icons/checked.gif 7247 _gfx_/icons/checkboxX.gif 372 _gfx_/icons/checkboxOptOut.png 364 Name: audited_financials_2016, dtype: int64 1 7247 0 372 _gfx_/icons/checkboxOptOut.png 364 Name: audited_financials_2016, dtype: int64
merged['audited_financials_2016_v2'] = np.where(merged['audited_financials_2016']== '_gfx_/icons/checkboxOptOut.png',
1, merged['audited_financials_2016'])
print merged['audited_financials_2016_v2'].value_counts()
1 7611 0 372 Name: audited_financials_2016_v2, dtype: int64
print len(merged[merged[columns[5]].isnull()])
print len(merged[merged[columns[6]].isnull()])
print merged[columns[5]].value_counts(), '\n'
print merged[columns[6]].value_counts(), '\n'
pd.crosstab(merged[columns[5]],merged[columns[6]])
7667 7667 Y 433 N 204 Name: aud_fincl_stmts, dtype: int64 Y 629 N 8 Name: fs_audited, dtype: int64
fs_audited | N | Y |
---|---|---|
aud_fincl_stmts | ||
N | 7 | 197 |
Y | 1 | 432 |
432+197+1+7
637
print len(merged[merged['audited_financials_2016_v2'].notnull()])
7983
merged['audited_financials_v2'] = np.nan
print len(merged[merged['audited_financials_v2'].notnull()])
merged['audited_financials_v2'] = np.where(merged['audited_financials_2016_v2'].notnull(),
merged['audited_financials_2016_v2'],
merged['audited_financials_v2'] )
print len(merged[merged['audited_financials_v2'].notnull()])
merged['audited_financials_v2'] = np.where( ((merged['audited_financials_v2'].isnull()) &
(merged['audited_financials'].notnull())),
merged['audited_financials'], merged['audited_financials_v2'] )
print len(merged[merged['audited_financials_v2'].notnull()])
merged['audited_financials_v2'] = np.where( ((merged['audited_financials_v2'].isnull()) &
(merged['aud_fincl_stmts'].notnull())),
merged['aud_fincl_stmts'], merged['audited_financials_v2'] )
print len(merged[merged['audited_financials_v2'].notnull()])
merged['audited_financials_v2'].value_counts()
merged['audited_financials_v2'] = np.where( ((merged['audited_financials_v2'].isnull()) &
(merged['fs_audited'].notnull())),
merged['fs_audited'], merged['audited_financials_v2'] )
print len(merged[merged['audited_financials_v2'].notnull()])
merged['audited_financials_v2'].value_counts()
0 7983 8135 8135 8135
1 7703 0 432 Name: audited_financials_v2, dtype: int64
merged['audited_financials'].value_counts()
1.0 92 0.0 60 Name: audited_financials, dtype: int64
columns_v2 = ['org_id', 'EIN', 'FYE', #'independent_board_2016',
#'board_size', 'independent_members', 'independent_board',
#'pt1_num_vtng_gvrn_bdy_mems', 'pt1_num_ind_vtng_mems',
'independent_board_v2',
#'audited_financials_2016',
'audited_financials_v2',
#'FYE_y',
#'audited_financials', 'aud_fincl_stmts', 'fs_audited', 'audit_committee_x', 'audit_committee_y',
#'num_vtng_gvrn_bdy_mems', 'num_ind_vtng_mems', 'contri_grnts_cy', 'tot_fndrsng_exp_cy',
#'fund_expense_percent_2016',
#'fundraising_exp', 'low_quality_reporting'
]
merged[columns_v2].dtypes
org_id object EIN object FYE object independent_board_v2 object audited_financials_v2 object dtype: object
merged['independent_board_v2'] = merged['independent_board_v2'].astype('float')
merged['audited_financials_v2'] = merged['audited_financials_v2'].astype('float')
merged[columns_v2].describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
independent_board_v2 | 8135.0 | 0.951567 | 0.214692 | 0.0 | NaN | NaN | NaN | 1.0 |
audited_financials_v2 | 8135.0 | 0.946896 | 0.224254 | 0.0 | NaN | NaN | NaN | 1.0 |
merged[columns_v2].to_pickle('additional variables for testing.pkl')
print len(merged)
8304
df4 = pd.read_stata('Test 4 data.dta')
print len(df4)
df4[:2]
8238
Col1 | A2011_data | A2016_data | ein | fye | Form_990_FYE | SOX_policies | SOX_policies_all_binary | SOX_policies_binary | age | category | category_Animals | category_Arts__Culture__Humaniti | category_Community_Development | category_Education | category_Environment | category_Health | category_Human_Services | category_Human_and_Civil_Rights | category_International | category_Religion | category_Research_and_Public_Pol | complexity | complexity_2011 | conflict_of_interest_policy_v2 | donor_advisory | donor_advisory_2011_to_2016 | donor_advisory_2016 | org_id | program_efficiency | ratings_system | records_retention_policy_v2 | state | tot_rev | total_revenue_logged | whistleblower_policy_v2 | ncategory | revs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50709 | 0 | 1 | 010202467 | FY2014 | 2014-12 | 3.0 | 1.0 | 1.0 | 62.0 | Research and Public Policy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.0 | NaN | 1.0 | 0 | 0 | 0 | 5954 | 0.794457 | CN 2.1 | 1.0 | ME | NaN | 16.377993 | 1.0 | Research and Public Policy | 12967968.0 |
1 | 76320 | 0 | 1 | 010211478 | FY2014 | 2014-06 | 3.0 | 1.0 | 1.0 | 57.0 | Community Development | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3.0 | NaN | 1.0 | 0 | 0 | 0 | 12517 | 0.808606 | CN 2.1 | 1.0 | ME | 3054612.0 | 14.932163 | 1.0 | Community Development | 3054612.0 |
merged2 = pd.merge(df4, merged[columns_v2], left_on='ein', right_on='EIN', how='left', indicator=True)
print len(merged2)
print merged2['_merge'].value_counts()
merged2[:2]
8238 both 8238 right_only 0 left_only 0 Name: _merge, dtype: int64
Col1 | A2011_data | A2016_data | ein | fye | Form_990_FYE | SOX_policies | SOX_policies_all_binary | SOX_policies_binary | age | category | category_Animals | category_Arts__Culture__Humaniti | category_Community_Development | category_Education | category_Environment | category_Health | category_Human_Services | category_Human_and_Civil_Rights | category_International | category_Religion | category_Research_and_Public_Pol | complexity | complexity_2011 | conflict_of_interest_policy_v2 | donor_advisory | donor_advisory_2011_to_2016 | donor_advisory_2016 | org_id_x | program_efficiency | ratings_system | records_retention_policy_v2 | state | tot_rev | total_revenue_logged | whistleblower_policy_v2 | ncategory | revs | org_id_y | EIN | FYE | independent_board_v2 | audited_financials_v2 | _merge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 50709 | 0 | 1 | 010202467 | FY2014 | 2014-12 | 3.0 | 1.0 | 1.0 | 62.0 | Research and Public Policy | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6.0 | NaN | 1.0 | 0 | 0 | 0 | 5954 | 0.794457 | CN 2.1 | 1.0 | ME | NaN | 16.377993 | 1.0 | Research and Public Policy | 12967968.0 | 5954 | 010202467 | FY2014 | 1.0 | 1.0 | both |
1 | 76320 | 0 | 1 | 010211478 | FY2014 | 2014-06 | 3.0 | 1.0 | 1.0 | 57.0 | Community Development | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3.0 | NaN | 1.0 | 0 | 0 | 0 | 12517 | 0.808606 | CN 2.1 | 1.0 | ME | 3054612.0 | 14.932163 | 1.0 | Community Development | 3054612.0 | 12517 | 010211478 | FY2014 | 1.0 | 1.0 | both |
merged2['donor_advisory_2016'].value_counts()
0 7983 1 255 Name: donor_advisory_2016, dtype: int64
merged2.to_pickle('Test 4 with independent_board and audited_financials.pkl')
columns = ['screen_name', 'followers_first', 'followers_end', 'followers_last',
'orgid', 'retweeted_count', 'favorited_count']
aggregations = {'orgid': 'first',
'screen_name': 'first',
'retweeted_count': 'sum',
'favorited_count': 'sum',
'followers_first': 'first',
'followers_end': 'last',
}
columns = ['screen_name', 'followers_first', 'followers_end', 'followers_last',
'orgid', 'retweeted_count', 'favorited_count']
aggregations = {'orgid': 'first',
'screen_name': 'first',
'retweeted_count': 'sum',
'favorited_count': 'sum',
'followers_first': 'first',
'followers_end': 'last',
}
org_df = df[columns].groupby('orgid').agg(aggregations)
print len(org_df)
org_df = org_df.dropna()
org_df['followers_first'] = org_df['followers_first'].astype('int')
org_df['followers_end'] = org_df['followers_end'].astype('int')
org_df['favorited_count'] = org_df['favorited_count'].astype('int')
org_df['followers_change'] = org_df['followers_end'] - org_df['followers_first']
org_df['followers_pct_change'] = (org_df['followers_end'] - org_df['followers_first'])/org_df['followers_first']
print len(org_df)
org_df[:5]