Notebook
# List of DataFrames dataframe_dict = {u'Bill Clinton': bc_rv_df, u'Hillary Rodham Clinton': hrc_rv_df} # Set operations all_links = list(set(hrc_alllink_outlist) | set(hrc_alllink_inlist)) # Start the scrape errors = list() for article in all_links: try: df = get_revision_df(article) dataframe_dict[article] = df except: errors.append(article) pass gigantic_df = pd.concat(dataframe_dict.values()) gigantic_df.to_csv('gigantic_df.csv',encoding='utf8')
# Start up the data structure link_data = {u'Hillary Rodham Clinton': {'Out':hrc_alllink_outlist, 'In':hrc_alllink_inlist}} # Set operations all_links = list(set(hrc_alllink_outlist) | set(hrc_alllink_inlist)) # Start the scrape for article in all_links: try: _out_links,_in_links = get_article_links(article) link_data[article] = {'Out':_out_links, 'In':_in_links} except: print article pass # Save the data with open('link_data.json','wb') as f: json.dump(link_data,f)