from __future__ import unicode_literals import json import numpy as np import pandas as pd from pandas import DataFrame, Series import xlrd from collections import defaultdict # Import article data. This is a 400 MB file. df = pd.read_pickle('../data/all_plos_df.pkl') # Drop unused data df.drop(['author', 'title_display', 'journal', 'abstract', 'publication_date', 'score'], axis=1, inplace=True) df.set_index('id', inplace=True) df.head() # Let's make sure we are counting articles correctly for each subject node. def count_articles(df, subject_path): s = df.subject.apply(lambda s: str(s)) matching = s[s.str.contains(subject_path)] return len(matching) print 'Total articles:', len(df) print 'Science policy:', count_articles(df, 'Science policy') print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics') def tree_from_spreadsheet(f, df, verbose=False): subjects = df.subject.apply(lambda s: str(s)) book = xlrd.open_workbook(f) pt = book.sheet_by_index(0) # spreadsheet cells : (row, col) :: cell A1 : (0, 0) # Initialize a list to contain the thesaurus. # Our test case will only have one item in this list. pt_list = [] # Keep track of the path in the tree. cur_path = Series([np.nan]*10) for r in range(1, pt.nrows): # Start on row two. # Columns: the hierarchy goes up to 10 tiers. for c in range(10): if pt.cell_value(r, c): # If this condition is satisfied, we are at the node that's in this line. # Construct the path to this node. # Clean strings because some terms (RNA nomenclature) cause unicode error text = pt.cell_value(r, c).replace(u'\u2019', "'") cur_path[c] = text cur_path[c+1:] = np.nan path_list = list(cur_path.dropna()) tier = len(path_list) path_str = '/'.join(path_list) if verbose: print tier, path_str # Add the node to the JSON-like tree structure. node = defaultdict(list) node['name'] = text node['count']= len(subjects[subjects.str.contains(path_str)]) # This part is completely ridiculous. But it seems to work. if tier == 1: pt_list.append(node) elif tier == 2: pt_list[-1]['children'].append(node) elif tier == 3: pt_list[-1]['children'][-1]['children'].append(node) elif tier == 4: pt_list[-1]['children'][-1]['children'][-1]['children'].append(node) elif tier == 5: pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node) elif tier == 6: pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node) elif tier == 7: pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node) elif tier == 8: pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node) elif tier == 9: pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node) elif tier == 10: pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node) # Go to next row after finding a term. There is only one term listed per row. break # Make a single JSON object to contain all the branches. pt_obj = {'count': len(df), 'name': 'PLOS', 'children': pt_list} return pt_obj # Test 1: Science policy plosthes_test_file = '../data/plosthes_test.xlsx' json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True)) # Test 2: An edited subset of Earth Sciences plosthes_test_file = '../data/plosthes_test_2.xlsx' json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True)) df.subject[df.subject.apply(lambda x: u'/Earth sciences/Mineralogy/Minerals/Gemstones/Diamonds' in x)] # Update this filename if you use a newer version! plosthes_full_file = '../data/plosthes.2014-2.full.xlsx' # Generate tree structure # Change to verbose=True if you want to see it happening. # (Fills up the output cell with ~10000 lines.) plos_tree = tree_from_spreadsheet(plosthes_full_file, df, verbose=False) # Export tree structure as JSON # Note: the D3 tree visualization uses plos_tree.json -- this script won't overwrite it. with open('../data/plos_hierarchy_full.json', 'wb') as f: json.dump(plos_tree, f)