#!/usr/bin/env python # coding: utf-8 # # Utility — Table to Tree # # This scripts in this notebook are utility scripts for generating a hierarchical JSON tree structure from a *pandas* dataframe. # # Consider the following dataframe: # In[1]: import pandas as pd wards_csv_fn = 'wards_data.csv' wards_df = pd.read_csv('wards_data.csv') wards_df.head() # Regions are contained within countries, local authorities within regions, wards within local authorities. # # The Python [`treelib`](https://github.com/caesar0301/treelib) provides support for creating simple tree structures. The following explicit code fragment will generate a `treelib.Tree()` object from a dataframe when passed the column names corresponding to the ID value and label required for each level of the tree: # In[33]: #%pip install treelib from treelib import Tree country_tree = Tree() # Create a root node country_tree.create_node("Country", "countries") # Group by country for country, regions in wards_df.head(5).groupby(["CTRY17NM", "CTRY17CD"]): # Generate a node for each country country_tree.create_node(country[0], country[1], parent="countries") # Group by region for region, las in regions.groupby(["GOR10NM", "GOR10CD"]): # Generate a node for each region country_tree.create_node(region[0], region[1], parent=country[1]) # Group by local authority for la, wards in las.groupby(['LAD17NM', 'LAD17CD']): # Create a node for each local authority country_tree.create_node(la[0], la[1], parent=region[1]) for ward, _ in wards.groupby(['WD17NM', 'WD17CD']): # Create a leaf node for each ward country_tree.create_node(ward[0], ward[1], parent=la[1]) # Output the hierarchical data country_tree.show() # Whilst the code works, it is a little messy. More generally, we can create a recursive function to traverse the tree for us: # In[24]: get_ipython().run_cell_magic('writefile', 'table2tree.py', 'from treelib import Tree\n\ndef create_tree(df, items, parent, root=None, tree=None, i=0):\n """Create a tree from a dataframe."""\n if tree is None:\n tree = Tree()\n root = root if root else parent\n tree.create_node(root, parent)\n\n i = i + 1\n\n for parental, group_df in df.groupby(items[i-1]):\n tree.create_node(parental[0], parental[1], parent=parent)\n if i <= len(items)-1: \n create_tree(group_df, items, parental[1], tree=tree, i=i)\n \n return tree\n') # In[25]: # Run the file as if we had run the code cell get_ipython().run_line_magic('run', 'table2tree.py') # We can now specify a list of column pairs (label and ID) for each level of the tree and generate the tree from that: # In[28]: # The items specify the label and the ID columns for each node in the tree items = [["CTRY17NM", "CTRY17CD"], ["GOR10NM", "GOR10CD"], ['LAD17NM', 'LAD17CD'], ['WD17NM', 'WD17CD']] tree = create_tree(wards_df.head(10), items, 'countries', 'Country' ) tree.show() # We can also export the tree as a JSON file: # In[34]: import json tree_json = json.loads(tree.to_json()) tree_json # The format of the JSON has interstitial `children` elements that make be convenient in some cases, but that may be surplus to requirements in other cases. # # Naively, and explicitly, we could start to remove these elements from the tree using something like following code snippet: # In[35]: tmp_pruned_tree = {'Country':{}} for region in tree_json['Country']['children']: for region_key in region.keys(): tmp_pruned_tree['Country'][region_key] = {} for la in region[region_key]['children']: for la_key in la.keys(): tmp_pruned_tree['Country'][region_key][la_key] = la[la_key]['children'] tmp_pruned_tree # Once again, we can take inspiration from the literal code to come up with a recursive function that will prune the child nodes for us for any depth tree: # In[38]: get_ipython().run_cell_magic('writefile', '-a table2tree.py', '\ndef prune_tree(tree, pruned=None, path=None):\n """Prune \'children\' nodes from tree."""\n \n # Create a new pruned tree if we haven\'t yet started...\n pruned = {} if pruned is None else pruned\n\n # Convert the tree to a dict if it isn\'t already in dict form\n if isinstance(tree, type(Tree())):\n tree = json.loads(tree.to_json())\n \n # Get the first (root) node\n path = path if path else next(iter(tree))\n \n # This will be our pruned tree dictionary\n pruned[path] = {}\n \n # Now start to check the subtrees...\n for subtree in tree[path][\'children\']:\n # If we find into another subtree...\n if isinstance(subtree, dict):\n # Descend into it...\n for subtree_key in subtree.keys():\n # Create a new key node for this subtree\n pruned[path][subtree_key] = {}\n # And carry on pruning down into the tree\n prune_tree(subtree, pruned[path], subtree_key)\n else:\n # We\'ve reached the leaves which add as a list\n pruned[path] = tree[path][\'children\']\n \n return pruned\n') # In[39]: # Run the file as if we had run the code cell get_ipython().run_line_magic('run', 'table2tree.py') # In[40]: pruned_tree = prune_tree(tree_json) pruned_tree