from SPARQLWrapper import SPARQLWrapper, JSON from rdflib import ConjunctiveGraph, Namespace, Literal, RDF, RDFS, BNode, URIRef, XSD, Variable import operator import numpy as np import networkx as nx import matplotlib.pyplot as plt import urllib2, StringIO, csv import re %matplotlib inline NG_TEMPLATE = 'http://lod.cedar-project.nl/resource/v2/TABLE' END_POINT = 'http://lod.cedar-project.nl:8080/sparql/cedar' def clean_string(text): # Remove some extra things text_clean = text.replace('.', '').replace('_', ' ').lower() # Shrink spaces text_clean = re.sub(r'\s+', ' ', text_clean) # Remove lead and trailing whitespaces text_clean = text_clean.strip() return text_clean url = 'https://raw.githubusercontent.com/cgueret/Harmonize/master/tables.txt' tables = [table.strip() for table in StringIO.StringIO(urllib2.urlopen(url).read())] headers = {} for table in tables: named_graph = NG_TEMPLATE.replace('TABLE', table) sparql = SPARQLWrapper(END_POINT) query = """ prefix tablink: prefix skos: select distinct ?header ?label ?parent from where { ?header a tablink:ColumnHeader; tablink:subColHeaderOf ?parent; skos:prefLabel ?label. } """.replace('GRAPH',named_graph) sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert() table_headers = {} for result in results["results"]["bindings"]: h = URIRef(result['header']['value']) p = URIRef(result['parent']['value']) l = Literal(result['label']['value']) table_headers[h] = {'parent':p, 'label':l} headers[table] = table_headers keyword_count = {} for (table, table_headers) in headers.iteritems(): for (header, data) in table_headers.iteritems(): clean_label = clean_string(data['label']) for label_part in clean_label.split(' '): if label_part not in keyword_count: keyword_count[label_part] = 0 keyword_count[label_part] = keyword_count[label_part] + 1 sorted_keyword_count = sorted(keyword_count.iteritems(), key=operator.itemgetter(1), reverse=True) for (k,v) in sorted_keyword_count[:15]: print '{0:<8} => {1}'.format(v, k) values = [v for (k,v) in sorted_keyword_count] plt.semilogy(range(len(values)), values, 'ro') plt.axis([0, len(values), 0, values[0]]) plt.xlabel('Index of the dimension') plt.ylabel('Frequency') plt.show() keyword_sets = {} for (table, table_headers) in headers.iteritems(): keyword_sets[table] = set() for (header, data) in table_headers.iteritems(): clean_label = clean_string(data['label']) for label_part in clean_label.split(' '): keyword_sets[table].add(label_part) # Create the edges edges = [] sorted_tb_names = sorted(keyword_sets.keys()) maximum_size = 0.0 for i in range(0, len(sorted_tb_names)): for j in range(i+1, len(sorted_tb_names)): table_a = sorted_tb_names[i] table_b = sorted_tb_names[j] size = float(len(keyword_sets[table_a] & keyword_sets[table_b])) if size > 0: edges.append((table_a, table_b, size)) if size > maximum_size: maximum_size = size # Normalize the weights for i in range(0, len(edges)): (a,b,s) = edges[i] edges[i] = (a, b, s/maximum_size) # Create the graph G=nx.Graph() # Add the nodes for node in sorted_tb_names: G.add_node(node) # Add the edges for (src,dst,weight) in edges: if weight > 0.025: # Filter a bit to get less edges G.add_edge(src, dst, {'weight': weight}) # Position the nodes pos=nx.spring_layout(G, iterations=100) # Color them according to their type (beroep in blue, volk in green and woning in red) color_map = {'BRT': 'b', 'VT' : 'g', 'WT' : 'r'} typed_nodes = {} for node in sorted_tb_names: node_type = node.split('_')[0] typed_nodes.setdefault(node_type, []).append(node) # Plot the network plt.figure(figsize=(15,15)) plt.axis('off') nx.draw_networkx_edges(G, pos, alpha=0.4, width=1) for node_type in color_map.keys(): nx.draw_networkx_nodes(G, pos, alpha=0.6, node_size=80, nodelist=typed_nodes[node_type], node_color=color_map[node_type])