from SPARQLWrapper import SPARQLWrapper, JSON
import operator
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
named_graphs = [named_graph.strip() for named_graph in open('named_graphs.txt')]
def process_graph(graph, dimensions):
sparql = SPARQLWrapper("http://lod.cedar-project.nl:8080/sparql/cedar")
query = """
select distinct (count(?dim) as ?total) ?label from <GRAPH> where {
?header <http://www.data2semantics.org/core/isDimension> ?dim.
?dim a <http://www.data2semantics.org/core/Dimension>.
?dim <http://www.w3.org/2004/02/skos/core#prefLabel> ?label.
}
group by ?label
order by desc(?total)
""".replace('GRAPH',graph)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
for result in results["results"]["bindings"]:
total=int(result['total']['value'])
dimension=result['label']['value']
if dimension not in dimensions:
dimensions[dimension] = 0
dimensions[dimension] = dimensions[dimension] + total
dimensions_count = {}
# test graph "http://lod.cedar-project.nl/resource/VT_1859_01_H1"
for named_graph in named_graphs:
process_graph(named_graph, dimensions_count)
sorted_dimensions_count = sorted(dimensions_count.iteritems(), key=operator.itemgetter(1), reverse=True)
for (k,v) in sorted_dimensions_count[:15]:
try:
print '{0:<8} => {1}'.format(v, k)
except UnicodeEncodeError:
pass
81092 => D 55980 => A 53529 => C 34607 => B 25398 => Geheele Provincie 18080 => C 16208 => Totaal 15699 => A 12074 => XVII. Fabricage van voedings- en genotmiddelen 11750 => XXI. Verkeerswezen 9738 => XX. Warenhandel. 1) 8456 => Kom 8340 => Huizen 8174 => Buiten kom 6886 => M
values = [v for (k,v) in sorted_dimensions_count]
plt.semilogy(range(len(values)), values, 'ro')
plt.axis([0, len(values), 0, values[0]])
plt.xlabel('Index of the dimension')
plt.ylabel('Frequency')
plt.show()