from SPARQLWrapper import SPARQLWrapper, JSON from rdflib import ConjunctiveGraph, Namespace, Literal, RDF, RDFS, BNode, URIRef, XSD, Variable import operator import numpy as np import networkx as nx import matplotlib.pyplot as plt import urllib2, StringIO, csv import re %matplotlib inline NG_TEMPLATE = 'http://lod.cedar-project.nl/resource/r1/CUBE' END_POINT = 'http://lod.cedar-project.nl:8080/sparql/cedar' # Load cube names url = 'https://raw.githubusercontent.com/cgueret/Harmonize/master/cubes.txt' cubes = [cube.strip() for cube in StringIO.StringIO(urllib2.urlopen(url).read())] # Load list of occupations url = 'https://raw.githubusercontent.com/cgueret/Harmonize/master/codes/occupation.csv' reader = csv.reader(StringIO.StringIO(urllib2.urlopen(url).read())) occupation_codes = set() header_row = True for row in reader: # Skip the header if header_row: header_row = False continue # Skip empty lines if len(row) != 2: continue try: occupation_codes.add(str(row[1].replace('http://cedar.example.org/ns#hisco-',''))) except: pass print "Total number of codes %d " % len(occupation_codes) # Load the list of target codes target_codes = set() url = 'https://raw.githubusercontent.com/cgueret/CEDAR-Analysis/master/modern_hisco_2.txt' lines = [line.strip() for line in StringIO.StringIO(urllib2.urlopen(url).read())] for line in lines: if line.startswith('HISCO'): continue if len(line) < 2: continue target = str(line.split(' ')[0]) for occupation in occupation_codes: if occupation.startswith(target): target_codes.add(occupation) print "Target codes %d " % len(target_codes) observations = {} for code in target_codes: for cube in cubes: named_graph = NG_TEMPLATE.replace('CUBE', cube) sparql = SPARQLWrapper(END_POINT) query = """ prefix cedar: prefix qb: select (sum(?s) as ?total) ?year from where { ?o a qb:Observation. ?o cedar:occupation cedar:hisco-CODE. ?o cedar:populationSize ?s. ?o qb:dataSet ?ds. ?ds a qb:DataSet. ?ds cedar:censusYear ?year. } """.replace('GRAPH',named_graph).replace('CODE', code) sparql.setQuery(query) sparql.setReturnFormat(JSON) results = sparql.query().convert()["results"]["bindings"] for result in results: if 'total' in result and 'year' in result: total = int(result['total']['value']) year = str(result['year']['value']) observations.setdefault(year, {}) observations[year].setdefault(code, 0) observations[year][code] += total print observations agg = {} for (year, data) in observations.iteritems(): for (code, count) in data.iteritems(): agg.setdefault(year, 0) agg[year] += count print agg data_years = [] data_counts = [] for year in sorted(agg.keys()): data_years.append(year) data_counts.append(agg[year]) print year, agg[year] plt.plot(data_years, data_counts, alpha=0.5) plt.scatter(data_years, data_counts, alpha=0.5) plt.show() data_years = [] data_counts = [] for year in sorted(observations.keys()): data = observations[year] total = 0 for (code, count) in data.iteritems(): if code.startswith('21110'): total += count data_years.append(year) data_counts.append(total) print year, total plt.plot(data_years, data_counts, alpha=0.5) plt.scatter(data_years, data_counts, alpha=0.5) plt.show()