I'd really like to get all of my labs experiments and I think using a dedicated query language can make such operations easier.
Initial setup for this notebook.
The RDF library I've been using is Redland. The first JSON-LD library I found on PyPI was PyLD however returns a structure that Redland doesn't directly read, so I needed to provide an adaptor. Some of the other JSON-LD language bindings may be a bit more convienent. There's a pure python RDF package called RDFLib that has a JSON-LD parser.
import collections
from hashlib import md5
import json
import os
import netrc
from pprint import pformat
import requests
import RDF
from StringIO import StringIO
import sys
import time
import types
import urllib
from urlparse import urlsplit, urlunsplit, urljoin, parse_qs
# point to where I extracted PyLD
pyld = os.path.expanduser('~/src/PyLD-0.4.9/lib')
if pyld not in sys.path:
sys.path.append(pyld)
from pyld import jsonld
Store password outside of code being shared using a netrc file.
Keyring may be a better solution as it encrypts the password. However it requires that you query by url and user id, and I was having trouble getting to work with the KDE keychain.
ENCODE_HOST = 'submit.encodedcc.org'
USER_ID, _, PASSWD = netrc.netrc().authenticators(ENCODE_HOST)
I got tired of typing long urls, so this tries to fix up a url fragment with defaults intelligent enough for this document.
def prepare_url(request_url, **kwargs):
'''This attempts to provide some convienence for accessing a URL
Given a url fragment it will default to :
* requests over http
* requests to submit.encodedcc.org (ENCODE_HOST from above)
* appending limit=all to the query string
This allows fairly flexible urls. e.g.
prepare_url('/experiments/ENCSR000AEG')
prepare_url('submit.encodedcc.org/experiments/ENCSR000AEG')
prepare_url('http://submit.encodedcc.org/experiments/ENCSR000AEG?limit=all')
should all return the same url
'''
# clean up potentially messy urls
url = urlsplit(request_url)._asdict()
if not url['scheme']:
url['scheme'] = 'http'
if not url['netloc']:
url['netloc'] = ENCODE_HOST
if url['query']:
kwargs.update(parse_qs(url.qs))
url['query'] = urllib.urlencode(kwargs)
url = urlunsplit(url.values())
return url
Just a little bit more than the DCC's example for accessing objects at submit.
def get_ENCODE(obj_id, **kwargs):
'''GET an ENCODE object as JSON and return as dict
Uses prepare_url to allow url short-cuts
This will also encode additional keyword arguments in the query string.
'''
if len(kwargs) == 0:
kwargs['limit'] = 'all'
url = prepare_url(obj_id, **kwargs)
print 'requesting:', url
# do the request
headers = {'content-type': 'application/json'}
response = requests.get(url, auth=(USER_ID, PASSWD), headers=headers)
if not response.status_code == requests.codes.ok:
print >> sys.stderr, response.text
response.raise_for_status()
return response.json()
test_req = get_ENCODE('http://submit.encodedcc.org/labs/barbara-wold/')
requesting: http://submit.encodedcc.org/labs/barbara-wold/?limit=all
The ENCODE3 submit tool encoded returns a JSON structure that's been partially formatted as JSON-LD, in that many objects have @id
and @type
attributes. However by using the JSON-LD Context we can clean up the structure a bit more.
My entry point for importing ENCODE3 objexts is load_LD_ENCODE
def load_LD_ENCODE(model, url, contexts, **kwargs):
"""Load an encode json object at url into our model.
This Uses the provided context dictionary to improve the json to json-ld.
Additional keyword arguments can also be provided and will be added
to the request's query string
"""
resource = prepare_url(url)
print 'resource:', resource
# the encode site kept 500-ing on me. so lets throw in a quick and dirty cache
resource_cache = 'cache/' + md5(resource).hexdigest()+'.cache.json'
if os.path.exists(resource_cache):
with open(resource_cache, 'r') as instream:
data = json.loads(instream.read())
else:
data = get_ENCODE(resource, **kwargs)
with open(resource_cache, 'w') as outstream:
outstream.write(json.dumps(data))
addContextToEncodedTree(data, contexts, base=resource)
loadJSONintoModel(model, data)
def addContextToEncodedTree(tree, contexts, base=None):
"""Add contexts to various objects in the tree.
tree is a json tree returned from the DCC's encoded database.
contexts is a dictionary of dictionaries containing contexts
for the various possible encoded classes.
base, if supplied allows setting the base url that relative
urls will be resolved against.
"""
tree['@context'] = contexts[None]
if base:
tree['@context']['@base'] = base
addContextToEncodedChild(tree, contexts)
def addContextToEncodedChild(obj, contexts):
'''Add JSON-LD context to the encoded JSON.
This is recursive becuase some of the IDs were relative URLs
and I needed a way to properly compute a the correct base URL.
'''
# pretend strings aren't iterable
if type(obj) in types.StringTypes:
return
# recurse on container types
if isinstance(obj, collections.Sequence):
# how should I update lists?
for v in obj:
addContextToEncodedChild(v, contexts)
return
if isinstance(obj, collections.Mapping):
for v in obj.values():
addContextToEncodedChild(v, contexts)
# we have an object. attach a context to it.
if isEncodedObject(obj):
default_base = contexts[None]['@base']
context = {'@base': urljoin(default_base, obj['@id'])}
for t in obj['@type']:
if t in contexts:
context.update(contexts[t])
if len(context) > 0:
obj.setdefault('@context', {}).update(context)
def isEncodedObject(obj):
'''Test to see if an object is a JSON-LD object
Some of the nested dictionaries lack the @id or @type
information necessary to convert them.
'''
if not isinstance(obj, collections.Iterable):
return False
if '@id' in obj and '@type' in obj:
return True
return False
def loadJSONintoModel(model, json_data):
'''Given a PyLD dictionary, load its statements into our model
'''
json_graphs = jsonld.to_rdf(json_data)
for graph in json_graphs:
for triple in json_graphs[graph]:
s = pyldToNode(triple['subject'])
p = pyldToNode(triple['predicate'])
o = pyldToNode(triple['object'])
stmt = RDF.Statement(s, p, o)
model.add_statement(stmt) #, graph_context)
def pyldToNode(item):
'''Convert a PyLD node to a Redland node'''
nodetype = item['type']
value = item['value']
datatype = item.get('datatype', None)
if nodetype == 'blank node':
return RDF.Node(blank=value)
elif nodetype == 'IRI':
return RDF.Node(uri_string=str(value))
else:
return RDF.Node(literal=unicode(value).encode('utf-8'),
datatype=RDF.Uri(datatype))
Note: the contexts[None]
indicates default that will be attached to the root of the tree
contexts = {
# The None context will get added to the root of the tree and will
# provide common defaults.
None: {
# give this context a default descriptive url.
# I wish encoded's rendering supported fragment linking
'@vocab': 'http://submit.encodedcc.org/profiles/experiment.json#',
# terms in multiple encoded objects
"lab": { "@type": "@id" },
"pi": { "@type": "@id" },
"description": "rdf:description",
'href': { '@type': '@id' },
'url': { '@type': '@id' },
},
# Identify and markup contained classes.
# e.g. in the tree there was a sub-dictionary named 'biosample'
# That dictionary had a term 'biosample_term_id, which is the
# term that should be used as the @id.
'biosample': {
'biosample_term_id': { '@type': '@id' },
},
'experiment': {
"assay_term_id": { "@type": "@id" },
},
# I tried to use the JSON-LD mapping capabilities to convert the lab
# contact information into a vcard record, but the encoded model
# didn't lend itself well to the vcard schema
#'lab': {
# "address1": "vcard:street-address",
# "address2": "vcard:street-address",
# "city": "vcard:locality",
# "state": "vcard:region",
# "country": "vcard:country"
#},
'human_donor': {
'award': { '@type': '@id' },
},
'library': {
'award': { '@type': '@id' },
'nucleic_acid_term_id': { '@type': '@id' }
}
}
namespaces = {
# JSON-LD lets you define namespaces so you can used the shorted url syntax.
# (instead of http://www.w3.org/2000/01/rdf-schema#label you can do
# rdfs:label)
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"owl": "http://www.w3.org/2002/07/owl#",
"dc": "htp://purl.org/dc/elements/1.1/",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"vcard": "http://www.w3.org/2006/vcard/ns#",
# for some namespaces I made a best guess for the ontology root.
"EFO": "http://www.ebi.ac.uk/efo/", # EFO ontology
"OBO": "http://purl.obolibrary.org/obo/", # OBO ontology
"OBI": "http://purl.obolibrary.org/obo/OBI_", # Ontology for Biomedical Investigations
# OBI: available from http://svn.code.sf.net/p/obi/code/releases/2012-07-01/merged/merged-obi-comments.owl
'SO': 'http://purl.obolibrary.org/obo/SO_', # Sequence ontology
# SO: available from http://www.berkeleybop.org/ontologies/so.owl
# make a fake shortening for this ontology
'encode3exp': 'http://submit.encodedcc.org/profiles/experiment.json#'
}
contexts[None].update(namespaces)
For this notebook I'm just creating a memory model, Redland supports a variety of other storage types
I recently did some performance testing, and learned that the RDF.HashStorage is quite fast as long as you don't use named graphs. At least with Redland librdf 1.0.16 something goes terribly slowly if you enable the context graph. (E.g. loading the 30k triples for from the experiment summary takes ~1.5 seconds if you use the default hash, and ~1500 seconds if you turn on the context graph).
storage = RDF.MemoryStorage()
model = RDF.Model(storage)
load_LD_ENCODE(model, '/experiments/', contexts)
resource: http://submit.encodedcc.org/experiments/
Lets look for possibly useful terms in the experiment collection
First lets define a short helper function to display the result of running a RDF.SPARQLQuery
def query_model(model, query):
'''Execute a sparql query on the model.
The namespace dictionary provides default shortend urls
in the query.
'''
q = RDF.SPARQLQuery(query)
for i, row in enumerate(q.execute(model)):
if i == 0:
print '\t'.join(row.keys())
print '\t'.join(['-' * len(k) for k in row]) # should make a seperator line
print '\t'.join((str(row[k]) for k in row))
When exploring a new RDF Model, There's a few queries I like to do to see what's available.
First off is the ?s a ?type
query. The keyword a
is meant to be read as is-a
, so the following query finds all the object classes that were defined in this model.
Distinct is a useful option that suppresses duplicates. The query below is also finding all the possible object IDs but isn't reporting them. However without the distinct we'd still end up with every ?type
triple in the dataset.
query_model(model, '''
select distinct ?type
where {
?s a ?type
}
''')
type ---- http://submit.encodedcc.org/profiles/experiment.json#experiment http://submit.encodedcc.org/profiles/experiment.json#item http://submit.encodedcc.org/profiles/experiment.json#experiment_collection http://submit.encodedcc.org/profiles/experiment.json#collection
/usr/lib/python2.7/dist-packages/RDF.py:1995: RedlandWarning: Variable s was bound but is unused in the query results = Redland.librdf_query_execute(self._query,model._model)
The next query is to see what opject properties are defined.
query_model(model, '''
select distinct ?p
where {
?s ?p ?o
}
''')
p - http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://submit.encodedcc.org/profiles/experiment.json#accession http://submit.encodedcc.org/profiles/experiment.json#assay_term_name http://submit.encodedcc.org/profiles/experiment.json#award.rfa http://submit.encodedcc.org/profiles/experiment.json#biosample_term_name http://submit.encodedcc.org/profiles/experiment.json#files.length http://submit.encodedcc.org/profiles/experiment.json#lab.title http://submit.encodedcc.org/profiles/experiment.json#replicates.length http://submit.encodedcc.org/profiles/experiment.json#target.label http://submit.encodedcc.org/profiles/experiment.json#condition http://submit.encodedcc.org/profiles/experiment.json#href http://submit.encodedcc.org/profiles/experiment.json#method http://submit.encodedcc.org/profiles/experiment.json#name http://submit.encodedcc.org/profiles/experiment.json#profile http://submit.encodedcc.org/profiles/experiment.json#title http://submit.encodedcc.org/profiles/experiment.json#actions http://submit.encodedcc.org/profiles/experiment.json#columns http://www.w3.org/1999/02/22-rdf-syntax-ns#description
/usr/lib/python2.7/dist-packages/RDF.py:1995: RedlandWarning: Variable o was bound but is unused in the query results = Redland.librdf_query_execute(self._query,model._model)
Since my goal was to try and find what experiments were associated with my lab the #lab.title term looks promising, so lets examine those.
query_model(model, '''
select distinct ?lab
where {
?s <http://submit.encodedcc.org/profiles/experiment.json#lab.title> ?lab
}
order by ?lab''')
lab --- Ali Mortazavi, UCI Barbara Wold, Caltech Bing Ren, UCSD Bradley Bernstein, Broad Brenton Graveley, UConn David Gilbert, FSU Gregory Crawford, Duke J. Michael Cherry, Stanford Jason Lieb, UNC Job Dekker, UMass John Stamatoyannopoulos, UW Kevin Struhl, HMS Kevin White, UChicago Lab Michael Snyder, Stanford Peggy Farnham, USC Piero Carninci, RIKEN Richard Myers, HAIB Ross Hardison, PennState Scott Tenenbaum, SUNY-Albany Sherman Weissman, Yale Thomas Gingeras, CSHL Vishwanath Iyer, UTA Yijun Ruan, GIS
Now we're getting somewhere, lets get all the experiment ids for my lab. At least with the Redland query engine I need to use a FILTER
with the regex
function to limit the records returned.
query_model(model, '''
select ?experiment ?lab
where {
?experiment <http://submit.encodedcc.org/profiles/experiment.json#lab.title> ?lab.
filter(regex(?lab, "barbara wold", "i"))
}
''')
experiment lab ---------- --- http://submit.encodedcc.org/experiments/ENCSR000AEG/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AEH/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AEP/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AEQ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHL/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHM/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHN/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHO/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHP/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHQ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHR/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHS/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHT/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHU/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHV/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHW/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHX/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHY/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AHZ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIA/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIB/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIC/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AID/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIE/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIF/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIG/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIH/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AII/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIJ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIK/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIL/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIM/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIN/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIO/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIP/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIQ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIR/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIS/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIT/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIU/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIV/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIW/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIX/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIY/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AIZ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJA/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJB/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJC/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJD/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJE/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJF/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJG/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJH/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJN/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJO/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJP/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJQ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJR/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJS/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000AJT/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWK/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWL/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWM/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWN/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWO/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWP/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWQ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000CWR/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYN/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYO/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYP/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYQ/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYR/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYS/ Barbara Wold, Caltech http://submit.encodedcc.org/experiments/ENCSR000EYT/ Barbara Wold, Caltech
Ok so that's my labs data, lets import it.
Also this is the kind of operation that makes webmasters sad, as I'm loading a bunch of large objects as fast as their webserver will hand them to me. Because when I first started doing this there was a burst of 500-errors I added a super simple cache algorithm to the loader.
I wonder what a good solution for caching results and checking to see if they're still fresh is.
q = RDF.SPARQLQuery('''
select ?experiment
where {
?experiment <http://submit.encodedcc.org/profiles/experiment.json#lab.title> ?lab.
filter(regex(?lab, "barbara wold", "i"))
}''')
for row in q.execute(model):
load_LD_ENCODE(model, str(row['experiment']), contexts)
resource: http://submit.encodedcc.org/experiments/ENCSR000AEG/ resource: http://submit.encodedcc.org/experiments/ENCSR000AEH/ resource: http://submit.encodedcc.org/experiments/ENCSR000AEP/ resource: http://submit.encodedcc.org/experiments/ENCSR000AEQ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHL/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHM/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHN/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHO/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHP/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHQ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHR/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHS/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHT/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHU/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHV/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHW/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHX/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHY/ resource: http://submit.encodedcc.org/experiments/ENCSR000AHZ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIA/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIB/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIC/ resource: http://submit.encodedcc.org/experiments/ENCSR000AID/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIE/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIF/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIG/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIH/ resource: http://submit.encodedcc.org/experiments/ENCSR000AII/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIJ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIK/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIL/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIM/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIN/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIO/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIP/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIQ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIR/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIS/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIT/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIU/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIV/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIW/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIX/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIY/ resource: http://submit.encodedcc.org/experiments/ENCSR000AIZ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJA/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJB/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJC/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJD/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJE/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJF/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJG/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJH/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJN/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJO/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJP/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJQ/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJR/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJS/ resource: http://submit.encodedcc.org/experiments/ENCSR000AJT/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWK/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWL/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWM/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWN/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWO/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWP/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWQ/ resource: http://submit.encodedcc.org/experiments/ENCSR000CWR/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYN/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYO/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYP/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYQ/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYR/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYS/ resource: http://submit.encodedcc.org/experiments/ENCSR000EYT/
So now what's in our model now?
Get all the classes...
query_model(model, '''
select distinct ?type
where {
?s a ?type
}
order by ?type
''')
type ---- http://submit.encodedcc.org/profiles/experiment.json#antibody_lot http://submit.encodedcc.org/profiles/experiment.json#award http://submit.encodedcc.org/profiles/experiment.json#biosample http://submit.encodedcc.org/profiles/experiment.json#collection http://submit.encodedcc.org/profiles/experiment.json#document http://submit.encodedcc.org/profiles/experiment.json#donor http://submit.encodedcc.org/profiles/experiment.json#experiment http://submit.encodedcc.org/profiles/experiment.json#experiment_collection http://submit.encodedcc.org/profiles/experiment.json#file http://submit.encodedcc.org/profiles/experiment.json#human_donor http://submit.encodedcc.org/profiles/experiment.json#item http://submit.encodedcc.org/profiles/experiment.json#lab http://submit.encodedcc.org/profiles/experiment.json#library http://submit.encodedcc.org/profiles/experiment.json#mouse_donor http://submit.encodedcc.org/profiles/experiment.json#organism http://submit.encodedcc.org/profiles/experiment.json#platform http://submit.encodedcc.org/profiles/experiment.json#replicate http://submit.encodedcc.org/profiles/experiment.json#source http://submit.encodedcc.org/profiles/experiment.json#target http://submit.encodedcc.org/profiles/experiment.json#user
And get all the properties
query_model(model, '''
select distinct ?p
where {
?s ?p ?o
}
order by ?p
''')
p - http://submit.encodedcc.org/profiles/experiment.json#accession http://submit.encodedcc.org/profiles/experiment.json#actions http://submit.encodedcc.org/profiles/experiment.json#address1 http://submit.encodedcc.org/profiles/experiment.json#address2 http://submit.encodedcc.org/profiles/experiment.json#aliases http://submit.encodedcc.org/profiles/experiment.json#antibody http://submit.encodedcc.org/profiles/experiment.json#antigen_description http://submit.encodedcc.org/profiles/experiment.json#antigen_sequence http://submit.encodedcc.org/profiles/experiment.json#assay_term_id http://submit.encodedcc.org/profiles/experiment.json#assay_term_name http://submit.encodedcc.org/profiles/experiment.json#assembly http://submit.encodedcc.org/profiles/experiment.json#attachment http://submit.encodedcc.org/profiles/experiment.json#award http://submit.encodedcc.org/profiles/experiment.json#award.rfa http://submit.encodedcc.org/profiles/experiment.json#awards http://submit.encodedcc.org/profiles/experiment.json#biological_replicate_number http://submit.encodedcc.org/profiles/experiment.json#biosample http://submit.encodedcc.org/profiles/experiment.json#biosample_term_id http://submit.encodedcc.org/profiles/experiment.json#biosample_term_name http://submit.encodedcc.org/profiles/experiment.json#biosample_type http://submit.encodedcc.org/profiles/experiment.json#characterizations http://submit.encodedcc.org/profiles/experiment.json#city http://submit.encodedcc.org/profiles/experiment.json#clonality http://submit.encodedcc.org/profiles/experiment.json#columns http://submit.encodedcc.org/profiles/experiment.json#condition http://submit.encodedcc.org/profiles/experiment.json#country http://submit.encodedcc.org/profiles/experiment.json#culture_harvest_date http://submit.encodedcc.org/profiles/experiment.json#culture_start_date http://submit.encodedcc.org/profiles/experiment.json#dataset http://submit.encodedcc.org/profiles/experiment.json#dataset_type http://submit.encodedcc.org/profiles/experiment.json#date_created http://submit.encodedcc.org/profiles/experiment.json#dbxref http://submit.encodedcc.org/profiles/experiment.json#derived_from http://submit.encodedcc.org/profiles/experiment.json#document_type http://submit.encodedcc.org/profiles/experiment.json#documents http://submit.encodedcc.org/profiles/experiment.json#donor http://submit.encodedcc.org/profiles/experiment.json#download http://submit.encodedcc.org/profiles/experiment.json#download_path http://submit.encodedcc.org/profiles/experiment.json#email http://submit.encodedcc.org/profiles/experiment.json#encode2_dbxrefs http://submit.encodedcc.org/profiles/experiment.json#end_date http://submit.encodedcc.org/profiles/experiment.json#ethnicity http://submit.encodedcc.org/profiles/experiment.json#experiment http://submit.encodedcc.org/profiles/experiment.json#extraction_method http://submit.encodedcc.org/profiles/experiment.json#fax http://submit.encodedcc.org/profiles/experiment.json#file_format http://submit.encodedcc.org/profiles/experiment.json#files http://submit.encodedcc.org/profiles/experiment.json#files.length http://submit.encodedcc.org/profiles/experiment.json#first_name http://submit.encodedcc.org/profiles/experiment.json#flowcell http://submit.encodedcc.org/profiles/experiment.json#flowcell_details http://submit.encodedcc.org/profiles/experiment.json#fragmentation_method http://submit.encodedcc.org/profiles/experiment.json#gene_name http://submit.encodedcc.org/profiles/experiment.json#geo_dbxrefs http://submit.encodedcc.org/profiles/experiment.json#google http://submit.encodedcc.org/profiles/experiment.json#health_status http://submit.encodedcc.org/profiles/experiment.json#host_organism http://submit.encodedcc.org/profiles/experiment.json#href http://submit.encodedcc.org/profiles/experiment.json#institute_label http://submit.encodedcc.org/profiles/experiment.json#institute_name http://submit.encodedcc.org/profiles/experiment.json#isotype http://submit.encodedcc.org/profiles/experiment.json#job_title http://submit.encodedcc.org/profiles/experiment.json#lab http://submit.encodedcc.org/profiles/experiment.json#lab.title http://submit.encodedcc.org/profiles/experiment.json#label http://submit.encodedcc.org/profiles/experiment.json#lane http://submit.encodedcc.org/profiles/experiment.json#last_name http://submit.encodedcc.org/profiles/experiment.json#library http://submit.encodedcc.org/profiles/experiment.json#library_size_selection_method http://submit.encodedcc.org/profiles/experiment.json#life_stage http://submit.encodedcc.org/profiles/experiment.json#lot_id http://submit.encodedcc.org/profiles/experiment.json#lot_id_alias http://submit.encodedcc.org/profiles/experiment.json#lysis_method http://submit.encodedcc.org/profiles/experiment.json#machine http://submit.encodedcc.org/profiles/experiment.json#md5sum http://submit.encodedcc.org/profiles/experiment.json#method http://submit.encodedcc.org/profiles/experiment.json#name http://submit.encodedcc.org/profiles/experiment.json#note http://submit.encodedcc.org/profiles/experiment.json#nucleic_acid_term_id http://submit.encodedcc.org/profiles/experiment.json#nucleic_acid_term_name http://submit.encodedcc.org/profiles/experiment.json#organism http://submit.encodedcc.org/profiles/experiment.json#output_type http://submit.encodedcc.org/profiles/experiment.json#paired_ended http://submit.encodedcc.org/profiles/experiment.json#passage_number http://submit.encodedcc.org/profiles/experiment.json#phone1 http://submit.encodedcc.org/profiles/experiment.json#phone2 http://submit.encodedcc.org/profiles/experiment.json#pi http://submit.encodedcc.org/profiles/experiment.json#platform http://submit.encodedcc.org/profiles/experiment.json#possible_controls http://submit.encodedcc.org/profiles/experiment.json#postal_code http://submit.encodedcc.org/profiles/experiment.json#product_id http://submit.encodedcc.org/profiles/experiment.json#profile http://submit.encodedcc.org/profiles/experiment.json#project http://submit.encodedcc.org/profiles/experiment.json#protocol_documents http://submit.encodedcc.org/profiles/experiment.json#purifications http://submit.encodedcc.org/profiles/experiment.json#replicate http://submit.encodedcc.org/profiles/experiment.json#replicates http://submit.encodedcc.org/profiles/experiment.json#replicates.length http://submit.encodedcc.org/profiles/experiment.json#rfa http://submit.encodedcc.org/profiles/experiment.json#schema_version http://submit.encodedcc.org/profiles/experiment.json#scientific_name http://submit.encodedcc.org/profiles/experiment.json#sex http://submit.encodedcc.org/profiles/experiment.json#size_range http://submit.encodedcc.org/profiles/experiment.json#source http://submit.encodedcc.org/profiles/experiment.json#start_date http://submit.encodedcc.org/profiles/experiment.json#starting_amount http://submit.encodedcc.org/profiles/experiment.json#starting_amount_units http://submit.encodedcc.org/profiles/experiment.json#state http://submit.encodedcc.org/profiles/experiment.json#status http://submit.encodedcc.org/profiles/experiment.json#strain_background http://submit.encodedcc.org/profiles/experiment.json#strand_specificity http://submit.encodedcc.org/profiles/experiment.json#submits_for http://submit.encodedcc.org/profiles/experiment.json#submitted_by http://submit.encodedcc.org/profiles/experiment.json#submitted_file_name http://submit.encodedcc.org/profiles/experiment.json#target http://submit.encodedcc.org/profiles/experiment.json#target.label http://submit.encodedcc.org/profiles/experiment.json#taxon_id http://submit.encodedcc.org/profiles/experiment.json#technical_replicate_number http://submit.encodedcc.org/profiles/experiment.json#term_id http://submit.encodedcc.org/profiles/experiment.json#term_name http://submit.encodedcc.org/profiles/experiment.json#timezone http://submit.encodedcc.org/profiles/experiment.json#title http://submit.encodedcc.org/profiles/experiment.json#type http://submit.encodedcc.org/profiles/experiment.json#url http://submit.encodedcc.org/profiles/experiment.json#urls http://submit.encodedcc.org/profiles/experiment.json#uuid http://www.w3.org/1999/02/22-rdf-syntax-ns#description http://www.w3.org/1999/02/22-rdf-syntax-ns#type
query_model(model, '''
prefix encode: <http://submit.encodedcc.org/profiles/experiment.json#>
prefix rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
select ?s ?description
where {
?s rdfs:description ?description ;
a encode:biosample .
}
limit 50
''')
s description - ----------- http://submit.encodedcc.org/biosamples/ENCBS089RNA/ B-lymphocyte, lymphoblastoid, International HapMap Project - CEPH/Utah - European Caucasion, Epstein-Barr Virus http://submit.encodedcc.org/biosamples/ENCBS090RNA/ B-lymphocyte, lymphoblastoid, International HapMap Project - CEPH/Utah - European Caucasion, Epstein-Barr Virus http://submit.encodedcc.org/biosamples/ENCBS087RNA/ The continuous cell line K-562 was established by Lozzio and Lozzio from the pleural effusion of a 53-year-old female with chronic myelogenous leukemia in terminal blast crises. ENCODE3 RNA-seq evaluation replicate 1. http://submit.encodedcc.org/biosamples/ENCBS088RNA/ The continuous cell line K-562 was established by Lozzio and Lozzio from the pleural effusion of a 53-year-old female with chronic myelogenous leukemia in terminal blast crises. ENCODE3 RNA-seq evaluation replicate 2. http://submit.encodedcc.org/biosamples/ENCBS124ENC/ Myoblast cell line derived from thigh muscle of C3H mice after crush injury http://submit.encodedcc.org/biosamples/ENCBS127ENC/ Myoblast cell line derived from thigh muscle of C3H mice after crush injury; differentiated from C2C12 cells for 60 hours http://submit.encodedcc.org/biosamples/ENCBS125ENC/ Myoblast cell line derived from thigh muscle of C3H mice after crush injury; differentiated from C2C12 cells for 24 hours http://submit.encodedcc.org/biosamples/ENCBS034ENC/ Multipotential cell line that can be converted by 5-azacytidine into three mesodermal stem cell lineages. http://submit.encodedcc.org/biosamples/ENCBS035ENC/ As a control, this multipotential cell line was treated with a differentiation protocol that will not induce these cells to differentiate into three mesodermal stem cell lineages. http://submit.encodedcc.org/biosamples/ENCBS126ENC/ Myoblast cell line derived from thigh muscle of C3H mice after crush injury; differentiated from C2C12 cells for 5 days http://submit.encodedcc.org/biosamples/ENCBS128ENC/ Myoblast cell line derived from thigh muscle of C3H mice after crush injury; differentiated from C2C12 cells for 7 days
query_model(model, '''
prefix encode: <http://submit.encodedcc.org/profiles/experiment.json#>
prefix rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
select distinct ?term
where {
?s encode:nucleic_acid_term_id ?term .
}
limit 50
''')
term ---- http://purl.obolibrary.org/obo/SO_0000356 http://purl.obolibrary.org/obo/SO_0000871 http://purl.obolibrary.org/obo/SO_0000352
Ok so far all of that could have been plausably done with a relational model.
So, lets try something harder. Let's load another ontology into our model.
model.load('http://www.berkeleybop.org/ontologies/so.owl')
True
Ok now that we've loaded so.owl. Lets do a query that takes advantage of it.
I manually looked at terms in so.owl, and noticed IAO_0000115 looked useful. Attempting to look the term up suggests it means "Definition".
query_model(model, '''
prefix encode: <http://submit.encodedcc.org/profiles/experiment.json#>
prefix rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix SO: <http://purl.obolibrary.org/obo/SO_>
select distinct ?term ?iao
where {
?s encode:nucleic_acid_term_id ?term .
?term <http://purl.obolibrary.org/obo/IAO_0000115> ?iao .
}
order by ?s
limit 50
''')
term iao ---- --- http://purl.obolibrary.org/obo/SO_0000356 An attribute describing a sequence consisting of nucleobases bound to a repeating unit made of a D-ribose ring connected to a phosphate backbone. http://purl.obolibrary.org/obo/SO_0000871 An mRNA that is polyadenylated. http://purl.obolibrary.org/obo/SO_0000352 An attribute describing a sequence consisting of nucleobases bound to a repeating unit made of a 2-deoxy-D-ribose ring connected to a phosphate backbone.
Let's try another case.
It looks like I didn't mark up all the terms correctly for them to be converted into links. Also when I tried to do load the EFO my browser started to run my laptop out of memory. So grabbing more information from this query may be challenging, and possibly take a database backed triple store.
query_model(model, '''
prefix encode: <http://submit.encodedcc.org/profiles/experiment.json#>
prefix rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
select distinct ?term
where {
?s encode:biosample_term_id ?term .
}
order by ?s
limit 50
''')
term ---- BTO:0003166 http://www.ebi.ac.uk/efo/0002067 http://www.ebi.ac.uk/efo/0002784 http://www.ebi.ac.uk/efo/0001098 NTR:0000710 EFO:0002784 EFO:0002067 EFO:0001098 NTR:0000710 BTO:0003166 EFO:0000322 EFO:0002786 EFO:0002824 CL:0000515 BTO:0005046 BTO:0000093 CL:0002553 EFO:0003042 EFO:0001185 EFO:0001187 CL:0002618 CL:0000312
Unfortunately this query is rather slow in redland. Also I probably need to adjust the download_path context so its an absolute URL.
query_model(model, '''
prefix encode: <http://submit.encodedcc.org/profiles/experiment.json#>
prefix rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
select ?exp_accession ?file ?path
where {
?s encode:files ?file ;
encode:accession ?exp_accession .
?file encode:file_format ?format ;
encode:download_path ?path .
}
order by ?exp_acession
limit 10
''')
exp_accession file path ------------- ---- ---- ENCSR000AEG http://submit.encodedcc.org/files/ENCFF001RRR/ 2013/7/22/ENCFF001RRR.fastq.gz ENCSR000AEG http://submit.encodedcc.org/files/ENCFF001RRN/ 2013/7/22/ENCFF001RRN.fastq.gz ENCSR000AEH http://submit.encodedcc.org/files/ENCFF001RRJ/ 2013/7/22/ENCFF001RRJ.fastq.gz ENCSR000AEH http://submit.encodedcc.org/files/ENCFF001RRK/ 2013/7/22/ENCFF001RRK.fastq.gz ENCSR000AEH http://submit.encodedcc.org/files/ENCFF001RRI/ 2013/7/22/ENCFF001RRI.fastq.gz ENCSR000AEH http://submit.encodedcc.org/files/ENCFF001RRL/ 2013/7/22/ENCFF001RRL.fastq.gz ENCSR000AEP http://submit.encodedcc.org/files/ENCFF001RQW/ 2013/7/22/ENCFF001RQW.fastq.gz ENCSR000AEP http://submit.encodedcc.org/files/ENCFF001RRG/ 2013/7/22/ENCFF001RRG.fastq.gz ENCSR000AEP http://submit.encodedcc.org/files/ENCFF001RQT/ 2013/7/22/ENCFF001RQT.fastq.gz ENCSR000AEP http://submit.encodedcc.org/files/ENCFF001RQX/ 2013/7/22/ENCFF001RQX.fastq.gz
/usr/lib/python2.7/dist-packages/RDF.py:1995: RedlandWarning: Variable format was bound but is unused in the query results = Redland.librdf_query_execute(self._query,model._model) /usr/lib/python2.7/dist-packages/RDF.py:1995: RedlandWarning: Variable exp_acession was used but is not bound in the query results = Redland.librdf_query_execute(self._query,model._model)
Examing one file shows that there's a dataset term that could have been converted to a fully qualified @id.
query_model(model, '''
prefix encode: <http://submit.encodedcc.org/profiles/experiment.json#>
prefix rdfs: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
select distinct ?p ?o
where {
<http://submit.encodedcc.org/files/ENCFF001RRR/> ?p ?o .
}
order by ?s
limit 10
''')
p o - - http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://submit.encodedcc.org/profiles/experiment.json#file http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://submit.encodedcc.org/profiles/experiment.json#item http://submit.encodedcc.org/profiles/experiment.json#accession ENCFF001RRR http://submit.encodedcc.org/profiles/experiment.json#dataset /experiments/ENCSR000AEG/ http://submit.encodedcc.org/profiles/experiment.json#date_created 2013-07-22 http://submit.encodedcc.org/profiles/experiment.json#download_path 2013/7/22/ENCFF001RRR.fastq.gz http://submit.encodedcc.org/profiles/experiment.json#file_format fastq http://submit.encodedcc.org/profiles/experiment.json#md5sum 3f44150f50da1da4ca1d7a5d7360ec07 http://submit.encodedcc.org/profiles/experiment.json#output_type read1 http://submit.encodedcc.org/profiles/experiment.json#replicate http://submit.encodedcc.org/replicates/dab021e1-4e00-4c6f-8580-eb95c7160995/
/usr/lib/python2.7/dist-packages/RDF.py:1995: RedlandWarning: Variable s was used but is not bound in the query results = Redland.librdf_query_execute(self._query,model._model)