import shelve
sys.path.append("/home/gavin/Documents/MRes/opencast-bio/")
import ocbio.extract
cd ~/Documents/MRes/
/home/gavin/Documents/MRes
parser = ocbio.extract.ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
"/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db",
protindexes=(1,3),valindexes=[4])
parser.regenerate(force=True, verbose=True)
Forcing regeneration of database /home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db from data file /home/gavin/Documents/MRes/HIPPIE/hippie_current.txt. Filling database......................................................................................................................................................................... Parsed 169626 lines.
examplekeys = parser.db.keys()[0:10]
print len(parser.db.keys())
169323
parser.db.close()
parser2 = ocbio.extract.ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
"/home/gavin/Documents/MRes/HIPPIE/feature2.HIPPIE.db",
protindexes=(1,3),valindexes=[4])
print len(parser2.db.keys())
0
parser.close()
cd test/
/home/gavin/Documents/MRes/test
class test():
def __init__(self):
self.db = ocbio.extract.ProteinPairDB("test.db")
def __setitem__(self,key,value):
self.db[key] = value
inst = test()
for k in examplekeys:
inst[k] = 1
inst.db.keys()
[frozenset({'10054', '201626'}), frozenset({'10236', '3930'}), frozenset({'10320', '6605'}), frozenset({'10399', '8364'}), frozenset({'10474', '1499'}), frozenset({'1058', '1060'}), frozenset({'10289', '10857'}), frozenset({'11091110911109111091'}), frozenset({'112495', '9329'}), frozenset({'10379', '1822'}), frozenset({'something'})]
inst.db.close()
inst = test()
inst.db.keys()
[frozenset({'10054', '201626'}), frozenset({'10236', '3930'}), frozenset({'10320', '6605'}), frozenset({'10399', '8364'}), frozenset({'10474', '1499'}), frozenset({'1058', '1060'}), frozenset({'10289', '10857'}), frozenset({'11091110911109111091'}), frozenset({'112495', '9329'}), frozenset({'10379', '1822'}), frozenset({'something'})]
%load ../opencast-bio/ocbio/extract.py
# Feature extraction code
# Header pending
import os
import time
import subprocess
import csv
import shelve
import re
import sys
import pickle
import pdb
def verbosecheck(verbose):
'''returns a function depending on the state of the verbose flag'''
if verbose:
def v_print(*args):
'''declare v_print function that prints to stdout
if verbose flag is on'''
for argument in args:
print argument,
print
else:
def v_print(*args):
None
return v_print
class ProteinPairDB(shelve.DbfilenameShelf):
'''A simple database for protein pairs using shelve.'''
def __setitem__(self, key, value):
# key will be frozenset so make it a list first
key = list(key)
# then make it a string
if len(key) == 1:
key = key[0] * 2
else:
key = key[0] + "\t" + key[1]
shelve.DbfilenameShelf.__setitem__(self, key, value)
return None
def __getitem__(self, key):
# make two strings from the key
key = list(key)
if len(key) == 1:
key1 = key[0] * 2
key1 = key[0] * 2
else:
key1 = key[0] + "\t" + key[1]
key2 = key[1] + "\t" + key[0]
# try the first one
try:
value = shelve.DbfilenameShelf.__getitem__(self, key1)
except KeyError:
# couldn't find the first key, try the second
value = shelve.DbfilenameShelf.__getitem__(self, key2)
# if we don't find this one then error out as usual
return value
def keys(self):
# retrieve the string keys used by shelve
ks = shelve.DbfilenameShelf.keys(self)
# convert them to frozensets
ks = map(lambda x: frozenset(x.split("\t")), ks)
return ks
class ProteinPairParser():
'''Does simple parsing on data files to produce protein pair files with feature values'''
def __init__(self,
datadir,
outdir,
protindexes=(0, 1),
valindexes=[2],
script=None,
csvdelim="\t",
ignoreheader=0,
generator=False):
# first, store the initialisation
self.datadir = datadir
self.outdir = outdir
self.protindexes = protindexes
# had to hack this together from the list above
# passing tuple in as default did not work
self.valindexes = tuple(valindexes)
self.script = script
self.csvdelim = csvdelim
self.ignoreheader = ignoreheader
if generator:
#then open up this pickle file
f = open(generator)
self.generator = pickle.load(f)
f.close()
self.db = None
else:
#otherwise open the database that is assumed to exist
self.generator = None
self.db = openpairshelf(self.outdir)
return None
def regenerate(self, force=False, verbose=False):
'''Regenerate the pair file from the data source
if the data source is newer than the pair file'''
v_print = verbosecheck(verbose)
if self.generator == None:
# so first check the ages of both files
datamtime = os.stat(self.datadir)[-2]
if os.path.isfile(self.outdir):
pairmtime = os.stat(self.outdir)[-2]
else:
# bit of a hack
pairmtime = 0
# if the data modification time is greater than output modification time
if datamtime > pairmtime or force is True:
# now regenerate the data file according to the options defined above:
if verbose and datamtime > pairmtime:
if pairmtime == 0:
print "Database file not found, regenerating at {0} from {1}.".format(self.outdir, self.datadir)
else:
print "Data file {0} is newer than processed database {1}, regenerating.".format(self.datadir, self.outdir)
if verbose and force:
print "Forcing regeneration of database {0} from data file {1}.".format(self.outdir, self.datadir)
# if there's a script to run
if self.script is not None:
v_print("Executing script: {0}.".format(self.script))
# then execute the script
retcode = subprocess.call("python2 {0}".format(self.script), shell=True)
v_print("Script returned: {0}".format(retcode))
# first delete out of date file, if it's there
if os.path.isfile(self.outdir):
os.remove(self.outdir)
# open the data file
c = csv.reader(open(self.datadir), delimiter=self.csvdelim)
# if the header should be ignored then ignore it
if self.ignoreheader == "1":
v_print("Ignoring header.")
c.next()
if verbose:
sys.stdout.write("Filling database")
lcount = 0
for line in c:
# each line use the protein pair as a key
# by formatting it as a frozenset
pair = frozenset([line[self.protindexes[0]], line[self.protindexes[1]]])
# and the value is indexed by valindexes
values = []
for i in self.valindexes:
values.append(line[i])
self.db[pair] = values[:]
if verbose:
lcount = lcount + 1
if lcount % 1000 == 0:
sys.stdout.write(".")
if verbose:
sys.stdout.write("\n")
print "Parsed {0} lines.".format(lcount)
else:
v_print("Custom generator function, no database to regenerate.")
return None
def __getitem__(self,key):
if self.generator != None:
#try and read a key from the custom generator
return self.generator[key]
else:
#read key from database
return self.db[key]
def close(self):
self.db.close()
return None
def openpairshelf(filename, flag='c', protocol=None, writeback=False):
"""Returns a ProteinPairDB object, with similar functionality to shelve.open()"""
return ProteinPairDB(filename, flag, protocol, writeback)
parser = ProteinPairParser("/home/gavin/Documents/MRes/HIPPIE/hippie_current.txt",
"/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db",
protindexes=(1,3),valindexes=[4])
parser.db.keys()
[]
parser.regenerate(force=True)
print len(parser.db.keys())
169323
parser.db.close()
parser.db = openpairshelf("/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db")
print len(parser.db.keys())
0
for k in examplekeys:
parser.db[k] = 1
print parser.db.keys()
[frozenset(['10054', '201626']), frozenset(['10236', '3930']), frozenset(['10320', '6605']), frozenset(['10399', '8364']), frozenset(['10474', '1499']), frozenset(['1058', '1060']), frozenset(['10857', '10289']), frozenset(['11091110911109111091']), frozenset(['112495', '9329']), frozenset(['10379', '1822'])]
parser.close()
parser.db = openpairshelf("/home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db")
print parser.db.keys()
[frozenset(['10054', '201626']), frozenset(['10236', '3930']), frozenset(['10320', '6605']), frozenset(['10399', '8364']), frozenset(['10474', '1499']), frozenset(['1058', '1060']), frozenset(['10857', '10289']), frozenset(['11091110911109111091']), frozenset(['112495', '9329']), frozenset(['10379', '1822'])]