Due to the problem of extremely large database files being produced when extracting features with extremely good coverage, such as Gene Ontology, a new version of the code is required to deal with this problem.
The new code will store the ProteinPairParser objects as pickle files and the features will be indexed from these objects through a __getitem__
method with the ProteinPairParser only interacting with it's database, if it has one.
Each ProteinPairParser will have it's own generator function which will either be created using the options handed to it or loaded from another pickle file. The default generator will act as the code currently does, by creating a database then indexing said database to retrieve files. According to the Python documentation if the Parser opens the database at initialisation and then is pickled the database will be closed and opened again at unpickling time: pickle documentation - see the TextReader example.
cd /home/gavin/Documents/MRes/
/home/gavin/Documents/MRes
Going to step through the changes I'm making to the code and list them using git. After making changes I'll run a test case to make sure it's still working.
import sys
sys.path.append("/home/gavin/Documents/MRes/opencast-bio/")
import ocbio.extract
reload(ocbio.extract)
<module 'ocbio.extract' from '/home/gavin/Documents/MRes/opencast-bio/ocbio/extract.pyc'>
__getitem__
¶This involves ensuring that at initialisation the parser will define a function to return values by default from the database it's going to create when regenerate
is run.
There must also be an option to load an arbitrary pickled object to return items.
Also, the databases must now be opened and closed with the parsers and the opened databases stored within the parser objects. The assembler will have to be modified to deal with this and close the databases when it's done.
Showing these changes:
!git -C opencast-bio/ show HEAD
commit 1101122b2bba125e6c318e04fb6e52d7f81097e0 Author: Gavin <gavingray1729@gmail.com> Date: Sun Jun 22 13:35:53 2014 +0100 added functionality for gene ontology to protein pair parser diff --git a/ocbio/extract.py b/ocbio/extract.py index 0bb7db8..fc604d0 100644 --- a/ocbio/extract.py +++ b/ocbio/extract.py @@ -1,4 +1,5 @@ # Feature extraction code +# Header pending import os import time @@ -7,6 +8,8 @@ import csv import shelve import re import sys +import pickle +import geneontology def verbosecheck(verbose): '''returns a function depending on the state of the verbose flag''' @@ -214,7 +217,8 @@ class ProteinPairParser(): valindexes=[2], script=None, csvdelim="\t", - ignoreheader=0): + ignoreheader=0, + generator=False): # first, store the initialisation self.datadir = datadir self.outdir = outdir @@ -225,76 +229,95 @@ class ProteinPairParser(): self.script = script self.csvdelim = csvdelim self.ignoreheader = ignoreheader + if generator: + #then open up this pickle file + f = open(generator) + self.generator = pickle.load(f) + f.close() + else: + self.generator == None + self.db = openpairshelf(self.outdir) return None def regenerate(self, force=False, verbose=False): '''Regenerate the pair file from the data source if the data source is newer than the pair file''' v_print = verbosecheck(verbose) - # so first check the ages of both files - datamtime = os.stat(self.datadir)[-2] - if os.path.isfile(self.outdir): - pairmtime = os.stat(self.outdir)[-2] - else: - # bit of a hack - pairmtime = 0 - # if the data modification time is greater than output modification time - if datamtime > pairmtime or force is True: - # now regenerate the data file according to the options defined above: - if verbose and datamtime > pairmtime: - if pairmtime == 0: - print "Database file not found, regenerating at {0} from {1}.".format(self.outdir, self.datadir) - else: - print "Data file {0} is newer than processed database {1}, regenerating.".format(self.datadir, self.outdir) - if verbose and force: - print "Forcing regeneration of database {0} from data file {1}.".format(self.outdir, self.datadir) - # if there's a script to run - if self.script is not None: - v_print("Executing script: {0}.".format(self.script)) - # then execute the script - retcode = subprocess.call("python2 {0}".format(self.script), shell=True) - - v_print("Script returned: {0}".format(retcode)) - # first delete out of date file, if it's there + if self.generator != None: + # so first check the ages of both files + datamtime = os.stat(self.datadir)[-2] if os.path.isfile(self.outdir): - os.remove(self.outdir) - # perform simple parsing to make a file of just protein pairs and the value we care about - # and save these using shelve - db = openpairshelf(self.outdir) - # open the data file - c = csv.reader(open(self.datadir), delimiter=self.csvdelim) - # if the header should be ignored then ignore it - - if self.ignoreheader == "1": - v_print("Ignoring header.") - c.next() + pairmtime = os.stat(self.outdir)[-2] + else: + # bit of a hack + pairmtime = 0 + # if the data modification time is greater than output modification time + if datamtime > pairmtime or force is True: + # now regenerate the data file according to the options defined above: + if verbose and datamtime > pairmtime: + if pairmtime == 0: + print "Database file not found, regenerating at {0} from {1}.".format(self.outdir, self.datadir) + else: + print "Data file {0} is newer than processed database {1}, regenerating.".format(self.datadir, self.outdir) + if verbose and force: + print "Forcing regeneration of database {0} from data file {1}.".format(self.outdir, self.datadir) + # if there's a script to run + if self.script is not None: + v_print("Executing script: {0}.".format(self.script)) + # then execute the script + retcode = subprocess.call("python2 {0}".format(self.script), shell=True) + + v_print("Script returned: {0}".format(retcode)) + # first delete out of date file, if it's there + if os.path.isfile(self.outdir): + os.remove(self.outdir) + # open the data file + c = csv.reader(open(self.datadir), delimiter=self.csvdelim) + # if the header should be ignored then ignore it + + if self.ignoreheader == "1": + v_print("Ignoring header.") + c.next() - if verbose: - sys.stdout.write("Filling database") - lcount = 0 + if verbose: + sys.stdout.write("Filling database") + lcount = 0 + + for line in c: + # each line use the protein pair as a key + # by formatting it as a frozenset + pair = frozenset([line[self.protindexes[0]], line[self.protindexes[1]]]) + # and the value is indexed by valindexes + values = [] - for line in c: - # each line use the protein pair as a key - # by formatting it as a frozenset - pair = frozenset([line[self.protindexes[0]], line[self.protindexes[1]]]) - # and the value is indexed by valindexes - values = [] + for i in self.valindexes: + values.append(line[i]) - for i in self.valindexes: - values.append(line[i]) + self.db[pair] = values[:] - db[pair] = values[:] + if verbose: + lcount = lcount + 1 + if lcount % 1000 == 0: + sys.stdout.write(".") if verbose: - lcount = lcount + 1 - if lcount % 1000 == 0: - sys.stdout.write(".") + sys.stdout.write("\n") + print "Parsed {0} lines.".format(lcount) + else: + v_print("Custom generator function, no database to regenerate.") - if verbose: - sys.stdout.write("\n") - print "Parsed {0} lines.".format(lcount) - db.close() + return None + + def __getitem__(self,key): + if self.generator != None: + #try and read a key from the custom generator + return self.generator[key] + else: + #read key from database + return self.db[key] + def close(self): + self.db.close() return None diff --git a/ocbio/geneontology.py b/ocbio/geneontology.py new file mode 100644 index 0000000..6a73307 --- /dev/null +++ b/ocbio/geneontology.py @@ -0,0 +1,35 @@ +# coding: utf-8 +import numpy as np + +class GOfvectors(): + def __init__(self,goEntrezdict,terms): + self.goEntrezdict = goEntrezdict + self.terms = terms + return None + + def __getitem__(self,key): + pair = list(key) + if len(pair)==1: + pair = pair*2 + # First define the domains, I'm guessing they're not changing + domains = ['molecular_function','cellular_component','biological_process'] + # initialise the vectors we're going to be using: + govectordict= {} + for entrezID in pair: + vec = [] + for domain in domains: + for term in self.terms[domain]: + try: + if term in self.goEntrezdict[entrezID][domain]: + vec.append(1) + else: + vec.append(0) + except KeyError: + vec.append(0) + #save to dictionary + govectordict[entrezID] = np.array(vec[:]) + #iterate over combinations of Entrez pairs + #building the line as it goes + line = list(govectordict[pair[0]]*govectordict[pair[1]]) + #yield this feature vector + return line
Testing initialisation:
testparser = ocbio.extract.ProteinPairParser("none","none",generator="geneontology/testgen.pickle")
testpair = frozenset(('114787', '114785'))
print testparser[testpair]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
The assembler currently looks at the databases the parsers produce and pulls out the values itself. Now that the parsers can do that there's no reason for this. The assembler just needs to iterate through parsers and index for the pair it's working on at that time:
!git -C opencast-bio/ show HEAD
commit 7de0c23b12a57caf544cc538cf737029a306d2f1 Author: Gavin <gavingray1729@gmail.com> Date: Sun Jun 22 16:25:24 2014 +0100 updated assembler for gene ontology diff --git a/ocbio/extract.py b/ocbio/extract.py index fc604d0..c2d856d 100644 --- a/ocbio/extract.py +++ b/ocbio/extract.py @@ -10,6 +10,7 @@ import re import sys import pickle import geneontology +import pdb def verbosecheck(verbose): '''returns a function depending on the state of the verbose flag''' @@ -66,7 +67,7 @@ class FeatureVectorAssembler(): if "script" in d["options"].keys(): d["options"]["script"] = os.path.join(self.sourcetabdir, d["options"]["script"]) - # parse protindexes and validexes: + # parse protindexes and valindexes: if "protindexes" in d["options"].keys(): d["options"]["protindexes"] = tuple(int(v) for v in re.findall("[0-9]+", d["options"]["protindexes"])) if "valindexes" in d["options"].keys(): @@ -108,30 +109,39 @@ class FeatureVectorAssembler(): pairs = map(lambda l: frozenset(l), csv.reader(open(pairfile), delimiter="\t")) # open the file to put the feature vector in c = csv.writer(open(outputfile, "w"), delimiter=delim) - # open all the databases and put them in a dictionary - dbdict = {} - - v_print("Opening databases:") - for parser in self.parserinitlist: - dbdict[parser["output path"]] = openpairshelf(parser["output path"]) - v_print("\t {0} open".format(parser["output path"])) v_print("Checking feature sizes:") # check size of feature in each file # will be important later featuresizes = {} - for parser in self.parserinitlist: - k = dbdict[parser["output path"]].keys()[0] - featuresizes[parser["output path"]] = len(dbdict[parser["output path"]][k]) - v_print("\t Database {0} contains features of size {1}.".format(parser["output path"], - featuresizes[parser["output path"]])) + for parser, i in zip(self.parserlist, range(len(self.parserlist))): + #try to get an example feature + examplefeature = None + for pair in pairs: + try: + examplefeature = parser[pair] + except KeyError: + #keep trying + pass + #if examplefeature != None: + # break + #check we actually got an example feature + if examplefeature == None: + # should probably not include a feature that's going to be all missing values + del self.parserlist[i] + v_print("\t Feature from {0} does not map to these protein pairs.".format(parser.datadir)) + else: + #then we've got a feature so we should see what size it is + featuresizes[parser.datadir] = len(examplefeature) + v_print("\t Data source {0} produces features of size {1}.".format(parser.datadir, + featuresizes[parser.datadir])) if verbose: sys.stdout.write("Writing feature vectors") lcount = 0 # counters for each database reporting numbers of missing values mcount = {} - for parser in self.parserinitlist: - mcount[parser["output path"]] = 0 + for parser in self.parserlist: + mcount[parser.datadir] = 0 # then iterate through the pairs, querying all parser databases and building a list of rows rows = [] for pair in pairs: @@ -141,31 +151,35 @@ class FeatureVectorAssembler(): if len(lpair) == 1: lpair = lpair * 2 row = row + lpair - for parser in self.parserinitlist: + for parser in self.parserlist: # if there are features there then append them to the row try: - row = row + dbdict[parser["output path"]][pair] + row = row + parser[pair] except KeyError: - row = row + [missinglabel] * featuresizes[parser["output path"]] + row = row + [missinglabel] * featuresizes[parser.datadir] if verbose: - mcount[parser["output path"]] += 1 + mcount[parser.datadir] += 1 c.writerow(row) if verbose: lcount = lcount+1 - if lcount % 1000 == 0: + if lcount % 10000 == 0: sys.stdout.write(".") if verbose: sys.stdout.write("\n") print "Wrote {0} vectors.".format(lcount) - for parser in self.parserinitlist: - percentage_match = 100.0 - 100.0 * mcount[parser["output path"]] / lcount - print "Matched {0:.2f} % of protein pairs in {1} to {2}".format(percentage_match, + for parser in self.parserlist: + percentage_match = 100.0 - 100.0 * mcount[parser.datadir] / lcount + print "Matched {0:.2f} % of protein pairs in {1} to features from {2}".format(percentage_match, pairfile, - parser["output path"]) - # close all the databases - for parser in self.parserinitlist: - dbdict[parser["output path"]].close() + parser.datadir) + return None + def close(self, verbose=False): + v_print = verbosecheck(verbose) + for parser in self.parserlist: + if parser.db != None: + parser.close() + v_print("{0} closed".format(parser.outdir)) return None @@ -234,8 +248,10 @@ class ProteinPairParser(): f = open(generator) self.generator = pickle.load(f) f.close() + self.db = None else: - self.generator == None + #otherwise open the database that is assumed to exist + self.generator = None self.db = openpairshelf(self.outdir) return None @@ -243,7 +259,7 @@ class ProteinPairParser(): '''Regenerate the pair file from the data source if the data source is newer than the pair file''' v_print = verbosecheck(verbose) - if self.generator != None: + if self.generator == None: # so first check the ages of both files datamtime = os.stat(self.datadir)[-2] if os.path.isfile(self.outdir): @@ -268,9 +284,6 @@ class ProteinPairParser(): retcode = subprocess.call("python2 {0}".format(self.script), shell=True) v_print("Script returned: {0}".format(retcode)) - # first delete out of date file, if it's there - if os.path.isfile(self.outdir): - os.remove(self.outdir) # open the data file c = csv.reader(open(self.datadir), delimiter=self.csvdelim) # if the header should be ignored then ignore it @@ -303,8 +316,8 @@ class ProteinPairParser(): if verbose: sys.stdout.write("\n") print "Parsed {0} lines.".format(lcount) - else: - v_print("Custom generator function, no database to regenerate.") + else: + v_print("Custom generator function, no database to regenerate.") return None
Regenerating data source table to test with:
f = open("datasource.tab", "w")
# the HIPPIE feature
f.write("HIPPIE/hippie_current.txt"+"\t"+"HIPPIE/feature.HIPPIE.db"+"\t"+"protindexes=(1,3);valindexes=(4)"+"\n")
# the abundance feature
f.write("forGAVIN/pulldown_data/dataset/ppi_ab_entrez.csv"+"\t"+"forGAVIN/pulldown_data/dataset/abundance.Entrez.db"+"\t"+"ignoreheader=1"+"\n")
# the affinity feature
f.write("affinityresults/results2/unique_data_ppi_coor_C2S_entrez.csv"+"\t"+"affinityresults/results2/affinity.Entrez.db"+"\t"+""+"\n")
# gene ontology features
f.write("Gene_Ontology"+"\t"+"Gene_Ontology"+"\t"+"generator=geneontology/testgen.pickle")
f.close()
reload(ocbio.extract)
<module 'ocbio.extract' from '/home/gavin/Documents/MRes/opencast-bio/ocbio/extract.py'>
assembler = ocbio.extract.FeatureVectorAssembler("/home/gavin/Documents/MRes/datasource.tab")
assembler.regenerate(force=True, verbose=True)
Regenerating parsers: parser 0 Forcing regeneration of database /home/gavin/Documents/MRes/HIPPIE/feature.HIPPIE.db from data file /home/gavin/Documents/MRes/HIPPIE/hippie_current.txt. Filling database......................................................................................................................................................................... Parsed 169626 lines. parser 1 Forcing regeneration of database /home/gavin/Documents/MRes/forGAVIN/pulldown_data/dataset/abundance.Entrez.db from data file /home/gavin/Documents/MRes/forGAVIN/pulldown_data/dataset/ppi_ab_entrez.csv. Ignoring header. Filling database................. Parsed 17777 lines. parser 2 Forcing regeneration of database /home/gavin/Documents/MRes/affinityresults/results2/affinity.Entrez.db from data file /home/gavin/Documents/MRes/affinityresults/results2/unique_data_ppi_coor_C2S_entrez.csv. Filling database............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... Parsed 1871145 lines. parser 3 Custom generator function, no database to regenerate.
assembler.assemble("DIP/human/training.nolabel.positive.Entrez.txt",
"features/training.nolabel.positive.Entrez.vectors.txt", verbose=True)
Reading pairfile: DIP/human/training.nolabel.positive.Entrez.txt Checking feature sizes: Data source /home/gavin/Documents/MRes/HIPPIE/hippie_current.txt produces features of size 1. Data source /home/gavin/Documents/MRes/forGAVIN/pulldown_data/dataset/ppi_ab_entrez.csv produces features of size 1. Data source /home/gavin/Documents/MRes/affinityresults/results2/unique_data_ppi_coor_C2S_entrez.csv produces features of size 1. Data source /home/gavin/Documents/MRes/Gene_Ontology produces features of size 90. Writing feature vectors Wrote 4857 vectors. Matched 60.08 % of protein pairs in DIP/human/training.nolabel.positive.Entrez.txt to features from /home/gavin/Documents/MRes/HIPPIE/hippie_current.txt Matched 0.29 % of protein pairs in DIP/human/training.nolabel.positive.Entrez.txt to features from /home/gavin/Documents/MRes/forGAVIN/pulldown_data/dataset/ppi_ab_entrez.csv Matched 3.69 % of protein pairs in DIP/human/training.nolabel.positive.Entrez.txt to features from /home/gavin/Documents/MRes/affinityresults/results2/unique_data_ppi_coor_C2S_entrez.csv Matched 100.00 % of protein pairs in DIP/human/training.nolabel.positive.Entrez.txt to features from /home/gavin/Documents/MRes/Gene_Ontology
assembler.close()
These are the notes on the development of the code described in the Feature vector assembly page of the wiki. First, the protein pair parser class will be written and tested on a dataset that has already been extracted. This will be the HIPPIE dataset.
Initialisation currently involves loading in the three command line options and saving them to the object. It must also involve parsing of the options. Testing the initialisation:
import os, time, subprocess, csv, shelve
cd /home/gavin/Documents/MRes/HIPPIE/
/home/gavin/Documents/MRes/HIPPIE
#define the class
class ProteinPairParser():
'''Does simple parsing on data files to produce protein pair files with feature values'''
def __init__(self,datadir,outdir,protindexes=(1,2),valindex=3,script=None,csvdelim="\t"):
#first, store the initialisation
self.datadir = datadir
self.outdir = outdir
self.protindexes=protindexes
self.valindex=valindex
self.script=script
self.csvdelim=csvdelim
return None
def regenerate(self):
'''Regenerate the pair file from the data source
if the data source is newer than the pair file'''
# so first check the ages of both files
datamtime = os.stat(self.datadir)[-2]
if os.path.isfile(self.outdir):
pairmtime = os.stat(self.outdir)[-2]
else:
#bit of a hack
pairmtime = 0
#if the data modification time is greater than output modification time
if datamtime > pairmtime:
# now regenerate the data file according to the options defined above:
print "data file is newer than pair file"
#if there's a script to run
if self.script != None:
#then execute the script
retcode=subprocess.call("python2 %s"%self.script, shell=True)
#first delete out of date file, if it's there
if os.path.isfile(self.outdir):
os.remove(self.outdir)
#perform simple parsing to make a file of just protein pairs and the value we care about
#and save these using shelve
db = openpairshelf(self.outdir)
#open the data file
c = csv.reader(open(self.datadir), delimiter=self.csvdelim)
for line in c:
#each line use the protein pair as a key
#by formatting it as a frozenset
pair = frozenset([line[self.protindexes[0]],line[self.protindexes[1]]])
#and the value is indexed by valindex
db[pair] = line[self.valindex]
db.close()
return None
We have to rerun the script to generate a pre-processed HIPPIE file:
%%bash
java -jar HIPPIE_NC.jar -i=../DIP/human/training.nolabel.positive.Entrez.txt -t=e -l=0 -o=prematch.positive.HIPPIE.txt
Then we can use the parser to quickly perform what is done in the notebook previously used to extract the features:
x = ProteinPairParser("prematch.positive.HIPPIE.txt","training.positive.HIPPIE.txt",protindexes=(1,3),valindex=4)
x.regenerate()
data file is newer than pair file
The function reports that the data file is newer than the pair file and regenerates the pair files as a persistent dictionary object. This is useful because it means that this can later be indexed quickly for building feature vectors.
We can open this database to show this functionality:
test = openpairshelf("training.positive.HIPPIE.txt")
If we look at the keys that this database uses we can see that it is using strings internally.
It could be useful to modify the .keys()
method of this function so that this would produce the list of frozenset keys instead.
test.keys()[0:10]
['10013\t55031', '10094\t2885', '10399\t8364', '10971\t8452', '1109111091', '11198\t142', '1326\t9020', '13917118\t7322', '1869\t142', '2\t2885']
And taking one of these keys and turning it into a frozenset, we can then index the database using that as a key.
testkey = test.keys()[0]
testkey = frozenset(testkey.split("\t"))
print testkey
test[testkey]
frozenset(['10013', '55031'])
'0.72'
test.close()
The problem is that a shelve (shelf?) database can't take the frozenset as a key. The recommended way to deal with this is to make a wrapper. As the database used by shelf is a class we can build a child class from this, modifying the functions to deal with protein pairs stored in frozensets as keys. This code will not deal with arbitrary frozensets as keys.
Essentially, it will use a string of the two protein identifiers separated by a tab as the key. To index though it will take a frozenset and convert it to two strings which are the two iterations of the two strings.
class ProteinPairDB(shelve.DbfilenameShelf):
'''A simple database for protein pairs using shelve.'''
def __setitem__(self,key,value):
#key will be frozenset so make it a list first
key = list(key)
#then make it a string
if len(key) == 1:
key = key[0]*2
else:
key = key[0] + "\t" + key[1]
shelve.DbfilenameShelf.__setitem__(self,key,value)
return None
def __getitem__(self,key):
#make two strings from the key
key = list(key)
if len(key) == 1:
key1 = key[0]*2
key1 = key[0]*2
else:
key1 = key[0] + "\t" + key[1]
key2 = key[1] + "\t" + key[0]
#try the first one
try:
value = shelve.DbfilenameShelf.__getitem__(self,key1)
except KeyError:
#couldn't find the first key, try the second
value = shelve.DbfilenameShelf.__getitem__(self,key2)
#if we don't find this one then error out as usual
return value
We also have to define a function to apply default arguments to have similar functionality to shelve:
def openpairshelf(filename, flag='c', protocol=None, writeback=False):
return ProteinPairDB(filename, flag, protocol, writeback)
The next thing to do is write the code for the feature vector assembler. This is another class, with methods:
And at initialisation it is expected to do:
To do this it must parse the data source table. It assumes that the data source table is provided as a full path and that this path is the top directory for the data. ie all of the data paths in the data source will be relative to the path of the table itself. The table itself will have structure:
Data source directory | Output database directory | Options |
---|---|---|
/relative/path/to/data |
/relative/path/to/output/database |
protindexes=1,3;valindex=4;script=/path/to/script ;csvdelim=\t |
The available options are given in the options column of the table above.
To test initialisation a first draft of the code is given below:
class FeatureVectorAssembler():
'''Assembles feature vectors from protein pair files, data source lists and gold standard protein pair lists.'''
def __init__(self,sourcetab,goldpos,goldneg):
#store the initialisation
# directories for positive and negative gold standard data files
self.goldpos = goldpos
self.goldneg = goldneg
#check if the gold standard data directories exist
# throw an error if they don't
#instantiate protein pair parsers
# first parse the data source table
# store the directory of the table and it's name
self.sourcetabdir,self.tabfile = os.path.split(sourcetab)
# open the table and parse for initialisation options
c = csv.reader(open(sourcetab), delimiter="\t")
# iterate over lines adding to list of protein pair parsers
self.parserinitlist = []
for line in c:
#store the information in a dictionary
d = {}
d["data path"] = line[0]
d["output path"] = line[1]
#store options in a dictionary in the dictionary
d["options"] = {}
options = line[2].split(";")
for x in options:
#split each option to find out which option it is:
x = x.split("=")
#store it in the dictionary
# if there are invalid options this code WILL NOT DETECT THEM
d["options"][x[0]]= x[1]
#copy the dictionary into the list
self.parserinitlist.append(d.copy())
#then initialise each of these parsers and keep them in a list
self.parserlist = []
for parser in self.parserinitlist:
self.parserlist.append(ProteinPairParser(parser["data path"],
parser["output path"],
**parser["options"]))
return None
Testing initialisation requires a data source table file so this file is created below at the top directory for the data files.
cd /home/gavin/Documents/MRes/
/home/gavin/Documents/MRes
f = open("datasource.tab", "w")
f.write("HIPPIE/prematch.positive.HIPPIE.txt" + "\t" + "HIPPIE/training.positive.HIPPIE.txt" + "\t" + "protindexes=(1,3);valindex=4")
f.close()
Checking this file has written properly:
%%bash
cat datasource.tab
HIPPIE/prematch.positive.HIPPIE.txt HIPPIE/training.positive.HIPPIE.txt protindexes=(1,3);valindex=4
Then attempt to initialise the above version from that. Notice here that the FeatureVectorAssembler requires gold standard data sources at initialisation in this version. This is removed in the second version as it made more sense to allow arbitrary protein pair lists in the feature vector assemble method.
test = FeatureVectorAssembler("datasource.tab",
"DIP/human/training.nolabel.positive.Entrez.txt",
"DIP/human/training.nolabel.negative.Entrez.txt")
It initialises ok, so the second draft of the code with the required methods is given below:
class FeatureVectorAssembler():
'''Assembles feature vectors from protein pair files, data source lists and gold standard protein pair lists.'''
def __init__(self,sourcetab):
#instantiate protein pair parsers
# first parse the data source table
# store the directory of the table and it's name
self.sourcetabdir,self.tabfile = os.path.split(sourcetab)
# open the table and parse for initialisation options
c = csv.reader(open(sourcetab), delimiter="\t")
# iterate over lines adding to list of protein pair parsers
self.parserinitlist = []
for line in c:
#store the information in a dictionary
d = {}
d["data path"] = os.path.join(self.sourcetabdir,line[0])
d["output path"] = os.path.join(self.sourcetabdir,line[1])
#store options in a dictionary in the dictionary
d["options"] = {}
options = line[2].split(";")
for x in options:
#split each option to find out which option it is:
x = x.split("=")
#store it in the dictionary
# if there are invalid options this code WILL NOT DETECT THEM
d["options"][x[0]]= x[1]
#update the script directory
if "script" in d["options"].keys():
d["options"]["script"] = os.path.join(self.sourcetabdir,d["options"]["script"])
#copy the dictionary into the list
self.parserinitlist.append(d.copy())
#then initialise each of these parsers and keep them in a list
self.parserlist = []
for parser in self.parserinitlist:
self.parserlist.append(ProteinPairParser(parser["data path"],
parser["output path"],
**parser["options"]))
return None
def regenerate(self):
'''Calls all known protein parsers and gets them to regenerate their output, if they have to.'''
for parser in self.parserlist:
parser.regenerate()
return None
def assemble(self, pairfile, outputfile):
'''Assembles a file of feature vectors for each protein pair in a protein pair file supplied.
Assumes the pairfile is tab delimited.'''
# first parse the pairfile into a list of frozensets
pairs = map(lambda l: frozenset(l),csv.reader(open(pairfile), delimiter="\t"))
# open the file to put the feature vector in
c = csv.writer(open(outputfile, "w"), delimiter="\t")
#open all the databases and put them in a dictionary
dbdict = {}
for parser in self.parserinitlist:
dbdict[parser["output path"]] = openpairshelf(parser["output path"])
# then iterate through the pairs, querying all parser databases
for pair in pairs:
row = []
lpair = list(pair)
row = row + lpair
for parser in self.parserinitlist:
row.append(dbdict[parser["output path"]][pair])
c.writerow(row)
#close all the databases
for parser in self.parserinitlist:
dbdict[parser["output path"]].close()
return None
Testing initialisation of this code again:
test = FeatureVectorAssembler("/home/gavin/Documents/MRes/datasource.tab")
Regenerating the data sources. It does not report that any of the data sources require regeneration so nothing is done this time:
test.regenerate()
Then testing the assembly of a feature vector file:
test.assemble("DIP/human/training.nolabel.positive.Entrez.txt", "testoutput")
Looking at this file we can see that it has produced a feature vector with only one feature. It also reports the associated protein pairs next to this feature. This is removed from later versions as it makes later classification more complicated.
%%bash
head testoutput
4084 207 0.86 8360 8356 0.97 5914 9612 0.9 79833 6634 0.62 29102 4090 0 7074 6382 0 7159 22059 0 1869 7029 0.96 801 817 0 207 1786 0.7