import os
os.chdir("d:\\workspace\\ma")
from matplotlib.pyplot import plot
import pymongo
print "pymongo version", pymongo.version
pymongo version 2.4
def ma_collection():
con = pymongo.Connection("localhost",27018)
col = con.ma.results
return col
col = ma_collection()
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-4-58188d38b7eb> in <module>() ----> 1 col = ma_collection() <ipython-input-3-ce6c00306540> in ma_collection() 1 def ma_collection(): ----> 2 con = pymongo.Connection("britanya409-2.tau.ac.il",27018) 3 col = con.ma.results 4 return col C:\Python27\lib\site-packages\pymongo\connection.pyc in __init__(self, host, port, max_pool_size, network_timeout, document_class, tz_aware, _connect, **kwargs) 176 177 super(Connection, self).__init__(host, port, --> 178 max_pool_size, document_class, tz_aware, _connect, **kwargs) 179 180 def __repr__(self): C:\Python27\lib\site-packages\pymongo\mongo_client.pyc in __init__(self, host, port, max_pool_size, document_class, tz_aware, _connect, **kwargs) 269 if _connect: 270 try: --> 271 self.__find_node(seeds) 272 except AutoReconnect, e: 273 # ConnectionFailure makes more sense here than AutoReconnect C:\Python27\lib\site-packages\pymongo\mongo_client.pyc in __find_node(self, seeds) 617 for candidate in candidates: 618 try: --> 619 node, ismaster, isdbgrid, res_time = self.__try_node(candidate) 620 self.__is_primary = ismaster 621 self.__is_mongos = isdbgrid C:\Python27\lib\site-packages\pymongo\mongo_client.pyc in __try_node(self, node) 527 528 # Call 'ismaster' directly so we can get a response time. --> 529 sock_info = self.__socket() 530 response, res_time = self.__simple_command(sock_info, 531 'admin', C:\Python27\lib\site-packages\pymongo\mongo_client.pyc in __socket(self) 660 self.start_request() 661 --> 662 sock_info = self.__pool.get_socket((host, port)) 663 except socket.error, why: 664 self.disconnect() C:\Python27\lib\site-packages\pymongo\pool.pyc in get_socket(self, pair) 265 self.lock.release() 266 except KeyError: --> 267 sock_info, from_pool = self.connect(pair), False 268 269 if from_pool: C:\Python27\lib\site-packages\pymongo\pool.pyc in connect(self, pair) 214 return_socket() when you're done with it. 215 """ --> 216 sock = self.create_connection(pair) 217 218 if self.use_ssl: C:\Python27\lib\site-packages\pymongo\pool.pyc in create_connection(self, pair) 193 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) 194 sock.settimeout(self.conn_timeout or 20.0) --> 195 sock.connect(sa) 196 return sock 197 except socket.error, e: KeyboardInterrupt:
col.count()
Specs are the parameter sets. Currently most of the parameters are constants, and only s, U, and $\pi$ are variables.
specs = []
for s in col.distinct('s'):
for U in col.find({'s':s}).distinct('U'):
for pi in col.find({'s':s,'U':U}).distinct('pi'):
specs.append( {'s':s,'U':U,'pi':pi} )
Get only the required date - the parameters and w which is the mean fitness after the bottleneck (if b==1 then it is the fitness of the single individual after the bottleneck).
cur = col.find({},['w','tau','B','genes','epistasis','s','b','U','pop','pi'])
d is some record. We show the parameter values and the plot of the mean fitness. Note that w and _id do not count as parameters because the first is the fitness time series and the second is the Mongo ID of the record.
d = cur.next()
params = [(k,v) for k,v in d.items() if (k!='_id' and k!='w')]
for k,v in params: print k,'=',v
tau = 1 B = 300 genes = 100 pop = 100000000.0 epistasis = 1.0 s = 0.01 b = 1 U = 0.003 pi = 200
w = d['w']
plot(w);
First we convert a dict with a list value in key w to a list of dicts, each with a single value for key w.
time_series = [None]*len(w)
for t in range(len(w)):
time_point = {'w':w[t]}
time_point.update(params)
time_series[t] = time_point
import pandas as pd
print "pandas version", pd.__version__
pandas version 0.9.0
df = pd.DataFrame(time_series)
df.to_csv("test.csv")
Start by writing a function that does what we did before - convert a record from MongoDB to a list of dicts.
def record_to_data_list_of_dicts(record):
params = [(k,v) for k,v in record.items() if (k!='_id' and k!='w')]
time_series = [None]*len(w)
for t in range(len(w)):
time_point = {'w':w[t],'t':t+1}
time_point.update(params)
time_series[t] = time_point
return time_series
Next, here is a function that takes a cursor and converts all the records to one big list of dict. Use a limit of 10 when testing.
def cursor_to_data_frame(cur):
time_series = []
for record in cur:
time_series.extend( record_to_data_list_of_dicts(record) )
return time_series
This function creates a cursor of just 10 records, for testing, and converts it to a DataFrame and saves it to csv.
The csv file can be opened in R with the following:
data<-read.csv('test.csv') dim(data) # should be 3000 12
If the filename ends with .gz the file will be compressed with gzip - this is transparent to R and sometimes allows R to load the file faster compared to a non-compressed csv file.
def mongo_to_csv(csv_fname, limit):
col = ma_collection()
cur = col.find({},['w','tau','B','genes','epistasis','s','b','U','pop','pi'], limit=limit)
time_series = cursor_to_data_frame(cur)
cur.close()
df = pd.DataFrame(time_series)
if csv_fname.endswith(".csv"):
df.to_csv(csv_fname, index=False)
elif csv_fname.endswith(".csv.gz"):
import gzip
fout = gzip.open(csv_fname, 'wb')
df.to_csv(fout, index=False)
mongo_to_csv("test.csv", 10)
mongo_to_csv("test.csv.gz", 10)
That's it, all the functions are in place, the above is a test case. To run on all the data just use:
mongo_to_csv("ma_output.csv.gz", 0)
Note: limit=0 gets all the records.
And you can run this in R to plot a certain fitness mean:
df <- read.csv('ma_output.csv.gz') data <- ddply(df, .(s,U,pi,t), summarize, mean.w = mean(w), median.w = median(w)) subdata <-subset(data,s==0.01 & U==0.003 & pi==0) plot(mean.w~t, data=subdata)