*Alternative versions: LabNotebook, nbviewer*
The ABIDE preprocessed connectome project (PCP) data can be downloaded programatically with a little hackery. This helps to keep storage space to a minimum as we can indicate with complete specificity which files we want, and download and keep (or not keep; just use temporarily) those only. Whilst there are some scripts and apis already out there that greatly finesse the process of grabbing data from the database, there are a few specific functionalities lacking in these that made them insufficient for my own needs. The following notes are therefore a simple set of recipes for grabbing ABIDE PCP data in a way that matched my requirements. I believe it should also generalize to the other non-abide PCP databases also.
Define some variables
# define system-specific filepaths etc
%run ~/set_localenv_vars.py
# output dir
outdir = le['data_dir'] + '/notebooks/downloading_abide_pcp_data'
!mkdir -p $outdir
nb_name = 'downloading_abide_pcp_data'
# stuff for analyses
s3_prefix = 'https://s3.amazonaws.com/fcp-indi/data/Projects/'\
'ABIDE_Initiative'
abide_dir = le['data_dir'] + '/PCP/ABIDE/downloaded'
# stuff for workdocs-cloudfiles
aws_key = 'drowssaperucesyreva'
aws_secret = '?teytidettopsuoyevah'
Importage
# Generic imports
from IPython.display import Image,display as d
from copy import deepcopy
import os,glob,sys
import numpy as np
import pandas as pd
import urllib
# workdocs-cloudfiles stuff
sys.path.append(le['ipynb_workdocs_dir'])
from new_utils import nb_fig,cloudfiles_nb
Initialize workdocs-cloudfiles folder
cnb = cloudfiles_nb('aws', [aws_key,aws_secret])
cnb.initialize_folder(nb_name)
Load calico document extensions
%%javascript
IPython.load_extensions('calico-spell-check', 'calico-document-tools',
'calico-cell-tools');
Go to output folder
os.chdir(outdir)
Ok, let's get cracking...
md_file = 'Phenotypic_V1_0b_preprocessed1.csv'
url = s3_prefix + '/' + md_file
out_file = abide_dir + '/' + md_file
urllib.urlretrieve(url,out_file);
Take a look at metadata file
df_md = pd.read_csv(out_file)
cols = ['subject', 'SUB_ID', 'FILE_ID', 'AGE_AT_SCAN','SEX', 'DX_GROUP']
d(df_md[cols].ix[0:5])
subject | SUB_ID | FILE_ID | AGE_AT_SCAN | SEX | DX_GROUP | |
---|---|---|---|---|---|---|
0 | 50002 | 50002 | no_filename | 16.77 | 1 | 1 |
1 | 50003 | 50003 | Pitt_0050003 | 24.45 | 1 | 1 |
2 | 50004 | 50004 | Pitt_0050004 | 19.09 | 1 | 1 |
3 | 50005 | 50005 | Pitt_0050005 | 13.73 | 2 | 1 |
4 | 50006 | 50006 | Pitt_0050006 | 13.37 | 1 | 1 |
5 | 50007 | 50007 | Pitt_0050007 | 17.78 | 1 | 1 |
site = 'Caltech'
sub = '0051456'
pipeline = 'cpac'
strategy = 'filt_global'
derivative = 'func_preproc'
suffix = '.nii.gz'
path = '/Outputs/%s/%s/%s/%s_%s_%s%s' %(pipeline,strategy, derivative,
site,sub, derivative, suffix)
d(path)
'/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder): os.makedirs(out_folder)
This is where it's coming from
url = s3_prefix + path
d(url)
'https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'
This is where it's going
d(out_file)
'/alexandra/mcintosh_lab/john/Data/PCP/ABIDE/downloaded/Outputs/cpac/filt_global/func_preproc/Caltech_0051456_func_preproc.nii.gz'
Now get it.
urllib.urlretrieve(url,out_file);
Check that file has arrived
ls $out_folder/
Caltech_0051456_func_preproc.nii.gz Caltech_0051457_func_preproc.nii.gz
fs_folder = 'surf'
fs_filename = 'lh.orig'
path = '/Outputs/freesurfer/5.1/%s_%s/%s/%s' %(site,sub,fs_folder, fs_filename)
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder): os.makedirs(out_folder)
This is where it's coming from
url = s3_prefix + path
d(url)
'https://s3.amazonaws.com/fcp-indi/data/Projects/ABIDE_Initiative/Outputs/freesurfer/5.1/Caltech_0051456/surf/lh.orig'
This is where it's going
d(out_file)
'/alexandra/mcintosh_lab/john/Data/PCP/ABIDE/downloaded/Outputs/freesurfer/5.1/Caltech_0051456/surf/lh.orig'
Do it
urllib.urlretrieve(url,out_file);
Check for outputs
ls $out_folder/
lh.orig lh.thickness rh.orig rh.thickness lh.sphere lh.white rh.sphere rh.white
Now do the above for all the freesurfer files we need
# surf folder
surf_files = ['lh.orig', 'rh.orig', 'lh.white', 'rh.white', 'lh.sphere' ,'rh.sphere',
'lh.thickness', 'rh.thickness']
# mri folder
mri_files = ['brain.mgz','brainmask.mgz', 'orig.mgz']
fs_paths = []
for f in surf_files: fs_paths.append('surf/%s' %f)
for f in mri_files: fs_paths.append('mri/%s' %f)
These are the files
d(fs_paths)
['surf/lh.orig', 'surf/rh.orig', 'surf/lh.white', 'surf/rh.white', 'surf/lh.sphere', 'surf/rh.sphere', 'surf/lh.thickness', 'surf/rh.thickness', 'mri/brain.mgz', 'mri/brainmask.mgz', 'mri/orig.mgz']
Grab em
for f in fs_paths:
path = '/Outputs/freesurfer/5.1/%s_%s/%s' %(site,sub,f)
url = s3_prefix + path
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder):
os.makedirs(out_folder)
urllib.urlretrieve(url,out_file);
Ok, this should be clear enough now.
In practice, in general, we will probably want to be pulling entire groups in one go. This would be done with something like the following:
pipeline = 'cpac'
derivative = 'func_preproc'
suffix = '.nii.gz'
subs = ['0051456', '0051457']
strategies = ['filt_global', 'filt_noglobal', 'nofilt_global', 'nofilt_noglobal']
for sub in subs:
# Get functional data
for strategy in strategies:
path = '/Outputs/%s/%s/%s/%s_%s_%s%s' %(pipeline,strategy, derivative,
site,sub, derivative, suffix)
url = s3_prefix + path
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder):
os.makedirs(out_folder)
if not os.path.isfile(out_file):
urllib.urlretrieve(url,out_file);
#!wget $url -O $out_file # (alternative to urllib command)
# Get freesurfer data
for f in fs_paths:
path = '/Outputs/freesurfer/5.1/%s_%s/%s' %(site,sub,f)
url = s3_prefix + path
out_file = abide_dir + path
out_folder = os.path.dirname(out_file)
if not os.path.isdir(out_folder):
os.makedirs(out_folder)
if not os.path.isfile(out_file):
urllib.urlretrieve(url,out_file);
#!wget $url -O $out_file # (alternative to urllib command)
Ok, that's a wrap.
css styling
display_css(le['work_folder']+'/masters/styles/CFDPython_css_modified_2.css')