(you do not need to do this if you have done it in the Interfacing_R notebook)
!rm sequence.index 2>/dev/null
!wget -nd ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/historical_data/former_toplevel/sequence.index -O sequence.index
--2016-02-05 15:50:22-- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/historical_data/former_toplevel/sequence.index => 'sequence.index' Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8 Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected. Logging in as anonymous ... Logged in! ==> SYST ... done. ==> PWD ... done. ==> TYPE I ... done. ==> CWD (1) /vol1/ftp/historical_data/former_toplevel ... done. ==> SIZE sequence.index ... 67069489 ==> PASV ... done. ==> RETR sequence.index ... done. Length: 67069489 (64M) (unauthoritative) sequence.index 100%[=====================>] 63.96M 419KB/s in 2m 27s 2016-02-05 15:52:54 (445 KB/s) - 'sequence.index' saved [67069489]
This is different from the book.
import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2
%load_ext rpy2.ipython
seq_data = %R read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE)
#This code was changed against the book as rpy2 changed
as_integer = robjects.r('as.integer')
match = robjects.r.match
seq_data['READ_COUNT'] = as_integer(seq_data['READ_COUNT'])
seq_data['BASE_COUNT'] = as_integer(seq_data['BASE_COUNT'])
%R -i seq_data
%R print(colnames(seq_data))
%R seq_data$CENTER_NAME <- toupper(seq_data$CENTER_NAME)
[1] "FASTQ_FILE" "MD5" "RUN_ID" [4] "STUDY_ID" "STUDY_NAME" "CENTER_NAME" [7] "SUBMISSION_ID" "SUBMISSION_DATE" "SAMPLE_ID" [10] "SAMPLE_NAME" "POPULATION" "EXPERIMENT_ID" [13] "INSTRUMENT_PLATFORM" "INSTRUMENT_MODEL" "LIBRARY_NAME" [16] "RUN_NAME" "RUN_BLOCK_NAME" "INSERT_SIZE" [19] "LIBRARY_LAYOUT" "PAIRED_FASTQ" "WITHDRAWN" [22] "WITHDRAWN_DATE" "COMMENT" "READ_COUNT" [25] "BASE_COUNT" "ANALYSIS_GROUP"
array(['BGI', 'BGI', 'BGI', ..., 'BI', 'BI', 'BI'], dtype='<U8')
%%R
seq_data <- seq_data[seq_data$WITHDRAWN==0, ]
seq_data <- seq_data[, c('STUDY_ID', 'STUDY_NAME', 'CENTER_NAME', 'SAMPLE_ID', 'SAMPLE_NAME', 'POPULATION', 'INSTRUMENT_PLATFORM', 'LIBRARY_LAYOUT', 'PAIRED_FASTQ', 'READ_COUNT', 'BASE_COUNT', 'ANALYSIS_GROUP')]
%%R
bar <- ggplot(seq_data) + aes(factor(CENTER_NAME)) + geom_bar() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(bar)
%%R
seq_data$POPULATION <- as.factor(seq_data$POPULATION)
yri_ceu <- seq_data[seq_data$POPULATION %in% c("YRI", "CEU") & seq_data$BASE_COUNT < 2E9 & seq_data$READ_COUNT < 3E7, ]
%%R
scatter <- ggplot(yri_ceu, aes(x=BASE_COUNT, y=READ_COUNT, col=factor(ANALYSIS_GROUP), shape=POPULATION)) + geom_point()
print(scatter)
%%R
library(gridExtra)
library(grid)
g <- grid.arrange(bar, scatter, ncol=1)
g
TableGrob (2 x 1) "arrange": 2 grobs z cells name grob 1 1 (1-1,1-1) arrange gtable[layout] 2 2 (2-2,1-1) arrange gtable[layout]
%%R
png('fig.png')
g
dev.off()
png 2