GFFutils available in your $PATH
.
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
%env
indicates a bash variable
without %env
is Python variable
# Set directories, input/output files
%env data_dir=/home/sam/data/C_virginica/genomes
%env analysis_dir=/home/sam/analyses/20220217-cvir-lncRNA_subsetting
analysis_dir="20220217-cvir-lncRNA_subsetting"
# Input files (from NCBI)
%env ncbi_fasta=GCF_002022765.2_C_virginica-3.0_genomic.fna
%env ncbi_fasta_index=GCF_002022765.2_C_virginica-3.0_genomic.fna.fai
%env ncbi_fasta_gz=GCF_002022765.2_C_virginica-3.0_genomic.fna.gz
%env ncbi_gff=GCF_002022765.2_C_virginica-3.0_genomic.gff
%env ncbi_gff_gz=GCF_002022765.2_C_virginica-3.0_genomic.gff.gz
# URL to download files from NCBI
%env ncbi_url=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/022/765/GCF_002022765.2_C_virginica-3.0
# Output files
%env lncRNA_bed=GCF_002022765.2_C_virginica-3.0_lncRNA.bed
%env lncRNA_gff=GCF_002022765.2_C_virginica-3.0_lncRNA.gff
%env lncRNA_gtf=GCF_002022765.2_C_virginica-3.0_lncRNA.gtf
%env lncRNA_fasta=GCF_002022765.2_C_virginica-3.0_lncRNA.fa
%env lncRNA_fasta_index=GCF_002022765.2_C_virginica-3.0_lncRNA.fa.fai
# Set program locations
%env gffread=/home/sam/programs/gffread-0.12.7.Linux_x86_64/gffread
%env samtools=/home/sam/programs/samtools-1.12/samtools
%%bash
# Make analysis directory, if it doesn't exist
mkdir --parents "${analysis_dir}"
%%bash
cd "${data_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
wget --quiet \
--continue \
${ncbi_url}/${ncbi_gff_gz}
# Unzip download GFF
gunzip "${ncbi_gff_gz}"
ls -ltrh "${ncbi_gff}"
%%bash
head -n 20 "${data_dir}"/"${ncbi_gff}"
%%bash
cd "${data_dir}"
# Download with wget.
# Use --quiet option to prevent wget output from printing too many lines to notebook
# Use --continue to prevent re-downloading fie if it's already been downloaded.
wget --quiet \
--continue \
${ncbi_url}/${ncbi_fasta_gz}
# Unzip download GFF
gunzip "${ncbi_fasta_gz}"
ls -ltrh "${ncbi_fasta}"
%%bash
cd "${data_dir}"
${samtools} faidx "${ncbi_fasta}"
ls -ltrh "${ncbi_fasta_index}"
%%bash
cd "${data_dir}"
head "${ncbi_fasta_index}"
%%bash
cd "${data_dir}"
# Capture GFF header from NCBI gff
head -n 7 "${ncbi_gff}" > ${analysis_dir}/"${lncRNA_gff}"
# Add note about modification
printf "#%s%s\n" "!" "lncRNA only - created by Sam White $(date)" >> ${analysis_dir}/"${lncRNA_gff}"
# Finds lncRNAs in NCBI GFF
gtf_extract \
--feature lnc_RNA \
--gff "${ncbi_gff}" \
>> ${analysis_dir}/"${lncRNA_gff}"
head ${analysis_dir}/"${lncRNA_gff}"
%%bash
cd "${data_dir}"
${gffread} --bed \
${analysis_dir}/"${lncRNA_gff}" \
> ${analysis_dir}/"${lncRNA_bed}"
%%bash
head ${analysis_dir}/"${lncRNA_bed}"
%%bash
cd "${data_dir}"
${gffread} -E \
${analysis_dir}/"${lncRNA_gff}" -T \
1> ${analysis_dir}/"${lncRNA_gtf}" \
2> ${analysis_dir}/gffread-lncRNA_gff-to-lncRNA_gtf.stderr
%%bash
head ${analysis_dir}/"${lncRNA_gtf}"
Explanation of GffRead options used below:
-w
: specifies output FastA file
-W
: specifies to write coordinates of all exons spliced in FastA deflines
-g
: specifies input FastA (needs to have a corresponding FastA index file in same directory)
%%bash
cd "${data_dir}"
${gffread} -E \
-w ${analysis_dir}/"${lncRNA_fasta}" -W \
-g "${ncbi_fasta}" \
${analysis_dir}/"${lncRNA_gtf}" \
2> ${analysis_dir}/gffread_lncRNA-fasta-extraction.stderr
%%bash
head ${analysis_dir}/"${lncRNA_fasta}"
%%bash
cd "${analysis_dir}"
${samtools} faidx "${lncRNA_fasta}"
ls -ltrh "${lncRNA_fasta_index}"
%%bash
cd "${analysis_dir}"
head "${lncRNA_fasta_index}"
%%bash
cd "${analysis_dir}"
for file in *
do
md5sum "${file}" | tee --append checksums.md5
done
%%bash
${gffread} -h
gtf_extract
options¶%%bash
gtf_extract -h