#!/usr/bin/env python # coding: utf-8 # ## Identification and assembly practical # How do we know what our sample is? # There are a few easy ways ... # # - BLASTing reads # - OneCodex # - Kraken (What's in my Pot?) # # # # For E. coli we will use the file: # # MAP006-1.pass.2D.poRe.fastq # # For the new genome use: # # pc1_shigella.tar # # Start with E. coli!, then move onto Shigella. # In[ ]: # ## miniasm # We will start by assembling E. coli with miniasm. # In[ ]: # Install minimap and miniasm (requiring gcc and zlib) git clone https://github.com/lh3/minimap && (cd minimap && make) git clone https://github.com/lh3/miniasm && (cd miniasm && make) # Overlap minimap/minimap -Sw5 -L100 -m0 -t8 MAP006-1.pass.2D.poRe.fastq MAP006-1.pass.2D.poRe.fastq | gzip -1 > reads.paf.gz # Layout miniasm/miniasm -f MAP006-1.pass.2D.poRe.fastq reads.paf.gz > contigs.gfa # We need to convert the unitigs file into a FASTA file: # In[ ]: awk '/^S/{print ">"$2"\n"$3}' contigs.gfa | fold > contigs.fa # Let's use QUAST to explore the assembly quality # In[ ]: wget "http://downloads.sourceforge.net/project/quast/quast-3.2.tar.gz?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Fquast%2Ffiles%2F&ts=1450193070&use_mirror=netcologne" -O quast.tar.gz # In[ ]: tar xvfz quast.tar.gz # In[ ]: sudo apt-get install python-matplotlib # In[ ]: quast-3.2/quast.py -R NC_000913.3.fa contigs.fa # Download the PDF report from ``quast_results/latest`` # ## Hybrid assembly # Convert the FASTQ file to FASTA (of course, you could have output FASTA via poretools or poRe) # In[ ]: seqtk seq -A MAP006-1.pass.2D.poRe.fastq > MAP006-1.pass.2D.poRe.fasta # Run Spades using the nanopore data and Illumina data. # In[ ]: spades.py --only-assembler -k 21,51,71 -1 MiSeq/SRR2627019_1.fastq.gz -2 MiSeq/SRR2627019_2.fastq.gz --nanopore MAP006-1.pass.2D.poRe.fasta -o SPADES &