#!/usr/bin/env python # coding: utf-8 # SAM # === # # This notebook explores parsing and understanding the SAM (Sequence Alignment/Map) and related BAM format. SAM is an extremely common format for representing read alignments. Most widely-used read alignment tools output SAM alignments. # # SAM is a text format. There is a closely related binary format called BAM. There are two types of BAM files: unsorted or sorted. Various tools, notably [SAMtools] and [Picard], can convert back and forth between SAM and BAM, and can sort an existing BAM file. When we say a BAM file is sorted we almost always mean that the alignments are sorted left-to-right along the reference genome. (It's also possible to sort a BAM file by read name, though that's only occassionally useful.) # # Once you have an interesting set of read alignments that you would like to keep for a while and perhaps analyze further, it's a good idea to keep them in *sorted BAM* files. This is because: # # 1. They will be well compressed. BAM files are smaller than corresponding SAM files, and *sorted* BAM files are smaller than corresponding *unsorted* BAM files. # 2. From a sorted BAM file, it's easy to extract just the alignments that overlap a specified stretch of the genome, making it easy to convert from sorted BAM to many other useful formats. # # That said, most alignment tools output SAM (not BAM), and the alignments come out in an arbitrary order -- not sorted. # # An authoritative and complete document describing the SAM and BAM formats is the [SAM specification]. This document is thorough, but, being a specification, it does not describe is the various "dialects" of legal SAM emitted by popular tools. I'll cover some of that here. # # [SAM specification]: http://samtools.sourceforge.net/SAMv1.pdf # [SAMtools]: http://samtools.sourceforge.net # [Picard]: http://picard.sourceforge.net # In[1]: # Here's a string representing a three-line SAM file. I'm temporarily # ignoring the fact that SAM files usually have several header lines at # the beginning. samStr = '''\ r1 0 gi|9626243|ref|NC_001416.1| 18401 42 122M * 0 0 TGAATGCGAACTCCGGGACGCTCAGTAATGTGACGATAGCTGAAAACTGTACGATAAACNGTACGCTGAGGGCAGAAAAAATCGTCGGGGACATTNTAAAGGCGGCGAGCGCGGCTTTTCCG +"@6<:27(F&5)9)"B:%B+A-%5A?2$HCB0B+0=D<7E/<.03#!.F77@6B==?C"7>;))%;,3-$.A06+<-1/@@?,26">=?*@'0;$:;??G+:#+(A?9+10!8!?()?7C> AS:i:-5 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:59G13G21G26 YT:Z:UU r2 0 gi|9626243|ref|NC_001416.1| 8886 42 275M * 0 0 NTTNTGATGCGGGCTTGTGGAGTTCAGCCGATCTGACTTATGTCATTACCTATGAAATGTGAGGACGCTATGCCTGTACCAAATCCTACAATGCCGGTGAAAGGTGCCGGGATCACCCTGTGGGTTTATAAGGGGATCGGTGACCCCTACGCGAATCCGCTTTCAGACGTTGACTGGTCGCGTCTGGCAAAAGTTAAAGACCTGACGCCCGGCGAACTGACCGCTGAGNCCTATGACGACAGCTATCTCGATGATGAAGATGCAGACTGGACTGC (#!!'+!$""%+(+)'%)%!+!(&++)''"#"#&#"!'!("%'""("+&%$%*%%#$%#%#!)*'(#")(($&$'&%+&#%*)*#*%*')(%+!%%*"$%"#+)$&&+)&)*+!"*)!*!("&&"*#+"&"'(%)*("'!$*!!%$&&&$!!&&"(*"$&"#&!$%'%"#)$#+%*+)!&*)+(""#!)!%*#"*)*')&")($+*%%)!*)!('(%""+%"$##"#+(('!*(($*'!"*('"+)&%#&$+('**$$&+*&!#%)')'(+(!%+ AS:i:-14 XN:i:0 XM:i:8 XO:i:0 XG:i:0 NM:i:8 MD:Z:0A0C0G0A108C23G9T81T46 YT:Z:UU r3 16 gi|9626243|ref|NC_001416.1| 11599 42 338M * 0 0 GGGCGCGTTACTGGGATGATCGTGAAAAGGCCCGTCTTGCGCTTGAAGCCGCCCGAAAGAAGGCTGAGCAGCAGACTCAAGAGGAGAAAAATGCGCAGCAGCGGAGCGATACCGAAGCGTCACGGCTGAAATATACCGAAGAGGCGCAGAAGGCTNACGAACGGCTGCAGACGCCGCTGCAGAAATATACCGCCCGTCAGGAAGAACTGANCAAGGCACNGAAAGACGGGAAAATCCTGCAGGCGGATTACAACACGCTGATGGCGGCGGCGAAAAAGGATTATGAAGCGACGCTGTAAAAGCCGAAACAGTCCAGCGTGAAGGTGTCTGCGGGCGAT 7F$%6=$:9B@/F'>=?!D?@0(:A*)7/>9C>6#1<6:C(.CC;#.;>;2'$4D:?&B!>689?(0(G7+0=@37F)GG=>?958.D2E04CB>D-="C'B080E'5BH"77':"@70#4%A5=6.2/1>;9"&-H6)=$/0;5E:<8G!@::1?2DC7C*;@*#.1C0.D>H/20,!"C-#,6@%<+:?5"2?:G,F"D0B8D-6$65D 0: ret.append([0, run, ""]) elif md[i].isalpha(): # stretch of mismatches mmstr = "" while i < len(md) and md[i].isalpha(): mmstr += md[i] i += 1 assert len(mmstr) > 0 ret.append([1, len(mmstr), mmstr]) elif md[i] == "^": # read gap i += 1 # skip over ^ refstr = "" while i < len(md) and md[i].isalpha(): refstr += md[i] i += 1 # skip over inserted character assert len(refstr) > 0 ret.append([2, len(refstr), refstr]) else: raise RuntimeError('Unexpected character in MD:Z: "%d"' % md[i]) return ret # In[6]: # Each element in the list returned by this call is itself a list w/ 3 # elements. Element 1 is the MD:Z operation (0=match, 1=mismatch, # 2=deletion). Element 2 is the length and element 3 is the relevant # sequence of nucleotides from the reference. mdzToList('10A5^AC6') # Now we can write a fucntion that takes a read sequennce, a parsed CIGAR string, and a parse `MD:Z` string and combines information from all three to make what I call a "stacked alignment." # In[7]: def cigarMdzToStacked(seq, cgp, mdp_orig): ''' Takes parsed CIGAR and parsed MD:Z, generates a stacked alignment: a pair of strings with gap characters inserted (possibly) and where characters at at the same offsets are opposite each other in the alignment. Only knows how to handle CIGAR ops M=XDINSH right now. ''' mdp = mdp_orig[:] rds, rfs = [], [] mdo, rdoff = 0, 0 for c in cgp: op, run = c skipping = (op == 4 or op == 5) assert skipping or mdo < len(mdp) if op == 0: # CIGAR op M, = or X # Look for block matches and mismatches in MD:Z string mdrun = 0 runleft = run while runleft > 0 and mdo < len(mdp): op_m, run_m, st_m = mdp[mdo] run_comb = min(runleft, run_m) runleft -= run_comb assert op_m == 0 or op_m == 1 rds.append(seq[rdoff:rdoff + run_comb]) if op_m == 0: # match from MD:Z string rfs.append(seq[rdoff:rdoff + run_comb]) else: # mismatch from MD:Z string assert len(st_m) == run_comb rfs.append(st_m) mdrun += run_comb rdoff += run_comb # Stretch of matches in MD:Z could span M and I CIGAR ops if run_comb < run_m: assert op_m == 0 mdp[mdo][1] -= run_comb else: mdo += 1 elif op == 1: # CIGAR op I rds.append(seq[rdoff:rdoff + run]) rfs.append("-" * run) rdoff += run elif op == 2: # D op_m, run_m, st_m = mdp[mdo] assert op_m == 2 assert run == run_m assert len(st_m) == run mdo += 1 rds.append("-" * run) rfs.append(st_m) elif op == 3: # N rds.append("-" * run) rfs.append("-" * run) elif op == 4: # S rds.append(seq[rdoff:rdoff + run].lower()) rfs.append(' ' * run) rdoff += run elif op == 5: # H rds.append('!' * run) rfs.append(' ' * run) elif op == 6: # P raise RuntimeError("Don't know how to handle P in CIGAR") else: raise RuntimeError('Unexpected CIGAR op: %d' % op) assert mdo == len(mdp) return ''.join(rds), ''.join(rfs) # In[8]: # Following example includes gaps and mismatches cigarMdzToStacked('GGACGCTCAGTAGTGACGATAGCTGAAAACCCTGTACGATAAACC', cigarToList('12M2D17M2I14M'), mdzToList('12^AT30G0')) # In[9]: # Following example also includes soft clipping (CIGAR: S) # SAM spec: Soft clipping: "clipped sequences present in SEQ" # We print them in lowercase to emphasize their clippedness cigarMdzToStacked('GGACGCTCAGTAGTGACGATAGCTGAAAACCCTGTACGAGAAGCC', cigarToList('12M2D17M2I8M6S'), mdzToList('12^AT25')) # In[10]: # Following example also includes hard clipping (CIGAR: H) # SAM spec: Hard clipping: "clipped sequences NOT present in SEQ" cigarMdzToStacked('GGACGCTCAGTAGTGACGATAGCTGAAAACCCTGTACGAGAAGCC', cigarToList('12M2D17M2I8M6S3H'), mdzToList('12^AT25')) # Note: don't see hard clipping in practice much # In[11]: # Following example also includes skipping (CIGAR: N), as seen in # TopHat alignments cigarMdzToStacked('GGACGCTCAGTAGTGACGATAGCTGAAAACCCTGTACGAGAAGCC', cigarToList('12M2D10M10N7M2I8M6S3H'), mdzToList('12^AT25')) # From the stacked alignment, it's easy to do other things. E.g. we can turn a stacked alignment into a new CIGAR string that uses the `=` and `X` operations instead of the less specific `M` operation: # In[12]: def cigarize(rds, rfs): off = 0 oplist = [] lastc, cnt = '', 0 for i in range(len(rds)): c = None if rfs[i] == ' ': c = 'S' elif rds[i] == '-' and rfs[i] == '-': c = 'N' elif rds[i] == '-': c = 'D' elif rfs[i] == '-': c = 'I' elif rds[i] != rfs[i]: c = 'X' else: c = '=' if c == lastc: cnt += 1 else: if len(lastc) > 0: oplist.append((lastc, cnt)) lastc, cnt = c, 1 if len(lastc) > 0: oplist.append((lastc, cnt)) return ''.join(map(lambda x: str(x[1]) + x[0], oplist)) # In[13]: x, y = cigarMdzToStacked('ACGTACGT', cigarToList('8M'), mdzToList('4G3')) # In[14]: cigarize(x, y) # In[15]: x, y = cigarMdzToStacked('GGACGCTCAGTAGTGACGATAGCTGAAAACCCTGTACGAGAAGCC', cigarToList('12M2D10M10N7M2I8M6S3H'), mdzToList('12^AT25')) # In[16]: cigarize(x, y) # ### Other resources # # * [SAM specification] # * [SAMtools] (and associated [publication](http://bioinformatics.oxfordjournals.org/content/25/16/2078.full)) # * [pysam] (Python library) # * [Picard] (Java library) # * [Bamtools] (C++ library) # * [Rsamtools] (R library) # # [SAM specification]: http://samtools.sourceforge.net/SAMv1.pdf # [SAMtools]: http://samtools.sourceforge.net/ # [pysam]: http://code.google.com/p/pysam/ # [Picard]: http://picard.sourceforge.net/ # [Bamtools]: http://sourceforge.net/projects/bamtools/ # [Rsamtools]: http://www.bioconductor.org/packages/2.14/bioc/html/Rsamtools.html # # © Copyright [Ben Langmead](http://www.cs.jhu.edu/~langmea) 2014