import networkx as nx DG=nx.DiGraph() import random random.seed(0) myseq = "".join([random.choice(['a', 'c', 'g', 't']) for i in range(10000)]) print myseq[:60], "..." readlen = 30 genelen = 1000 starts = [] reads = [] for i in range(10): starts += [random.randint(i*genelen, (i+1)*genelen-readlen-1) for j in range(500)] reads = [myseq[start:start+readlen] for start in starts] print reads[0] coverage = [0] * len(myseq) for start in starts: for i in range(start, start+readlen): coverage[i] += 1 %matplotlib inline import matplotlib, numpy #matplotlib.use('Agg') import matplotlib.pyplot as plt fig=plt.figure(figsize=(4,2)) plt.plot(coverage) fig.tight_layout() k = 16 for read in reads: for i in range(len(read) - k - 1): kmer_a = read[i:i+k] kmer_b = read[i+1:i+k+1] DG.add_edge(kmer_a, kmer_b) components = nx.connected_components(DG.to_undirected()) len(components) map(len, components) placement = [0] * len(myseq) for i in range(len(components)): for read in components[i]: place = myseq.index(read) placement[place] = i placefig=plt.figure(figsize=(4,2)) plt.plot(placement) placefig.tight_layout() best = nx.subgraph(DG, components[0]) nx.draw(best, with_labels=False, node_size=300, pos=nx.random_layout(best)) def eulerian_path(G, source=None): """Find an Eulerian path in a weakly connected component, stolen from eulerian_circuit""" if not nx.is_weakly_connected(G): raise nx.NetworkXError("G is not connected.") g = G.__class__(G) # copy graph structure (not attributes) # set starting node if source is None: for v in g.nodes(): indeg = g.in_degree(v) outdeg = g.out_degree(v) if outdeg == (indeg + 1): break else: raise nx.NetworkXError("G doesn't have an Eulerian circuit.") else: v = source while g.out_degree(v) > 0: n = v bridge = None # sort nbrs here to provide stable ordering of alternate cycles nbrs = sorted([v for u,v in g.edges(n)]) for v in nbrs: g.remove_edge(n,v) bridge = not nx.is_connected(g.to_undirected()) if bridge: g.add_edge(n,v) # add this edge back and try another else: break # this edge is good, break the for loop if bridge: g.remove_edge(n,v) g.remove_node(n) yield (n,v) path = eulerian_path(best) nodepath = [u for u,v in path] seq = nodepath[0] + "".join([node[-1] for node in nodepath[1:]]) seq in myseq len(seq) def wrap(seq): start = 0 for i in range(60, len(seq), 60): print seq[start: i] start = i print seq[start:] wrap(seq) myseq.index(seq)