# Modified Stylesheet for notebook. from IPython.core.display import HTML def css_styling(): styles = open("custom.css", "r").read() return HTML(styles) css_styling() import matplotlib.pyplot as plt import numpy as np np.set_printoptions(precision=2) from IPython.display import Image import os,sys Image('https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/EKG_Complex_en.svg/300px-EKG_Complex_en.svg.png') # via Wikipedia # Not as important. Image(url="http://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/QRS_complex.png/220px-QRS_complex.png") sys.path.append(os.path.join('..', 'ecgtk')) import wfdbtools data_path = os.path.join('..', 'data') os.listdir(data_path) fpath = os.path.join(data_path, 'a02.hea') print "Reading data header file {}".format(fpath) print "-" * 10 with open(fpath, 'r') as f: for ii in range(5): print f.readline() #!head 'data/a02.qrs' #!head 'data/a02.apn' wfpath = fpath.rstrip('.hea') # here we use wfdbtools to load our data and metadata dat, info = wfdbtools.rdsamp( wfpath, start=0, # define time range instead of reading end=20, # reading in the whole file. ) # info is a dict of metadata for k,v in info.iteritems(): print "{:<15} : {}".format(k,v) # dat is our actual data print "\nData shape: {}".format(dat.shape) # wfdbtools includes a simple way to plot our data wfdbtools.plot_data(dat, info) import pandas as pd # Let's reload the data # This time ALL the data not just the first 20 seconds dat, info = wfdbtools.rdsamp(wfpath) print dat.shape # Load numpy array into pandas and label columns df = pd.DataFrame(dat, columns= ['#', 'sec', 'EKG']) # Set index so the x-axis is properly labeled in our plot. df.set_index('sec', drop=False, inplace=True) # Just plot all the data # This is a lot of data. Takes a few seconds to plot, and looks messy. df.EKG.plot() # Plot customization # Using a slice of just the first few seconds of EKG data ekg_signal = df.EKG[:6] def plot_ekg(ekg, title = "EKG Plot, Customized"): fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10,5)) ekg.plot( marker='x', linestyle='', alpha=0.8, ax=ax ) ekg.plot( c='b', linestyle='-', alpha=0.5, ax=ax ) plt.title(title); plot_ekg(ekg_signal) window_size = 32 print "Window size: " , window_size step_size = window_size print "Step size: ", step_size total_time_available = info['samp_count'] / info['samp_freq'] # in seconds print "Total data available: {} seconds".format(total_time_available) print "Sample Frequency: {} samples per second".format(info['samp_freq']) print "Size of data being processed: {} observations".format(len(fullsignal)) print "\t({} seconds)".format(len(fullsignal) / info['samp_freq']) #Choose how much of the signal to use here: training_size = 2000000 fullsignal = dat[:training_size,2] # selecting just the 3rd column # 1st column is index, 2nd is time, 3d is EKG data. print "Using about {:.02f}% of the total available signal".format( len(fullsignal) / float(info['samp_count']) * 100 ) window_list = [] nwindows = 0 for range_start in xrange(0, len(fullsignal), step_size): nwindows += 1 range_end = range_start + step_size #- 1 if range_end > len(fullsignal): continue # DEBUG print "{} to {}".format(range_start, range_end) window = fullsignal[range_start:range_end] # Multiply signal by sin() window = window * np.sin(np.linspace(0, np.pi, num=window_size ))# - 1) ) window_list.append(window) print "\nFull signal of size {} was broken up into {} pieces of length {} each.".format( len(fullsignal), len(window_list), len(window_list[0]) ) # All windows should be the same size. window_lengths = np.array([len(w) for w in window_list]) print "Sanity check: Each window is the same size:", print np.all(window_lengths == window_lengths[0]) from sklearn.cluster import MiniBatchKMeans # Do the clustering here print "Number of windows", len(window_list) k = 200 # number of clusters print "Number of clusters: ", k #testdata = window_list[:500] # To help with testing, limit model = MiniBatchKMeans( n_clusters=k, verbose=1, ) model.fit_predict(window_list) ## Set the k-means cluster centers as our basis dictionary basis_dict = model.cluster_centers_ # Visually examining the Basis Dictionary. if k > 10: ksmall = 15 print "Large dictionary. Plotting just the first {}.".format(ksmall) print "... there are {} additional windows".format(k - ksmall) print " in the dictionary that we will not plot." ncols = 5 nrows = int(math.ceil(ksmall / 5.)) fig, axes = plt.subplots( ncols=ncols, nrows = nrows, figsize = (3 * ncols,3 * nrows) ) axiter = axes.flat for clust_idx in xrange(ksmall): axiter[clust_idx].plot(basis_dict[clust_idx]) from collections import Counter clust_counter = Counter(model.labels_) tt = clust_counter.most_common(5) print [idx for idx,size in tt] # These are the 10 largest clusters # and the number of windows they contain print "Largest clusters:" print "\t(cluster_index, n_windows_in_cluster)" large_clusters = clust_counter.most_common(5) print "\t", large_clusters fig, axes = plt.subplots( ncols=5, nrows=1, figsize = (15,3) ) axiter = axes.flat for ii, clust_idx in enumerate([idx for idx,size in large_clusters]): axiter[ii].plot(basis_dict[clust_idx]) # These are the 10 smallest clusters # and the number of windows they contain print "Smallest clusters:" print "\t(cluster_index, n_windows_in_cluster)" small_clusters = clust_counter.most_common()[:-6:-1] print "\t", small_clusters fig, axes = plt.subplots( ncols=5, nrows=1, figsize = (15,3) ) axiter = axes.flat for ii, clust_idx in enumerate([idx for idx,size in small_clusters]): axiter[ii].plot(basis_dict[clust_idx]) modelfilename = "model_trainsize={}_stepsize={}_windowsize={}_nclusters={}.pkl".format( len(fullsignal), step_size, window_size, k ) from sklearn.externals import joblib joblib.dump(model, modelfilename, compress=9) from sklearn.metrics import mean_squared_error ## Reconstructing a single window def reconstruct_window(window, basis, plot=False): proj = np.apply_along_axis(window.dot, 1, basis) #Find closest basis vector maxidx = np.abs(proj).argmax() if plot==True: print "Best Projection: ", proj[maxidx] # Visual inspection fig, axes = plt.subplots(2, 1, figsize=(6,6)) axes[0].plot(window) axes[0].set_title('Original') axes[1].plot(basis[maxidx]) axes[1].set_title('Reconstructed') # find error print "Error: ", mean_squared_error(window, basis[maxidx]) return basis[maxidx] # Select any window for testing some_window = window_list[100] # Reconstruct this window basis_window = reconstruct_window(some_window, basis_dict, plot=True) ## Reconstruct a longer signal def reconstruct_signal(sig, basis, plot=False): dict_size, window_size = basis.shape n_segments = 2 * int(len(sig) / window_size) - 1 rec = np.zeros(len(sig), dtype=float) print "n segments to use is:", n_segments print "dictionary (basis set) size is:", dict_size #split the signal for ii in xrange(n_segments): # Obtain index range start_idx = int(ii * window_size/2) end_idx = start_idx + window_size # Slice signal over range to get window s = sig[start_idx:end_idx] # reconstruct this window bvec = reconstruct_window(s, basis, plot=False) # Update complete reconstruction rec[start_idx:end_idx] += bvec if plot == True: print "window size is: ", window_size print "Error: ", mean_squared_error(sig, rec) fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(int(n_segments/3), 4*3)) axes[0].plot(sig) axes[0].plot(rec) axes[0].set_title('Original and reconstructed signal') ylims = axes[0].get_ylim() axes[1].plot(sig) axes[1].set_title('Original signal') axes[2].plot(rec) axes[2].set_title('Reconstructed signal') #axes[3].plot(sig - rec) #axes[3].set_ylim(ylims) #axes[3].set_title('Residual') return rec # Select a longer signal to reconstruct for testing. signal = fullsignal[:800] signal_reconstructed = reconstruct_signal(signal, basis_dict, plot=True) np.set_printoptions(precision=2, linewidth=100) # explicitly calculate the residual here. residual = signal - signal_reconstructed # bounds to look at: start = 10; stop = 20 print signal[start:stop] print signal_reconstructed[start:stop] print residual[start:stop] print '-' # Sanity check on error calculation tt = np.power(signal[start:stop] - signal_reconstructed[start:stop], 2) print tt.mean() print mean_squared_error(signal[start:stop], signal_reconstructed[start:stop]) plt.plot(residual) # Warning: Depending on the signal size, this can take a long time. # Below we'll %%timeit to check running time, for reference. sig_size = 50000 long_signal = fullsignal[:sig_size] long_signal_reconstructed = reconstruct_signal(long_signal, basis_dict, plot=False) %%timeit long_signal_reconstructed = reconstruct_signal(long_signal, basis_dict, plot=False) print """ Parameters: Signal size to reconstruct = {} Size of basis dictionary = {} Window size = {} Step size = {} Results in Mean Squared Error: {}""".format( sig_size, k, window_size, step_size, mean_squared_error(long_signal, long_signal_reconstructed) ) # Load a signal segment known to be anolamous. anomalous_dataset = 'a01' start = 4825 end = start + 15 wfpath = os.path.join(data_path, anomalous_dataset) dat, info = wfdbtools.rdsamp(wfpath, start=start, end=end ) print info print dat.shape anomalous_ekg = dat[:, 2] # Plot to see the anomalous signal plt.figure(figsize=(16,3)) plt.plot(anomalous_ekg) plt.title('{}, Start: {}'.format(fpath, start)) reconstruct_signal(anomalous_ekg, basis_dict, plot=True)