# Modified Stylesheet for notebook.
from IPython.core.display import HTML
def css_styling():
    styles = open("custom.css", "r").read()
    return HTML(styles)

css_styling()

import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=2)

from IPython.display import Image
import os,sys

Image('https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/EKG_Complex_en.svg/300px-EKG_Complex_en.svg.png')
# via Wikipedia
# Not as important. 
Image(url="http://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/QRS_complex.png/220px-QRS_complex.png")
sys.path.append(os.path.join('..', 'ecgtk'))
import wfdbtools

data_path = os.path.join('..', 'data')
os.listdir(data_path)

fpath = os.path.join(data_path, 'a02.hea')

print "Reading data header file {}".format(fpath)
print "-" * 10
with open(fpath, 'r') as f:
    for ii in range(5):
        print f.readline()

#!head 'data/a02.qrs'
#!head 'data/a02.apn'

wfpath = fpath.rstrip('.hea')

# here we use wfdbtools to load our data and metadata
dat, info = wfdbtools.rdsamp(
    wfpath, 
    start=0,  # define time range instead of reading
    end=20,   #   reading in the whole file. 
    )

# info is a dict of metadata
for k,v in info.iteritems():
    print "{:<15} : {}".format(k,v)

# dat is our actual data
print "\nData shape: {}".format(dat.shape)

# wfdbtools includes a simple way to plot our data
wfdbtools.plot_data(dat, info)

import pandas as pd

# Let's reload the data
#  This time ALL the data not just the first 20 seconds
dat, info = wfdbtools.rdsamp(wfpath)

print dat.shape

# Load numpy array into pandas and label columns
df = pd.DataFrame(dat, columns= ['#', 'sec', 'EKG'])

# Set index so the x-axis is properly labeled in our plot. 
df.set_index('sec', drop=False, inplace=True) 

# Just plot all the data
#   This is a lot of data. Takes a few seconds to plot, and looks messy.
df.EKG.plot()

# Plot customization
#  Using a slice of just the first few seconds of EKG data

ekg_signal = df.EKG[:6]
def plot_ekg(ekg, title = "EKG Plot, Customized"):
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10,5))
    ekg.plot(
        marker='x',
        linestyle='',
        alpha=0.8,
        ax=ax
        )
    ekg.plot(
        c='b',
        linestyle='-',
        alpha=0.5,
        ax=ax
        )
    plt.title(title);

plot_ekg(ekg_signal)

window_size = 32
print "Window size: " , window_size

step_size = window_size
print "Step size: ", step_size

total_time_available = info['samp_count'] / info['samp_freq']  # in seconds
print "Total data available: {} seconds".format(total_time_available)
print "Sample Frequency: {} samples per second".format(info['samp_freq'])

print "Size of data being processed: {} observations".format(len(fullsignal))
print "\t({} seconds)".format(len(fullsignal) / info['samp_freq'])


#Choose how much of the signal to use here:

training_size = 2000000
fullsignal = dat[:training_size,2]
    # selecting just the 3rd column 
    # 1st column is index, 2nd is time, 3d is EKG data. 
    
print "Using about {:.02f}% of the total available signal".format(
    len(fullsignal) / float(info['samp_count']) * 100
    )

window_list = []
nwindows = 0

for range_start in xrange(0, len(fullsignal), step_size):
    nwindows += 1
    range_end = range_start + step_size #- 1
    
    if range_end > len(fullsignal): 
        continue
    
    # DEBUG print "{} to {}".format(range_start, range_end)
    
    window = fullsignal[range_start:range_end]
    
    # Multiply signal by sin() 
    window = window * np.sin(np.linspace(0, np.pi, num=window_size ))# - 1) )    
    window_list.append(window)

print "\nFull signal of size {} was broken up into {} pieces of length {} each.".format(
    len(fullsignal),
    len(window_list),
    len(window_list[0])
)

# All windows should be the same size. 
window_lengths = np.array([len(w) for w in window_list])

print "Sanity check: Each window is the same size:", 
print np.all(window_lengths == window_lengths[0])

from sklearn.cluster import MiniBatchKMeans

# Do the clustering here
print "Number of windows", len(window_list)

k = 200  # number of clusters
print "Number of clusters: ", k

#testdata = window_list[:500]  # To help with testing, limit 

model = MiniBatchKMeans(
    n_clusters=k, 
    verbose=1,
    )
model.fit_predict(window_list)

## Set the k-means cluster centers as our basis dictionary
basis_dict = model.cluster_centers_

# Visually examining the Basis Dictionary. 

if k > 10: 
    ksmall = 15
    print "Large dictionary. Plotting just the first {}.".format(ksmall)
    print "... there are {} additional windows".format(k - ksmall)
    print " in the dictionary that we will not plot."
    
ncols = 5
nrows = int(math.ceil(ksmall / 5.))

fig, axes = plt.subplots(
                ncols=ncols, 
                nrows = nrows, 
                figsize = (3 * ncols,3 * nrows)
            )

axiter = axes.flat
for clust_idx in xrange(ksmall): 
    axiter[clust_idx].plot(basis_dict[clust_idx])
    

from collections import Counter

clust_counter = Counter(model.labels_)

tt = clust_counter.most_common(5)
print [idx for idx,size in tt] 

# These are the 10 largest clusters
#   and the number of windows they contain
print "Largest clusters:"
print "\t(cluster_index, n_windows_in_cluster)"
large_clusters = clust_counter.most_common(5)
print "\t", large_clusters

fig, axes = plt.subplots( ncols=5, nrows=1,
                figsize = (15,3)
            )

axiter = axes.flat
for ii, clust_idx in enumerate([idx for idx,size in large_clusters]): 
    axiter[ii].plot(basis_dict[clust_idx])


# These are the 10 smallest clusters
#   and the number of windows they contain
print "Smallest clusters:"
print "\t(cluster_index, n_windows_in_cluster)"
small_clusters = clust_counter.most_common()[:-6:-1]
print "\t", small_clusters

fig, axes = plt.subplots( ncols=5, nrows=1,
                figsize = (15,3)
            )

axiter = axes.flat
for ii, clust_idx in enumerate([idx for idx,size in small_clusters]): 
    axiter[ii].plot(basis_dict[clust_idx])


modelfilename = "model_trainsize={}_stepsize={}_windowsize={}_nclusters={}.pkl".format(
            len(fullsignal),
            step_size, 
            window_size, 
            k
            )

from sklearn.externals import joblib
joblib.dump(model, modelfilename, compress=9)

from sklearn.metrics import mean_squared_error

## Reconstructing a single window

def reconstruct_window(window, basis, plot=False):
    proj = np.apply_along_axis(window.dot, 1, basis)

    #Find closest basis vector
    maxidx = np.abs(proj).argmax()

    if plot==True:
        print "Best Projection: ", proj[maxidx]

        # Visual inspection
        fig, axes = plt.subplots(2, 1, figsize=(6,6))
        axes[0].plot(window)
        axes[0].set_title('Original')
        axes[1].plot(basis[maxidx])
        axes[1].set_title('Reconstructed')

        # find error
        print "Error: ", mean_squared_error(window, basis[maxidx])
    
    return basis[maxidx]

# Select any window for testing
some_window = window_list[100]   

# Reconstruct this window
basis_window = reconstruct_window(some_window, basis_dict, plot=True)    

## Reconstruct a longer signal 

def reconstruct_signal(sig, basis, plot=False):
    
    dict_size, window_size = basis.shape
   
    n_segments = 2 * int(len(sig) / window_size) - 1 
    rec = np.zeros(len(sig), dtype=float)
    
    print "n segments to use is:", n_segments
    print "dictionary (basis set) size is:", dict_size
    
    #split the signal
    for ii in xrange(n_segments):

        # Obtain index range
        start_idx = int(ii * window_size/2)
        end_idx = start_idx + window_size

        # Slice signal over range to get window
        s = sig[start_idx:end_idx]

        # reconstruct this window
        bvec = reconstruct_window(s, basis, plot=False)

        # Update complete reconstruction
        rec[start_idx:end_idx] += bvec

    if plot == True: 
        print "window size is: ", window_size
        print "Error: ", mean_squared_error(sig, rec)
        
        fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(int(n_segments/3), 4*3))
        axes[0].plot(sig)
        axes[0].plot(rec)
        axes[0].set_title('Original and reconstructed signal')
        ylims = axes[0].get_ylim()
        axes[1].plot(sig)
        axes[1].set_title('Original signal')
        axes[2].plot(rec)
        axes[2].set_title('Reconstructed signal')
        #axes[3].plot(sig - rec)
        #axes[3].set_ylim(ylims)
        #axes[3].set_title('Residual')
                

    return rec


# Select a longer signal to reconstruct for testing.
signal = fullsignal[:800]

signal_reconstructed = reconstruct_signal(signal, basis_dict, plot=True)

np.set_printoptions(precision=2, linewidth=100)

# explicitly calculate the residual here. 
residual = signal - signal_reconstructed

# bounds to look at: 
start = 10; stop = 20
print signal[start:stop]
print signal_reconstructed[start:stop]
print residual[start:stop]
print '-'

# Sanity check on error calculation
tt = np.power(signal[start:stop] - signal_reconstructed[start:stop], 2)
print tt.mean()
print mean_squared_error(signal[start:stop], signal_reconstructed[start:stop])

plt.plot(residual)


# Warning: Depending on the signal size, this can take a long time. 
# Below we'll %%timeit to check running time, for reference. 

sig_size = 50000
long_signal = fullsignal[:sig_size]
long_signal_reconstructed = reconstruct_signal(long_signal, 
                                               basis_dict, 
                                               plot=False)

%%timeit

long_signal_reconstructed = reconstruct_signal(long_signal, 
                                               basis_dict, 
                                               plot=False)

print """
Parameters: 
  Signal size to reconstruct = {}
  Size of basis dictionary = {}
  Window size = {}
  Step size = {}

Results in Mean Squared Error: {}""".format(
        sig_size,
        k,
        window_size, 
        step_size,
        mean_squared_error(long_signal, long_signal_reconstructed)
    )


# Load a signal segment known to be anolamous. 
anomalous_dataset = 'a01'
start = 4825
end = start + 15

wfpath = os.path.join(data_path, anomalous_dataset)

dat, info = wfdbtools.rdsamp(wfpath, 
                             start=start,
                             end=end
                             )

print info
print dat.shape

anomalous_ekg = dat[:, 2]

# Plot to see the anomalous signal
plt.figure(figsize=(16,3))
plt.plot(anomalous_ekg)
plt.title('{}, Start: {}'.format(fpath, start))

reconstruct_signal(anomalous_ekg, basis_dict, plot=True)