# Modified Stylesheet for notebook. from IPython.core.display import HTML def css_styling(): styles = open("custom.css", "r").read() return HTML(styles) css_styling() import matplotlib.pyplot as plt import numpy as np from collections import defaultdict np.random.seed(seed=111111) np.set_printoptions(precision=3, suppress=True) import sys; sys.path.append('../src/') from change_detector import ChangeDetector, OnlineSimulator # Create a seasonal signal # I imagined a metric that rises from 0 to 5 each calendar month sig = np.linspace(0, 5, num=30) sig = np.concatenate([sig for x in xrange(12)]) # Add a jump jump_size = 5 sig[250:] = sig[250:] + jump_size # Noise noise = np.random.normal( size=sig.shape, scale=jump_size * 0.02) plt.figure(figsize=(15,5)) plt.plot(sig + noise, 'b.', linestyle='') plt.plot(sig + noise, 'b-', alpha=0.15) plt.ylim(0,15) plt.xlim(0,365) plt.title("Imaginary Seasonal signal") import change_detector reload(change_detector) from change_detector import ChangeDetector, OnlineSimulator from collections import deque class ZScoreDetector(ChangeDetector): """ A change detection algorithm based on z-scores --- 1. We will use Welford's algorithm to collect mean and variance of the full stream. 2. At the same time, we will collect a window of recent observations. We are storing all the recent observations for this window, and compute local statistics traditionally. 3. Then, we compute a Z-score to compare the sample (window) distribution to the population (global) distribution. Our stopping rule is based on this computed z-score """ def __init__(self, window_size = 100, threshold=0.05): super( ZScoreDetector, self ).__init__() # hyper-parameters self.threshold = threshold self.window_size = window_size # Interim and calculated values self.k = 0 # total signal_size self.g_mean = 0.0 # global mean self.s = 0.0 # for Welford's method. variance = s / (k + 1) # This is the window self.window = deque(maxlen = window_size) # ... and, finally, our residuals self.z_score_ = np.nan self.z_array = [] def update_residuals(self, new_signal_value): self._update_base_residuals(new_signal_value) x = new_signal_value # Add new value to local window (deque will # automatically drop a value to maintain window size) self.window.append(x) """Calculate Statistics on global and local window """ # Calculate global statistics using welford's method oldm = self.g_mean newm = oldm + (x - oldm) / (self.k + 1) s = self.s + (x - newm) * (x - oldm) g_mean = newm # Global mean g_std = np.sqrt(s / (self.k+1)) # Global std # Calculate local statistics on the window # We have all values stored for the window, so # can use built-in numpy stats methods w_mean = np.mean(self.window) # Window mean w_std = np.std(self.window) # Window std """Calculate variables required for Zscore""" # Calculate SE, see formula above std_diff = (g_std - w_std) / g_std SE = g_std / np.sqrt(self.window_size) # Difference between the means mean_diff = (g_mean - w_mean) / g_mean # Z-score (residual) self.z_score_ = (w_mean - g_mean) / SE self.z_array.append(self.z_score_) # Store attributes self.g_mean = g_mean self.s = s # Update k (size of global window). # This must be done at the end! self.k += 1 def check_stopping_rules(self, new_signal_value): # Check stopping rule! if np.abs(self.z_score_) > self.threshold: self.rules_triggered = True # Our first, most basic, signal. basic_sig = np.ones(150) basic_sig[:100] *= 50 basic_sig[100:] *= 4 # On Signal 1 detector = ZScoreDetector(window_size = 10, threshold=10.0) OnlineSimulator(detector, basic_sig).run( signal_name='Original first signal, sig1' ) plt.figure(figsize=(15,5)) plt.plot(sig + noise, 'b.', linestyle='') plt.plot(sig + noise, 'b-', alpha=0.15) plt.ylim(0,15) plt.xlim(0,365) plt.title("Imaginary Seasonal signal") detector = ZScoreDetector(window_size = 30, threshold=10.0) OnlineSimulator(detector, sig).run(signal_name = "Imaginary Seasonal Signal") jump_size = 30 sig2 = np.linspace(40, 50, num=20) sig2 = np.concatenate([sig2 for x in xrange(8)]) # TODO need a change to detect sig2[120:] = sig2[120:] + jump_size noise = np.random.normal( size=sig2.shape, scale=jump_size * 0.05 ) sig2 = sig2 + noise detector = ZScoreDetector(window_size = 20, threshold=10.0) OnlineSimulator(detector, sig2).run(signal_name = "Second Imaginary Seasonal Signal") jump_size = 20 signal_size = 300 # Create a signal with a small jump (change) sig3 = np.ones(signal_size) sig3[:150] = 30 sig3[150:] = 30 + jump_size # Add noise noise = np.random.normal( scale = 0.10 * jump_size, size=signal_size) sig3 = sig3 + noise # Add outliers outlier_idx = np.random.choice(signal_size, size=3) sig3[outlier_idx] = 0 outlier_idx = np.random.choice(signal_size, size=2) sig3[outlier_idx] = 90 # What does this look like? plt.figure(figsize=(15, 5)) plt.plot(sig3, 'b.') plt.plot(sig3, '-', alpha=0.15) plt.ylim(0,100) # On Signal 3 detector = ZScoreDetector(window_size = 40, threshold=8.0) OnlineSimulator(detector, sig3).run(signal_name = "Signal with noise and outlier") Note [Aman Ahuja] So far this demonstrates the need for windows, but still has a dependency on hyperparameters. - How would we know what to pick for the Z-score threshold? - We also now have a new hyper parameter, the window size. This discussion leads to the Page Hinkley Stopping Rule The likelihood ratio, or the ratio of the gaussian probability densities, can be used to detect the time at which the change occurred, if the magnitude of the jump (magnitude of the change in mean) is known a priori. Else the maximum likelihood estimate of the point (time) of change can be used, requiring some threshold value lambda for the size of the change in mean. This leads to the Page-Hinkley stopping rule (E.S. Page, 1954) and the cumulative sum algorithm (CUSUM) There is a recursive method to compute the stopping rule.