''' debinningWithNewStatistics.ipynb This file is an ipython notebook designed to be converted to reveal.js slides via this command: ipython nbconvert debinningWithNewStatistics.ipynb --to=slides --post=serve [--config slides_config.py] This file also contains the main body of debinning work in the code blocks below. Copyright (C) 2015 Abram Krislock Talks given from ~= this version of the slides: "Debinning: Data Smoothing with a New Probability Calculus" - Fysisk institutt, Universitetet i Oslo, February 25, 2015 - Mitchell Institute for Fundamental Physics and Astronomy, Texas A&M University, March 13, 2015 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.txt. ''' import sys sys.path[0] = '..' from __future__ import division import numpy as np from time import time import bisect import operator as op from utilities import settings import utilities.sampleFunctions as funcs import utilities.probCalc as pc import utilities.plotting as debinPlot import utilities.minimizers as miniz reload(funcs) reload(pc) reload(debinPlot) reload(miniz) %matplotlib inline from matplotlib import pyplot as plt from IPython.display import HTML funcs.randomGenerator.setSeed('seedOne') settings['simpleSortedOutput'] = True nPulls = 400 basePars = (0.00006, 10., 200., 110., 1.5, 50., 140.) baseGuess = (140., -2., -0.1, 10.) def fitTwoHistograms(percentile): data = funcs.functionToData(funcs.easyEndpoint, nPulls, 0.5, (0,200), basePars) percentieth = data[data > np.percentile(data, percentile)] # The histograms are filled with the same data, only different binning. binContentsOne, binEdgesOne = np.histogram(percentieth, np.arange(3., 204., 10.)) binContentsTwo, binEdgesTwo = np.histogram(percentieth, np.arange(0., 201., 5.)) # The fit ranges are defined to be from the value of the 70th percentile data point # to "the last non-zero bin", blindly. fitBinsOne = np.flatnonzero(binContentsOne[binContentsOne.argmax():]) + binContentsOne.argmax() fitBinsTwo = np.flatnonzero(binContentsTwo[binContentsTwo.argmax():]) + binContentsTwo.argmax() ### FITTING METHOD: Binned Maximum Likelihood ### fitGuessOne = np.array([value * funcs.randomGenerator.normal(1., 0.1) for value in baseGuess]) fitGuessTwo = fitGuessOne.copy() def chiSqOne(pars): # return funcs.chiSqSimple(binContentsOne, binEdgesOne, fitBinsOne, funcs.theKink, pars) return -2. * funcs.theKink_BinnedLikelihood(binContentsOne, binEdgesOne, fitBinsOne, pars) def chiSqTwo(pars): # return funcs.chiSqSimple(binContentsTwo, binEdgesTwo, fitBinsTwo, funcs.theKink, pars) return -2. * funcs.theKink_BinnedLikelihood(binContentsTwo, binEdgesTwo, fitBinsTwo, pars) fitOne = miniz.nelderMead(chiSqOne, fitGuessOne, xtol=0.01, ftol=0.05, disp=0) fitTwo = miniz.nelderMead(chiSqTwo, fitGuessTwo, xtol=0.01, ftol=0.05, disp=0) return (data, fitOne, fitTwo) xKinkOne = [] xKinkTwo = [] xKinkDiff = [] percentile = 60 nTrials = 1000 for i in xrange(nTrials): data, fitOne, fitTwo = fitTwoHistograms(percentile) xKinkOne.append(fitOne[0]) xKinkTwo.append(fitTwo[0]) xKinkDiff.append(baseGuess[0] + fitTwo[0] - fitOne[0]) def plotHistoFitCompare(): fig, axes = plt.subplots(figsize=(11,7), facecolor='#ffffff') # plt.hist(xKinkOne, range=(80,180), color='#66a61e', bins=np.arange(80, 181, 2), alpha=0.6, zorder=2) # plt.hist(xKinkTwo, range=(80,180), color='#0088ff', bins=np.arange(80, 181, 2), alpha=0.6, zorder=2) plt.hist(xKinkDiff, range=(80,180), color='#0088ff', bins=np.arange(80, 181, 2), alpha=0.8, zorder=1) axes.set_xticks(np.arange(80, 181, 10)) axes.set_xticks(np.arange(80, 181, 2), minor=True) axes.set_xticklabels([r'$%i$' % int(num) for num in np.arange(80., 181., 10.)], fontsize=20) axes.set_yticks(np.arange(0, 250, 50)) axes.set_yticks(np.arange(0, 250, 10), minor=True) axes.set_yticklabels([r'$%i$' % num for num in np.arange(0, 250, 50)], fontsize=20) axes.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) axes.tick_params(which='minor', length=5, width=1.2, zorder=11) axes.minorticks_on() axes.set_xlabel('True Kink (140) + Kink Fit Difference', labelpad=5, fontsize=22) axes.set_ylabel('Number of Counts (400 total)', labelpad=5, fontsize=22) axes.annotate(r'larger $x_{\rm kink2}$', xy=(140.5,200), xycoords='data', xytext=(50,-5), textcoords='offset points', size=22, arrowprops=dict(arrowstyle='<-', color='#e7298a', linewidth=3)) axes.annotate(r'larger $x_{\rm kink1}$', xy=(139.5,200), xycoords='data', xytext=(-180,-5), textcoords='offset points', size=22, arrowprops=dict(arrowstyle='<-', color='#66a61e', linewidth=3)) axes.annotate('', xy=(140.15,250), xycoords='data', xytext=(0,-100), textcoords='offset points', arrowprops=dict(arrowstyle='-')) print plotHistoFitCompare() reload(funcs) def quickData(): ''' Shorthand for getting easyEndpoint data (384 x values in [8.5,200]) - set nPulls before calling - see utilities.sampleFunctions.functionToData for more details ''' return funcs.functionToData(funcs.easyEndpoint, nPulls, 0.5, np.arange(8.5, 200.2, 0.5)) def quickBG(): ''' Shorthand for getting endpointBG data (384 x values in [8.5,200]) - set nPulls before calling - see utilities.sampleFunctions.functionToData for more details ''' return funcs.functionToData(funcs.endpointBG, nPulls, 0.5, np.arange(8.5, 200.2, 0.5)) nPulls = 10 pullData = {index:[] for index in xrange(nPulls)} settings['simpleSortedOutput'] = True for iteration in xrange(2000): for i,v in enumerate(quickData()): pullData[i].append(v) settings['simpleSortedOutput'] = False x, f, F, data = quickData() bgx, bgf, bgF, bgdata = quickBG() f /= F[-1] bgf /= F[-1] bgF /= bgF[-1] F /= F[-1] ithPDF = {} for i in xrange(1, nPulls+1): ithPDF[i-1] = np.array([f[j] * F[j]**(i-1) * (1 - F[j])**(nPulls-i) * pc.nKr(nPulls, i) for j in xrange(len(x))]) def makeHist(i=0): fig, axes = plt.subplots(figsize=(9,5), facecolor='#ffffff') plt.plot(x, f, '--', color='#0088ff', alpha=0.6, linewidth=3) plt.plot(x, bgf, '-.', color='#0088ff', alpha=0.6, linewidth=3) plt.hist(pullData[i], bins=np.arange(0, 201, 5), alpha=0.4, color='#66a61e', normed=True) plt.plot(x, ithPDF[i], '#964a01', linewidth=3) axes.set_xticks(np.arange(0, 201, 50)) axes.set_xticks(np.arange(0, 201, 10), minor=True) axes.set_xticklabels([r'$%i$' % int(num) for num in np.arange(0, 201, 50)], fontsize=20) axes.set_ylim(bottom=-0.0001, top=0.0401) axes.set_yticks(np.arange(0, 0.04, 0.01)) axes.set_yticklabels([r'$%0.3f$' % num for num in np.arange(0, 0.04, 0.01)], fontsize=20) axes.set_yticks(np.arange(0, 0.04, 0.01/5), minor=True) axes.set_xlabel('Physical Observable', labelpad=5, fontsize=22) axes.set_ylabel('Probability', labelpad=5, fontsize=22) axes.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) axes.tick_params(which='minor', length=5, width=1.2, zorder=11) axes.minorticks_on() makeHist(0) makeHist(1) makeHist(2) makeHist(3) makeHist(4) makeHist(5) makeHist(6) makeHist(7) makeHist(8) makeHist(9) settings['simpleSortedOutput'] = False nPulls = 96 nFictitious=96 x, f, F, data = quickData() x, bgf, bgF, bgData = quickBG() f /= F[-1] F /= F[-1] bgf /= bgF[-1] bgF /= bgF[-1] def mapDataToX(dindex): # maps elements of data to elements of x # bisect_left(x, datum) locates the left-most entry in x which is >= datum xindex = bisect.bisect_left(x, data[dindex]) if xindex > 0 and x[xindex] - data[dindex] >= data[dindex] - x[xindex-1]: return xindex - 1 return xindex dataToX = np.array([mapDataToX(j) for j in xrange(len(data))]) def vecG_rth(rindex, xindex): r = rindex+1 return pc.nKr(nFictitious, r) * bgF[xindex]**(r-1) * (1 - bgF[xindex])**(nFictitious - r) vecG_rx = np.array([[vecG_rth(rindex, xindex) for xindex in xrange(len(x))] for rindex in xrange(nFictitious)]) sumG_j = vecG_rx.take(dataToX, axis=1).sum(axis=1) debinningData = bgf * np.dot(sumG_j, vecG_rx) / (nFictitious*nPulls) # One more pull for plotting purposes x, f, F, data = quickData() x, bgf, bgF, bgData = quickBG() f /= F[-1] bgf /= F[-1] bgF /= F[-1] F /= F[-1] def makeDebinningPlot(): fig, axes = plt.subplots(figsize=(11,7), facecolor='#ffffff') plt.plot(x, f, '--', color='#0088ff', alpha=0.6, linewidth=3) plt.plot(x, bgf, '-.', color='#0088ff', alpha=0.6, linewidth=3) plt.hist(data, bins=np.arange(0, 201, 5), alpha=0.4, color='#66a61e', normed=True) plt.plot(x, debinningData, '#964a01', linewidth=3) axes.set_xticks(np.arange(0, 201, 50)) axes.set_xticks(np.arange(0, 201, 10), minor=True) axes.set_xticklabels([r'$%i$' % int(num) for num in np.arange(0, 201, 50)], fontsize=20) axes.set_ylim(bottom=-0.0001, top=0.0201) axes.set_yticks(np.arange(0, 0.02, 0.005)) axes.set_yticklabels([r'$%0.3f$' % num for num in np.arange(0, 0.02, 0.005)], fontsize=20) axes.set_yticks(np.arange(0, 0.02, 0.005/5), minor=True) axes.set_xlabel('Physical Observable', labelpad=5, fontsize=22) axes.set_ylabel('Probability', labelpad=5, fontsize=22) axes.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) axes.tick_params(which='minor', length=5, width=1.2, zorder=11) axes.minorticks_on() makeDebinningPlot() fwhmValues=None # Let's try some GPU programming: import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule from string import Template settings['simpleSortedOutput'] = False nPulls = 96 nFictitious = 96 # Multiple of 32 for GPU programming ease. lenXdomain = len(np.arange(8.5, 200.2, 0.5)) mod_template = Template(""" __global__ void vecG(double *vecG_rx, double *bgF, double *lognKr) { const int idr = threadIdx.y + blockDim.y * blockIdx.y; const int idx = threadIdx.x + blockDim.x * blockIdx.x; const int idrx = idx + idr*${lenX}; const int r = (idr + 1) < (${nFic} - idr) ? (idr + 1) : ${nFic} - idr; if (bgF[idx] == 0 || (1 - bgF[idx]) == 0) vecG_rx[idrx] = 0; else vecG_rx[idrx] = exp(lognKr[r-1] + idr * log(bgF[idx]) + (${nFic} - idr - 1) * log(1 - bgF[idx])); } """) mod = SourceModule(mod_template.substitute(nFic=nFictitious, lenX=lenXdomain)) timer = time() # initialize the loop lognKr = pc.get_lognKrArray(nFictitious) chisq = [] bgchisq = [] ks = [] bgks = [] for trial in xrange(100000): x, f, F, data = quickData() x, bgf, bgF, bgData = quickBG() f /= F[-1] F /= F[-1] bgf /= bgF[-1] bgF /= bgF[-1] def mapDataToX(dindex): # maps elements of data to elements of x # bisect_left(x, datum) locates the left-most entry in x which is >= datum xindex = bisect.bisect_left(x, data[dindex]) if xindex > 0 and x[xindex] - data[dindex] >= data[dindex] - x[xindex-1]: return xindex - 1 return xindex dataToX = np.array([mapDataToX(j) for j in xrange(len(data))]) # Use the GPU for the most expensive / paralellizable part of the calculation: # ===== # Empty numpy array to hold result of vecG (row, column = nFictitious, len(x)) vecG_rx = np.empty([nFictitious, len(x)]) # Allocate memory on the GPU for vecG calculation vecG_rx_gpu = cuda.mem_alloc(vecG_rx.nbytes) bgF_gpu = cuda.mem_alloc(bgF.nbytes) lognKr_gpu = cuda.mem_alloc(lognKr.nbytes) # Transfer data to the GPU cuda.memcpy_htod(bgF_gpu, bgF) cuda.memcpy_htod(lognKr_gpu, lognKr) # Get a reference to the GPU module function and do the calculation # NOTE: 16*24 = 384 = len(x), and 4*24 = 96 = nFictitious vecG_func = mod.get_function('vecG') vecG_func(vecG_rx_gpu, bgF_gpu, lognKr_gpu, block=(16, 4, 1), grid=(24, 24)) # Get the result back from the GPU and use it! cuda.memcpy_dtoh(vecG_rx, vecG_rx_gpu) sumG_j = vecG_rx.take(dataToX, axis=1).sum(axis=1) debinningData = bgf * np.dot(sumG_j, vecG_rx) / (nFictitious*nPulls) # Chisq test.... if type(fwhmValues) == np.ndarray: chisq.append(sum(4.*(debinningData - f)**2 / fwhmValues**2) / len(fwhmValues)) bgchisq.append(sum(4.*(debinningData - bgf)**2 / fwhmValues**2) / len(fwhmValues)) # KS test... debinningCDF = pc.A(debinningData, 0.5) ks.append(max(abs(debinningCDF - F))) bgks.append(max(abs(debinningCDF - bgF))) if trial == 0: # Infinity sigma upper and lower bands... debinningUpper = debinningData.copy() debinningLower = debinningData.copy() # Uncertainty PDFs for each x value... uncertaintyHistos = [debinPlot.HistoContainer(np.arange(0., 0.024, 0.0002), debinningData[i]) for i in xrange(len(x))] else: debinningUpper = np.clip(debinningData, debinningUpper, 20.) debinningLower = np.clip(debinningData, -1., debinningLower) for i in xrange(len(x)): try: uncertaintyHistos[i].addValue(debinningData[i]) except ValueError as e: print e.message print 'xindex = ' + str(i) + ', debinningData[xindex] = ' + str(debinningData[i]) if not type(fwhmValues) == np.ndarray: fwhmValues = np.array([np.diff(uncertaintyHistos[i].fwhm()) for i in xrange(len(x))]).flatten() print "Run this again for the Chisq test data..." print time() - timer # Try one more sample, but with a HUGE data set: nPulls = 10000 timer = time() x, f, F, data = quickData() x, bgf, bgF, bgData = quickBG() f /= F[-1] F /= F[-1] bgf /= bgF[-1] bgF /= bgF[-1] def mapDataToX(dindex): # maps elements of data to elements of x # bisect_left(x, datum) locates the left-most entry in x which is >= datum xindex = bisect.bisect_left(x, data[dindex]) if xindex > 0 and x[xindex] - data[dindex] >= data[dindex] - x[xindex-1]: return xindex - 1 return xindex dataToX = np.array([mapDataToX(j) for j in xrange(len(data))]) # Use the GPU for the most expensive / paralellizable part of the calculation: # ===== # Empty numpy array to hold result of vecG (row, column = nFictitious, len(x)) vecG_rx = np.empty([nFictitious, len(x)]) # Allocate memory on the GPU for vecG calculation vecG_rx_gpu = cuda.mem_alloc(vecG_rx.nbytes) bgF_gpu = cuda.mem_alloc(bgF.nbytes) lognKr_gpu = cuda.mem_alloc(lognKr.nbytes) # Transfer data to the GPU cuda.memcpy_htod(bgF_gpu, bgF) cuda.memcpy_htod(lognKr_gpu, lognKr) # Get a reference to the GPU module function and do the calculation # NOTE: 16*24 = 384 = len(x), and 4*24 = 96 = nFictitious vecG_func = mod.get_function('vecG') vecG_func(vecG_rx_gpu, bgF_gpu, lognKr_gpu, block=(16, 4, 1), grid=(24, 24)) # Get the result back from the GPU and use it! cuda.memcpy_dtoh(vecG_rx, vecG_rx_gpu) sumG_j = vecG_rx.take(dataToX, axis=1).sum(axis=1) BIGdebinningData = bgf * np.dot(sumG_j, vecG_rx) / (nFictitious*nPulls) print time() - timer # One more pull for plotting purposes x, f, F, data = quickData() x, bgf, bgF, bgData = quickBG() f /= F[-1] bgf /= F[-1] bgF /= F[-1] F /= F[-1] # Pretty plot fig, axes = plt.subplots(figsize=(11,7), facecolor='#ffffff') plt.fill_between(x, debinningUpper, debinningLower, color='#0088ff', alpha=0.3, zorder=1) plt.plot(x, f, color='#0088ff', linewidth=3, zorder=3) plt.plot(x, bgf, color='#0088ff', linestyle='-.', linewidth=3, zorder=3) plt.plot(x, BIGdebinningData, color='black', linestyle='--', linewidth=3, zorder=4) colors = np.array([(0.8,1.,0.4), (0.8,0.3,0.), (1.,1.,1.), (0.,0.,1.)]) for i, v in enumerate(x): debinPlot.verticalColorBand(uncertaintyHistos[i], v, 0.5, colors) axes.set_xticks(np.arange(0, 201, 50)) axes.set_xticks(np.arange(0, 201, 10), minor=True) axes.set_xticklabels([r'$%i$' % int(num) for num in np.arange(0, 201, 50)], fontsize=20) axes.set_ylim(bottom=-0.0001, top=0.0201) axes.set_yticks(np.arange(0, 0.02, 0.005)) axes.set_yticklabels([r'$%0.3f$' % num for num in np.arange(0, 0.02, 0.005)], fontsize=20) axes.set_yticks(np.arange(0, 0.02, 0.005/5), minor=True) axes.set_xlabel('Physical Observable', labelpad=5, fontsize=22) axes.set_ylabel('Probability', labelpad=5, fontsize=22) axes.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) axes.tick_params(which='minor', length=5, width=1.2, zorder=11) axes.minorticks_on() print fig, axes = plt.subplots(figsize=(11,7), facecolor='#ffffff') plt.hist(chisq, bins=np.arange(0, 7.01, .05), color='#0088ff', edgecolor='#555555') plt.hist(bgchisq, bins=np.arange(0, 7.01, .05), color='#66a61e', edgecolor='#555555', alpha=0.5) axes.set_xticks(np.arange(0, 7.01, 1)) axes.set_xticks(np.arange(0, 7.01, .25), minor=True) axes.set_xticklabels([r'$%1.1f$' % num for num in np.arange(0, 7.01, 1)], fontsize=20) axes.set_ylim(bottom=-0.0001, top=6000) axes.set_yticks(np.arange(0, 6001, 1000)) axes.set_yticks(np.arange(0, 6001, 250), minor=True) axes.set_yticklabels([r'$%i$' % int(num) for num in np.arange(0, 6001, 1000)], fontsize=20) axes.set_xlabel(r'$\chi^2$ per d.o.f.', labelpad=5, fontsize=22) axes.set_ylabel('Number of Counts', labelpad=5, fontsize=22) axes.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) axes.tick_params(which='minor', length=5, width=1.2, zorder=11) axes.minorticks_on() print fig, axes = plt.subplots(figsize=(11,7), facecolor='#ffffff') plt.hist(ks, bins=np.arange(0, 0.3, .002), color='#0088ff', edgecolor='#555555') plt.hist(bgks, bins=np.arange(0, 0.3, .002), color='#66a61e', edgecolor='#555555', alpha=0.5) axes.set_xticks(np.arange(0, 0.26, .05)) axes.set_xticks(np.arange(0, 0.26, .01), minor=True) axes.set_xticklabels([r'$%0.2f$' % num for num in np.arange(0, 0.26, .05)], fontsize=20) axes.set_ylim(bottom=-0.0001, top=5000) axes.set_yticks(np.arange(0, 5001, 1000)) axes.set_yticks(np.arange(0, 5001, 250), minor=True) axes.set_yticklabels([r'$%i$' % int(num) for num in np.arange(0, 5001, 1000)], fontsize=20) axes.set_xlabel('KS test value', labelpad=5, fontsize=22) axes.set_ylabel('Number of Counts', labelpad=5, fontsize=22) axes.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) axes.tick_params(which='minor', length=5, width=1.2, zorder=11) axes.minorticks_on() print from IPython.display import HTML HTML('')