import numpy as np import scipy as sp from scipy import stats from scipy import interpolate First lets do a T-Test with two independent sets of data with same mean and variance. np.random.seed(12345678) a = stats.norm.rvs(loc=5,scale=10,size=1000) b = stats.norm.rvs(loc=5,scale=10,size=1000) # Now do t-test result = stats.ttest_ind(a,b) p_value = result[-1] print "p-value: ", p_value As we can see the p-value is greater than 0.05 meaning there is a very small chance that the means of the two actually being different. So we cant reject the hypothesis that the means are the sameWe are interested in the process that data is collected from a process (patients visiting a doctor) and random intervals and having measurements taken. I am assuming that the measurements are randomly distributed (mean 5) and that the days of visiting are random too. I am going to create a 120 day process with guaranteed measurements on day 1 and day 120 but random days in between. Patients will not have the same measurements. Lets assume the patients have 8 - 12 measurements in the time (including the start and end measurements) def get_data(): data =[] day_range = xrange(1,119) for i in range(1000): days = [0,120] # Add the first and last day sample_size = randint(6,10) #How many days to sample? days.extend(list(np.random.choice(day_range,size=sample_size,replace=False))) days = list(np.sort(days)) measurements = list(stats.norm.rvs(loc=5,scale=10,size=sample_size+2)) data.append([days,measurements]) return data process_data = get_data() Now lets calculate AUC for each data point def calculate_auc(data,start_day,end_day): auc =[] for days,measurements in data: tck = interpolate.splrep(days, measurements, s=0, k=1) temp_auc = interpolate.splint(start_day, end_day, tck) auc.append(temp_auc) return auc original_process_auc = calculate_auc(process_data,0,120) Lets look at the distribution of the AUC pyplot.hist(original_process_auc) def calculate_interval_auc(data,start_day,end_day,interval): auc =[] for days,measurements in data: tck = interpolate.splrep(days, measurements, s=0, k=1) new_days = range(start_day,end_day,interval) new_days.append(end_day) new_measurements = [] for day in new_days: new_measurements.append(interpolate.splev(day,tck)) new_tck = interpolate.splrep(new_days, new_measurements, s=0, k=1) temp_auc = interpolate.splint(start_day, end_day, new_tck) auc.append(temp_auc) return auc interval_process_auc = calculate_interval_auc(data,0,120,30) pyplot.hist(interval_process_auc) # Now do t-test for 30 days t_result = stats.ttest_ind(original_process_auc,interval_process_auc,equal_var=False) t_p_value = t_result[-1] print t_p_value interval_60_process_auc = calculate_interval_auc(data,0,120,60) pyplot.hist(interval_60_process_auc) # Now do t-test for 60 days t_result = stats.ttest_ind(original_process_auc,interval_60_process_auc) t_p_value = t_result[-1] print t_p_value interval_120_process_auc = calculate_interval_auc(data,0,120,120) pyplot.hist(interval_120_process_auc) # Now do t-test for 120 days t_result = stats.ttest_ind(original_process_auc,interval_120_process_auc,equal_var=False) t_p_value = t_result[-1] print "p-value 120 day intervals: ", t_p_value interval_15_process_auc = calculate_interval_auc(data,0,120,15) pyplot.hist(interval_15_process_auc) # Now do t-test for 15 days t_result = stats.ttest_ind(original_process_auc,interval_15_process_auc,equal_var=False) t_p_value = t_result[-1] print "p-value 15 day intervals: ",t_p_value t_result = stats.ttest_ind(original_process_auc,original_process_auc,equal_var=False) t_p_value = t_result[-1] print "p-value Original: ",t_p_value