#!/usr/bin/env python # coding: utf-8 # # Web browsers use the network too sparingly # # Author: alcidesv@shimmercat.com # # ## Abstract # # The typical HTTP request/response model makes difficult for browsers to use their available bandwidth to fetch the website faster. Here we analyze how fast a web page can be fetched. # ## What is in the dataset # # The dataset contains page-load times, with individual resources, for each of the 1300 resource page loads.The 1300 sites were submitted by performance-conscious site operators that were evaluating their site's performance. The important bits of the dataset, anonymized, [are available](https://github.com/shimmercat/art_timings/raw/master/data/clean_dataset.json.xz) for you to make your own measurements. # ## Notebook initialization and loading the dataset # In[1]: # Import Bokeh modules for interactive plotting import bokeh.io import bokeh.mpl import bokeh.plotting # Seaborn, useful for graphics import seaborn as sns import matplotlib #matplotlib.style.use('ggplot') rc = {'lines.linewidth': 2, 'axes.labelsize': 14, 'axes.titlesize': 14, 'axes.facecolor': 'DFDFE5', 'patch.facecolor': 'F37626' } sns.set_context('notebook', rc=rc) sns.set_style('darkgrid', rc=rc) get_ipython().run_line_magic('pylab', '') get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('config', "InlineBackend.figure_formats = {'png', 'retina'}") # %config InlineBackend.figure_formats = {'svg',} # Set up Bokeh for inline viewing #bokeh.io.output_notebook() # In[2]: import numpy as np import pandas as pd from glob import glob import os.path import json import lzma import re # In[3]: with lzma.open("data/clean_dataset.json.xz", 'rb') as fin: dataset_s = fin.read().decode('ascii') dataset = json.loads(dataset_s) # ### Parsing the date-times # This is how the starting times of a request look in the jar file. # In[4]: dtsample = dataset[0]['entries'][0] dtsample # Let's parse startedDateTime by hand... # In[5]: rgx_ = re.compile( 'T([0-9]{2}):([0-9]{2}):([0-9]{2}\.[0-9]*)Z' ) ms_ = 1000 def dt2milliseconds(dtval): mo = re.search(rgx_, dtval) milliseconds = \ int( mo.group(1) )*3600*ms_ + \ int( mo.group(2) )*60*ms_ + \ float( mo.group(3) )*ms_ return milliseconds # ### Decorating the entries with the relative start-end time of the request # In[6]: def decorate_relative_timings(timings, starts_dtval, first_request_start): # Uses the timings entry in each fetch and its starting time # to deduce a set of "unhabited" intervals and a set of # "populated" intervals. mseconds_start = dt2milliseconds(starts_dtval) - first_request_start connect = timings.get('connect',0) dns = timings.get('dns', 0) blocked = timings.get('blocked', 0) # Chrome uses '-1' to signal that the timing doesn't apply if connect < 0: connect = 0 if dns < 0: dns = 0 ssl = timings.get('ssl', 0) if ssl < 0: ssl = 0 send = timings.get('send') wait = timings.get('wait') receive = timings.get('receive') starts_receiving = mseconds_start + connect + dns + ssl + send + wait ends_receiving = starts_receiving + receive timings['rel_start'] = mseconds_start timings['starts_receiving'] = starts_receiving timings['ends_receiving'] = ends_receiving def decorate_all_timing_entries(fetch_timings): first_fetch = fetch_timings['entries'][0] first_request_start = dt2milliseconds(first_fetch['startedDateTime']) for entry in fetch_timings['entries']: decorate_relative_timings(entry['timings'], entry['startedDateTime'], first_request_start) for fetch_timings in dataset: decorate_all_timing_entries(fetch_timings) # An entry looks like this now: # In[7]: dataset[0]['entries'][0] # ### Sets of intervals # # Sets of intervals are cool! We can do all sort of interesting things with them! # In[8]: def merge_sets_step(s1): # Take the first interval fi = s1[0] fi_start, fi_end = fi r_start = fi_start r_end = fi_end # Take the other intervals, and see if we can merge them for (_i,ai) in enumerate(s1[1:]): ai_start, ai_end = ai if ai_start <= r_end: # A merge is possible r_end = ai_end else: # A merge is not possible. Since the intervals # are sorted by their starting point, the next # interval will start at a more distant place. # Finish by returning the new merged big interval # and the rest of the sorted set.... return ((r_start, r_end), s1[_i+1:]) # If I arrive here, it is just a big one return ((r_start, r_end), []) def merge_interval_sets(iterable_of_intervals): # Sort the intervals ... s1 = sorted(iterable_of_intervals, key=(lambda i: i[0])) si = s1 disjoint_intervals = [] disjoint_voids = [] mi_end_prev = None while len(si) > 0: (merged_interval, si_next) = merge_sets_step(si) mi_start, mi_end = merged_interval if mi_end_prev: disjoint_voids.append((mi_end_prev, mi_start)) mi_end_prev = mi_end disjoint_intervals.append(merged_interval) si = si_next return (disjoint_intervals, disjoint_voids) # Let's just test a little bit the merge intervals function. # In[9]: merge_sets_step([ (0,1), (2,3) ]) # In[10]: merge_sets_step([ (0,1), (0.5,1.5), (2,3) ]) # In[11]: merge_sets_step([ (0,1), (0.5,2.5), (2,3) ]) # In[12]: merge_interval_sets([ (0,1), (0.5,2.5), (2,3) ]) # In[13]: merge_interval_sets([ (0,1), (0.5,1.5), (2,3), (3,4), (4.01, 6) ]) # In[14]: def adv_state(state, token): if (state,token) == ("S0", "B1"): return "S1" if (state,token) == ("S0", "B2"): return "S3" if (state,token) == ("S1", "E1"): return "S0" if (state,token) == ("S1", "B2"): return "S2" if (state, token) == ("S2", "E2"): return "S1" if (state, token) == ('S2', 'E1'): return "S3" if (state, token) == ('S3', 'B1'): return "S2" if (state, token) == ('S3', 'E2'): return "S0" raise AssertionError("Must-not-happen: " + str((state, token))) return None def diff_interval_sets(a_set, b_set): """Computes a \ b""" all_intervals = [] for (a,b) in a_set: all_intervals.append(("B1",a)) all_intervals.append(("E1",b)) for (a,b) in b_set: all_intervals.append(("B2",a)) all_intervals.append(("E2",b)) # Now sort all_intervals.sort(key=(lambda x: x[1])) state = 'S0' prev_position = None for (change, position) in all_intervals: #print((state,change, position, prev_position)) new_state = adv_state(state, change) if state == 'S1' and new_state != 'S1': # This most be true for the most part ... yield (prev_position, position) state = new_state prev_position = position # In[15]: list(diff_interval_sets( [ (0,1), (1.2, 3.4) ], [ (0.2, 0.5), (1.1, 3.0) ] )) # In[16]: list(diff_interval_sets( [ (-1,2), (2.4, 8.4) ], [ (2.2, 3.5), (3.6, 3.7) ] )) # Looks good, let's continue. # Number of data points # ------------------------ # # That is, how many files there are in our little DB. # In[17]: len(dataset) # ## Number of requests per page # In[18]: counts = pd.DataFrame( list( len(d['entries']) for d in dataset ), columns=['asset_count'] ) # In[19]: counts[:5] # In[20]: counts_clean = counts.query('asset_count > 5') counts_clean['asset_count'].describe() # In[21]: counts_clean.plot(kind='hist') # In[22]: fig=pyplot.figure() ax = fig.add_subplot(111) (a0,a1,patches)=ax.hist(counts_clean.as_matrix(), bins=np.arange(0,400,25), normed=True) a1.sum() ax.set_xlabel("Number of requests") ax.set_ylabel("Normalized density") ax.set_title("Distribution of requests per page") # The number above should be used as our total of data-points: # 1233 # ---------------------- # ## When the first request can actually start # In[23]: def extract_stoppers(entries_list): first_entry = entries_list[0]['timings'] connect_time = first_entry['connect'] dns_time = first_entry['dns'] ssl = first_entry.get('ssl', 0) wait = first_entry.get('wait') return { 'connect': connect_time, 'dns': dns_time, 'ssl': ssl, 'wait': wait } # In[24]: stoppers = list( [extract_stoppers(d['entries']) for d in dataset] ) stoppers_df = pd.DataFrame(stoppers) stoppers_df[:5] # In[25]: stoppers_df_clean = stoppers_df.query('connect > 0') stoppers_df_clean[:5] # ### How the bare connection times are distributed # # This does not include DNS or SSL times # In[26]: stoppers_df_clean['connect'].plot(kind='hist', bins=np.arange(0,1000,25)) # In[27]: _data = stoppers_df_clean['connect'].as_matrix() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(_data, normed=True, bins=np.arange(0,1000,25)) ax.set_title("Connection time") ax.set_xlabel("Milliseconds") ax.set_ylabel("Normalized density") # In[28]: stoppers_df_clean.describe() # ≽ There seems to be two modes. My guess for those are some sites are in EE.UU. (same place where the browser that made the measurements is located) and som are in Europe. # ### How the total times are distributed for connections without SSL # In[29]: stoppers_nossl_clean = stoppers_df_clean.query('ssl == -1') stoppers_nossl_clean[:5] # In[30]: full_nossl_stoppers = ( stoppers_nossl_clean['connect']+stoppers_nossl_clean['dns'] ) full_nossl_stoppers[:5] # In[31]: full_nossl_stoppers.describe(percentiles=[0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95]) # In[32]: _data = full_nossl_stoppers.as_matrix() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(_data, normed=True, bins=np.arange(0,1000,25)) ax.set_title("Time for first byte of requests, no SSL") ax.set_xlabel("Milliseconds") ax.set_ylabel("Normalized density") # ### How the total times are distributed for connections with SSL # In[33]: stoppers_ssl_clean = stoppers_df_clean.query('ssl > 0') stoppers_ssl_clean[:5] # #### SSL connection time # In[34]: stoppers_ssl_clean['ssl'].plot(kind='hist', bins=np.arange(0,1000,25)) # In[35]: stoppers_ssl_clean['ssl'].describe() # #### All the time that clients communicating via SSL need to wait before issuing the first request # In[36]: full_stoppers = ( stoppers_ssl_clean['ssl']+stoppers_ssl_clean['connect']+stoppers_ssl_clean['dns'] ) full_stoppers[:5] # In[37]: full_stoppers.describe(percentiles=[0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95]) # That is, the mean time is well over half a second.The difference of mean with the connections that don't use SSL is 673 - 459 = 214, very close to the mean SSL connection time. # In[38]: full_stoppers.plot(kind='hist', bins=np.arange(0,1500,25)) # In[39]: _data = full_stoppers.as_matrix() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(_data, normed=True, bins=np.arange(0,1000,25)) ax.set_title("Time for first byte of requests, SSL") ax.set_xlabel("Milliseconds") ax.set_ylabel("Normalized density") # ## How much time the browser waits # In[40]: first_entries_file = dataset[0] # In[41]: sum( e['timings']['wait'] for e in first_entries_file['entries'][20:] ) # ### How much waiting time is in the first three seconds # In[42]: def total_waiting_for_entries(entries, up_to): """ @param up_to: If the relative starting time of the request is greater than this, forget about that """ result = 0 for e in entries: if e['timings']['rel_start'] < up_to: result += e['timings']['wait'] else: # print(e['timings'] ) pass return result # The '3000' below is given in milliseconds. That is, how much the "wait" time totals for requests that are done in the first three seconds. # In[43]: waitings = pd.Series([ total_waiting_for_entries(entries['entries'], 3000) for entries in dataset]) # In[44]: waitings.describe() # Remember, this waiting time is counted in parallel: if the browser issues 20 requests in the first three seconds, and each request has a waiting timing of half a second, the result will be 20\*500 = 10000 milliseconds. # What about average waiting time? # In[45]: def average_waiting_for_entries(entries, up_to): """ @param up_to: If the relative starting time of the request is greater than this, forget about that """ total = 0 c = 0 for e in entries: if e['timings']['rel_start'] < up_to: total += e['timings']['wait'] c += 1 return total/float(c) # In[46]: avg_waitings = pd.Series([ average_waiting_for_entries(entries['entries'], 3000) for entries in dataset]) # In[47]: avg_waitings.describe() # The number above is the average waiting time for request. # ### How much waiting time is until the "load" event # In[48]: dataset[0]['pageTimings'] # In[49]: def average_waiting_until_load(dts): total = 0 c = 0 up_to = dts['pageTimings']['onLoad'] if up_to is None: return None for e in dts['entries']: if e['timings']['ends_receiving'] < up_to: total += e['timings']['wait'] c += 1 if c == 0: # Happens some times, if the onLoad event for some reason is triggered # before the page is finished fetching. Can happen for special responses, # e.g. redirects and the link return None return total/float(c) def total_waiting_time_until_load(dts): total = 0 up_to = dts['pageTimings']['onLoad'] if up_to is None: return None for e in dts['entries']: if e['timings']['ends_receiving'] < up_to: total += e['timings']['wait'] return total # In[50]: avg_wait_till_load = pd.Series([average_waiting_until_load(dts) for dts in dataset if average_waiting_until_load(dts) is not None]) # In[51]: avg_wait_till_load.describe() # In[52]: _data = avg_wait_till_load.as_matrix() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(_data, normed=True, bins=np.arange(0,1000,25)) ax.set_title("Average time waiting") ax.set_xlabel("Milliseconds") ax.set_ylabel("Normalized density") # In[53]: tot_wait_till_load = pd.Series([total_waiting_time_until_load(dts) for dts in dataset]) # In[54]: tot_wait_till_load.describe() # ### How much time the browser is exclusively waiting # In[55]: def wait_intervals_until_load(dts): total = 0 up_to = dts['pageTimings']['onLoad'] if up_to is None: return None for e in dts['entries']: timings = e['timings'] if timings['ends_receiving'] < up_to: connect = timings.get('connect',0) dns = timings.get('dns', 0) blocked = timings.get('blocked', 0) rel_start = timings.get('rel_start') # Chrome uses '-1' to signal that the timing doesn't apply if connect < 0: connect = 0 if dns < 0: dns = 0 ssl = timings.get('ssl', 0) if ssl < 0: ssl = 0 send = timings.get('send') wait = timings.get('wait') starts_waiting = rel_start + connect + dns + ssl + send ends_waiting = starts_waiting + wait yield (starts_waiting, ends_waiting) def disjoint_wait_intervals_until_load(dts): intervals = list(wait_intervals_until_load(dts)) merged_intervals = merge_interval_sets(intervals) # The second part of the answer contains the voids, here we are interested # only in the parts where there was a waiting... return merged_intervals[0] # What about the intervals where data-traffic happens? # In[56]: def deduce_interval_optimistic(timings, starts_dtval): """For a given request/response pair, returns the interval where data is being transferred to the browser """ mseconds_start = timings['rel_start'] connect = timings.get('connect',0) dns = timings.get('dns', 0) blocked = timings.get('blocked', 0) # Chrome uses '-1' to signal that the timing doesn't apply if connect < 0: connect = 0 if dns < 0: dns = 0 ssl = timings.get('ssl', 0) if ssl < 0: ssl = 0 send = timings.get('send') wait = timings.get('wait') receive = timings.get('receive') starts_receiving = mseconds_start + connect + dns + ssl + send + wait ends_receiving = starts_receiving + receive return (starts_receiving, ends_receiving) def summarize_intervals_from_entries(dts): """ Returns a set of merged intervals where there is data transfer from the server to the browser. """ entries = dts['entries'] if len(entries) <= 2: return None # returns a data frame with the void times, their lengths, # the void_end and so so... all_file_intervals = [ deduce_interval_optimistic(entry['timings'], entry['startedDateTime']) for entry in entries ] # Two sets of complimentary, non overlapping intervals. # The first set (non overlapping) intervals represents when data is being received # from the server. The second set represents when no data is being received. data_traffic_intervals, void_intervals = merge_interval_sets(all_file_intervals) return data_traffic_intervals # Let's compute the intervals for the first set just to see that the calculations make sense: # In[57]: wait_intervals = disjoint_wait_intervals_until_load(dataset[0]) wait_intervals[:4] # In[58]: data_transfer_intervals = summarize_intervals_from_entries(dataset[0]) data_transfer_intervals[:4] # In[59]: list( diff_interval_sets(wait_intervals, data_transfer_intervals) )[:4] # Exclusively waiting total (before the load event): # In[60]: def exclusively_waiting_total(dts): wait_intervals = disjoint_wait_intervals_until_load(dts) data_transfer_intervals = summarize_intervals_from_entries(dataset[0]) exclusively_waiting_intervals = diff_interval_sets(wait_intervals, data_transfer_intervals) s = 0.0 for (a,b) in exclusively_waiting_intervals: s += (b - a) return s # In[61]: exclusively_waiting_total(dataset[5]) # Now we can check a distribution for everybody: # In[62]: waiting_series = pd.Series([exclusively_waiting_total(dts) for dts in dataset]) # In[63]: waiting_series.describe() # How much is that waiting time compared to the load event? # In[84]: def proportion_just_waiting(dts): exclusively_waiting = exclusively_waiting_total(dts) load_time = dts['pageTimings']['onLoad'] if load_time is None: return float('nan') return exclusively_waiting / load_time proportion_just_waiting(dataset[12]) # In[86]: waiting_proportion = pd.Series([proportion_just_waiting(dts) for dts in dataset]) waiting_proportion.describe() # In[98]: _data = waiting_proportion.as_matrix() fig = plt.figure() ax = fig.add_subplot(111) ax.hist(_data, normed=True, bins=np.arange(0,1,0.05)) ax.set_title("Time exclusively waiting") ax.set_xlabel("Proportion of load time") ax.set_ylabel("Normalized density") # ## How much remains unused # # This section explores which part of the loading time of web pages goes empty, meaning that no data is being received by the browser. # ### Deducing the intervals from the timings # # Using the 'timings' key in the dataset entries, and the started time, let's deduce an interval as a tuple with the start and end time in milliseconds. # The function below takes all the intervals, merges and summarizes them. However, instead of looking to the time when data is being received, we look to the part when data is **not** being received. We call those intervals when the browser is not receiving data from the network, *voids*. # This is how it looks for a single entry in the dataset: # In[67]: file_entries = dataset[0] summary = summarize_intervals_from_entries(file_entries) summary[:5] # In the list above, the numbers represent milliseconds since the start of the page fetch. Each tuple is a separate interval where the browser is receiving data. # ## Shimmercat data # In[87]: def clean_entry(src_entry): trg_entry = {} trg_entry['timings'] = src_entry['timings'] trg_entry['startedDateTime'] = src_entry['startedDateTime'] trg_entry['transferSize'] = src_entry['response']['_transferSize'] return trg_entry def clean_har_record(har_record): entries = har_record['entries'] clean_entries = [] is_first_entry = True for e in entries: ee = clean_entry(e) if is_first_entry: timing_ref = e['pageref'] for page in har_record['pages']: if page['id'] == timing_ref: timing_data = page['pageTimings'] is_first_entry = False clean_entries.append( ee ) obj = { 'entries': clean_entries, 'pageTimings': timing_data } return obj def fetch_and_standardize_entries(filename): data = json.load(open(filename))['log'] decorate_all_timing_entries(data) o_data = clean_har_record(data) return o_data # In[88]: shimmercat_data = fetch_and_standardize_entries( 'data/www.shimmercat.com.har' ) average_waiting_until_load(shimmercat_data) # In[89]: proportion_just_waiting(shimmercat_data) # ## Data for some other sites # In[90]: wikipedia_data = fetch_and_standardize_entries( 'data/en.wikipedia.org.har' ) proportion_just_waiting(wikipedia_data) # In[91]: so_data = fetch_and_standardize_entries('data/stackoverflow.com.har') proportion_just_waiting(so_data) # In[92]: mozilla_data = fetch_and_standardize_entries('data/developer.mozilla.org.har') proportion_just_waiting(mozilla_data) # In[93]: exclusively_waiting_total(mozilla_data) # In[94]: exclusively_waiting_total(shimmercat_data) # In[95]: exclusively_waiting_total(so_data) # In[96]: exclusively_waiting_total(wikipedia_data) # In[ ]: