#!/usr/bin/env python # coding: utf-8 # # Optimizing JupyterHub startup time # # This is some analysis of timings for optimizing JupyterHub startup time while avoiding shifting that time too much to single subsequent requests. # # Specifically for [jupyterhub#4496](https://github.com/jupyterhub/jupyterhub/pull/4496) # In[118]: import pandas as pd import matplotlib.pyplot as plt # Each run is stored in a CSV file, with three measurements: # # - startup: the total startup time of JupyterHub # - spawn: the time for init_spawners only (typically dominates startup) # - first_users: the time to complete the first GET /api/users request # - second_users: the time to complete the second GET /api/users request, # which should be typical of subsequent requests, e.g. by the idle culler # In[49]: stable = pd.read_csv('7ae1b0b.csv') # In[50]: stable.tail() # The `generate-data` script runs `startup-perf` to gather data for each commit in the PR. # In[119]: import os jupyterhub_dir = os.path.expanduser("~/dev/jpy/jupyterhub") BASE_REF = "origin/main" BRANCH = "quicker-startup" from subprocess import check_output all_refs = [ check_output( ["git", "show", "-s", "--format=%H", BASE_REF], cwd=jupyterhub_dir, text=True ).strip() ] all_refs.extend( check_output( ["git", "rev-list", f"{BASE_REF}..{BRANCH}"], cwd=jupyterhub_dir, text=True ).split()[::-1] ) # ignore some commits that have bugs preventing them from producing valid data broken = set() all_refs # In[120]: csvs = [ f"{ref[:7]}.csv" for ref in all_refs if ref not in broken] csvs # Load the CSV files into data frames # In[121]: names = [] frames = [] for csv in csvs: if os.path.exists(csv): print(f"Loading {csv}") name, _ = os.path.splitext(csv) names.append(name) frames.append(pd.read_csv(csv)) stable = frames[0] latest = frames[-1] # We can plot our three metrics for a given commit: # In[122]: def plot_active(df, active=0): df = df[df.active == active] # df.plot(x='users', y='startup') # plt.title("Startup times") df.plot(x='users', y=['startup', 'first_users', 'second_users'], secondary_y=['startup'], style='-o') # plt.title("GET /api/users") plot_active(stable) # Plot startup time vs active users. This is the biggest thing we are changing # In[123]: def plot_vs_running(df): user_counts = df.users.unique() for u in user_counts: data = df[df.users==u] plt.semilogx(data.running + 1, data.startup, '-o', label=f'{u} users') plt.legend(loc=0) plt.title("Startup time") plt.xlabel("active users") plt.ylabel("Seconds") # In 4.1, startup has a high cost per-user (up to 50 seconds for 5000 active users!), # as well as per active user. # # In[124]: plot_vs_running(stable) # With the latest update, startup time is now much faster, and extra quick when few users were running at startup time (typically true). # In[83]: plot_vs_running(frames[-1]) # Now we can try to compare our three metrics (startup time, first GET /api/users, second GET /api/users) across the commits. # In[125]: import altair as alt df = frames[-1] alt.Chart(df).mark_line( point={ "filled": False, "fill": "white", } ).encode( x=alt.X("running", title="% active"), #scale=alt.Scale(type="log")), y=alt.Y("startup", title="startup time (s)"), color="users:O", ) # In[127]: column_labels = { "spawn": "Spawner initialization", } # In[128]: def plot_compare(frames, labels, y): baseline = frames[0] labels = labels[1:] if not isinstance(y, list): y = [y] relatives = [ df[['users', 'active', 'running']] for df in frames[1:] ] for col in y: for active in baseline.active.unique(): relative = baseline[['users', 'active', 'running']].copy() for df, name in zip(frames[1:], labels): relative[name] = baseline[col] / df[col] relative[relative.active==active].plot(x='users', y=labels) ax = plt.gca() ax.set_yscale('log') column_label = column_labels.get(col, col) plt.title(f"{column_label} speedup with {active:.0%} active") plt.ylabel(f"speedup vs {labels[0]}") plot_compare(frames, names, ['startup', 'spawn']) # In[129]: def bar_compare(frames, names, col, users=5000): data = [] for name, df in zip(names, frames): df = df[df.users == users] data.append([name, df[df.active==0][col].max(), df[df.active==0.25][col].max(), df[df.active==1][col].max()]) df = pd.DataFrame(data, columns=['commit', '0% active', '25% active', '100% active']) ax = plt.gca() df.plot.bar(ax=ax, x='commit') ax.set_yscale('log') label = labels.get(col, col) ax.set_title(f"{label} time for {users} users") ax.set_ylabel("seconds") ymin, ymax = ax.get_ylim() ax.set_ylim(min(ymin, 1e-1), max(ymax, 1e1)) # In[130]: bar_compare(frames, names, 'startup') plt.figure() bar_compare(frames, names, 'spawn') # plt.figure() # bar_compare(frames, names, 'first_users') # plt.figure() # bar_compare(frames, names, 'second_users') # Summary of results: # # Run with a 1ms penalty for each db execution, in order to simulate database queries over a network # # - O(N) queries can be quite costly # - we've removed O(N) queries during startup, # which were both in role assignment (O(total users)) and spawner initialization (O(running servers)) # - Startup time for 5000 users with none running is reduced from ~20 seconds to less than 1 second # - Startup time for 5000 _active_ users is reduced from ~40 seconds to ~2 seconds # # In[ ]: