Aggregate statistics for mybinder.org¶

My scratch notebook for exploring the mybinder analytics archive

Download raw events with gsutil (very quick after first run, ~2 minutes to start):
```
mkdir -p ./events/
gsutil -m rsync -r gs://binder-events-archive/ ./events/
```
Repack data with pandas for faster re-loading (~3 minutes for fresh data):
- convert categorical columns to pandas categoricals (saves space, some loading time, but
- serialize to feather
- aggregate counts, dropping ref column and rounding timestamps. This significantly reduces the row count, without losing any useful information. Ref could be kept without too much cost, I think.
- Aggregations are:
  - files per day, week, or month
  - counts per day or hour
Load files with pandas and plot (~1s)

In [1]:

import datetime

import pandas as pd
import altair as alt
from pathlib import Path
from functools import lru_cache
from urllib.request import urlretrieve
from concurrent.futures import ThreadPoolExecutor

from tqdm.notebook import tqdm

category_columns = ["schema", "version", "provider", "status", "origin"]


def categoricalize(df):
    """Ensure categorical columns are categorical

    For more efficient storage, processing
    """
    dtypes = {}
    for col in category_columns:
        if col in df.columns:
            dtypes[col] = "category"
    if dtypes:
        return df.astype(dtypes)
    else:
        return df


def uncategoricalize(df):
    """revert categories

    groupby is _super_ slow on categoricals
    https://github.com/pandas-dev/pandas/issues/32976
    """
    dtypes = {}
    for col, dtype in df.dtypes.items():
        if isinstance(dtype, pd.CategoricalDtype):
            dtypes[col] = dtype.categories.dtype
    if dtypes:
        return df.astype(dtypes)
    else:
        return df


@lru_cache(maxsize=60)
def _read_df(path):
    df = pd.read_json(path, lines=True)
    return categoricalize(df)


@lru_cache(maxsize=10)
def _read_dfs(*paths):
    dfs = [_read_df(path) for path in paths]
    if len(dfs) == 1:
        return dfs[0]
    else:
        # concatenate, preserve categoricals with new values
        return (
            categoricalize(pd.concat(dfs, ignore_index=True))
            .sort_values(["timestamp"])
            .reset_index(drop=True)
        )

In [2]:

%%time
events_dir = Path("events")
agg_dir = Path("aggregated")
agg_dir.mkdir(exist_ok=True)
daily_dir = agg_dir.joinpath("daily")
daily_dir.mkdir(exist_ok=True)
weekly_dir = agg_dir.joinpath("weekly")
weekly_dir.mkdir(exist_ok=True)
monthly_dir = agg_dir.joinpath("monthly")
monthly_dir.mkdir(exist_ok=True)

for parent in (daily_dir, weekly_dir, monthly_dir):
    parent.joinpath("by-hour").mkdir(exist_ok=True)
    parent.joinpath("by-day").mkdir(exist_ok=True)

# daily-by-hour
# weekly-by-day

jsonl_fmt = f"{events_dir}/events-%Y-%m-%d.jsonl"
daily_fmt = f"{daily_dir}/daily-%Y-%m-%d.feather"
weekly_fmt = f"{weekly_dir}/weekly-%Y-%m-%d"

def rounded_count(df, freq="H"):
    # copy to avoid leaving our new column in the df
    df = df.copy()
    # add counting column
    df["n"] = 1
    df["timestamp"] = df.timestamp.dt.round(freq)
    # exclude ref from aggregations
    groupby = list(set(df.columns).difference({"ref", "n"}))
    # uncategoricalize because groupby is crazy slow with categoricals
    # must faster to copy the whole df multiple times!
    return uncategoricalize(df).groupby(groupby).n.count().reset_index()


def _agg_and_save(src_list, window, date_str, debug=False):
    for src in src_list:
        if not os.path.exists(src):
            print(f"Missing file to aggregate by {window} for {date_str}: {src}")
            return
    dest_fmt = str(agg_dir.joinpath("{window}/{agg}/{window}-{agg}-{date_str}.feather"))
    dest_hourly = dest_fmt.format(window=window, agg="by-hour", date_str=date_str)
    dest_daily = dest_fmt.format(window=window, agg="by-day", date_str=date_str)

    if os.path.exists(dest_hourly) and os.path.exists(dest_daily):
        if debug:
            print(f"already have {dest_hourly} and {dest_daily}")
        return

    df = _read_dfs(*src_list)
    if debug:
        print(f"Aggregating {len(df)} rows to {dest_hourly} and {dest_hourly}")

    h = rounded_count(df, freq="H")
    h.to_feather(dest_hourly)
    if debug:
        print(
            f"Wrote {len(h)}/{len(df)} ({len(h) / len(df):.0%}) rows to {dest_hourly}"
        )

    d = rounded_count(df, freq="D")
    d.to_feather(dest_daily)
    if debug:
        print(f"Wrote {len(d)}/{len(df)} ({len(d) / len(df):.0%}) rows to {dest_daily}")


def aggregate_day(day):
    # hourly counts by r
    src = day.strftime(jsonl_fmt)
    date_str = day.strftime("%Y-%m-%d")
    _agg_and_save([src], "daily", date_str)


def aggregate_week(day):
    iso_day = day.isocalendar()
    week_start = day - datetime.timedelta(days=iso_day.weekday - 1)
    date_str = day.strftime("%Y-w%W")

    src_list = []
    for i in range(7):
        day = week_start + datetime.timedelta(days=i)
        assert day.isocalendar().week == iso_day.week
        src = day.strftime(jsonl_fmt)
        src_list.append(src)
    _agg_and_save(src_list, "weekly", date_str)


def aggregate_month(day):
    src_list = []
    month = day.month
    day = datetime.date(year=day.year, month=day.month, day=1)
    date_str = day.strftime("%Y-%m")
    while day.month == month:
        src_list.append(day.strftime(jsonl_fmt))
        day = day + datetime.timedelta(days=1)
    _agg_and_save(src_list, "monthly", date_str)


def aggregate(start_date=datetime.date(2019, 1, 1), end_date=datetime.date.today()):
    day = start_date
    total_days = int((end_date - start_date).total_seconds() // (24 * 3600))
    days = tqdm(unit="day", desc="days", total=total_days)
    weeks = tqdm(unit="week", desc="weeks", total = total_days // 7)
    months = tqdm(unit="month", desc="months", total = total_days // 31)

    while day < end_date:
        aggregate_day(day)
        if day.isocalendar().weekday == 7:
            aggregate_week(day)
            weeks.update(1)
        if (day + datetime.timedelta(days=1)).month != day.month:
            aggregate_month(day)
            months.update(1)
        day += datetime.timedelta(days=1)
        days.update(1)
    days.close()
    weeks.close()
    months.close()


aggregate()

days:   0%|          | 0/1592 [00:00<?, ?day/s]

weeks:   0%|          | 0/227 [00:00<?, ?week/s]

months:   0%|          | 0/51 [00:00<?, ?month/s]

Missing file to aggregate by daily for 2023-05-11: events/events-2023-05-11.jsonl
CPU times: user 61.5 ms, sys: 66.2 ms, total: 128 ms
Wall time: 127 ms

In [3]:

!du -hs events

7.3G	events

In [4]:

!du -hs aggregated/*

333M	aggregated/daily
270M	aggregated/monthly
283M	aggregated/weekly

In [5]:

import matplotlib.pyplot as plt

In [6]:

%%time

def get_monthly_data(by="day"):
    frames = [pd.read_feather(f) for f in monthly_dir.glob(f"by-{by}/*.feather")]
    return categoricalize(pd.concat(frames).sort_values("timestamp").reset_index(drop=True))

def get_weekly_data(by="day"):
    frames = [pd.read_feather(f) for f in weekly_dir.glob(f"by-{by}/*.feather")]
    return categoricalize(pd.concat(frames).sort_values("timestamp").reset_index(drop=True))


df = get_weekly_data()
df.origin.fillna("gke.mybinder.org", inplace=True)

CPU times: user 1.59 s, sys: 298 ms, total: 1.89 s
Wall time: 1.9 s

In [7]:

len(df)

Out[7]:

In [8]:

df.n.sum()

Out[8]:

29639742

In [9]:

%%time
uncategoricalize(df).groupby("provider").n.sum()

CPU times: user 194 ms, sys: 31.8 ms, total: 225 ms
Wall time: 225 ms

Out[9]:

provider
Dataverse          972
Figshare           345
Gist            423476
Git             357918
GitHub        28725578
GitLab          127988
Hydroshare         539
Zenodo            2926
Name: n, dtype: int64

In [10]:

origins = {
    'binder.mybinder.turing.ac.uk': "turing.mybinder.org",
    "binder.mybinder.ovh": "ovh.mybinder.org",
    "ovh2.mybinder.org": "ovh.mybinder.org",
    "ovh-test.mybinder.org": "ovh.mybinder.org",
    "notebooks.gesis.org": "gesis.mybinder.org",
    "notebooks-test.gesis.org": "gesis.mybinder.org",
    "gke2.mybinder.org": "gke.mybinder.org",
    "gke1.mybinder.org": "gke.mybinder.org",

}

df["federation"] = df.origin.apply(lambda x: origins.get(x, x)).str.split(".").str[0]
    

list(df.origin.unique()), list(df.federation.unique())

Out[10]:

(['gke.mybinder.org',
  'ovh.mybinder.org',
  'binder.mybinder.ovh',
  'notebooks.gesis.org',
  'gke.mybinder.org:443',
  'turing.mybinder.org',
  'binder.mybinder.turing.ac.uk',
  'gesis.mybinder.org',
  'ovh.mybinder.org:8893',
  'gke2.mybinder.org',
  'gke1.mybinder.org',
  'binder-staging.mybinder.turing.ac.uk',
  'ovh-test.mybinder.org',
  'ovh2.mybinder.org',
  'notebooks-test.gesis.org'],
 ['gke', 'ovh', 'gesis', 'turing', 'binder-staging'])

jovian.ml flooded unique repos that weren't really unique

In [11]:

df.loc[df.spec.str.contains("jovian.ml"), "spec"] = "jovian.ml/$ref"

In [14]:

%%time
uncategoricalize(df).groupby("provider").n.sum().sort_values(ascending=False)

CPU times: user 222 ms, sys: 31 ms, total: 253 ms
Wall time: 251 ms

Out[14]:

provider
GitHub        28725578
Gist            423476
Git             357918
GitLab          127988
Zenodo            2926
Dataverse          972
Hydroshare         539
Figshare           345
Name: n, dtype: int64

In [15]:

print(f"{df.n.sum():,d}")

29,639,742

In [16]:

monthly_count = df.groupby([df.timestamp.dt.strftime("%Y-%m")]).n.sum().cumsum().reset_index()
monthly_count.head()

Out[16]:

	timestamp	n
0	2018-12	2364
1	2019-01	345906
2	2019-02	735415
3	2019-03	1169049
4	2019-04	1594940

In [17]:

alt.Chart(
    monthly_count,
    title="Total user sessions on mybinder.org",
    width=300,
    height=300,
).mark_line().encode(
    x=alt.X("timestamp:T", title="date"),
    y=alt.Y("n", title="sessions"),
)

Out[17]:

Create derivative 'repo' column, stripping unresolved ref from spec

git/gitlab have url-encoded repo as first part, with ref after '/'
gist/github have user/(repo|gist)[/ref], so repo is first two parts

In [18]:

df["repo"] = df.spec
strip_spec = df.provider.isin({"Git", "GitLab"})
df.loc[strip_spec, ["repo"]] = df[strip_spec].spec.str.split("/", n=1).str[0]

github = df.provider.isin({"GitHub", "Gist"})
df.loc[github, ["repo"]] = df[github].spec.str.split("/", n=2).str[:2].str.join("/")

In [19]:

print(f"Total unique repos ever: {len(df.repo.unique()):,d}")

Total unique repos ever: 144,700

In [20]:

print(f"Total unique repos in 2022: {len(df[df.timestamp.dt.year==2022].repo.unique()):,d}")

Total unique repos in 2022: 45,176

In [21]:

len(df[["provider", "repo"]].apply(lambda row: f"{row.provider}:{row.repo}", axis=1).unique())

Out[21]:

In [22]:

df.provider.value_counts()

Out[22]:

GitHub        2222811
Git            105067
GitLab          38030
Gist            30902
Zenodo           1993
Dataverse         745
Hydroshare        277
Figshare          172
Name: provider, dtype: int64

In [23]:

df.repo.value_counts().head(10)

Out[23]:

jovian.ml                       46855
jupyterlab/jupyterlab-demo      10891
ipython/ipython-in-depth        10137
fonsp/pluto-on-binder            8576
binder-examples/requirements     7552
binder-examples/r                7313
scikit-learn/scikit-learn        6355
explosion/spacy-io-binder        4677
RubyData/binder                  4557
QuantStack/xeus-cling            4515
Name: repo, dtype: int64

In [24]:

#without_jovyan = df[~df.spec.str.contains("jovian.ml")]
#d = without_jovyan
monthly = df.groupby([df.timestamp.dt.strftime("%Y-%m"), 'repo']).n.sum().reset_index().groupby('timestamp').repo.count().reset_index()

In [25]:

monthly.tail()

Out[25]:

	timestamp	repo
49	2023-01	8852
50	2023-02	8860
51	2023-03	8929
52	2023-04	8471
53	2023-05	1728

In [26]:

alt.Chart(monthly[:-1], title="Monthly unique repositories").mark_bar().encode(x="timestamp:T", y="repo:Q")

Out[26]:

In [27]:

yearly = df.groupby([df.timestamp.dt.strftime("%Y"), 'repo']).n.sum().reset_index().groupby('timestamp').repo.count().reset_index()
alt.Chart(yearly).mark_bar().encode(x="timestamp", y="repo")

Out[27]:

In [28]:

%%time
counts = (
    uncategoricalize(df).groupby(["timestamp", "federation"]).n.sum()
)
counts = counts.unstack()  # .fillna(0)
seven_day_counts = counts.rolling("7D").mean()

CPU times: user 321 ms, sys: 60.3 ms, total: 381 ms
Wall time: 381 ms

In [29]:

seven_day_counts.plot()
plt.title("Daily user sessions (7-day average)")
# counts.plot(kind="area", stacked=True)

Out[29]:

Text(0.5, 1.0, 'Daily user sessions (7-day average)')

In [30]:

seven_day_counts.plot(kind="area", stacked=True)
plt.title("Daily user sessions (cumulative)")

Out[30]:

Text(0.5, 1.0, 'Daily user sessions (cumulative)')