#!/usr/bin/env python # coding: utf-8 # In[1]: import keyring # In[2]: from getpass import getpass token = keyring.get_password("readthedocs.org", "_api") if not token: token = getpass("readthedocs.org token: ") keyring.set_password("readthedocs.org", "_api", token) # In[3]: import requests import requests_cache from yarl import URL # In[4]: s = requests_cache.CachedSession() s.headers["Authorization"] = f"Token {token}" readthedocs_api = URL("https://readthedocs.org/api/v3/") # In[5]: def list_projects(url=readthedocs_api / "projects"): r = s.get(url) r.raise_for_status() resp = r.json() for project in resp["results"]: yield project if resp['next']: yield from list_projects(resp['next']) projects = list(list_projects()) # In[6]: len(projects) # In[16]: projects[0] # In[ ]: # In[17]: project_names = [ p["slug"] for p in projects ] # In[9]: from pathlib import Path import json from playwright.async_api import Page, async_playwright cookie_path = Path("cookies.json") login_url = "https://readthedocs.org/accounts/login/?next=/dashboard/" dashboard_url = "https://readthedocs.org/dashboard/" async def login() -> list[dict]: """Login to devilry and save cookies""" if cookie_path.exists(): with cookie_path.open() as f: # TODO: check if valid return json.load(f) async with async_playwright() as p: browser = await p.firefox.launch(headless=False) page = await browser.new_page() await page.goto(login_url) print("Login to readthedocs.org ...") await page.wait_for_url(dashboard_url + "*", timeout=120_000) cookies = await browser.contexts[0].cookies() print(f"Saving cookies to {cookie_path}") with cookie_path.open("w") as f: json.dump(cookies, f) return cookies # In[10]: cookies = await login() # In[11]: project_list = [ "jupyterhub", "oauthenticator", "zero-to-jupyterhub", "oauthenticator", "jupyterhub-kubespawner", ] # In[22]: stats_dir = Path("stats") stats_dir.mkdir(exist_ok=True) async def download_stats(project_name: str): cookies = await login() async with async_playwright() as p: browser = await p.firefox.launch(headless=False) page = await browser.new_page() await browser.contexts[0].add_cookies(cookies) for kind in ("traffic", "search"): url = URL(dashboard_url) / project_name / f"{kind}-analytics" await page.goto(str(url)) btn = page.get_by_text("Download all data", exact=True) async with page.expect_download() as download_info: await btn.click() download = await download_info.value dest = stats_dir / download.suggested_filename print(f"Downloading {dest}") await download.save_as(stats_dir / download.suggested_filename) # In[23]: # manually derived from above project_names = [ "binderhub", "ipykernel", "ipyparallel", "ipython", "ipywidgets", "jupyter", "jupyter-client", "jupyter-console", "jupyter-core", "jupyter-docker-stacks", "jupyterhub", "jupyterhub-deploy-teaching", "jupyterhub-dockerspawner", "jupyterhub-grafana", "jupyterhub-kubespawner", "jupyterhub-python-repo-template", "jupyterhub-team-compass", "jupyterhub-traefik-proxy", "jupyterhub-tutorial", "jupyter-notebook", "jupyter-server", "jupyter-server-proxy", "jupyter-software-steering-council-team-compass", "ltiauthenticator", "mybinder-sre", "nbconvert", "nbdime", "nbformat", "nbgitpuller", "oauthenticator", "pytest-jupyterhub", "qtconsole", "repo2docker", "the-littlest-jupyterhub", "traitlets", "zero-to-jupyterhub", ] # In[24]: import asyncio concurrency = 5 async def concurrent_call(semaphore, f, *args, **kwargs): """limit concurrency because too many playwrights can crash""" async with semaphore: return await f(*args, **kwargs) # for project_name in project_names: # await download_stats(project_name) # semaphore = asyncio.Semaphore(concurrency) # await asyncio.gather(*[concurrent_call(semaphore, download_stats, project_name) for project_name in project_names]) # In[25]: # download serially for project_name in project_names: await download_stats(project_name) # In[26]: get_ipython().system('open stats')