Via: https://www.england.nhs.uk/statistics/statistical-work-areas/covid-19-daily-deaths/
Daily reports are published as an Excel spreadhseet linked from the following page:
# Reporting page
url = 'https://www.england.nhs.uk/statistics/statistical-work-areas/covid-19-daily-deaths/'
Load the page:
import requests
page = requests.get(url)
Get the HTML page data into a form we can scrape it:
from bs4 import BeautifulSoup, SoupStrainer
soup = BeautifulSoup(page.text)
Get the relevant links to the daily spreadseets:
links = {}
for link in soup.find("article", {"class": "rich-text"}).find_all('a'):
if link.text.startswith('COVID 19 daily announced deaths'):
if link.text not in links:
links[link.text] = link.get('href')
elif link.text.startswith('COVID 19 total announced deaths'):
totals_link = link.get('href')
links
{'COVID 19 daily announced deaths 28 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-28-April-2020.xlsx', 'COVID 19 daily announced deaths 27 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-27-April-2020.xlsx', 'COVID 19 daily announced deaths 26 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-26-April-2020.xlsx', 'COVID 19 daily announced deaths 25 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-25-April-2020.xlsx', 'COVID 19 daily announced deaths 24 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-24-April-2020.xlsx', 'COVID 19 daily announced deaths 23 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-23-April-2020.xlsx', 'COVID 19 daily announced deaths 22 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-22-April-2020.xlsx', 'COVID 19 daily announced deaths 21 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-21-April-2020-1.xlsx', 'COVID 19 daily announced deaths 20 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-20-April-2020.xlsx', 'COVID 19 daily announced deaths 19 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-19-April-2020.xlsx', 'COVID 19 daily announced deaths 18 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-18-April-2020.xlsx', 'COVID 19 daily announced deaths 17 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-17-April-2020.xlsx', 'COVID 19 daily announced deaths 16 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-16-April-2020.xlsx', 'COVID 19 daily announced deaths 15 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-15-April-2020.xlsx', 'COVID 19 daily announced deaths 14 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-14-April-2020.xlsx', 'COVID 19 daily announced deaths 13 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-13-April-2020.xlsx', 'COVID 19 daily announced deaths 12 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-12-April-2020.xlsx', 'COVID 19 daily announced deaths 11 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-11-April-2020.xlsx', 'COVID 19 daily announced deaths 10 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-10-April-2020.xlsx', 'COVID 19 daily announced deaths 9 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-9-April-2020.xlsx', 'COVID 19 daily announced deaths 8 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-8-April-2020.xlsx', 'COVID 19 daily announced deaths 7 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-7-April-2020.xlsx', 'COVID 19 daily announced deaths 6 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-6-April-2020.xlsx', 'COVID 19 daily announced deaths 5 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-5-April-2020.xlsx', 'COVID 19 daily announced deaths 4 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-4-April-2020.xlsx', 'COVID 19 daily announced deaths 3 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-3-April-2020.xlsx', 'COVID 19 daily announced deaths 2 April 2020': 'https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2020/04/COVID-19-daily-announced-deaths-2-April-2020-1.xlsx'}
import pandas as pd
Start to sketch out how we can parse the data out of one of the spreadsheets. The following has been arrivied though a little bit of iteration an previewing of the data:
sheets = pd.read_excel(links['COVID 19 daily announced deaths 9 April 2020'],
sheet_name=None)
# What sheets are available in the spreadsheet
sheet_names = sheets.keys()
sheet_names
odict_keys(['COVID19 daily deaths by region', 'COVID19 daily deaths by age', 'COVID19 daily deaths by trust'])
The spreadsheet contains the following sheets:
COVID19 daily deaths by region
COVID19 daily deaths by age
COVID19 daily deaths by trust
Clean the sheets to get the actual data:
sheet = 'COVID19 daily deaths by region'
sheets[sheet].head(15)
Unnamed: 0 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | ... | Unnamed: 32 | Unnamed: 33 | Unnamed: 34 | Unnamed: 35 | Unnamed: 36 | Unnamed: 37 | Unnamed: 38 | Unnamed: 39 | Unnamed: 40 | Unnamed: 41 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Title: | COVID-19 daily deaths announcement | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | Summary: | Deaths of patients who have died in hospitals ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | NaN | Period: | 5pm 07 April 2020 - 5pm 08 April 2020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | Source: | COVID-19 Patient Notification System | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | Basis: | Provider | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | NaN | Published: | 2020-04-09 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | NaN | Revised: | - | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | NaN | Status: | Published | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | NaN | Contact: | england.nhsdata@nhs.net | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 | NaN | Regional summary | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12 | NaN | NHS England Region | NaN | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | 2020-03-24 00:00:00 | 2020-03-26 00:00:00 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13 | NaN | England | NaN | 1 | 2 | 1 | 2 | 1 | 2 | 3 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
14 | NaN | NaN | NaN | ) | ) | ) | ) | ) | ) | ) | ... | 104.0 | 105.0 | 106.0 | 107.0 | 108.0 | 109.0 | 110.0 | 111.0 | 112.0 | 113.0 |
15 rows × 42 columns
We don't necessarily know how much metadata there is at the start of the sheet so we need to emply heuristics. If NHS England Region is used consistently as a column heading, we can use that as a crib:
import numpy as np
rows, cols = np.where(sheets[sheet] == 'NHS England Region')
rows, cols
/usr/local/lib/python3.7/site-packages/pandas/core/ops/__init__.py:1115: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison result = method(y)
(array([12]), array([1]))
colnames = sheets[sheet].iloc[12]
sheets[sheet] = sheets[sheet].iloc[15:]
sheets[sheet].columns = colnames
sheets[sheet].dropna(axis=1, how='all', inplace=True)
sheets[sheet].dropna(axis=0, how='all', inplace=True)
#sheets[sheet].dropna(axis=0, subset=[sheets[sheet].columns[0]], inplace=True)
sheets[sheet].head()
12 | NHS England Region | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | 2020-03-24 00:00:00 | 2020-03-26 00:00:00 | 2020-03-27 00:00:00 | 2020-03-28 00:00:00 | ... | 2020-04-01 00:00:00 | 2020-04-02 00:00:00 | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | 2020-04-07 00:00:00 | 2020-04-08 00:00:00 | Awaiting verification | Total |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15 | East Of England | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 7 | 2 | 8 | 15 | 15 | 42 | 20 | 0 | 113 |
16 | London | 1 | 2 | 0 | 2 | 1 | 2 | 2 | 6 | 5 | ... | 7 | 10 | 15 | 16 | 12 | 25 | 61 | 29 | 0 | 213 |
17 | Midlands | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 8 | 3 | 5 | 7 | 13 | 22 | 41 | 17 | 0 | 124 |
18 | North East And Yorkshire | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 0 | 4 | 6 | 48 | 32 | 0 | 92 |
19 | North West | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 4 | 6 | 4 | 2 | 6 | 35 | 18 | 0 | 77 |
5 rows × 23 columns
The ages data is structured differently, but we can perhaps use Age Group as a crib?
sheet = 'COVID19 daily deaths by age'
sheets[sheet].head(25)
Unnamed: 0 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | ... | Unnamed: 17 | Unnamed: 18 | Unnamed: 19 | Unnamed: 20 | Unnamed: 21 | Unnamed: 22 | Unnamed: 23 | Unnamed: 24 | Unnamed: 25 | Unnamed: 26 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Title: | COVID-19 daily deaths announcement | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | AE_Closures | NaN | NaN | NaN | NaN | NaN |
1 | NaN | Summary: | Deaths of patients who have died in hospitals ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | Period: | 5pm 07 April 2020 - 5pm 08 April 2020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | Source: | COVID-19 Patient Notification System | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | NaN | Basis: | Provider | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | NaN | Published: | 2020-04-09 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | NaN | Revised: | - | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | NaN | Status: | Published | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | NaN | Contact: | england.nhsdata@nhs.net | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 | NaN | Breakdown by age group | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13 | NaN | Age group | NaN | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | 2020-03-24 00:00:00 | 2020-03-26 00:00:00 | ... | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | 2020-04-07 00:00:00 | 2020-04-08 00:00:00 | NaN | Awaiting verification | NaN | Total |
14 | NaN | Total | NaN | 1 | 2 | 1 | 2 | 1 | 2 | 3 | ... | 31 | 47 | 57 | 100 | 284 | 140 | NaN | 0 | NaN | 765 |
15 | NaN | NaN | NaN | ) | ) | ) | ) | ) | ) | ) | ... | ) | ) | ) | ) | ) | ) | 95.0 | NaN | 95.0 | NaN |
16 | NaN | 0-19 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | NaN | 0 |
17 | NaN | 20-39 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 4 | 0 | NaN | 0 | NaN | 6 |
18 | NaN | 40-59 | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 5 | 3 | 5 | 16 | 18 | NaN | 0 | NaN | 54 |
19 | NaN | 60-79 | NaN | 1 | 2 | 1 | 0 | 0 | 1 | 0 | ... | 17 | 19 | 21 | 38 | 112 | 42 | NaN | 0 | NaN | 290 |
20 | NaN | 80+ | NaN | 0 | 0 | 0 | 2 | 1 | 1 | 3 | ... | 14 | 23 | 33 | 57 | 152 | 80 | NaN | 0 | NaN | 415 |
21 | NaN | TBC | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | NaN | 0 |
22 rows × 27 columns
We can extract the published date to provide an additional metadata column:
rows, cols = np.where(sheets[sheet] == 'Published:')
published_date = sheets[sheet].iat[rows[0], cols[0]+1]
published_date
Timestamp('2020-04-09 00:00:00')
Try the crib:
rows, cols = np.where(sheets[sheet] == 'Age group')
rows, cols
(array([13]), array([1]))
Does the same cleaning pattern work?
colnames = sheets[sheet].iloc[13]
sheets[sheet] = sheets[sheet].iloc[16:]
sheets[sheet].columns = colnames
sheets[sheet].dropna(axis=0, how='all', inplace=True)
sheets[sheet].dropna(axis=1, how='all', inplace=True)
#sheets[sheet].dropna(axis=0, subset=[sheets[sheet].columns[0]], inplace=True)
sheets[sheet].head()
13 | Age group | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | 2020-03-24 00:00:00 | 2020-03-26 00:00:00 | 2020-03-27 00:00:00 | 2020-03-28 00:00:00 | ... | 2020-04-01 00:00:00 | 2020-04-02 00:00:00 | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | 2020-04-07 00:00:00 | 2020-04-08 00:00:00 | Awaiting verification | Total |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16 | 0-19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
17 | 20-39 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 6 |
18 | 40-59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | ... | 0 | 2 | 0 | 5 | 3 | 5 | 16 | 18 | 0 | 54 |
19 | 60-79 | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 1 | 3 | ... | 7 | 15 | 17 | 19 | 21 | 38 | 112 | 42 | 0 | 290 |
20 | 80+ | 0 | 0 | 0 | 2 | 1 | 1 | 3 | 4 | 3 | ... | 10 | 13 | 14 | 23 | 33 | 57 | 152 | 80 | 0 | 415 |
5 rows × 23 columns
Again, NHS England Region may be a handy crib in the following sheet:
sheet = 'COVID19 daily deaths by trust'
sheets[sheet].head(15)
Unnamed: 0 | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | ... | Unnamed: 19 | Unnamed: 20 | Unnamed: 21 | Unnamed: 22 | Unnamed: 23 | Unnamed: 24 | Unnamed: 25 | Unnamed: 26 | Unnamed: 27 | Unnamed: 28 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | Title: | COVID-19 daily deaths announcement | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | AE_Closures | NaN | NaN | NaN | NaN | NaN |
1 | NaN | Summary: | Deaths of patients who have died in hospitals ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | NaN | Period: | 5pm 07 April 2020 - 5pm 08 April 2020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | Source: | COVID-19 Patient Notification System | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | Basis: | Provider | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | NaN | Published: | 2020-04-09 00:00:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | NaN | Revised: | - | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | NaN | Status: | Published | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | NaN | Contact: | england.nhsdata@nhs.net | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 | NaN | Trust level data | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12 | NaN | NHS England Region | NaN | Code | Name | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | ... | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | 2020-04-07 00:00:00 | 2020-04-08 00:00:00 | NaN | Awaiting verification | NaN | Total |
13 | NaN | - | NaN | - | ENGLAND | 1 | 2 | 1 | 2 | 1 | ... | 31 | 47 | 57 | 100 | 284 | 140 | NaN | 0 | NaN | 765 |
14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
15 rows × 29 columns
The same cleaning pattern we used before seems to work fine:
colnames = sheets[sheet].iloc[12]
sheets[sheet] = sheets[sheet].iloc[15:]
sheets[sheet].columns = colnames
sheets[sheet].dropna(axis=1, how='all', inplace=True)
sheets[sheet].dropna(axis=0, how='all', inplace=True)
#sheets[sheet].dropna(axis=0, subset=[sheets[sheet].columns[0]], inplace=True)
sheets[sheet].head()
12 | NHS England Region | Code | Name | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | 2020-03-24 00:00:00 | 2020-03-26 00:00:00 | ... | 2020-04-01 00:00:00 | 2020-04-02 00:00:00 | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | 2020-04-07 00:00:00 | 2020-04-08 00:00:00 | Awaiting verification | Total |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - CLACTON HOS... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
16 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - FRYATT HOSP... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
17 | East Of England | RDD | BASILDON AND THURROCK UNIVERSITY HOSPITALS NHS... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 2 | 3 | 7 | 5 | 3 | 0 | 24 |
18 | East Of England | RC1 | BEDFORD HOSPITAL NHS TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 2 |
19 | East Of England | RGT | CAMBRIDGE UNIVERSITY HOSPITALS NHS FOUNDATION ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
5 rows × 25 columns
The following tries to clean things automatically - we drop the national aggregate values:
def cleaner(sheets):
for sheet in sheets:
if 'chart' in sheet:
continue
rows, cols = np.where(sheets[sheet] == 'Published:')
published_date = sheets[sheet].iat[rows[0], cols[0]+1]
if 'age' in sheet:
rows, cols = np.where(sheets[sheet] == 'Age group')
#print((rows, cols))
_ix= rows[0]
elif 'ethnicity' in sheet:
rows, cols = np.where(sheets[sheet] == 'Ethnic group')
#print((rows, cols))
_ix= rows[0]
else:
rows, cols = np.where(sheets[sheet] == 'NHS England Region')
#print((sheet, rows, cols))
_ix= rows[0] #ix[sheet][0]
colnames = sheets[sheet].iloc[_ix]
sheets[sheet] = sheets[sheet].iloc[_ix+3:]
sheets[sheet].columns = colnames
sheets[sheet].dropna(axis=1, how='all', inplace=True)
sheets[sheet].dropna(axis=0, how='all', inplace=True)
sheets[sheet] = sheets[sheet].loc[:, sheets[sheet].columns.notnull()]
sheets[sheet]['Published'] = published_date
#sheets[sheet].dropna(axis=0, subset=[sheets[sheet].columns[0]], inplace=True)
return sheets
Grab all the daily reports:
data = {}
for link in links:
print(link)
sheets = pd.read_excel(links[link], sheet_name=None)
sheets = cleaner(sheets)
data[link] = sheets
COVID 19 daily announced deaths 28 April 2020 COVID 19 daily announced deaths 27 April 2020 COVID 19 daily announced deaths 26 April 2020 COVID 19 daily announced deaths 25 April 2020 COVID 19 daily announced deaths 24 April 2020 COVID 19 daily announced deaths 23 April 2020 COVID 19 daily announced deaths 22 April 2020 COVID 19 daily announced deaths 21 April 2020 COVID 19 daily announced deaths 20 April 2020 COVID 19 daily announced deaths 19 April 2020 COVID 19 daily announced deaths 18 April 2020 COVID 19 daily announced deaths 17 April 2020 COVID 19 daily announced deaths 16 April 2020 COVID 19 daily announced deaths 15 April 2020 COVID 19 daily announced deaths 14 April 2020 COVID 19 daily announced deaths 13 April 2020 COVID 19 daily announced deaths 12 April 2020 COVID 19 daily announced deaths 11 April 2020 COVID 19 daily announced deaths 10 April 2020 COVID 19 daily announced deaths 9 April 2020 COVID 19 daily announced deaths 8 April 2020 COVID 19 daily announced deaths 7 April 2020 COVID 19 daily announced deaths 6 April 2020 COVID 19 daily announced deaths 5 April 2020 COVID 19 daily announced deaths 4 April 2020 COVID 19 daily announced deaths 3 April 2020 COVID 19 daily announced deaths 2 April 2020
data.keys()
dict_keys(['COVID 19 daily announced deaths 28 April 2020', 'COVID 19 daily announced deaths 27 April 2020', 'COVID 19 daily announced deaths 26 April 2020', 'COVID 19 daily announced deaths 25 April 2020', 'COVID 19 daily announced deaths 24 April 2020', 'COVID 19 daily announced deaths 23 April 2020', 'COVID 19 daily announced deaths 22 April 2020', 'COVID 19 daily announced deaths 21 April 2020', 'COVID 19 daily announced deaths 20 April 2020', 'COVID 19 daily announced deaths 19 April 2020', 'COVID 19 daily announced deaths 18 April 2020', 'COVID 19 daily announced deaths 17 April 2020', 'COVID 19 daily announced deaths 16 April 2020', 'COVID 19 daily announced deaths 15 April 2020', 'COVID 19 daily announced deaths 14 April 2020', 'COVID 19 daily announced deaths 13 April 2020', 'COVID 19 daily announced deaths 12 April 2020', 'COVID 19 daily announced deaths 11 April 2020', 'COVID 19 daily announced deaths 10 April 2020', 'COVID 19 daily announced deaths 9 April 2020', 'COVID 19 daily announced deaths 8 April 2020', 'COVID 19 daily announced deaths 7 April 2020', 'COVID 19 daily announced deaths 6 April 2020', 'COVID 19 daily announced deaths 5 April 2020', 'COVID 19 daily announced deaths 4 April 2020', 'COVID 19 daily announced deaths 3 April 2020', 'COVID 19 daily announced deaths 2 April 2020'])
Just as an aside, we can informally extract the publication date of a spreadheet from the associated link text on the original web page (trusting that the link does refer to the correctly linked document):
from parse import parse
import dateparser
def getLinkDate(link):
"""Get date from link text."""
_date = parse('COVID 19 daily announced deaths {date}', link)['date']
return dateparser.parse(_date)
#Test the date extractor
getLinkDate('COVID 19 daily announced deaths 15 April 2020')
datetime.datetime(2020, 4, 15, 0, 0)
data['COVID 19 daily announced deaths 9 April 2020'].keys()
odict_keys(['COVID19 daily deaths by region', 'COVID19 daily deaths by age', 'COVID19 daily deaths by trust'])
Preview what sort of data we've got:
df = data['COVID 19 daily announced deaths 9 April 2020']['COVID19 daily deaths by trust']
df
12 | NHS England Region | Code | Name | 2020-03-16 00:00:00 | 2020-03-19 00:00:00 | 2020-03-20 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | 2020-03-24 00:00:00 | 2020-03-26 00:00:00 | ... | 2020-04-02 00:00:00 | 2020-04-03 00:00:00 | 2020-04-04 00:00:00 | 2020-04-05 00:00:00 | 2020-04-06 00:00:00 | 2020-04-07 00:00:00 | 2020-04-08 00:00:00 | Awaiting verification | Total | Published |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - CLACTON HOS... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 2020-04-09 |
16 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - FRYATT HOSP... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 2020-04-09 |
17 | East Of England | RDD | BASILDON AND THURROCK UNIVERSITY HOSPITALS NHS... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 0 | 2 | 3 | 7 | 5 | 3 | 0 | 24 | 2020-04-09 |
18 | East Of England | RC1 | BEDFORD HOSPITAL NHS TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 2 | 2020-04-09 |
19 | East Of England | RGT | CAMBRIDGE UNIVERSITY HOSPITALS NHS FOUNDATION ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 2020-04-09 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
181 | South West | RA9 | TORBAY AND SOUTH DEVON NHS FOUNDATION TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2020-04-09 |
182 | South West | RA7 | UNIVERSITY HOSPITALS BRISTOL NHS FOUNDATION TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 4 | 2020-04-09 |
183 | South West | RK9 | UNIVERSITY HOSPITALS PLYMOUTH NHS TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 2020-04-09 |
184 | South West | RA3 | WESTON AREA HEALTH NHS TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2020-04-09 |
185 | South West | RA4 | YEOVIL DISTRICT HOSPITAL NHS FOUNDATION TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 2 | 2020-04-09 |
171 rows × 26 columns
Preview a specific area, albeit with quite an informal search term:
df = data['COVID 19 daily announced deaths 4 April 2020']['COVID19 daily deaths by trust']
df[df['Name'].str.contains('WIGHT')]
12 | NHS England Region | Code | Name | 2020-03-03 00:00:00 | 2020-03-16 00:00:00 | 2020-03-18 00:00:00 | 2020-03-20 00:00:00 | 2020-03-21 00:00:00 | 2020-03-22 00:00:00 | 2020-03-23 00:00:00 | ... | 2020-03-28 00:00:00 | 2020-03-29 00:00:00 | 2020-03-30 00:00:00 | 2020-03-31 00:00:00 | 2020-04-01 00:00:00 | 2020-04-02 00:00:00 | 2020-04-03 00:00:00 | Awaiting verification | Total | Published |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
132 | South East | R1F | ISLE OF WIGHT NHS TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2020-04-04 |
1 rows × 24 columns
Grab the totals:
totals_xl = pd.read_excel(totals_link, sheet_name=None)
totals_xl.keys()
odict_keys(['COVID19 total deaths chart', 'COVID19 total deaths by trust', 'COVID19 total deaths by region', 'Deaths by region- negative test', 'COVID19 total deaths by age', 'COVID19 total by ethnicity'])
totals_xl = cleaner(totals_xl)
totals_xl.keys()
odict_keys(['COVID19 total deaths chart', 'COVID19 total deaths by trust', 'COVID19 total deaths by region', 'Deaths by region- negative test', 'COVID19 total deaths by age', 'COVID19 total by ethnicity'])
dfs = totals_xl['COVID19 total deaths by trust']
dfs[dfs['Name'].str.contains('WIGHT')]
14 | NHS England Region | Code | Name | Up to 01-Mar-20 | 2020-03-01 00:00:00 | 2020-03-02 00:00:00 | 2020-03-03 00:00:00 | 2020-03-04 00:00:00 | 2020-03-05 00:00:00 | 2020-03-06 00:00:00 | ... | 2020-04-21 00:00:00 | 2020-04-22 00:00:00 | 2020-04-23 00:00:00 | 2020-04-24 00:00:00 | 2020-04-25 00:00:00 | 2020-04-26 00:00:00 | 2020-04-27 00:00:00 | Awaiting verification | Total | Published |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
179 | South East | R1F | ISLE OF WIGHT NHS TRUST | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 22 | 2020-04-28 |
1 rows × 65 columns
The data is perhaps most easily managed in a long form. We could normalise the data properly across several tables, or for mow we can just grab perhaps slightly denormalised tables for the dates and separate tables for totals and result awaiting verification:
df_dailies = df.drop(columns=['Awaiting verification', 'Total'])
tmp = df_dailies.melt(id_vars=['NHS England Region','Code','Name', 'Published'],
var_name='Date',
value_name='value')
tmp.head()
NHS England Region | Code | Name | Published | Date | value | |
---|---|---|---|---|---|---|
0 | East Of England | RDD | BASILDON AND THURROCK UNIVERSITY HOSPITALS NHS... | 2020-04-04 | 2020-03-03 | 0 |
1 | East Of England | RC1 | BEDFORD HOSPITAL NHS TRUST | 2020-04-04 | 2020-03-03 | 0 |
2 | East Of England | RGT | CAMBRIDGE UNIVERSITY HOSPITALS NHS FOUNDATION ... | 2020-04-04 | 2020-03-03 | 0 |
3 | East Of England | RWH | EAST AND NORTH HERTFORDSHIRE NHS TRUST | 2020-04-04 | 2020-03-03 | 0 |
4 | East Of England | RDE | EAST SUFFOLK AND NORTH ESSEX NHS FOUNDATION TRUST | 2020-04-04 | 2020-03-03 | 0 |
Find the days lag between published and strike date:
tmp['Date'] = pd.to_datetime(tmp['Date'])
tmp['lag'] = tmp['Published'] - tmp['Date']
tmp.head()
NHS England Region | Code | Name | Published | Date | value | lag | |
---|---|---|---|---|---|---|---|
0 | East Of England | RDD | BASILDON AND THURROCK UNIVERSITY HOSPITALS NHS... | 2020-04-04 | 2020-03-03 | 0 | 32 days |
1 | East Of England | RC1 | BEDFORD HOSPITAL NHS TRUST | 2020-04-04 | 2020-03-03 | 0 | 32 days |
2 | East Of England | RGT | CAMBRIDGE UNIVERSITY HOSPITALS NHS FOUNDATION ... | 2020-04-04 | 2020-03-03 | 0 | 32 days |
3 | East Of England | RWH | EAST AND NORTH HERTFORDSHIRE NHS TRUST | 2020-04-04 | 2020-03-03 | 0 | 32 days |
4 | East Of England | RDE | EAST SUFFOLK AND NORTH ESSEX NHS FOUNDATION TRUST | 2020-04-04 | 2020-03-03 | 0 | 32 days |
Create a simple SQLite database:
import sqlite_utils
!rm nhs_dailies.db
DB = sqlite_utils.Database("nhs_dailies.db")
Add the daily data to the db:
idx = {'trust': ['NHS England Region','Code','Name', 'Published'],
'age': ['Age group', 'Published'],
'region': ['NHS England Region', 'Published'] }
# TO DO - ethnicity
for daily in data.keys():
#print(daily)
#linkDate = getLinkDate(daily)
# TO DO - get data from excluded sheets
for sheet in data[daily].keys():
if 'chart' in sheet or 'ethnicity' in sheet or 'negative test ' in sheet:
continue
#print(sheet)
table = parse('COVID19 daily deaths by {table}', sheet)['table']
df_dailies = data[daily][sheet].drop(columns=['Awaiting verification', 'Total'])
#df_dailies['Link_date'] = linkDate
idx_cols = idx[table]#+['Link_date']
df_long = df_dailies.melt(id_vars=idx_cols,
var_name='Date',
value_name='value')
df_long['Date'] = pd.to_datetime(df_long['Date'])
df_long['lag'] = (df_long['Published'] - df_long['Date']).dt.days
_table = f'nhs_dailies_{table}'
df_long.to_sql(_table, DB.conn, index=False, if_exists='append')
cols = idx[table] + ['Awaiting verification', 'Total']
data[daily][sheet][cols].to_sql(f'{_table}_summary',
DB.conn, index=False, if_exists='append')
/usr/local/lib/python3.7/site-packages/pandas/core/generic.py:2712: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores. method=method,
Dummy query on age
sheet:
pd.read_sql("SELECT * FROM nhs_dailies_age LIMIT 25", DB.conn)
Age group | Published | Date | value | lag | |
---|---|---|---|---|---|
0 | 0-19 | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
1 | 20-39 | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
2 | 40-59 | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
3 | 60-79 | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 1 | 40 |
4 | 80+ | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
5 | TBC | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
6 | 0-19 | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
7 | 20-39 | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
8 | 40-59 | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
9 | 60-79 | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
10 | 80+ | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 1 | 39 |
11 | TBC | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
12 | 0-19 | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
13 | 20-39 | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
14 | 40-59 | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
15 | 60-79 | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 1 | 37 |
16 | 80+ | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 1 | 37 |
17 | TBC | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
18 | 0-19 | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
19 | 20-39 | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
20 | 40-59 | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
21 | 60-79 | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
22 | 80+ | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 1 | 36 |
23 | TBC | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
24 | 0-19 | 2020-04-28 00:00:00 | 2020-03-24 00:00:00 | 0 | 35 |
Dummy query on age_summary
sheet:
pd.read_sql("SELECT * FROM nhs_dailies_age_summary LIMIT 5", DB.conn)
Age group | Published | Awaiting verification | Total | |
---|---|---|---|---|
0 | 0-19 | 2020-04-28 00:00:00 | 0 | 0 |
1 | 20-39 | 2020-04-28 00:00:00 | 0 | 4 |
2 | 40-59 | 2020-04-28 00:00:00 | 0 | 41 |
3 | 60-79 | 2020-04-28 00:00:00 | 0 | 216 |
4 | 80+ | 2020-04-28 00:00:00 | 0 | 285 |
Dummy query on trust
sheet:
pd.read_sql("SELECT * FROM nhs_dailies_trust LIMIT 5", DB.conn)
NHS England Region | Code | Name | Published | Date | value | lag | |
---|---|---|---|---|---|---|---|
0 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - CLACTON HOS... | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
1 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - FRYATT HOSP... | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
2 | East Of England | RDD | BASILDON AND THURROCK UNIVERSITY HOSPITALS NHS... | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
3 | East Of England | RC1 | BEDFORD HOSPITAL NHS TRUST | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
4 | East Of England | RGT | CAMBRIDGE UNIVERSITY HOSPITALS NHS FOUNDATION ... | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
Dummy query on trust_summary
sheet:
pd.read_sql("SELECT * FROM nhs_dailies_trust_summary LIMIT 5", DB.conn)
NHS England Region | Code | Name | Published | Awaiting verification | Total | |
---|---|---|---|---|---|---|
0 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - CLACTON HOS... | 2020-04-28 00:00:00 | 0 | 0 |
1 | East Of England | NQ1 | ANGLIAN COMMUNITY ENTERPRISE CIC - FRYATT HOSP... | 2020-04-28 00:00:00 | 0 | 0 |
2 | East Of England | RDD | BASILDON AND THURROCK UNIVERSITY HOSPITALS NHS... | 2020-04-28 00:00:00 | 0 | 1 |
3 | East Of England | RC1 | BEDFORD HOSPITAL NHS TRUST | 2020-04-28 00:00:00 | 0 | 6 |
4 | East Of England | RGT | CAMBRIDGE UNIVERSITY HOSPITALS NHS FOUNDATION ... | 2020-04-28 00:00:00 | 0 | 2 |
Dummy query on region
sheet:
pd.read_sql("SELECT * FROM nhs_dailies_region LIMIT 25", DB.conn)
NHS England Region | Published | Date | value | lag | |
---|---|---|---|---|---|
0 | East Of England | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
1 | London | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 1 | 40 |
2 | Midlands | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
3 | North East And Yorkshire | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
4 | North West | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
5 | South East | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
6 | South West | 2020-04-28 00:00:00 | 2020-03-19 00:00:00 | 0 | 40 |
7 | East Of England | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
8 | London | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
9 | Midlands | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 1 | 39 |
10 | North East And Yorkshire | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
11 | North West | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
12 | South East | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
13 | South West | 2020-04-28 00:00:00 | 2020-03-20 00:00:00 | 0 | 39 |
14 | East Of England | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
15 | London | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 1 | 37 |
16 | Midlands | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
17 | North East And Yorkshire | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 1 | 37 |
18 | North West | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
19 | South East | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
20 | South West | 2020-04-28 00:00:00 | 2020-03-22 00:00:00 | 0 | 37 |
21 | East Of England | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
22 | London | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 1 | 36 |
23 | Midlands | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
24 | North East And Yorkshire | 2020-04-28 00:00:00 | 2020-03-23 00:00:00 | 0 | 36 |
Dummy query on region_summary
sheet:
pd.read_sql("SELECT * FROM nhs_dailies_region_summary LIMIT 25", DB.conn)
NHS England Region | Published | Awaiting verification | Total | |
---|---|---|---|---|
0 | East Of England | 2020-04-28 00:00:00 | 0 | 72 |
1 | London | 2020-04-28 00:00:00 | 0 | 88 |
2 | Midlands | 2020-04-28 00:00:00 | 0 | 99 |
3 | North East And Yorkshire | 2020-04-28 00:00:00 | 0 | 87 |
4 | North West | 2020-04-28 00:00:00 | 0 | 102 |
5 | South East | 2020-04-28 00:00:00 | 0 | 72 |
6 | South West | 2020-04-28 00:00:00 | 0 | 26 |
7 | East Of England | 2020-04-27 00:00:00 | 0 | 34 |
8 | London | 2020-04-27 00:00:00 | 0 | 87 |
9 | Midlands | 2020-04-27 00:00:00 | 0 | 55 |
10 | North East And Yorkshire | 2020-04-27 00:00:00 | 0 | 55 |
11 | North West | 2020-04-27 00:00:00 | 0 | 41 |
12 | South East | 2020-04-27 00:00:00 | 0 | 38 |
13 | South West | 2020-04-27 00:00:00 | 0 | 19 |
14 | East Of England | 2020-04-26 00:00:00 | 0 | 49 |
15 | London | 2020-04-26 00:00:00 | 0 | 64 |
16 | Midlands | 2020-04-26 00:00:00 | 0 | 57 |
17 | North East And Yorkshire | 2020-04-26 00:00:00 | 0 | 66 |
18 | North West | 2020-04-26 00:00:00 | 0 | 46 |
19 | South East | 2020-04-26 00:00:00 | 0 | 43 |
20 | South West | 2020-04-26 00:00:00 | 0 | 11 |
21 | East Of England | 2020-04-25 00:00:00 | 0 | 88 |
22 | London | 2020-04-25 00:00:00 | 0 | 116 |
23 | Midlands | 2020-04-25 00:00:00 | 0 | 145 |
24 | North East And Yorkshire | 2020-04-25 00:00:00 | 0 | 117 |
totals_xl.keys()
odict_keys(['COVID19 total deaths chart', 'COVID19 total deaths by trust', 'COVID19 total deaths by region', 'Deaths by region- negative test', 'COVID19 total deaths by age', 'COVID19 total by ethnicity'])
for sheet in totals_xl.keys():
if 'chart' in sheet or 'ethnicity' in sheet or 'negative test' in sheet:
continue
table = parse('COVID19 total deaths by {table}', sheet)['table']
df_totals = totals_xl[sheet].drop(columns=['Awaiting verification', 'Total', 'Up to 01-Mar-20'])
idx_cols = idx[table]
df_long = df_totals.melt(id_vars=idx_cols,
var_name='Date',
value_name='value')
df_long['Date'] = pd.to_datetime(df_long['Date'])
df_long['lag'] = (df_long['Published'] - df_long['Date']).dt.days
_table = f'nhs_totals_{table}'
df_long.to_sql(_table, DB.conn, index=False, if_exists='append')
cols = idx_cols + ['Up to 01-Mar-20', 'Awaiting verification', 'Total']
totals_xl[sheet][cols].to_sql(f'{_table}_summary',
DB.conn, index=False, if_exists='append')
pd.read_sql("SELECT * FROM nhs_totals_region_summary LIMIT 25", DB.conn)
NHS England Region | Published | Up to 01-Mar-20 | Awaiting verification | Total | |
---|---|---|---|---|---|
0 | East Of England | 2020-04-28 00:00:00 | 0 | 0 | 2210 |
1 | London | 2020-04-28 00:00:00 | 0 | 0 | 4781 |
2 | Midlands | 2020-04-28 00:00:00 | 0 | 0 | 3794 |
3 | North East And Yorkshire | 2020-04-28 00:00:00 | 0 | 0 | 2550 |
4 | North West | 2020-04-28 00:00:00 | 0 | 1 | 2851 |
5 | South East | 2020-04-28 00:00:00 | 0 | 0 | 2226 |
6 | South West | 2020-04-28 00:00:00 | 0 | 0 | 883 |
Let's try some basic charts. For example,
zz = pd.read_sql("SELECT * FROM nhs_totals_region WHERE `NHS England Region`='London' and Date=DATETIME('2020-04-09')", DB.conn)
zz
NHS England Region | Published | Date | value | lag | |
---|---|---|---|---|---|
0 | London | 2020-04-28 00:00:00 | 2020-04-09 00:00:00 | 187 | 19 |
How long does it take for a particular hospital to report deaths (i.e. what's the lag distribution between the publication date and the strike date?)?
The following chart sums the number of deaths reported relative to the delay in reporting them:
pd.read_sql("SELECT value, lag FROM nhs_totals_trust WHERE Name='WEST HERTFORDSHIRE HOSPITALS NHS TRUST'", DB.conn).groupby(['lag']).sum().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x11b446b50>
#via https://stackoverflow.com/questions/61415090/python-pandas-handling-of-308-request
import requests
import io
def get_308_csv(url):
datastr = requests.get(url, allow_redirects=True).text
data_file = io.StringIO(datastr)
_df = pd.read_csv(data_file)
_df['Specimen date'] = pd.to_datetime(_df['Specimen date'])
return _df
phe_cases_url = 'https://coronavirus.data.gov.uk/downloads/csv/coronavirus-cases_latest.csv'
phe_cases_df = get_308_csv(phe_cases_url)
_table = f'phe_cases'
phe_cases_df.to_sql(_table, DB.conn, index=False, if_exists='append')
phe_cases_df.head()
/usr/local/lib/python3.7/site-packages/pandas/core/generic.py:2712: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores. method=method,
Area name | Area code | Area type | Specimen date | Daily lab-confirmed cases | Previously reported daily cases | Change in daily cases | Cumulative lab-confirmed cases | Previously reported cumulative cases | Change in cumulative cases | |
---|---|---|---|---|---|---|---|---|---|---|
0 | England | E92000001 | Nation | 2020-04-27 | 103.0 | 0.0 | 103.0 | 114456 | 112467.0 | 1989.0 |
1 | South West | E12000009 | Region | 2020-04-27 | 4.0 | NaN | NaN | 5986 | NaN | NaN |
2 | South East | E12000008 | Region | 2020-04-27 | 7.0 | NaN | NaN | 16323 | NaN | NaN |
3 | London | E12000007 | Region | 2020-04-27 | 6.0 | NaN | NaN | 23979 | NaN | NaN |
4 | East of England | E12000006 | Region | 2020-04-27 | 1.0 | NaN | NaN | 9907 | NaN | NaN |
pd.read_sql("SELECT * FROM phe_cases LIMIT 3", DB.conn)
Area name | Area code | Area type | Specimen date | Daily lab-confirmed cases | Previously reported daily cases | Change in daily cases | Cumulative lab-confirmed cases | Previously reported cumulative cases | Change in cumulative cases | |
---|---|---|---|---|---|---|---|---|---|---|
0 | England | E92000001 | Nation | 2020-04-27 00:00:00 | 103.0 | 0.0 | 103.0 | 114456 | 112467.0 | 1989.0 |
1 | South West | E12000009 | Region | 2020-04-27 00:00:00 | 4.0 | NaN | NaN | 5986 | NaN | NaN |
2 | South East | E12000008 | Region | 2020-04-27 00:00:00 | 7.0 | NaN | NaN | 16323 | NaN | NaN |
phe_deaths_url = 'https://coronavirus.data.gov.uk/downloads/csv/coronavirus-deaths_latest.csv'
phe_deaths_df = get_308_csv(phe_cases_url)
_table = f'phe_deaths'
phe_cases_df.to_sql(_table, DB.conn, index=False, if_exists='append')
phe_deaths_df.head()
Area name | Area code | Area type | Specimen date | Daily lab-confirmed cases | Previously reported daily cases | Change in daily cases | Cumulative lab-confirmed cases | Previously reported cumulative cases | Change in cumulative cases | |
---|---|---|---|---|---|---|---|---|---|---|
0 | England | E92000001 | Nation | 2020-04-27 | 103.0 | 0.0 | 103.0 | 114456 | 112467.0 | 1989.0 |
1 | South West | E12000009 | Region | 2020-04-27 | 4.0 | NaN | NaN | 5986 | NaN | NaN |
2 | South East | E12000008 | Region | 2020-04-27 | 7.0 | NaN | NaN | 16323 | NaN | NaN |
3 | London | E12000007 | Region | 2020-04-27 | 6.0 | NaN | NaN | 23979 | NaN | NaN |
4 | East of England | E12000006 | Region | 2020-04-27 | 1.0 | NaN | NaN | 9907 | NaN | NaN |
pd.read_sql("SELECT * FROM phe_deaths LIMIT 3", DB.conn)
Area name | Area code | Area type | Specimen date | Daily lab-confirmed cases | Previously reported daily cases | Change in daily cases | Cumulative lab-confirmed cases | Previously reported cumulative cases | Change in cumulative cases | |
---|---|---|---|---|---|---|---|---|---|---|
0 | England | E92000001 | Nation | 2020-04-27 00:00:00 | 103.0 | 0.0 | 103.0 | 114456 | 112467.0 | 1989.0 |
1 | South West | E12000009 | Region | 2020-04-27 00:00:00 | 4.0 | NaN | NaN | 5986 | NaN | NaN |
2 | South East | E12000008 | Region | 2020-04-27 00:00:00 | 7.0 | NaN | NaN | 16323 | NaN | NaN |
Death registrations, 2020: https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/causesofdeath/datasets/deathregistrationsandoccurrencesbylocalauthorityandhealthboard
Weekly Death registrations (provisional): https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales
ons_weekly_url = 'https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fweeklyprovisionalfiguresondeathsregisteredinenglandandwales%2f2020/publishedweek1620201.xlsx'
r = requests.get(ons_weekly_url)
fn = ons_weekly_url.split('/')[-1]
with open(fn, 'wb') as f:
f.write(r.content)
ons_sheets = pd.read_excel(fn, sheet_name=None)
# What sheets are available in the spreadsheet
ons_sheet_names = ons_sheets.keys()
ons_sheet_names
odict_keys(['Analysis', 'Contents', 'Information', 'Terms and conditions', 'Weekly figures 2020', 'Covid-19 - Weekly registrations', 'Covid-19 - Weekly occurrences', 'UK - Covid-19 - Weekly reg', 'Covid-19 - E&W comparisons', 'Covid-19 - England comparisons', 'Covid-19 - Wales comparison', 'Covid-19 - Place of occurrence ', 'Related publications'])
ons_weekly_reg = ons_sheets['Covid-19 - Weekly registrations']
ons_weekly_reg.head()
Contents | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | ... | Unnamed: 45 | Unnamed: 46 | Unnamed: 47 | Unnamed: 48 | Unnamed: 49 | Unnamed: 50 | Unnamed: 51 | Unnamed: 52 | Unnamed: 53 | Unnamed: 54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Weekly provisional figures on deaths registere... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Note that up-to-date counts of the total numbe... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Week number | NaN | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
4 | Week ended | NaN | 2020-01-03 00:00:00 | 2020-01-10 00:00:00 | 2020-01-17 00:00:00 | 2020-01-24 00:00:00 | 2020-01-31 00:00:00 | 2020-02-07 00:00:00 | 2020-02-14 00:00:00 | 2020-02-21 00:00:00 | ... | 2020-10-30 00:00:00 | 2020-11-06 00:00:00 | 2020-11-13 00:00:00 | 2020-11-20 00:00:00 | 2020-11-27 00:00:00 | 2020-12-04 00:00:00 | 2020-12-11 00:00:00 | 2020-12-18 00:00:00 | 2020-12-25 00:00:00 | 2021-01-01 00:00:00 |
5 rows × 55 columns
ons_weekly_occ = ons_sheets['Covid-19 - Weekly occurrences']
ons_weekly_occ.head()
Contents | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | ... | Unnamed: 46 | Unnamed: 47 | Unnamed: 48 | Unnamed: 49 | Unnamed: 50 | Unnamed: 51 | Unnamed: 52 | Unnamed: 53 | Unnamed: 54 | Unnamed: 55 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Weekly provisional figures on death occurrence... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Note that up-to-date counts of the total numbe... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Week number | NaN | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
4 | Week ended | NaN | 2020-01-03 00:00:00 | 2020-01-10 00:00:00 | 2020-01-17 00:00:00 | 2020-01-24 00:00:00 | 2020-01-31 00:00:00 | 2020-02-07 00:00:00 | 2020-02-14 00:00:00 | 2020-02-21 00:00:00 | ... | 2020-10-30 00:00:00 | 2020-11-06 00:00:00 | 2020-11-13 00:00:00 | 2020-11-20 00:00:00 | 2020-11-27 00:00:00 | 2020-12-04 00:00:00 | 2020-12-11 00:00:00 | 2020-12-18 00:00:00 | 2020-12-25 00:00:00 | 2021-01-01 00:00:00 |
5 rows × 56 columns
def ons_weeklies(ons_weekly, typ):
ons_weekly_long = {}
rows, cols = np.where(ons_weekly == 'Week ended')
colnames = ons_weekly.iloc[rows[0]].tolist()
colnames[1] = 'Age'
rows, cols = np.where(ons_weekly == 'Deaths by age group')
_rows, _ = np.where(ons_weekly == '90+')
_ix = rows[0]
tables = []
#Get the first three tables - for Persons, Males and Females
for r, c in zip(rows, cols):
tables.append(ons_weekly.iloc[r-1, c].split()[0])
for r, _r, t in zip(rows, _rows, tables):
ons_weekly_long[t] = ons_weekly.iloc[r+1: _r+1]
ons_weekly_long[t].columns = colnames
ons_weekly_long[t].dropna(axis=1, how='all', inplace=True)
if 'Year to date' in ons_weekly_long[t].columns:
ons_weekly_long[t].drop(columns=['Year to date'], inplace=True)
ons_weekly_long[t] = ons_weekly_long[t].melt(id_vars=['Age'], var_name='Date', value_name='value')
ons_weekly_long[t]['measure'] = typ
ons_weekly_long[t]['Date'] = pd.to_datetime(ons_weekly_long[t]['Date'])
ons_weekly_long['Any'] = pd.DataFrame()
for t in tables:
ons_weekly_long[t]['Group'] = t
ons_weekly_long['Any'] = pd.concat([ons_weekly_long['Any'], ons_weekly_long[t]])
ons_weekly_long['Any'].reset_index(inplace=True, drop=True)
return ons_weekly_long
ons_weekly_reg_long = ons_weeklies(ons_weekly_reg, 'Weekly registrations')
ons_weekly_reg_long['Females']
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Age | Date | value | measure | Group | |
---|---|---|---|---|---|
0 | <1 | 2020-01-03 | 0 | Weekly registrations | Females |
1 | 1-4 | 2020-01-03 | 0 | Weekly registrations | Females |
2 | 5-9 | 2020-01-03 | 0 | Weekly registrations | Females |
3 | 10-14 | 2020-01-03 | 0 | Weekly registrations | Females |
4 | 15-19 | 2020-01-03 | 0 | Weekly registrations | Females |
... | ... | ... | ... | ... | ... |
315 | 70-74 | 2020-04-17 | 298 | Weekly registrations | Females |
316 | 75-79 | 2020-04-17 | 465 | Weekly registrations | Females |
317 | 80-84 | 2020-04-17 | 652 | Weekly registrations | Females |
318 | 85-89 | 2020-04-17 | 781 | Weekly registrations | Females |
319 | 90+ | 2020-04-17 | 954 | Weekly registrations | Females |
320 rows × 5 columns
ons_weekly_reg_long['Any']
Age | Date | value | measure | Group | |
---|---|---|---|---|---|
0 | <1 | 2020-01-03 | 0 | Weekly registrations | Persons |
1 | 1-4 | 2020-01-03 | 0 | Weekly registrations | Persons |
2 | 5-9 | 2020-01-03 | 0 | Weekly registrations | Persons |
3 | 10-14 | 2020-01-03 | 0 | Weekly registrations | Persons |
4 | 15-19 | 2020-01-03 | 0 | Weekly registrations | Persons |
... | ... | ... | ... | ... | ... |
955 | 70-74 | 2020-04-17 | 298 | Weekly registrations | Females |
956 | 75-79 | 2020-04-17 | 465 | Weekly registrations | Females |
957 | 80-84 | 2020-04-17 | 652 | Weekly registrations | Females |
958 | 85-89 | 2020-04-17 | 781 | Weekly registrations | Females |
959 | 90+ | 2020-04-17 | 954 | Weekly registrations | Females |
960 rows × 5 columns
ons_weekly_occ_long = ons_weeklies(ons_weekly_occ, 'Weekly occurrences')
ons_weekly_occ_long['Males']
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /usr/local/lib/python3.7/site-packages/pandas/core/frame.py:4117: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy errors=errors,
Age | Date | value | measure | Group | |
---|---|---|---|---|---|
0 | <1 | 2020-01-03 | 0 | Weekly occurrences | Males |
1 | 1-4 | 2020-01-03 | 0 | Weekly occurrences | Males |
2 | 5-9 | 2020-01-03 | 0 | Weekly occurrences | Males |
3 | 10-14 | 2020-01-03 | 0 | Weekly occurrences | Males |
4 | 15-19 | 2020-01-03 | 0 | Weekly occurrences | Males |
... | ... | ... | ... | ... | ... |
315 | 70-74 | 2020-04-17 | 459 | Weekly occurrences | Males |
316 | 75-79 | 2020-04-17 | 597 | Weekly occurrences | Males |
317 | 80-84 | 2020-04-17 | 803 | Weekly occurrences | Males |
318 | 85-89 | 2020-04-17 | 793 | Weekly occurrences | Males |
319 | 90+ | 2020-04-17 | 627 | Weekly occurrences | Males |
320 rows × 5 columns
ons_weekly_all = ons_sheets['Weekly figures 2020']
ons_weekly_all.head()
Contents | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | Unnamed: 8 | Unnamed: 9 | ... | Unnamed: 45 | Unnamed: 46 | Unnamed: 47 | Unnamed: 48 | Unnamed: 49 | Unnamed: 50 | Unnamed: 51 | Unnamed: 52 | Unnamed: 53 | Unnamed: 54 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Weekly provisional figures on deaths registere... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Note that up-to-date counts of the total numbe... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | Week number | NaN | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 |
4 | Week ended | NaN | 2020-01-03 00:00:00 | 2020-01-10 00:00:00 | 2020-01-17 00:00:00 | 2020-01-24 00:00:00 | 2020-01-31 00:00:00 | 2020-02-07 00:00:00 | 2020-02-14 00:00:00 | 2020-02-21 00:00:00 | ... | 2020-10-30 00:00:00 | 2020-11-06 00:00:00 | 2020-11-13 00:00:00 | 2020-11-20 00:00:00 | 2020-11-27 00:00:00 | 2020-12-04 00:00:00 | 2020-12-11 00:00:00 | 2020-12-18 00:00:00 | 2020-12-25 00:00:00 | 2021-01-01 00:00:00 |
5 rows × 55 columns
ons_weekly_all_long = ons_weeklies(ons_weekly_all, 'Weekly all mortality')
ons_weekly_all_long['Males']
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Age | Date | value | measure | Group | |
---|---|---|---|---|---|
0 | <1 | 2020-01-03 | 30 | Weekly all mortality | Males |
1 | 1-4 | 2020-01-03 | 5 | Weekly all mortality | Males |
2 | 5-9 | 2020-01-03 | 2 | Weekly all mortality | Males |
3 | 10-14 | 2020-01-03 | 1 | Weekly all mortality | Males |
4 | 15-19 | 2020-01-03 | 5 | Weekly all mortality | Males |
... | ... | ... | ... | ... | ... |
315 | 70-74 | 2020-04-17 | 1280 | Weekly all mortality | Males |
316 | 75-79 | 2020-04-17 | 1635 | Weekly all mortality | Males |
317 | 80-84 | 2020-04-17 | 2100 | Weekly all mortality | Males |
318 | 85-89 | 2020-04-17 | 2111 | Weekly all mortality | Males |
319 | 90+ | 2020-04-17 | 1879 | Weekly all mortality | Males |
320 rows × 5 columns
Add to database...
_table = 'ons_deaths'
ons_weekly_occ_long['Any'].to_sql(_table, DB.conn, index=False, if_exists='append')
ons_weekly_reg_long['Any'].to_sql(_table, DB.conn, index=False, if_exists='append')
ons_weekly_all_long['Any'].to_sql(_table, DB.conn, index=False, if_exists='append')
ons_death_reg_url = 'https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fhealthandsocialcare%2fcausesofdeath%2fdatasets%2fdeathregistrationsandoccurrencesbylocalauthorityandhealthboard%2f2020/lahbtablesweek16.xlsx'
r = requests.get(ons_death_reg_url)
fn = ons_death_reg_url.split('/')[-1]
with open(fn, 'wb') as f:
f.write(r.content)
ons_reg_sheets = pd.read_excel(fn, sheet_name=None)
# What sheets are available in the spreadsheet
ons_reg_sheet_names = ons_reg_sheets.keys()
ons_reg_sheet_names
odict_keys(['Contents', 'Information', 'Terms and conditions', 'Registrations - All data', 'Registrations - Pivot table', 'Occurrences - All data', 'Occurrences - Pivot table'])
ons_death_reg = ons_reg_sheets['Registrations - All data']
ons_death_reg_metadata = ons_death_reg.iloc[0, 0]
ons_death_reg_metadata
'Deaths (numbers) by local authority and cause of death, registered up to the 17th April 2020, England and Wales'
from parse import parse
import dateparser
upto = parse('Deaths (numbers) by local authority and cause of death, registered up to the {date}, England and Wales',
ons_death_reg_metadata)['date']
upto = dateparser.parse(upto)
rows, cols = np.where(ons_death_reg == 'Area code')
colnames = ons_death_reg.iloc[rows[0]].tolist()
ons_death_reg = ons_death_reg.iloc[rows[0]+1:].reset_index(drop=True)
ons_death_reg.columns = colnames
ons_death_reg['Registered up to'] = upto
ons_death_reg
Area code | Geography type | Area name | Cause of death | Week number | Place of death | Number of deaths | Registered up to | |
---|---|---|---|---|---|---|---|---|
0 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Care home | 8 | 2020-04-17 |
1 | E06000001 | Local Authority | Hartlepool | COVID 19 | 1 | Care home | 0 | 2020-04-17 |
2 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Elsewhere | 0 | 2020-04-17 |
3 | E06000001 | Local Authority | Hartlepool | COVID 19 | 1 | Elsewhere | 0 | 2020-04-17 |
4 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Home | 2 | 2020-04-17 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
64766 | W11000030 | Health Board | Cwm Taf Morgannwg University Health Board | COVID 19 | 16 | Home | 4 | 2020-04-17 |
64767 | W11000030 | Health Board | Cwm Taf Morgannwg University Health Board | COVID 19 | 16 | Hospital | 53 | 2020-04-17 |
64768 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Care home | 6 | 2020-04-17 |
64769 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Hospital | 47 | 2020-04-17 |
64770 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Other communal establishment | 6 | 2020-04-17 |
64771 rows × 8 columns
ons_death_occ = ons_reg_sheets['Occurrences - All data']
ons_death_occ_metadata = ons_death_occ.iloc[0, 0]
ons_death_occ_metadata
uptos = parse('Deaths (numbers) by local authority and cause of death, for deaths that occurred up to {date_occ} but were registered up to the {date_reg}, England and Wales',
ons_death_occ_metadata)
upto_occ = uptos['date_occ']
if '2020' not in upto_occ: upto_occ = f'{upto_occ} 2020'
upto_reg = uptos['date_reg']
if '2020' not in upto_occ: upto_occ = f'{upto_reg} 2020'
upto_occ = dateparser.parse(upto_occ)
upto_reg = dateparser.parse(upto_reg)
rows, cols = np.where(ons_death_occ == 'Area code')
colnames = ons_death_occ.iloc[rows[0]].tolist()
ons_death_occ = ons_death_occ.iloc[rows[0]+1:].reset_index(drop=True)
ons_death_occ.columns = colnames
ons_death_occ['Occurred up to'] = upto_occ
ons_death_occ['Registered up to'] = upto_reg
ons_death_occ
Area code | Geography type | Area name | Cause of death | Week number | Place of death | Number of deaths | Occurred up to | Registered up to | |
---|---|---|---|---|---|---|---|---|---|
0 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Care home | 3 | 2020-04-17 | 2020-04-25 |
1 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Elsewhere | 0 | 2020-04-17 | 2020-04-25 |
2 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Home | 7 | 2020-04-17 | 2020-04-25 |
3 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Hospice | 1 | 2020-04-17 | 2020-04-25 |
4 | E06000001 | Local Authority | Hartlepool | All causes | 1 | Hospital | 13 | 2020-04-17 | 2020-04-25 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
66427 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Elsewhere | 0 | 2020-04-17 | 2020-04-25 |
66428 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Home | 1 | 2020-04-17 | 2020-04-25 |
66429 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Hospice | 0 | 2020-04-17 | 2020-04-25 |
66430 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Hospital | 40 | 2020-04-17 | 2020-04-25 |
66431 | W11000031 | Health Board | Swansea Bay University Health Board | COVID 19 | 16 | Other communal establishment | 4 | 2020-04-17 | 2020-04-25 |
66432 rows × 9 columns
_table = 'ons_deaths_reg'
ons_death_reg.to_sql(_table, DB.conn, index=False, if_exists='append')
_table = 'ons_deaths_reg_occ'
ons_death_occ.to_sql(_table, DB.conn, index=False, if_exists='append')
datasette publish fly nhs_dailies.db --app="nhs-orgs"
# It takes tiny amounts of code to post s/thing from a notebook to an API and display a result
# Create some magic to call and API
from IPython.core.magic import register_cell_magic, register_line_magic
import requests
import pandas as pd
from urllib.parse import urlencode
_datasette_url = 'https://nhs-orgs.fly.dev/nhs_dailies/phe_cases.csv?{}'
@register_line_magic
def phe_cases(line):
"Query datasette."
payload = {'_sort': 'rowid',
'Area name__contains': line,
'_size': 'max'}
_url = _datasette_url.format(urlencode(payload))
return pd.read_csv( _url)
# Pass a string to the API via some magic and display the result
%phe_cases isle of wight
#https://techoverflow.net/2018/01/16/downloading-reading-a-zip-file-in-memory-using-python/
import zipfile
def download_extract_xml(url):
"""
Download a ZIP file and extract its contents in memory
yields (filename, file-like object) pairs
"""
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as thezip:
for zipinfo in thezip.infolist():
with thezip.open(zipinfo) as thefile:
yield zipinfo.filename, thefile
r = download_extract_xml(ons_weekly_url)
for f in r:
print(f)