Skip to content

Instantly share code, notes, and snippets.

@tomjemmett
Last active November 14, 2020 14:12
Show Gist options
  • Select an option

  • Save tomjemmett/21b26c862f8cede617537da15ad9f5e4 to your computer and use it in GitHub Desktop.

Select an option

Save tomjemmett/21b26c862f8cede617537da15ad9f5e4 to your computer and use it in GitHub Desktop.
Downloads and tidies the A&E stats from NHS England
library(tidyverse)
library(lubridate)
library(readxl)
library(httr)
library(rvest)
library(glue)
# We want to download all of the monthly excel files from Nhs England.
# These are all found from the following location
baseurl <- glue("https://www.england.nhs.uk/statistics/statistical-work-areas/",
"ae-waiting-times-and-activity/")
files <- c(
# 2016/17 files are stored at slightly different location to 2017/18+
glue("{baseurl}statistical-work-areas",
"ae-waiting-times-and-activity",
"ae-attendances-and-emergency-admissions-2016-17/"),
# 2017/18 onwards use a consistent url scheme
map_chr(2017:2019,
~glue("{baseurl}ae-attendances-and-emergency-admissions-",
"{.x}-{(.x+1)%%1000}/"))
) %>%
# first, get all of the links to the excel files
map(function(url) {
# grab the page, look for the link for the file
webpage <- read_html(url)
nodes <- html_nodes(webpage, "a")
files <- nodes[str_detect(html_text(nodes),
"Monthly A&E \\w+ \\d{4} (\\(.*\\) ?)?\\(XLS")] %>%
html_attr("href")
return(files)
}) %>%
# flatten to a single list
flatten_chr() %>%
# for each file, download the excel file to a temporary location
# return the list of the temporary files created
map_chr(function(url) {
GET(url, write_disk(tf <- tempfile(fileext = ".xls")))
return (tf)
})
# Process the files
read_ae_file <- function(tf) {
col_names <- c(
"org_code",
"parent_org",
"org_name",
"attendances_1",
"attendances_2",
"attendances_other",
"_attendances_total",
"in4hours_1",
"in4hours_2",
"in4hours_other",
"_in4hours_total",
"breaches_1",
"breaches_2",
"breaches_other",
"_breaches_total",
"pcnt_total",
"pcnt_1",
"pcnt_2",
"pcnt_other",
"admissions_1",
"admissions_2",
"admissions_other",
"_admissions_total",
"other_admissions",
"total_admisisons",
"dta_4_12",
"dta_12p"
)
# find the name of the sheet - this changed at somepoint, but it's always the
# first sheet of the file that we are interested in
sheet <- excel_sheets(tf)[[1]]
# get the period from the file and convert it to a date
period <- dmy(
paste("1",
read_xls(tf,
range=glue("'{sheet}'!C6"),
col_names = "period")[[1]])
)
# prior to April 2018 the files didn't include seperated Percentage columns
if (period < ymd(20180401)) {
col_names <- col_names[c(1:15, 17, 16, 20:27)]
}
# prior to April 2017 the files didn't include the columns for patients seen
# in 4 hours, so drop these from the column names
if (period < ymd(20170401)) {
col_names <- col_names[-(8:11)]
}
# the range of data is dependant on how many column names are listed in the
# variable col_names
range <- glue("'{sheet}'!B19:{c(LETTERS,'AA','AB')[[length(col_names)+1]]}65536")
# read the excel file
df <- read_xls(tf,
range = range,
col_names = col_names,
# Any column with a "-" value should be treated as NA
na=c("-")) %>%
# add in an index (used for joining data back together later) and the period
# value that we derived above
mutate(ix = row_number(),
period = period) %>%
# reorder the columns
select(ix, period, everything())
# to tidy the data we are going to split it into different groups:
# * the columns that identify the organisation
# * the attendance columns
# * the breaches columns
# * the admissions columns
# we will process these columns individually, then join them back together
# after (hence the need for creating an index column).
df <- inner_join(
# select the columns that identify the organisation
select(df, ix, period:org_name),
# for each of the 3 groups of columns
map(c("attendances",
"breaches",
"admissions"),
# select the index and the columns that start with the column group name
~df %>%
select(ix, starts_with(.x)) %>%
# gather the columns (except the index column) into "type" and the name
# of the column group that we are currently using, e.g. "attendances"
gather(type, !!sym(.x), -ix) %>%
# strip the text from the "type" column before the _ character
mutate_at(vars(type), str_replace, "\\w+_", "")
) %>%
# we are left with a list of dataframes for each of the groups, so we can
# "reduce" the lists by performing an inner_join on the first 2, then an
# inner_join on the first 2 and the next...
reduce(inner_join, by = c("ix", "type")),
# finally join back to the columns that identify the organisation from above
by = "ix"
) %>%
# get rid of the index column
select(-ix) %>%
# ensure that the attendances, breaches and admissions columns are numeric
mutate_at(vars(attendances:admissions), as.numeric) %>%
# remove any row which has 0 attendances
filter(attendances > 0)
# return the data
return (df)
}
# get all of the files
df <- map_dfr(files, read_ae_file)
# show the data
df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment