Last active
November 14, 2020 14:12
-
-
Save tomjemmett/21b26c862f8cede617537da15ad9f5e4 to your computer and use it in GitHub Desktop.
Downloads and tidies the A&E stats from NHS England
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyverse) | |
| library(lubridate) | |
| library(readxl) | |
| library(httr) | |
| library(rvest) | |
| library(glue) | |
| # We want to download all of the monthly excel files from Nhs England. | |
| # These are all found from the following location | |
| baseurl <- glue("https://www.england.nhs.uk/statistics/statistical-work-areas/", | |
| "ae-waiting-times-and-activity/") | |
| files <- c( | |
| # 2016/17 files are stored at slightly different location to 2017/18+ | |
| glue("{baseurl}statistical-work-areas", | |
| "ae-waiting-times-and-activity", | |
| "ae-attendances-and-emergency-admissions-2016-17/"), | |
| # 2017/18 onwards use a consistent url scheme | |
| map_chr(2017:2019, | |
| ~glue("{baseurl}ae-attendances-and-emergency-admissions-", | |
| "{.x}-{(.x+1)%%1000}/")) | |
| ) %>% | |
| # first, get all of the links to the excel files | |
| map(function(url) { | |
| # grab the page, look for the link for the file | |
| webpage <- read_html(url) | |
| nodes <- html_nodes(webpage, "a") | |
| files <- nodes[str_detect(html_text(nodes), | |
| "Monthly A&E \\w+ \\d{4} (\\(.*\\) ?)?\\(XLS")] %>% | |
| html_attr("href") | |
| return(files) | |
| }) %>% | |
| # flatten to a single list | |
| flatten_chr() %>% | |
| # for each file, download the excel file to a temporary location | |
| # return the list of the temporary files created | |
| map_chr(function(url) { | |
| GET(url, write_disk(tf <- tempfile(fileext = ".xls"))) | |
| return (tf) | |
| }) | |
| # Process the files | |
| read_ae_file <- function(tf) { | |
| col_names <- c( | |
| "org_code", | |
| "parent_org", | |
| "org_name", | |
| "attendances_1", | |
| "attendances_2", | |
| "attendances_other", | |
| "_attendances_total", | |
| "in4hours_1", | |
| "in4hours_2", | |
| "in4hours_other", | |
| "_in4hours_total", | |
| "breaches_1", | |
| "breaches_2", | |
| "breaches_other", | |
| "_breaches_total", | |
| "pcnt_total", | |
| "pcnt_1", | |
| "pcnt_2", | |
| "pcnt_other", | |
| "admissions_1", | |
| "admissions_2", | |
| "admissions_other", | |
| "_admissions_total", | |
| "other_admissions", | |
| "total_admisisons", | |
| "dta_4_12", | |
| "dta_12p" | |
| ) | |
| # find the name of the sheet - this changed at somepoint, but it's always the | |
| # first sheet of the file that we are interested in | |
| sheet <- excel_sheets(tf)[[1]] | |
| # get the period from the file and convert it to a date | |
| period <- dmy( | |
| paste("1", | |
| read_xls(tf, | |
| range=glue("'{sheet}'!C6"), | |
| col_names = "period")[[1]]) | |
| ) | |
| # prior to April 2018 the files didn't include seperated Percentage columns | |
| if (period < ymd(20180401)) { | |
| col_names <- col_names[c(1:15, 17, 16, 20:27)] | |
| } | |
| # prior to April 2017 the files didn't include the columns for patients seen | |
| # in 4 hours, so drop these from the column names | |
| if (period < ymd(20170401)) { | |
| col_names <- col_names[-(8:11)] | |
| } | |
| # the range of data is dependant on how many column names are listed in the | |
| # variable col_names | |
| range <- glue("'{sheet}'!B19:{c(LETTERS,'AA','AB')[[length(col_names)+1]]}65536") | |
| # read the excel file | |
| df <- read_xls(tf, | |
| range = range, | |
| col_names = col_names, | |
| # Any column with a "-" value should be treated as NA | |
| na=c("-")) %>% | |
| # add in an index (used for joining data back together later) and the period | |
| # value that we derived above | |
| mutate(ix = row_number(), | |
| period = period) %>% | |
| # reorder the columns | |
| select(ix, period, everything()) | |
| # to tidy the data we are going to split it into different groups: | |
| # * the columns that identify the organisation | |
| # * the attendance columns | |
| # * the breaches columns | |
| # * the admissions columns | |
| # we will process these columns individually, then join them back together | |
| # after (hence the need for creating an index column). | |
| df <- inner_join( | |
| # select the columns that identify the organisation | |
| select(df, ix, period:org_name), | |
| # for each of the 3 groups of columns | |
| map(c("attendances", | |
| "breaches", | |
| "admissions"), | |
| # select the index and the columns that start with the column group name | |
| ~df %>% | |
| select(ix, starts_with(.x)) %>% | |
| # gather the columns (except the index column) into "type" and the name | |
| # of the column group that we are currently using, e.g. "attendances" | |
| gather(type, !!sym(.x), -ix) %>% | |
| # strip the text from the "type" column before the _ character | |
| mutate_at(vars(type), str_replace, "\\w+_", "") | |
| ) %>% | |
| # we are left with a list of dataframes for each of the groups, so we can | |
| # "reduce" the lists by performing an inner_join on the first 2, then an | |
| # inner_join on the first 2 and the next... | |
| reduce(inner_join, by = c("ix", "type")), | |
| # finally join back to the columns that identify the organisation from above | |
| by = "ix" | |
| ) %>% | |
| # get rid of the index column | |
| select(-ix) %>% | |
| # ensure that the attendances, breaches and admissions columns are numeric | |
| mutate_at(vars(attendances:admissions), as.numeric) %>% | |
| # remove any row which has 0 attendances | |
| filter(attendances > 0) | |
| # return the data | |
| return (df) | |
| } | |
| # get all of the files | |
| df <- map_dfr(files, read_ae_file) | |
| # show the data | |
| df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment