tomjemmett · November 14, 2020 14:12
diff --git a/nhse_ae_tidy_csv.R b/nhse_ae_tidy_csv.R
 library(tidyverse)
 library(lubridate)
 library(readxl)
 library(httr)
 library(rvest)
 library(glue)

 # We want to download all of the monthly excel files from Nhs England.
 # These are all found from the following location
 baseurl <- glue("https://www.england.nhs.uk/statistics/statistical-work-areas/",
                "ae-waiting-times-and-activity/")

 files <- c(
  # 2016/17 files are stored at slightly different location to 2017/18+
  glue("{baseurl}statistical-work-areas",
       "ae-waiting-times-and-activity",
       "ae-attendances-and-emergency-admissions-2016-17/"),
  # 2017/18 onwards use a consistent url scheme
  map_chr(2017:2019,
          ~glue("{baseurl}ae-attendances-and-emergency-admissions-",
                "{.x}-{(.x+1)%%1000}/"))
 ) %>%
  # first, get all of the links to the excel files
  map(function(url) {
    # grab the page, look for the link for the file
    webpage <- read_html(url)
    nodes <- html_nodes(webpage, "a")
    files <- nodes[str_detect(html_text(nodes),
                              "Monthly A&E \\w+ \\d{4} (\\(.*\\) ?)?\\(XLS")] %>%
      html_attr("href")
    return(files)
  }) %>%
  # flatten to a single list
  flatten_chr() %>%
  # for each file, download the excel file to a temporary location
  # return the list of the temporary files created
  map_chr(function(url) {
    GET(url, write_disk(tf <- tempfile(fileext = ".xls")))
    return (tf)
  })

 # Process the files
 read_ae_file <- function(tf) {
  col_names <- c(
    "org_code",
    "parent_org",
    "org_name",
    "attendances_1",
    "attendances_2",
    "attendances_other",
    "_attendances_total",
    "in4hours_1",
    "in4hours_2",
    "in4hours_other",
    "_in4hours_total",
    "breaches_1",
    "breaches_2",
    "breaches_other",
    "_breaches_total",
    "pcnt_total",
    "pcnt_1",
    "pcnt_2",
    "pcnt_other",
    "admissions_1",
    "admissions_2",
    "admissions_other",
    "_admissions_total",
    "other_admissions",
    "total_admisisons",
    "dta_4_12",
    "dta_12p"
  )
  
  # find the name of the sheet - this changed at somepoint, but it's always the
  # first sheet of the file that we are interested in
  sheet <- excel_sheets(tf)[[1]]
  
  # get the period from the file and convert it to a date
  period <- dmy(
    paste("1",
          read_xls(tf,
                   range=glue("'{sheet}'!C6"),
                   col_names = "period")[[1]])
  )
  # prior to April 2018 the files didn't include seperated Percentage columns
  if (period < ymd(20180401)) {
    col_names <- col_names[c(1:15, 17, 16, 20:27)]
  }
  # prior to April 2017 the files didn't include the columns for patients seen
  # in 4 hours, so drop these from the column names
  if (period < ymd(20170401)) {
    col_names <- col_names[-(8:11)]
  }
  # the range of data is dependant on how many column names are listed in the
  # variable col_names
  range <- glue("'{sheet}'!B19:{c(LETTERS,'AA','AB')[[length(col_names)+1]]}65536")
  
  # read the excel file
  df <- read_xls(tf,
                 range = range,
                 col_names = col_names,
                 # Any column with a "-" value should be treated as NA
                 na=c("-")) %>%
    # add in an index (used for joining data back together later) and the period
    # value that we derived above
    mutate(ix = row_number(),
           period = period) %>%
    # reorder the columns
    select(ix, period, everything())
  
  # to tidy the data we are going to split it into different groups:
  #  * the columns that identify the organisation
  #  * the attendance columns
  #  * the breaches columns
  #  * the admissions columns
  # we will process these columns individually, then join them back together
  # after (hence the need for creating an index column).
  df <- inner_join(
    # select the columns that identify the organisation
    select(df, ix, period:org_name),
    # for each of the 3 groups of columns
    map(c("attendances",
          "breaches",
          "admissions"),
        # select the index and the columns that start with the column group name
        ~df %>%
          select(ix, starts_with(.x)) %>%
          # gather the columns (except the index column) into "type" and the name
          # of the column group that we are currently using, e.g. "attendances"
          gather(type, !!sym(.x), -ix) %>%
          # strip the text from the "type" column before the _ character
          mutate_at(vars(type), str_replace, "\\w+_", "")
    ) %>%
      # we are left with a list of dataframes for each of the groups, so we can
      # "reduce" the lists by performing an inner_join on the first 2, then an
      # inner_join on the first 2 and the next...
      reduce(inner_join, by = c("ix", "type")),
    # finally join back to the columns that identify the organisation from above
    by = "ix"
  ) %>%
    # get rid of the index column
    select(-ix) %>%
    # ensure that the attendances, breaches and admissions columns are numeric
    mutate_at(vars(attendances:admissions), as.numeric) %>%
    # remove any row which has 0 attendances
    filter(attendances > 0)
  
  # return the data
  return (df)
 }

 # get all of the files
 df <- map_dfr(files, read_ae_file)

 # show the data
 df
	library(tidyverse)
	library(lubridate)
	library(readxl)
	library(httr)
	library(rvest)
	library(glue)

	# We want to download all of the monthly excel files from Nhs England.
	# These are all found from the following location
	baseurl <- glue("https://www.england.nhs.uk/statistics/statistical-work-areas/",
	"ae-waiting-times-and-activity/")

	files <- c(
	# 2016/17 files are stored at slightly different location to 2017/18+
	glue("{baseurl}statistical-work-areas",
	"ae-waiting-times-and-activity",
	"ae-attendances-and-emergency-admissions-2016-17/"),
	# 2017/18 onwards use a consistent url scheme
	map_chr(2017:2019,
	~glue("{baseurl}ae-attendances-and-emergency-admissions-",
	"{.x}-{(.x+1)%%1000}/"))
	) %>%
	# first, get all of the links to the excel files
	map(function(url) {
	# grab the page, look for the link for the file
	webpage <- read_html(url)
	nodes <- html_nodes(webpage, "a")
	files <- nodes[str_detect(html_text(nodes),
	"Monthly A&E \\w+ \\d{4} (\\(.*\\) ?)?\\(XLS")] %>%
	html_attr("href")
	return(files)
	}) %>%
	# flatten to a single list
	flatten_chr() %>%
	# for each file, download the excel file to a temporary location
	# return the list of the temporary files created
	map_chr(function(url) {
	GET(url, write_disk(tf <- tempfile(fileext = ".xls")))
	return (tf)
	})

	# Process the files
	read_ae_file <- function(tf) {
	col_names <- c(
	"org_code",
	"parent_org",
	"org_name",
	"attendances_1",
	"attendances_2",
	"attendances_other",
	"_attendances_total",
	"in4hours_1",
	"in4hours_2",
	"in4hours_other",
	"_in4hours_total",
	"breaches_1",
	"breaches_2",
	"breaches_other",
	"_breaches_total",
	"pcnt_total",
	"pcnt_1",
	"pcnt_2",
	"pcnt_other",
	"admissions_1",
	"admissions_2",
	"admissions_other",
	"_admissions_total",
	"other_admissions",
	"total_admisisons",
	"dta_4_12",
	"dta_12p"
	)

	# find the name of the sheet - this changed at somepoint, but it's always the
	# first sheet of the file that we are interested in
	sheet <- excel_sheets(tf)[[1]]

	# get the period from the file and convert it to a date
	period <- dmy(
	paste("1",
	read_xls(tf,
	range=glue("'{sheet}'!C6"),
	col_names = "period")[[1]])
	)
	# prior to April 2018 the files didn't include seperated Percentage columns
	if (period < ymd(20180401)) {
	col_names <- col_names[c(1:15, 17, 16, 20:27)]
	}
	# prior to April 2017 the files didn't include the columns for patients seen
	# in 4 hours, so drop these from the column names
	if (period < ymd(20170401)) {
	col_names <- col_names[-(8:11)]
	}
	# the range of data is dependant on how many column names are listed in the
	# variable col_names
	range <- glue("'{sheet}'!B19:{c(LETTERS,'AA','AB')[[length(col_names)+1]]}65536")

	# read the excel file
	df <- read_xls(tf,
	range = range,
	col_names = col_names,
	# Any column with a "-" value should be treated as NA
	na=c("-")) %>%
	# add in an index (used for joining data back together later) and the period
	# value that we derived above
	mutate(ix = row_number(),
	period = period) %>%
	# reorder the columns
	select(ix, period, everything())

	# to tidy the data we are going to split it into different groups:
	# * the columns that identify the organisation
	# * the attendance columns
	# * the breaches columns
	# * the admissions columns
	# we will process these columns individually, then join them back together
	# after (hence the need for creating an index column).
	df <- inner_join(
	# select the columns that identify the organisation
	select(df, ix, period:org_name),
	# for each of the 3 groups of columns
	map(c("attendances",
	"breaches",
	"admissions"),
	# select the index and the columns that start with the column group name
	~df %>%
	select(ix, starts_with(.x)) %>%
	# gather the columns (except the index column) into "type" and the name
	# of the column group that we are currently using, e.g. "attendances"
	gather(type, !!sym(.x), -ix) %>%
	# strip the text from the "type" column before the _ character
	mutate_at(vars(type), str_replace, "\\w+_", "")
	) %>%
	# we are left with a list of dataframes for each of the groups, so we can
	# "reduce" the lists by performing an inner_join on the first 2, then an
	# inner_join on the first 2 and the next...
	reduce(inner_join, by = c("ix", "type")),
	# finally join back to the columns that identify the organisation from above
	by = "ix"
	) %>%
	# get rid of the index column
	select(-ix) %>%
	# ensure that the attendances, breaches and admissions columns are numeric
	mutate_at(vars(attendances:admissions), as.numeric) %>%
	# remove any row which has 0 attendances
	filter(attendances > 0)

	# return the data
	return (df)
	}

	# get all of the files
	df <- map_dfr(files, read_ae_file)

	# show the data
	df
No results found