mhoehle · April 8, 2021 21:21 · mhoehle · Apr 8, 2021
diff --git a/voc.R b/voc.R
 # Quick mockup model to get expo-growth from variants based on
 # a representative (?) sample of subtypes.
 # Compare, e.g., with https://twitter.com/and_hi_/status/1370474726620008452/photo/1
 #
 # Author: Michael Höhle <https://www.math.su.se/~hoehle>
 #
 # Description:
 # Data from Tab. 2 in https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/DESH/Bericht_VOC_2021-03-10.pdf?__blob=publicationFile
 # as of 2021-03-11. VOC = B117
 #
 # History:
 # - 2021-03-23 original version
 # - 2021-04-08 improved code s.t. code really generates prediction intervals
 # - 2021-04-09 added AR(1) version for residuals based on suggestion in https://twitter.com/johannesbracher/status/1380066350111002624?s=20

 voc <- tibble(kw=2:9, n_test_sample=c(49,
                                      3344,
                                      30449,
                                      26849,
                                      33943,
                                      29770,
                                      45581,
                                      35755),
              n_b117_sample = c(1,
                                122,
                                1441,
                                1931,
                                5978,
                                7698,
                                18224,
                                19472),
              # Survstat incidences - queried from https://survstat.rki.de/Content/Query/Create.aspx
              inc=c(144.70,116.23, 95.19,78.62,61.85,
                    63.84,68.67,71.03)) %>%
  mutate(n_other_sample = n_test_sample - n_b117_sample,
         prop_b117 = n_b117_sample / n_test_sample,
         inc_b117 = prop_b117 * inc,
         inc_other = (1-prop_b117)*inc)

 # Make a long version
 voc_df_long <- voc %>% select(-n_test_sample,-prop_b117) %>% rename(inc_total = inc) %>%
  pivot_longer( c(inc_b117, inc_other), names_to = "type", values_to="inc") %>%
  mutate(predict=FALSE)
 # Compute CI for the incidence based on the sampling - rather narrow, so do not show it to make figure easier to read
 # mutate(predict=FALSE,
 #        a = 1/2 + if_else(type=="inc_voc", n_b117_sample, n_other_sample),
 #        b = 1/2 + if_else(type=="inc_voc", n_other_sample, n_b117_sample),
 #        lower = inc_total * qbeta(0.025, a, b),
 #        upper = inc_total * qbeta(0.975, a, b)) %>%
 # select(kw, predict, type, inc, lower, upper)


 # Fit Linear model with log-link (in order to get exponential growth in the expectation).
 m_voc <- glm( inc ~ (1 + kw)*type, family=gaussian(link="log"), data=voc_df_long)

 # Extrapolate model some weeks into the future. Danger!
 # Assumptions:
 #  - VOC and non-VOC are completely separate/independent processes
 #  - Things stay as-is for the next X weeks, i.e. no change in contacts (unlikely!)
 #  - Sequenced cases make up a simple random sample of all cases

 # This code apparently does not generate a prediction interval for the incidence but only a
 # CI for the mean.
 pred_old <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
  mutate(inc = predict(m_voc, newdata=., type="response"),
         se = predict(m_voc, newdata=., type="response", se=TRUE)$se,
         lower = inc - qnorm(0.975)*se,
         upper = inc + qnorm(0.975)*se,
         predict=TRUE)

 # Take estimated dispersion into account (homoscedastic) by a simple plug-in prediction
 pred_old_improved <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
  mutate(inc = predict(m_voc, newdata=., type="response"),
         se_mean = predict(m_voc, newdata=., type="response", se=TRUE)$se,
         se_obs  = sqrt(summary(m_voc)$dispersion),
         se = sqrt(se_mean^2 + se_obs^2),
         lower = inc - qnorm(0.975)*se,
         upper = inc + qnorm(0.975)*se,
         predict=TRUE)

 # Better: Fit a linear model to log(incidence).
 m_voclog <- lm( log(inc) ~ (1 + kw)*type, data=voc_df_long)

 # Generate proper prediction intervals based on back-transforming quantiles of
 # predictive on log-scale
 pred <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
  mutate(pred = predict(m_voclog, newdata=., type="response", interval="prediction") %>% exp(),
         fit = pred[,"fit"],  #note: this is the median of the forecast distrib
         lower = pred[,"lwr"],
         upper =  pred[,"upr"],
         predict=TRUE) %>%
  select(-pred)

 # Prepare data frame for plotting
 make_df_plot <- function(pred) {
  df <- bind_rows(voc_df_long, pred)
  df_total <- df %>% group_by(kw) %>%
    summarise(inc = sum(inc), lower=sum(lower), upper=sum(upper), predict=max(predict)) %>%
    mutate(type="inc_total")

  df_plot <- bind_rows(df, df_total) %>% mutate(predict=factor(predict))
  return(df_plot)
 }

 #Compute all 3 versions
 df_plot_old          <- make_df_plot(pred_old)
 df_plot_old_improved <- make_df_plot(pred_old_improved)
 df_plot              <- make_df_plot(pred)

 # Compare
 left_join(df_plot, df_plot_old, by=c("kw","type"), suffix=c(".new", ".old")) %>%
  left_join(df_plot_old_improved, by=c("kw","type")) %>%
  filter(predict.new==1) %>%
  rename(lower.old_improved = lower, upper.old_improved = upper) %>%
  select("kw", "type","lower.old", "lower.old_improved", "lower.new", "upper.old", "upper.old_improved", "upper.new")

 #Plot the new version
 ggplot(df_plot, aes(x=kw, y=inc, color=type, linetype=predict)) +
  geom_line() +
  geom_point() +
  xlab("Woche") + ylab("7T Melde-Inzidenz") +
  geom_errorbar(aes(ymin=lower, ymax=upper)) +
  scale_color_brewer(palette = "Dark2") +
  scale_x_continuous(breaks=2:13) +
  theme_minimal()

 ggsave("exp.png", width=8, height=4, dpi=150)

 ##################################################################
 ## Compare results with AR(model) based on
 ## See https://otexts.com/fpp3/nonlinear-regression.html
 ## and https://otexts.com/fpp3/non-seasonal-arima.html
 ##################################################################

 library(fpp3)
 voc_ts <- voc_df_long %>% as_tsibble(index = "kw", key="type")

 # Fit models to each 'type' time series
 fit_trends <- voc_ts %>%
  model(
    `log(inc) ~ 1 + t + iid` = TSLM(log(inc) ~ 1 + trend()),
    `log(inc) ~ 1 + t + AR(1)` = ARIMA(log(inc) ~ 1 + trend() + pdq(1,0,0))
  )

 # Make a 4 week prediction
 fc_trends <- fit_trends %>% forecast(h = 3)
 fc <- fc_trends  %>% hilo(level = c(95)) %>% unpack_hilo(cols="95%")

 # Show results
 afit <- augment(fit_trends)
 afit %>%
  ggplot(aes(x = kw)) +
  geom_line(data=voc_ts, aes(y = inc, colour = type)) +
  geom_point(data=voc_ts, aes(y = inc, colour = type, shape="Data"), size=2) +
  geom_line(aes(y = .fitted, colour = type), linetype=2) +
  geom_point(aes(y = .fitted, colour = type, shape=.model), size=2) +
  geom_errorbar(data=fc, aes(ymin=`95%_lower`,ymax= `95%_upper`,color=type)) +
  geom_point(data=fc, aes(y=.mean, color=type, shape=.model), size=2) +
  geom_point(data=fc, aes(y=`95%_lower`,color=type, shape=.model), size=2) +
  geom_point(data=fc, aes(y=`95%_upper`,color=type, shape=.model), size=2) +
  geom_vline(xintercept=9.5,lty=2, color="darkgray") +
  scale_color_brewer(palette = "Dark2") +
  guides(colour = guide_legend(title = "Type"),  size = FALSE, shape=guide_legend(title="Model") ) +
  xlab("Woche") + ylab("7T Melde-Inzidenz") +
  scale_x_continuous(breaks=2:13) +
  theme_minimal() +
  theme(legend.position="bottom")


 ggsave("exp-expar.png", width=8, height=4, dpi=150)
	# Quick mockup model to get expo-growth from variants based on
	# a representative (?) sample of subtypes.
	# Compare, e.g., with https://twitter.com/and_hi_/status/1370474726620008452/photo/1
	#
	# Author: Michael Höhle <https://www.math.su.se/~hoehle>
	#
	# Description:
	# Data from Tab. 2 in https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/DESH/Bericht_VOC_2021-03-10.pdf?__blob=publicationFile
	# as of 2021-03-11. VOC = B117
	#
	# History:
	# - 2021-03-23 original version
	# - 2021-04-08 improved code s.t. code really generates prediction intervals
	# - 2021-04-09 added AR(1) version for residuals based on suggestion in https://twitter.com/johannesbracher/status/1380066350111002624?s=20

	voc <- tibble(kw=2:9, n_test_sample=c(49,
	3344,
	30449,
	26849,
	33943,
	29770,
	45581,
	35755),
	n_b117_sample = c(1,
	122,
	1441,
	1931,
	5978,
	7698,
	18224,
	19472),
	# Survstat incidences - queried from https://survstat.rki.de/Content/Query/Create.aspx
	inc=c(144.70,116.23, 95.19,78.62,61.85,
	63.84,68.67,71.03)) %>%
	mutate(n_other_sample = n_test_sample - n_b117_sample,
	prop_b117 = n_b117_sample / n_test_sample,
	inc_b117 = prop_b117 * inc,
	inc_other = (1-prop_b117)*inc)

	# Make a long version
	voc_df_long <- voc %>% select(-n_test_sample,-prop_b117) %>% rename(inc_total = inc) %>%
	pivot_longer( c(inc_b117, inc_other), names_to = "type", values_to="inc") %>%
	mutate(predict=FALSE)
	# Compute CI for the incidence based on the sampling - rather narrow, so do not show it to make figure easier to read
	# mutate(predict=FALSE,
	# a = 1/2 + if_else(type=="inc_voc", n_b117_sample, n_other_sample),
	# b = 1/2 + if_else(type=="inc_voc", n_other_sample, n_b117_sample),
	# lower = inc_total * qbeta(0.025, a, b),
	# upper = inc_total * qbeta(0.975, a, b)) %>%
	# select(kw, predict, type, inc, lower, upper)


	# Fit Linear model with log-link (in order to get exponential growth in the expectation).
	m_voc <- glm( inc ~ (1 + kw)*type, family=gaussian(link="log"), data=voc_df_long)

	# Extrapolate model some weeks into the future. Danger!
	# Assumptions:
	# - VOC and non-VOC are completely separate/independent processes
	# - Things stay as-is for the next X weeks, i.e. no change in contacts (unlikely!)
	# - Sequenced cases make up a simple random sample of all cases

	# This code apparently does not generate a prediction interval for the incidence but only a
	# CI for the mean.
	pred_old <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
	mutate(inc = predict(m_voc, newdata=., type="response"),
	se = predict(m_voc, newdata=., type="response", se=TRUE)$se,
	lower = inc - qnorm(0.975)*se,
	upper = inc + qnorm(0.975)*se,
	predict=TRUE)

	# Take estimated dispersion into account (homoscedastic) by a simple plug-in prediction
	pred_old_improved <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
	mutate(inc = predict(m_voc, newdata=., type="response"),
	se_mean = predict(m_voc, newdata=., type="response", se=TRUE)$se,
	se_obs = sqrt(summary(m_voc)$dispersion),
	se = sqrt(se_mean^2 + se_obs^2),
	lower = inc - qnorm(0.975)*se,
	upper = inc + qnorm(0.975)*se,
	predict=TRUE)

	# Better: Fit a linear model to log(incidence).
	m_voclog <- lm( log(inc) ~ (1 + kw)*type, data=voc_df_long)

	# Generate proper prediction intervals based on back-transforming quantiles of
	# predictive on log-scale
	pred <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
	mutate(pred = predict(m_voclog, newdata=., type="response", interval="prediction") %>% exp(),
	fit = pred[,"fit"], #note: this is the median of the forecast distrib
	lower = pred[,"lwr"],
	upper = pred[,"upr"],
	predict=TRUE) %>%
	select(-pred)

	# Prepare data frame for plotting
	make_df_plot <- function(pred) {
	df <- bind_rows(voc_df_long, pred)
	df_total <- df %>% group_by(kw) %>%
	summarise(inc = sum(inc), lower=sum(lower), upper=sum(upper), predict=max(predict)) %>%
	mutate(type="inc_total")

	df_plot <- bind_rows(df, df_total) %>% mutate(predict=factor(predict))
	return(df_plot)
	}

	#Compute all 3 versions
	df_plot_old <- make_df_plot(pred_old)
	df_plot_old_improved <- make_df_plot(pred_old_improved)
	df_plot <- make_df_plot(pred)

	# Compare
	left_join(df_plot, df_plot_old, by=c("kw","type"), suffix=c(".new", ".old")) %>%
	left_join(df_plot_old_improved, by=c("kw","type")) %>%
	filter(predict.new==1) %>%
	rename(lower.old_improved = lower, upper.old_improved = upper) %>%
	select("kw", "type","lower.old", "lower.old_improved", "lower.new", "upper.old", "upper.old_improved", "upper.new")

	#Plot the new version
	ggplot(df_plot, aes(x=kw, y=inc, color=type, linetype=predict)) +
	geom_line() +
	geom_point() +
	xlab("Woche") + ylab("7T Melde-Inzidenz") +
	geom_errorbar(aes(ymin=lower, ymax=upper)) +
	scale_color_brewer(palette = "Dark2") +
	scale_x_continuous(breaks=2:13) +
	theme_minimal()

	ggsave("exp.png", width=8, height=4, dpi=150)

	##################################################################
	## Compare results with AR(model) based on
	## See https://otexts.com/fpp3/nonlinear-regression.html
	## and https://otexts.com/fpp3/non-seasonal-arima.html
	##################################################################

	library(fpp3)
	voc_ts <- voc_df_long %>% as_tsibble(index = "kw", key="type")

	# Fit models to each 'type' time series
	fit_trends <- voc_ts %>%
	model(
	`log(inc) ~ 1 + t + iid` = TSLM(log(inc) ~ 1 + trend()),
	`log(inc) ~ 1 + t + AR(1)` = ARIMA(log(inc) ~ 1 + trend() + pdq(1,0,0))
	)

	# Make a 4 week prediction
	fc_trends <- fit_trends %>% forecast(h = 3)
	fc <- fc_trends %>% hilo(level = c(95)) %>% unpack_hilo(cols="95%")

	# Show results
	afit <- augment(fit_trends)
	afit %>%
	ggplot(aes(x = kw)) +
	geom_line(data=voc_ts, aes(y = inc, colour = type)) +
	geom_point(data=voc_ts, aes(y = inc, colour = type, shape="Data"), size=2) +
	geom_line(aes(y = .fitted, colour = type), linetype=2) +
	geom_point(aes(y = .fitted, colour = type, shape=.model), size=2) +
	geom_errorbar(data=fc, aes(ymin=`95%_lower`,ymax= `95%_upper`,color=type)) +
	geom_point(data=fc, aes(y=.mean, color=type, shape=.model), size=2) +
	geom_point(data=fc, aes(y=`95%_lower`,color=type, shape=.model), size=2) +
	geom_point(data=fc, aes(y=`95%_upper`,color=type, shape=.model), size=2) +
	geom_vline(xintercept=9.5,lty=2, color="darkgray") +
	scale_color_brewer(palette = "Dark2") +
	guides(colour = guide_legend(title = "Type"), size = FALSE, shape=guide_legend(title="Model") ) +
	xlab("Woche") + ylab("7T Melde-Inzidenz") +
	scale_x_continuous(breaks=2:13) +
	theme_minimal() +
	theme(legend.position="bottom")


	ggsave("exp-expar.png", width=8, height=4, dpi=150)
No results found