Skip to content

Instantly share code, notes, and snippets.

@mhoehle
Last active April 8, 2021 21:21
Show Gist options
  • Select an option

  • Save mhoehle/f6e0bc459f48a0f884d137b17619a5b1 to your computer and use it in GitHub Desktop.

Select an option

Save mhoehle/f6e0bc459f48a0f884d137b17619a5b1 to your computer and use it in GitHub Desktop.
Sum of two exponential growths
# Quick mockup model to get expo-growth from variants based on
# a representative (?) sample of subtypes.
# Compare, e.g., with https://twitter.com/and_hi_/status/1370474726620008452/photo/1
#
# Author: Michael Höhle <https://www.math.su.se/~hoehle>
#
# Description:
# Data from Tab. 2 in https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/DESH/Bericht_VOC_2021-03-10.pdf?__blob=publicationFile
# as of 2021-03-11. VOC = B117
#
# History:
# - 2021-03-23 original version
# - 2021-04-08 improved code s.t. code really generates prediction intervals
# - 2021-04-09 added AR(1) version for residuals based on suggestion in https://twitter.com/johannesbracher/status/1380066350111002624?s=20
voc <- tibble(kw=2:9, n_test_sample=c(49,
3344,
30449,
26849,
33943,
29770,
45581,
35755),
n_b117_sample = c(1,
122,
1441,
1931,
5978,
7698,
18224,
19472),
# Survstat incidences - queried from https://survstat.rki.de/Content/Query/Create.aspx
inc=c(144.70,116.23, 95.19,78.62,61.85,
63.84,68.67,71.03)) %>%
mutate(n_other_sample = n_test_sample - n_b117_sample,
prop_b117 = n_b117_sample / n_test_sample,
inc_b117 = prop_b117 * inc,
inc_other = (1-prop_b117)*inc)
# Make a long version
voc_df_long <- voc %>% select(-n_test_sample,-prop_b117) %>% rename(inc_total = inc) %>%
pivot_longer( c(inc_b117, inc_other), names_to = "type", values_to="inc") %>%
mutate(predict=FALSE)
# Compute CI for the incidence based on the sampling - rather narrow, so do not show it to make figure easier to read
# mutate(predict=FALSE,
# a = 1/2 + if_else(type=="inc_voc", n_b117_sample, n_other_sample),
# b = 1/2 + if_else(type=="inc_voc", n_other_sample, n_b117_sample),
# lower = inc_total * qbeta(0.025, a, b),
# upper = inc_total * qbeta(0.975, a, b)) %>%
# select(kw, predict, type, inc, lower, upper)
# Fit Linear model with log-link (in order to get exponential growth in the expectation).
m_voc <- glm( inc ~ (1 + kw)*type, family=gaussian(link="log"), data=voc_df_long)
# Extrapolate model some weeks into the future. Danger!
# Assumptions:
# - VOC and non-VOC are completely separate/independent processes
# - Things stay as-is for the next X weeks, i.e. no change in contacts (unlikely!)
# - Sequenced cases make up a simple random sample of all cases
# This code apparently does not generate a prediction interval for the incidence but only a
# CI for the mean.
pred_old <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
mutate(inc = predict(m_voc, newdata=., type="response"),
se = predict(m_voc, newdata=., type="response", se=TRUE)$se,
lower = inc - qnorm(0.975)*se,
upper = inc + qnorm(0.975)*se,
predict=TRUE)
# Take estimated dispersion into account (homoscedastic) by a simple plug-in prediction
pred_old_improved <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
mutate(inc = predict(m_voc, newdata=., type="response"),
se_mean = predict(m_voc, newdata=., type="response", se=TRUE)$se,
se_obs = sqrt(summary(m_voc)$dispersion),
se = sqrt(se_mean^2 + se_obs^2),
lower = inc - qnorm(0.975)*se,
upper = inc + qnorm(0.975)*se,
predict=TRUE)
# Better: Fit a linear model to log(incidence).
m_voclog <- lm( log(inc) ~ (1 + kw)*type, data=voc_df_long)
# Generate proper prediction intervals based on back-transforming quantiles of
# predictive on log-scale
pred <- expand_grid(kw=10:12, type=c("inc_b117", "inc_other")) %>%
mutate(pred = predict(m_voclog, newdata=., type="response", interval="prediction") %>% exp(),
fit = pred[,"fit"], #note: this is the median of the forecast distrib
lower = pred[,"lwr"],
upper = pred[,"upr"],
predict=TRUE) %>%
select(-pred)
# Prepare data frame for plotting
make_df_plot <- function(pred) {
df <- bind_rows(voc_df_long, pred)
df_total <- df %>% group_by(kw) %>%
summarise(inc = sum(inc), lower=sum(lower), upper=sum(upper), predict=max(predict)) %>%
mutate(type="inc_total")
df_plot <- bind_rows(df, df_total) %>% mutate(predict=factor(predict))
return(df_plot)
}
#Compute all 3 versions
df_plot_old <- make_df_plot(pred_old)
df_plot_old_improved <- make_df_plot(pred_old_improved)
df_plot <- make_df_plot(pred)
# Compare
left_join(df_plot, df_plot_old, by=c("kw","type"), suffix=c(".new", ".old")) %>%
left_join(df_plot_old_improved, by=c("kw","type")) %>%
filter(predict.new==1) %>%
rename(lower.old_improved = lower, upper.old_improved = upper) %>%
select("kw", "type","lower.old", "lower.old_improved", "lower.new", "upper.old", "upper.old_improved", "upper.new")
#Plot the new version
ggplot(df_plot, aes(x=kw, y=inc, color=type, linetype=predict)) +
geom_line() +
geom_point() +
xlab("Woche") + ylab("7T Melde-Inzidenz") +
geom_errorbar(aes(ymin=lower, ymax=upper)) +
scale_color_brewer(palette = "Dark2") +
scale_x_continuous(breaks=2:13) +
theme_minimal()
ggsave("exp.png", width=8, height=4, dpi=150)
##################################################################
## Compare results with AR(model) based on
## See https://otexts.com/fpp3/nonlinear-regression.html
## and https://otexts.com/fpp3/non-seasonal-arima.html
##################################################################
library(fpp3)
voc_ts <- voc_df_long %>% as_tsibble(index = "kw", key="type")
# Fit models to each 'type' time series
fit_trends <- voc_ts %>%
model(
`log(inc) ~ 1 + t + iid` = TSLM(log(inc) ~ 1 + trend()),
`log(inc) ~ 1 + t + AR(1)` = ARIMA(log(inc) ~ 1 + trend() + pdq(1,0,0))
)
# Make a 4 week prediction
fc_trends <- fit_trends %>% forecast(h = 3)
fc <- fc_trends %>% hilo(level = c(95)) %>% unpack_hilo(cols="95%")
# Show results
afit <- augment(fit_trends)
afit %>%
ggplot(aes(x = kw)) +
geom_line(data=voc_ts, aes(y = inc, colour = type)) +
geom_point(data=voc_ts, aes(y = inc, colour = type, shape="Data"), size=2) +
geom_line(aes(y = .fitted, colour = type), linetype=2) +
geom_point(aes(y = .fitted, colour = type, shape=.model), size=2) +
geom_errorbar(data=fc, aes(ymin=`95%_lower`,ymax= `95%_upper`,color=type)) +
geom_point(data=fc, aes(y=.mean, color=type, shape=.model), size=2) +
geom_point(data=fc, aes(y=`95%_lower`,color=type, shape=.model), size=2) +
geom_point(data=fc, aes(y=`95%_upper`,color=type, shape=.model), size=2) +
geom_vline(xintercept=9.5,lty=2, color="darkgray") +
scale_color_brewer(palette = "Dark2") +
guides(colour = guide_legend(title = "Type"), size = FALSE, shape=guide_legend(title="Model") ) +
xlab("Woche") + ylab("7T Melde-Inzidenz") +
scale_x_continuous(breaks=2:13) +
theme_minimal() +
theme(legend.position="bottom")
ggsave("exp-expar.png", width=8, height=4, dpi=150)
@mhoehle
Copy link
Author

mhoehle commented Apr 8, 2021

2021-03-23 Original model (update: now as log(inc) ~ 1 + t)
grafik

2021-04-08 For comparison added a time series based approach, which allows the residuals of the log(inc) ~ 1 + t to be auto-correlated according to an AR(1) process.
grafik

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment