Skip to content

Instantly share code, notes, and snippets.

@n8thangreen
Last active May 28, 2018 01:53
Show Gist options
  • Select an option

  • Save n8thangreen/05bcc2674c0be3b7dfe8 to your computer and use it in GitHub Desktop.

Select an option

Save n8thangreen/05bcc2674c0be3b7dfe8 to your computer and use it in GitHub Desktop.
empirical and model based (logistic) training sample adjustment
covariateShift <- function(data, resla, riskfac, ssize=10000){
## importance sampling approach
## when different distributions for the
## training and test data
require(plyr)
Natsal.riskfac.table <- DistnTable(data, riskfac)
Natsal.riskfac.table <- colNameReplace(Natsal.riskfac.table, "(all)", "Natsalfreq")
res.df <- ldply(resla, data.frame)
LA.riskfac.table <- DistnTable(res.df, riskfac)
LA.riskfac.table <- colNameReplace(LA.riskfac.table, "(all)", "LAfreq")
data.freq <- merge(LA.riskfac.table, Natsal.riskfac.table)
data.freq <- transform(data.freq, ratio = LAfreq/Natsalfreq)
data.freq$ratio[is.na(data.freq$ratio)] <- 0
datat <- merge(data, data.freq)
set.seed(1968)
sampleRows <- sample(1:nrow(datat), prob=datat$ratio, replace=TRUE, size=ssize)
data.adj <- datat[sampleRows,]
rownames(data.adj) <- NULL
data.adj
}
covariateShift.glm <- function(data, resla, riskfac, ssize=10000){
## alternative model-based approach:
## could fit a logistic regression to estimate the ratio of probabilities of each data set
## and then predict for (all) permutations
## http://blog.smola.org/post/4110255196/real-simple-covariate-shift-correction
require(plyr)
res.df <- ldply(resla, data.frame)
res.df <- cbind(res.df, out=0)
data <- cbind(data, out=1)
rdata <- rbind(data[,c(riskfac,"out")], res.df[,c(riskfac,"out")])
formula <- as.formula(paste("out ~ ", paste(riskfac, collapse="+"), sep=""))
wt <- c(rep(1/nrow(data), nrow(data)), rep(1/nrow(res.df), nrow(res.df)))
fit <- glm(formula, family=binomial, data=rdata, weight=wt)
# grid <- expand.grid(apply(rdata[,riskfac], 2, unique))
odds <- exp(predict(fit, newdata=data, type="link"))
set.seed(1968)
sampleRows <- sample(1:nrow(data), prob=odds, replace=TRUE, size=ssize)
data.adj <- data[sampleRows,]
rownames(data.adj) <- NULL
data.adj
}
colNameReplace <- function(array, name.before, name.after){
names(array)[names(array)==name.before] <- name.after
array
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment