It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.
number_of_variables <- 3
form <- "Age~Weight+Oxygen"
n <- 200
data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")
This is a useful helper function. The inputs are the coefficients of the bootstraps and outputs a matrix, whose ith column represents the ith covariate, the 1st row, the bottom 2.5% quantiles, and the 2nd the top 97.5% quartile. In otherwords, the ith column is the 95% confidence.
get_quantiles <- function(coeff, num_var){
mat <- matrix(0L, nrow = 2, ncol = num_var)
for(i in 1:num_var){
mat[,i]<-matrix(quantile(coeff[,i], probs = c(0.025,0.975)))
}
#print(mat)
return(mat)
}
baselineBootstrap <- function(inputData, num_var,formula, nBoots){
for(i in 1:nBoots){
#randomly sample data
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
#for optimization put
if(i == 1){
bootResults <- matrix(coef(bootLM), ncol = num_var)
} else {
bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
}
} # end of i loop
return(bootResults)
}
Get the coefficients for a linear model trained on the declared variables above and print quantiles
set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients1,number_of_variables)
## [,1] [,2] [,3]
## [1,] 49.23226 -0.37527171 -0.6136807
## [2,] 98.76897 0.05641607 0.1042451
What needed to improve:
Instantiate matrix with all zeros instead of rbind hell.
Improve random sampling bottleneck by removing list enumeration in sample function.
speedyBoot <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
for(i in 1:nBoots){
bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
mat[i,] <- coef(bootLM)
} # end of i loop
return(mat)
}
set.seed(9)
coefficients2 <-speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients2, number_of_variables)
## [,1] [,2] [,3]
## [1,] 49.23226 -0.37527171 -0.6136807
## [2,] 98.76897 0.05641607 0.1042451
library(microbenchmark)
set.seed(8)
n<- 100
microbenchmark(
baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## speedyBoot(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 77.36575 84.01449 108.4726 95.54701 121.0147 338.2292 100
## 76.29310 85.74675 106.1637 99.09461 115.1908 240.7033 100
Define a function to call in each loop of parallelization
speedy4 <- function(inputData, num_var,formula){
bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
return(coef(bootLM))
}
Define function
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
set.seed(9)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
bootCoefs <- foreach(i = 1:n, .combine =rbind) %dopar% speedy4(inputData = data,
num_var = number_of_variables,
formula = form)
get_quantiles(bootCoefs, number_of_variables)
## [,1] [,2] [,3]
## [1,] 53.91828 -0.39241458 -0.67908479
## [2,] 101.87962 0.03942624 0.00105602
stopCluster(cl)
Check speed
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
inputData = data,
num_var = number_of_variables,
formula = form)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data, num_var = number_of_variables, formula = form)
## min lq mean median uq max neval
## 761.7036 885.5867 1012.1729 929.7713 1037.1346 2113.989 100
## 722.8394 841.7968 935.1599 878.2646 940.3058 1693.999 100
stopCluster(cl)
library(boot)
# https://www.statmethods.net/advstats/bootstrapping.html
# function to obtain regression weights
bs <- function(formula, data, indices) {
d <- data[indices,] # allows boot to select sample
fit <- lm(formula, data=d)
return(coef(fit))
}
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000
microbenchmark(
# bootstrapping with 1000 replications
results <- boot(data=data, statistic=bs,
R=n, formula=form),
bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
inputData = data,
num_var = number_of_variables,
formula = form)
)
## Unit: milliseconds
## expr
## results <- boot(data = data, statistic = bs, R = n, formula = form)
## bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data, num_var = number_of_variables, formula = form)
## min lq mean median uq max neval
## 721.6557 788.6952 949.3243 855.9705 1053.994 1805.171 100
## 685.1906 809.0650 963.2686 858.2443 1081.957 1648.772 100
stopCluster(cl)
The function is approximately the same speed as the boot library
A good model defined by AIC is Oxygen ~ Age + RunTime + RunPulse + MaxPulse. The number of variables is 5 and lets perform 1000 bootstraps.
set.seed(9)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
#num of boots here
bootCoefs <- foreach(i = 1:1000, .combine =rbind) %dopar% speedy4(inputData = data,
num_var = 5,
formula = "Oxygen ~Age+RunTime+RunPulse+MaxPulse")
stopCluster(cl)
We can get the mean estimates for the coefficients by finding column means
colMeans(bootCoefs)
## (Intercept) Age RunTime RunPulse MaxPulse
## 98.6583337 -0.1935990 -2.7688876 -0.3197957 0.2387201
Let’s compare it to the coefficients of classical regression
model <- lm(Oxygen ~Age+RunTime+RunPulse+MaxPulse, data=data)
coef(model)
## (Intercept) Age RunTime RunPulse MaxPulse
## 98.1478880 -0.1977347 -2.7675788 -0.3481079 0.2705130
Very good. We can get coefficients using get quantiles on the bootCoefs.
get_quantiles(bootCoefs, 5)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 79.90637 -0.38360975 -3.384129 -0.54040555 -0.06072477
## [2,] 117.92533 -0.01735126 -2.202520 -0.07697345 0.48743461