It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.
number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 200
data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")
get_quantiles <- function(coeff, num_var){
for(i in 1:num_var){
#get quantiles
#how does one accesss the index of a quantile
print(quantile(coeff[,i], probs = c(0.025,0.975)))
}
}
baselineBootstrap <- function(inputData, num_var,formula, nBoots){
for(i in 1:nBoots){
#randomly sample data
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
#for optimization put
if(i == 1){
bootResults <- matrix(coef(bootLM), ncol = num_var)
} else {
bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
}
} # end of i loop
return(bootResults)
}
Get the coefficients for a linear model trained on the declared variables above and print quantiles
set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients1,number_of_variables)
## 2.5% 97.5%
## 49.23226 98.76897
## 2.5% 97.5%
## -0.37527171 0.05641607
## 2.5% 97.5%
## -0.6136807 0.1042451
What I tried to improve:
Instantiate matrix with all zeros instead of rbind hell.
An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median
speedyBoot <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
for(i in 1:nBoots){
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
mat[i,] <- coef(bootLM)
} # end of i loop
return(mat)
}
Get coefficients and assess quantiles
set.seed(9)
coefficients2 <- speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients2, number_of_variables)
## 2.5% 97.5%
## 49.23226 98.76897
## 2.5% 97.5%
## -0.37527171 0.05641607
## 2.5% 97.5%
## -0.6136807 0.1042451
One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.
set.seed(9)
library(microbenchmark)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients2 <- speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 127.9100 143.2442 160.9455 152.6946 169.2403 279.2433 100
## 127.7749 140.5288 163.4521 150.8295 176.2851 282.8920 100
There is improvement. However the difference is so nominal that it could be completely stochastic.
Try to improve random sampling bottleneck by removing list enumeration in sample
speedyBoot2 <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
for(i in 1:nBoots){
bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
mat[i,] <- coef(bootLM)
} # end of i loop
return(mat)
}
set.seed(9)
coefficients3 <- speedyBoot2(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients3, number_of_variables)
## 2.5% 97.5%
## 49.23226 98.76897
## 2.5% 97.5%
## -0.37527171 0.05641607
## 2.5% 97.5%
## -0.6136807 0.1042451
set.seed(8)
n<- 1000
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients3 <- speedyBoot2(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 668.9691 703.3552 748.3622 732.5565 785.3489 939.1890 100
## 645.2541 688.7216 740.4761 722.2839 768.6516 966.2532 100
speedy4 <- function(inputData, num_var,formula){
bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
return(coef(bootLM))
}
make function
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
n<-200
set.seed(9)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
bootCoefs <- foreach(i = 1:n, .combine =rbind) %dopar% speedy4(inputData = data,
num_var = number_of_variables,
formula = form)
get_quantiles(bootCoefs, number_of_variables)
## 2.5% 97.5%
## 53.11104 99.92509
## 2.5% 97.5%
## -0.36888342 0.03656095
## 2.5% 97.5%
## -0.67394348 -0.01255995
stopCluster(cl)
Check speed
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
inputData = data,
num_var = number_of_variables,
formula = form)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data, num_var = number_of_variables, formula = form)
## min lq mean median uq max neval
## 652.6307 712.9005 745.5873 735.5217 761.9037 1042.9266 100
## 628.8516 706.4043 735.3229 734.1860 762.5622 968.0211 100
stopCluster(cl)