It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.
number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 200
data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")
Need to ask how to index such things so as to store them.
get_quantiles <- function(coeff, num_var){
for(i in 1:num_var){
#get quantiles
#how does one accesss the index of a quantile
print(quantile(coeff[,i], probs = c(0.025,0.975)))
}
}
baselineBootstrap <- function(inputData, num_var,formula, nBoots){
for(i in 1:nBoots){
#randomly sample data
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
#for optimization put
if(i == 1){
bootResults <- matrix(coef(bootLM), ncol = num_var)
} else {
bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
}
} # end of i loop
return(bootResults)
}
Get the coefficients for a linear model trained on the declared variables above and print quantiles
set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients1,number_of_variables)
## 2.5% 97.5%
## 49.23226 98.76897
## 2.5% 97.5%
## -0.37527171 0.05641607
## 2.5% 97.5%
## -0.6136807 0.1042451
What I tried to improve:
Instantiate matrix with all zeros instead of rbind hell.
An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median
speedyBoot <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
for(i in 1:nBoots){
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
mat[i,] <- coef(bootLM)
} # end of i loop
return(mat)
}
Get coefficients and assess quantiles
set.seed(9)
coefficients2 <- speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients2, number_of_variables)
## 2.5% 97.5%
## 49.23226 98.76897
## 2.5% 97.5%
## -0.37527171 0.05641607
## 2.5% 97.5%
## -0.6136807 0.1042451
One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.
set.seed(9)
library(microbenchmark)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients2 <- speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 133.5789 143.7758 193.6508 154.8828 190.3599 795.6000 100
## 136.0232 145.8165 208.1908 161.2652 227.2137 602.1474 100
There is improvement. However the difference is so nominal that it could be completely stochastic. Let’s test!
set.seed(498)
library(microbenchmark)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients2 <- speedyBoot(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 133.2648 146.8824 168.7516 157.4868 188.1007 235.9295 100
## 133.1541 144.0724 159.2174 150.5516 169.6844 227.0567 100
It is stochastic!
Try to improve random sampling bottleneck by removing list enumeration in sample
speedyBoot2 <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
for(i in 1:nBoots){
bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
mat[i,] <- coef(bootLM)
} # end of i loop
return(mat)
}
set.seed(9)
coefficients3 <- speedyBoot2(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients3, number_of_variables)
## 2.5% 97.5%
## 49.23226 98.76897
## 2.5% 97.5%
## -0.37527171 0.05641607
## 2.5% 97.5%
## -0.6136807 0.1042451
set.seed(8)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients3 <- speedyBoot2(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 132.9637 141.9761 163.4414 150.6011 176.0582 246.3348 100
## 134.0594 140.9567 163.4510 150.5066 183.1295 258.2571 100
Check for Stochasticity
set.seed(23892)
n<- 200
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients3 <- speedyBoot2(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 132.8960 142.5517 160.1374 148.6452 176.8258 237.4288 100
## 132.7335 141.4793 162.7981 151.7555 183.3191 225.4164 100
Parallelization
library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
speedy3 <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
foreach(i = 1:nBoots) %dopar% {
bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
# store the coefs
mat[i,] <- coef(bootLM)
}
return(mat)
}
set.seed(9)
coefficients4 <- speedy3(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients4, number_of_variables)
## 2.5% 97.5%
## 0 0
## 2.5% 97.5%
## 0 0
## 2.5% 97.5%
## 0 0
stopCluster(cl)
test against baseline
set.seed(8)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients4 <- speedy3(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients4 <- speedy3(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 135.1830 146.5930 190.3372 156.7029 200.5170 609.2315 100
## 134.5109 157.2091 185.5377 169.2796 201.8788 492.7157 100
stopCluster(cl)
no
speedy4 <- function(inputData, num_var,formula){
bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
return(coef(bootLM))
}
make function
set.seed(9)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(inputData = data,
num_var = number_of_variables,
formula = form)
get_quantiles(bootCoefs, number_of_variables)
## 2.5% 97.5%
## -0.4909766 93.9616886
## 2.5% 97.5%
## -0.2436453 73.1234787
## 2.5% 97.5%
## -0.3984445 73.9164522
stopCluster(cl)
Check speed
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(
inputData = data,
num_var = number_of_variables,
formula = form)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(inputData = data, num_var = number_of_variables, formula = form)
## min lq mean median uq max neval
## 156.5179 175.8639 202.4928 196.8214 224.2124 309.5326 100
## 165.6321 172.3279 199.7886 186.9666 218.0335 367.9900 100
stopCluster(cl)
IDK about errors. It seems ok or the same. Potentially better than before. IDK