It will be useful to store variable like this, so when we try to generalize it into an app, it is relatively simple
number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 3
data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")
Need to ask how to index such things so as to store them.
get_quantiles <- function(coeff, num_var){
for(i in 1:num_var){
#get quantiles
#how does one accesss the index of a quantile
print(quantile(coeff[,i], probs = c(0.025,0.975)))
}
}
baselineBootstrap <- function(inputData, num_var,formula, nBoots){
for(i in 1:nBoots){
#randomly sample data
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
#for optimization put
if(i == 1){
bootResults <- matrix(coef(bootLM), ncol = num_var)
} else {
bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
}
} # end of i loop
return(bootResults)
}
Get the coefficients for a linear model trained on the declared variables above and print quantiles
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients1,number_of_variables)
## 2.5% 97.5%
## 81.87461 92.04718
## 2.5% 97.5%
## -0.3409903 -0.2351599
## 2.5% 97.5%
## -0.3715402 -0.3521323
What I tried to improve:
Instantiate matrix with all zeros instead of rbind hell.
An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median
speededBootstrap <- function(inputData, num_var,formula, nBoots){
mat <- matrix(0L, nrow = nBoots, ncol = num_var)
for(i in 1:nBoots){
bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
bootLM <- lm(formula, data = bootData)
# store the coefs
mat[i,] <- coef(bootLM)
} # end of i loop
return(mat)
}
Get coefficients and assess quantiles
coefficients2 <- speededBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
get_quantiles(coefficients2, number_of_variables)
## 2.5% 97.5%
## 75.4392 101.2135
## 2.5% 97.5%
## -0.34943364 -0.03351741
## 2.5% 97.5%
## -0.5584332 -0.5179003
One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.
n <- 100
library(microbenchmark)
microbenchmark(
coefficients1 <- baselineBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n),
coefficients2 <- speededBootstrap(inputData = data,
num_var = number_of_variables,
formula = form,
nBoots = n)
)
## Unit: milliseconds
## expr
## coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## coefficients2 <- speededBootstrap(inputData = data, num_var = number_of_variables, formula = form, nBoots = n)
## min lq mean median uq max neval
## 65.35562 67.95314 72.59092 69.53808 72.74806 121.9472 100
## 62.85916 67.05448 72.24298 68.49712 72.38403 136.4283 100