Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is relatively simple

number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 3

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

Need to ask how to index such things so as to store them.

get_quantiles <- function(coeff, num_var){
  for(i in 1:num_var){
    #get quantiles
    #how does one accesss the index of a quantile
    print(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)
##     2.5%    97.5% 
## 81.87461 92.04718 
##       2.5%      97.5% 
## -0.3409903 -0.2351599 
##       2.5%      97.5% 
## -0.3715402 -0.3521323

SpeededUp 1

What I tried to improve:

Instantiate matrix with all zeros instead of rbind hell.

An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median

speededBootstrap <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

Get coefficients and assess quantiles

coefficients2 <- speededBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)
##     2.5%    97.5% 
##  75.4392 101.2135 
##        2.5%       97.5% 
## -0.34943364 -0.03351741 
##       2.5%      97.5% 
## -0.5584332 -0.5179003

Comparison of algorithms

One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.

n <- 100

library(microbenchmark)

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speededBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##   coefficients2 <- speededBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  65.35562 67.95314 72.59092 69.53808 72.74806 121.9472   100
##  62.85916 67.05448 72.24298 68.49712 72.38403 136.4283   100