Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.

number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 200

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

Need to ask how to index such things so as to store them.

get_quantiles <- function(coeff, num_var){
  for(i in 1:num_var){
    #get quantiles
    #how does one accesss the index of a quantile
    print(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)
##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

SpeededUp 1

What I tried to improve:

Instantiate matrix with all zeros instead of rbind hell.

An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median

speedyBoot <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

Get coefficients and assess quantiles

set.seed(9)

coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)
##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

Comparison of algorithms

One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.

set.seed(9)

library(microbenchmark)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  133.5789 143.7758 193.6508 154.8828 190.3599 795.6000   100
##  136.0232 145.8165 208.1908 161.2652 227.2137 602.1474   100

There is improvement. However the difference is so nominal that it could be completely stochastic. Let’s test!

set.seed(498)

library(microbenchmark)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  133.2648 146.8824 168.7516 157.4868 188.1007 235.9295   100
##  133.1541 144.0724 159.2174 150.5516 169.6844 227.0567   100

It is stochastic!

Speedy 2

Try to improve random sampling bottleneck by removing list enumeration in sample

speedyBoot2 <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

set.seed(9)

coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients3, number_of_variables)
##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451
set.seed(8)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##        coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  132.9637 141.9761 163.4414 150.6011 176.0582 246.3348   100
##  134.0594 140.9567 163.4510 150.5066 183.1295 258.2571   100

Check for Stochasticity

set.seed(23892)
n<- 200
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##        coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  132.8960 142.5517 160.1374 148.6452 176.8258 237.4288   100
##  132.7335 141.4793 162.7981 151.7555 183.3191 225.4164   100

Experiment 3

Parallelization

library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

speedy3 <- function(inputData, num_var,formula, nBoots){

  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  foreach(i = 1:nBoots) %dopar% {
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
    # store the coefs
    mat[i,] <- coef(bootLM)
  }
  return(mat)
  
}

set.seed(9)


coefficients4 <- speedy3(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients4, number_of_variables)
##  2.5% 97.5% 
##     0     0 
##  2.5% 97.5% 
##     0     0 
##  2.5% 97.5% 
##     0     0
stopCluster(cl)

test against baseline

set.seed(8)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients4 <- speedy3(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##            coefficients4 <- speedy3(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  135.1830 146.5930 190.3372 156.7029 200.5170 609.2315   100
##  134.5109 157.2091 185.5377 169.2796 201.8788 492.7157   100
stopCluster(cl)

no

experiment 4

speedy4 <- function(inputData, num_var,formula){
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
     return(coef(bootLM))
}

make function

set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)


bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)


get_quantiles(bootCoefs, number_of_variables)
##       2.5%      97.5% 
## -0.4909766 93.9616886 
##       2.5%      97.5% 
## -0.2436453 73.1234787 
##       2.5%      97.5% 
## -0.3984445 73.9164522
stopCluster(cl)

Check speed

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  
  bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )  
## Unit: milliseconds
##                                                                                                                                   expr
##                   coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##  bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq     mean   median       uq      max neval
##  156.5179 175.8639 202.4928 196.8214 224.2124 309.5326   100
##  165.6321 172.3279 199.7886 186.9666 218.0335 367.9900   100
stopCluster(cl)

IDK about errors. It seems ok or the same. Potentially better than before. IDK