Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.

number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 200

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

get_quantiles <- function(coeff, num_var){
  for(i in 1:num_var){
    #get quantiles
    #how does one accesss the index of a quantile
    print(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)
##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

SpeededUp 1

What I tried to improve:

Instantiate matrix with all zeros instead of rbind hell.

An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median

speedyBoot <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

Get coefficients and assess quantiles

set.seed(9)

coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)
##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

Comparison of algorithms

One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.

set.seed(9)

library(microbenchmark)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  127.9100 143.2442 160.9455 152.6946 169.2403 279.2433   100
##  127.7749 140.5288 163.4521 150.8295 176.2851 282.8920   100

There is improvement. However the difference is so nominal that it could be completely stochastic.

Speedy 2

Try to improve random sampling bottleneck by removing list enumeration in sample

speedyBoot2 <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

set.seed(9)

coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients3, number_of_variables)
##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451
set.seed(8)
n<- 1000
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )  
## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##        coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  668.9691 703.3552 748.3622 732.5565 785.3489 939.1890   100
##  645.2541 688.7216 740.4761 722.2839 768.6516 966.2532   100

experiment 4

speedy4 <- function(inputData, num_var,formula){
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
     return(coef(bootLM))
}

make function

library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
n<-200
set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)

bootCoefs <- foreach(i = 1:n, .combine =rbind) %dopar% speedy4(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)


get_quantiles(bootCoefs, number_of_variables)
##     2.5%    97.5% 
## 53.11104 99.92509 
##        2.5%       97.5% 
## -0.36888342  0.03656095 
##        2.5%       97.5% 
## -0.67394348 -0.01255995
stopCluster(cl)

Check speed

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  
  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )  
## Unit: milliseconds
##                                                                                                                                   expr
##                   coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq     mean   median       uq       max neval
##  652.6307 712.9005 745.5873 735.5217 761.9037 1042.9266   100
##  628.8516 706.4043 735.3229 734.1860 762.5622  968.0211   100
stopCluster(cl)