Experimentation Log

Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.

number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 200

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

Need to ask how to index such things so as to store them.

get_quantiles <- function(coeff, num_var){
  for(i in 1:num_var){
    #get quantiles
    #how does one accesss the index of a quantile
    print(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)

##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

SpeededUp 1

What I tried to improve:

Instantiate matrix with all zeros instead of rbind hell.

An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median

speedyBoot <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

Get coefficients and assess quantiles

set.seed(9)

coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)

##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

Comparison of algorithms

One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.

set.seed(9)

library(microbenchmark)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  133.5789 143.7758 193.6508 154.8828 190.3599 795.6000   100
##  136.0232 145.8165 208.1908 161.2652 227.2137 602.1474   100

There is improvement. However the difference is so nominal that it could be completely stochastic. Let’s test!

set.seed(498)

library(microbenchmark)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  133.2648 146.8824 168.7516 157.4868 188.1007 235.9295   100
##  133.1541 144.0724 159.2174 150.5516 169.6844 227.0567   100

It is stochastic!

Speedy 2

Try to improve random sampling bottleneck by removing list enumeration in sample

speedyBoot2 <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

set.seed(9)

coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients3, number_of_variables)

##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

set.seed(8)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##        coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  132.9637 141.9761 163.4414 150.6011 176.0582 246.3348   100
##  134.0594 140.9567 163.4510 150.5066 183.1295 258.2571   100

Check for Stochasticity

set.seed(23892)
n<- 200
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##        coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  132.8960 142.5517 160.1374 148.6452 176.8258 237.4288   100
##  132.7335 141.4793 162.7981 151.7555 183.3191 225.4164   100

Experiment 3

Parallelization

library(foreach)
library(doParallel)

## Loading required package: iterators

## Loading required package: parallel

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

speedy3 <- function(inputData, num_var,formula, nBoots){

  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  foreach(i = 1:nBoots) %dopar% {
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
    # store the coefs
    mat[i,] <- coef(bootLM)
  }
  return(mat)
  
}

set.seed(9)


coefficients4 <- speedy3(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients4, number_of_variables)

##  2.5% 97.5% 
##     0     0 
##  2.5% 97.5% 
##     0     0 
##  2.5% 97.5% 
##     0     0

stopCluster(cl)

test against baseline

set.seed(8)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients4 <- speedy3(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##            coefficients4 <- speedy3(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  135.1830 146.5930 190.3372 156.7029 200.5170 609.2315   100
##  134.5109 157.2091 185.5377 169.2796 201.8788 492.7157   100

stopCluster(cl)

experiment 4

speedy4 <- function(inputData, num_var,formula){
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
     return(coef(bootLM))
}

make function

set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)


bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)


get_quantiles(bootCoefs, number_of_variables)

##       2.5%      97.5% 
## -0.4909766 93.9616886 
##       2.5%      97.5% 
## -0.2436453 73.1234787 
##       2.5%      97.5% 
## -0.3984445 73.9164522

stopCluster(cl)

Check speed

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  
  bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )

## Unit: milliseconds
##                                                                                                                                   expr
##                   coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##  bootCoefs <- foreach(i = 1:n, .combine = cbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq     mean   median       uq      max neval
##  156.5179 175.8639 202.4928 196.8214 224.2124 309.5326   100
##  165.6321 172.3279 199.7886 186.9666 218.0335 367.9900   100

stopCluster(cl)

IDK about errors. It seems ok or the same. Potentially better than before. IDK