Experimentation Log

Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.

number_of_variables <- 3
form <- Age~Weight+Oxygen
n <- 200

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

get_quantiles <- function(coeff, num_var){
  for(i in 1:num_var){
    #get quantiles
    #how does one accesss the index of a quantile
    print(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)

##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

SpeededUp 1

What I tried to improve:

Instantiate matrix with all zeros instead of rbind hell.

An experiment will need to occur with instantiating a nan matrix - (10/15) Result is failure. Instantiating with NaNs slows performance on all metrics but Median

speedyBoot <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

Get coefficients and assess quantiles

set.seed(9)

coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)

##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

Comparison of algorithms

One wants n large so that the gap in performance is larger and more evident. If n is too small, then the performance improvement will be negligible.

set.seed(9)

library(microbenchmark)
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients2 <- speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         coefficients2 <- speedyBoot(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  127.9100 143.2442 160.9455 152.6946 169.2403 279.2433   100
##  127.7749 140.5288 163.4521 150.8295 176.2851 282.8920   100

There is improvement. However the difference is so nominal that it could be completely stochastic.

Speedy 2

Try to improve random sampling bottleneck by removing list enumeration in sample

speedyBoot2 <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

set.seed(9)

coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients3, number_of_variables)

##     2.5%    97.5% 
## 49.23226 98.76897 
##        2.5%       97.5% 
## -0.37527171  0.05641607 
##       2.5%      97.5% 
## -0.6136807  0.1042451

set.seed(8)
n<- 1000
microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  coefficients3 <- speedyBoot2(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                                  expr
##  coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##        coefficients3 <- speedyBoot2(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##       min       lq     mean   median       uq      max neval
##  668.9691 703.3552 748.3622 732.5565 785.3489 939.1890   100
##  645.2541 688.7216 740.4761 722.2839 768.6516 966.2532   100

experiment 4

speedy4 <- function(inputData, num_var,formula){
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
     return(coef(bootLM))
}

make function

library(foreach)
library(doParallel)

## Loading required package: iterators

## Loading required package: parallel

n<-200
set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)

bootCoefs <- foreach(i = 1:n, .combine =rbind) %dopar% speedy4(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)


get_quantiles(bootCoefs, number_of_variables)

##     2.5%    97.5% 
## 53.11104 99.92509 
##        2.5%       97.5% 
## -0.36888342  0.03656095 
##        2.5%       97.5% 
## -0.67394348 -0.01255995

stopCluster(cl)

Check speed

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  
  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )

## Unit: milliseconds
##                                                                                                                                   expr
##                   coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq     mean   median       uq       max neval
##  652.6307 712.9005 745.5873 735.5217 761.9037 1042.9266   100
##  628.8516 706.4043 735.3229 734.1860 762.5622  968.0211   100

stopCluster(cl)