Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.

number_of_variables <- 3
form <- "Age~Weight+Oxygen"
n <- 200

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

This is a useful helper function. The inputs are the coefficients of the bootstraps and outputs a matrix, whose ith column represents the ith covariate, the 1st row, the bottom 2.5% quantiles, and the 2nd the top 97.5% quartile. In otherwords, the ith column is the 95% confidence.

get_quantiles <- function(coeff, num_var){
  mat <- matrix(0L, nrow = 2, ncol = num_var)
  for(i in 1:num_var){
    mat[,i]<-matrix(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
  #print(mat)
  return(mat)
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)
##          [,1]        [,2]       [,3]
## [1,] 49.23226 -0.37527171 -0.6136807
## [2,] 98.76897  0.05641607  0.1042451

Speeded Up Code

What needed to improve:

  1. Instantiate matrix with all zeros instead of rbind hell.

  2. Improve random sampling bottleneck by removing list enumeration in sample function.

speedyBoot <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

set.seed(9)
coefficients2 <-speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)
##          [,1]        [,2]       [,3]
## [1,] 49.23226 -0.37527171 -0.6136807
## [2,] 98.76897  0.05641607  0.1042451
library(microbenchmark)
set.seed(8)
n<- 100
microbenchmark(
  baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  ) 
## Unit: milliseconds
##                                                                                                 expr
##  baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         speedyBoot(inputData = data, num_var = number_of_variables, formula = form,      nBoots = n)
##       min       lq     mean   median       uq      max neval
##  77.36575 84.01449 108.4726 95.54701 121.0147 338.2292   100
##  76.29310 85.74675 106.1637 99.09461 115.1908 240.7033   100

Post Parallelization

Define a function to call in each loop of parallelization

speedy4 <- function(inputData, num_var,formula){
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
     return(coef(bootLM))
}

Define function

library(foreach)
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)

bootCoefs <- foreach(i = 1:n, .combine =rbind) %dopar% speedy4(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)


get_quantiles(bootCoefs, number_of_variables)
##           [,1]        [,2]        [,3]
## [1,]  53.91828 -0.39241458 -0.67908479
## [2,] 101.87962  0.03942624  0.00105602
stopCluster(cl)

Check speed

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  
  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )  
## Unit: milliseconds
##                                                                                                                                   expr
##                   coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq      mean   median        uq      max neval
##  761.7036 885.5867 1012.1729 929.7713 1037.1346 2113.989   100
##  722.8394 841.7968  935.1599 878.2646  940.3058 1693.999   100
stopCluster(cl)

Comparison against boot

library(boot)
# https://www.statmethods.net/advstats/bootstrapping.html
# function to obtain regression weights
bs <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample
  fit <- lm(formula, data=d)
  return(coef(fit))
}

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000

microbenchmark(
  # bootstrapping with 1000 replications
  results <- boot(data=data, statistic=bs,
   R=n, formula=form), 
  
  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )  
## Unit: milliseconds
##                                                                                                                                   expr
##                                                                    results <- boot(data = data, statistic = bs, R = n, formula = form)
##  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq     mean   median       uq      max neval
##  721.6557 788.6952 949.3243 855.9705 1053.994 1805.171   100
##  685.1906 809.0650 963.2686 858.2443 1081.957 1648.772   100
stopCluster(cl)

The function is approximately the same speed as the boot library

Worked Example

A good model defined by AIC is Oxygen ~ Age + RunTime + RunPulse + MaxPulse. The number of variables is 5 and lets perform 1000 bootstraps.

set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
                 #num of boots here
bootCoefs <- foreach(i = 1:1000, .combine =rbind) %dopar% speedy4(inputData = data,
                                   num_var = 5,
                                   formula = "Oxygen ~Age+RunTime+RunPulse+MaxPulse")


stopCluster(cl)

We can get the mean estimates for the coefficients by finding column means

colMeans(bootCoefs)
## (Intercept)         Age     RunTime    RunPulse    MaxPulse 
##  98.6583337  -0.1935990  -2.7688876  -0.3197957   0.2387201

Let’s compare it to the coefficients of classical regression

model <- lm(Oxygen ~Age+RunTime+RunPulse+MaxPulse, data=data)
coef(model)
## (Intercept)         Age     RunTime    RunPulse    MaxPulse 
##  98.1478880  -0.1977347  -2.7675788  -0.3481079   0.2705130

Very good. We can get coefficients using get quantiles on the bootCoefs.

get_quantiles(bootCoefs, 5)
##           [,1]        [,2]      [,3]        [,4]        [,5]
## [1,]  79.90637 -0.38360975 -3.384129 -0.54040555 -0.06072477
## [2,] 117.92533 -0.01735126 -2.202520 -0.07697345  0.48743461