Experimentation Log

Declare Variables and Import Data

It will be useful to store variable like this, so when we try to generalize it into an app, it is more straightforward.

number_of_variables <- 3
form <- "Age~Weight+Oxygen"
n <- 200

data <- read.csv(file="~/Desktop/5763GroupProject/data/fitness.csv", header=TRUE, sep=",")

Get quantiles

This is a useful helper function. The inputs are the coefficients of the bootstraps and outputs a matrix, whose ith column represents the ith covariate, the 1st row, the bottom 2.5% quantiles, and the 2nd the top 97.5% quartile. In otherwords, the ith column is the 95% confidence.

get_quantiles <- function(coeff, num_var){
  mat <- matrix(0L, nrow = 2, ncol = num_var)
  for(i in 1:num_var){
    mat[,i]<-matrix(quantile(coeff[,i], probs = c(0.025,0.975)))
  }
  #print(mat)
  return(mat)
}

Original Boot Data Function (Slightly Modified)

baselineBootstrap <- function(inputData, num_var,formula, nBoots){
  
  for(i in 1:nBoots){
    #randomly sample data
    bootData <- inputData[sample(1:nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    #for optimization put 
    if(i == 1){
      bootResults <- matrix(coef(bootLM), ncol = num_var)
    } else {
      bootResults<- rbind(bootResults, matrix(coef(bootLM), ncol = num_var))
    }
  } # end of i loop
  return(bootResults)
}

Get the coefficients for a linear model trained on the declared variables above and print quantiles

set.seed(9)
coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)

get_quantiles(coefficients1,number_of_variables)

##          [,1]        [,2]       [,3]
## [1,] 49.23226 -0.37527171 -0.6136807
## [2,] 98.76897  0.05641607  0.1042451

Speeded Up Code

What needed to improve:

Instantiate matrix with all zeros instead of rbind hell.
Improve random sampling bottleneck by removing list enumeration in sample function.

speedyBoot <- function(inputData, num_var,formula, nBoots){
  mat <- matrix(0L, nrow = nBoots, ncol = num_var)
  for(i in 1:nBoots){
    bootData <- inputData[sample(nrow(inputData), nrow(inputData), replace = T),]
    bootLM <- lm(formula, data = bootData)
    # store the coefs
    mat[i,] <- coef(bootLM)
  } # end of i loop
  return(mat)
}

set.seed(9)
coefficients2 <-speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
get_quantiles(coefficients2, number_of_variables)

##          [,1]        [,2]       [,3]
## [1,] 49.23226 -0.37527171 -0.6136807
## [2,] 98.76897  0.05641607  0.1042451

library(microbenchmark)
set.seed(8)
n<- 100
microbenchmark(
  baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  speedyBoot(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n)
  
  )

## Unit: milliseconds
##                                                                                                 expr
##  baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##         speedyBoot(inputData = data, num_var = number_of_variables, formula = form,      nBoots = n)
##       min       lq     mean   median       uq      max neval
##  77.36575 84.01449 108.4726 95.54701 121.0147 338.2292   100
##  76.29310 85.74675 106.1637 99.09461 115.1908 240.7033   100

Post Parallelization

Define a function to call in each loop of parallelization

speedy4 <- function(inputData, num_var,formula){
    bootLM <- lm(formula, data = inputData[sample(nrow(inputData), nrow(inputData), replace = T),])
     return(coef(bootLM))
}

Define function

library(foreach)
library(doParallel)

## Loading required package: iterators

## Loading required package: parallel

set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)

bootCoefs <- foreach(i = 1:n, .combine =rbind) %dopar% speedy4(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)


get_quantiles(bootCoefs, number_of_variables)

##           [,1]        [,2]        [,3]
## [1,]  53.91828 -0.39241458 -0.67908479
## [2,] 101.87962  0.03942624  0.00105602

stopCluster(cl)

Check speed

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000

microbenchmark(
  coefficients1 <- baselineBootstrap(inputData = data,
                                   num_var = number_of_variables,
                                   formula = form,
                                   nBoots = n),
  
  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )

## Unit: milliseconds
##                                                                                                                                   expr
##                   coefficients1 <- baselineBootstrap(inputData = data, num_var = number_of_variables,      formula = form, nBoots = n)
##  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq      mean   median        uq      max neval
##  761.7036 885.5867 1012.1729 929.7713 1037.1346 2113.989   100
##  722.8394 841.7968  935.1599 878.2646  940.3058 1693.999   100

stopCluster(cl)

Comparison against boot

library(boot)
# https://www.statmethods.net/advstats/bootstrapping.html
# function to obtain regression weights
bs <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample
  fit <- lm(formula, data=d)
  return(coef(fit))
}

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
n<-1000

microbenchmark(
  # bootstrapping with 1000 replications
  results <- boot(data=data, statistic=bs,
   R=n, formula=form), 
  
  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(
                                    inputData = data,
                                   num_var = number_of_variables,
                                   formula = form)

  )

## Unit: milliseconds
##                                                                                                                                   expr
##                                                                    results <- boot(data = data, statistic = bs, R = n, formula = form)
##  bootCoefs <- foreach(i = 1:n, .combine = rbind) %dopar% speedy4(inputData = data,      num_var = number_of_variables, formula = form)
##       min       lq     mean   median       uq      max neval
##  721.6557 788.6952 949.3243 855.9705 1053.994 1805.171   100
##  685.1906 809.0650 963.2686 858.2443 1081.957 1648.772   100

stopCluster(cl)

The function is approximately the same speed as the boot library

Worked Example

A good model defined by AIC is Oxygen ~ Age + RunTime + RunPulse + MaxPulse. The number of variables is 5 and lets perform 1000 bootstraps.

set.seed(9)

cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)
clusterSetRNGStream(cl=cl,9)
                 #num of boots here
bootCoefs <- foreach(i = 1:1000, .combine =rbind) %dopar% speedy4(inputData = data,
                                   num_var = 5,
                                   formula = "Oxygen ~Age+RunTime+RunPulse+MaxPulse")


stopCluster(cl)

We can get the mean estimates for the coefficients by finding column means

colMeans(bootCoefs)

## (Intercept)         Age     RunTime    RunPulse    MaxPulse 
##  98.6583337  -0.1935990  -2.7688876  -0.3197957   0.2387201

Let’s compare it to the coefficients of classical regression

model <- lm(Oxygen ~Age+RunTime+RunPulse+MaxPulse, data=data)
coef(model)

## (Intercept)         Age     RunTime    RunPulse    MaxPulse 
##  98.1478880  -0.1977347  -2.7675788  -0.3481079   0.2705130

Very good. We can get coefficients using get quantiles on the bootCoefs.

get_quantiles(bootCoefs, 5)

##           [,1]        [,2]      [,3]        [,4]        [,5]
## [1,]  79.90637 -0.38360975 -3.384129 -0.54040555 -0.06072477
## [2,] 117.92533 -0.01735126 -2.202520 -0.07697345  0.48743461