R Markdown

For more details on using R Markdown see http://rmarkdown.rstudio.com.

#  Phil presumes this is the code from "Week Two Study Guide.Html"

# my data file is in:
#    C:\Users\Phil\Desktop\Milner recovered data 100920\Cal So Un.....
#     or set working directory to the above
     getwd() #  shows your working directory
## [1] "C:/Users/Phil/Desktop/Milner recovered data 100920/Cal So University DBA/SLU/AA 5300 (ML)/2 week/uploads-week-2/uploads-week-2"
#     setwd(dir)  #  sets working directory to the value of dir in double quotes w forward slashes
#     OR !!! in RStudio, click 'Session' (top menu bar), in dropdown select 'to source file location'
     setwd("C:/Users/Phil/Desktop/Milner recovered data 100920/Cal So University DBA/SLU/AA 5300 (ML)/2 week/uploads-week-2/uploads-week-2")

     
ds.original <- read.csv(file = "test-data-set.txt", header = TRUE, sep = " ")
print(ds.original)
##    a  b  c
## 1 11 12 13
## 2 21 22 23
## 3 31 32 33
## 4 41 42 43
for (iternum in (1:3)){
    
    print(paste("Shuffle number:", iternum))                # print shuffle no.
    
   ds.shuffled <- ds.original[sample(nrow(ds.original)),]  # sample() randomizes
    
    print(ds.shuffled)                                     # print random rows
}
## [1] "Shuffle number: 1"
##    a  b  c
## 1 11 12 13
## 4 41 42 43
## 3 31 32 33
## 2 21 22 23
## [1] "Shuffle number: 2"
##    a  b  c
## 3 31 32 33
## 2 21 22 23
## 4 41 42 43
## 1 11 12 13
## [1] "Shuffle number: 3"
##    a  b  c
## 4 41 42 43
## 3 31 32 33
## 2 21 22 23
## 1 11 12 13
### Example with Boston data

######### code block one gives vector with size of each subset
library(MASS)
dataset.obj <- as.data.frame(Boston)

#summary(dataset.obj)

## change the variable chas to factor, since it's a dummy variable
dataset.obj$chas <- as.factor(dataset.obj$chas)

## verify
# summary(dataset.obj$chas)

## The overall k-fold-based CV approach
samplesize <- nrow(dataset.obj)
numfolds <- 10 # we're setting k = 10

quotient <- samplesize %/% numfolds # the %/% operator gives quotient
remainder <- samplesize %% numfolds # the %% operator  gives remainder

vct.sizes <- rep(quotient, numfolds) # vector subsets, size = quotient
if(remainder > 0){
    for(i in 1:remainder){
        vct.sizes[i] <- vct.sizes[i] + 1 # ++remainders to subsets
    }
}

print(paste("K:", 10, "n:", samplesize))
## [1] "K: 10 n: 506"
print(vct.sizes)
##  [1] 51 51 51 51 51 51 50 50 50 50
######################  block 2:  split data in half & calc rmse for predicted

startval <- 1
endval <- nrow(dataset.obj)/2  # nrow() = no rows in object

model <- glm(crim ~ indus + dis + medv,
             data = dataset.obj[-(startval:endval), ])
pred.vals <- predict(model, newdata=dataset.obj[startval:endval, ],
                     type="response")

# calc Residual MSE
rmse <- sqrt(mean((dataset.obj$crim[startval:endval] - pred.vals)^2))

######## end block 2:  



#### block 3: 10-fold CV,MDL_1,3 vari's w above vector sample sizes, vct.sizes 

set.seed(112)
dataset.obj <- dataset.obj[sample(nrow(dataset.obj)), ] 


## create the vector to hold RMSE values
vct.rmses <- numeric(numfolds)

startval <- 1
for(kth in (1:numfolds)){
    endval <- vct.sizes[kth] + startval - 1 #1st loop: vct.sizes.1 + 0 
                                            #2++ Loop: Vct.sizes.x +old_endval+1

model <- glm(crim ~ indus + dis + medv,
         data = dataset.obj[-(startval:endval), ]) #data not in start:end
    
    pred.vals <- predict(model,     # predict w data in st:end,validate subset
                         newdata = dataset.obj[startval:endval, ],
                         type="response")
    
    
    ## compute the RMSE
    rmse <- sqrt(mean((dataset.obj$crim[startval:endval] - pred.vals)^2))
    
    
    ## store the current fold's RMSE
    vct.rmses[kth] <- rmse
    
    ## modify the start value to correspond to the next fold
    startval <- endval + 1
}

## Compute the overall RMSE
overall.rmse <- mean(vct.rmses)
print(paste("For the model crim ~ indus + dis + medv, the overall 10-fold CV RMSE is:",
            round(overall.rmse, 6)))
## [1] "For the model crim ~ indus + dis + medv, the overall 10-fold CV RMSE is: 6.965314"
########################################### end block 3, 1st 10-fold CV, mdl_1 

#### block 4: 2nd 10-fold CV (Model 2, 4predictors) w vct.sizes 
set.seed(112)
dataset.obj <- dataset.obj[sample(nrow(dataset.obj)), ]

## create the vector to hold RMSE values
vct.rmses <- numeric(numfolds)

startval <- 1
for(kth in (1:numfolds)){
    endval <- vct.sizes[kth] + startval - 1

    model <- glm(crim ~ indus + dis + medv + age,
                 data = dataset.obj[-(startval:endval), ])
    pred.vals <- predict(model,
                         newdata = dataset.obj[startval:endval, ],
                         type="response")
    ## compute the RMSE
    rmse <- sqrt(mean((dataset.obj$crim[startval:endval] - pred.vals)^2))
    ## store the current fold's RMSE
    vct.rmses[kth] <- rmse
    ## modify the start value to correspond to the next fold
    startval <- endval + 1
}

## Compute the overall RMSE
overall.rmse <- mean(vct.rmses)
print(paste("For the model crim ~ indus + dis + medv + age, the overall 10-fold CV RMSE is:",
            round(overall.rmse, 6)))
## [1] "For the model crim ~ indus + dis + medv + age, the overall 10-fold CV RMSE is: 7.242021"
########################################### end block 4, mdl_2, 4 predictors 

block 5: boot, resample to get variance of coefficients Alt text

getCoeffs <- function(dataset, indices, formula) # inputs
  {
    d <- dataset[indices, ]                      # build dataset w indicies
    fit <- glm(formula, data=d)                  # input formula on data =d
    return(coef(fit))                            # use coef() on formula/model
}
library(boot)
set.seed(122)

results <- boot(                    # boot creates 1000 metrics & their stats 
                data      = Boston,         # data set
                statistic = getCoeffs,      # function to create metric
                R         = 1000,           # number of resamples
                formula   = crim ~ indus + dis + medv)  # my metric needs formula

# for the above, see also block 5 or entire nxt chunk ,simpler metric function

print("Coefficients of the model")
## [1] "Coefficients of the model"
print(coef(model))
##   (Intercept)         indus           dis          medv           age 
## 12.2860166210  0.1075313251 -0.9449532758 -0.2837658683  0.0006720045
print("Estimated coefficient values from bootstrap")
## [1] "Estimated coefficient values from bootstrap"
print(results)
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Boston, statistic = getCoeffs, R = 1000, formula = crim ~ 
##     indus + dis + medv)
## 
## 
## Bootstrap Statistics :
##       original        bias    std. error
## t1* 11.6773802  0.0285925294  2.27003396
## t2*  0.1314472  0.0001859712  0.05433813
## t3* -0.9631976 -0.0015441674  0.17250672
## t4* -0.2606146 -0.0008408676  0.04963074
prednames <- c("indus", "dis", "medv")

# 4coefficients,iterate over CI calc
for(i in 2:4){
    print(paste("The confidence interval for", prednames[i-1], "is:"))
    print(boot.ci(results, conf=0.95, index=i, type="basic"))
}
## [1] "The confidence interval for indus is:"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = results, conf = 0.95, type = "basic", index = i)
## 
## Intervals : 
## Level      Basic         
## 95%   ( 0.0291,  0.2490 )  
## Calculations and Intervals on Original Scale
## [1] "The confidence interval for dis is:"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = results, conf = 0.95, type = "basic", index = i)
## 
## Intervals : 
## Level      Basic         
## 95%   (-1.2714, -0.5850 )  
## Calculations and Intervals on Original Scale
## [1] "The confidence interval for medv is:"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = results, conf = 0.95, type = "basic", index = i)
## 
## Intervals : 
## Level      Basic         
## 95%   (-0.3542, -0.1589 )  
## Calculations and Intervals on Original Scale
## phil stole from ISL next line
###  !!! ask Mudigonda is this is helpful !!!
plot(results)

The ISL Text Lab example of Boot:

alpha = function( with inputs ‘x’ & ‘y’)

## Bootstrap
## Minimum risk investment - Section 5.2

# Block 1: create a function to compute a statistic
alpha=function(x,y){
  vx=var(x)
  vy=var(y)
  cxy=cov(x,y)
  (vy-cxy)/(vx+vy-2*cxy)
}
alpha(Portfolio$X,Portfolio$Y)  # run 'alpha' over the whole data set
## [1] 0.5758321
####### block 1 ends

## Block 2:  new F() takes data & index   # SE or estimate?
alpha.fn=function(data, index){
  with(data[index,],alpha(X,Y)) # with() applies data-index to 'alpha'
}

alpha.fn(Portfolio,1:100)  # index uses all data, same answer as block 1
## [1] 0.5758321
##########  block 2 ends, same answer as block 1


# Block 3: Alternatively, estimate function in one step
alpha.fn=function (data ,index){
 X=data$X [index]
 Y=data$Y [index]
 return ((var(Y)-cov (X,Y))/(var(X)+var(Y) -2* cov(X,Y)))
}

alpha.fn(Portfolio,1:100)
## [1] 0.5758321
##########  block 3 ends, same answer as block 1 & 2

# Block 4: BOOTSTRAP data set, 100 values x,y random sample from
#   all 'portfolio' data (that is elements 1:100), then recomputing ˆα
set.seed(1)
alpha.fn (Portfolio,sample(1:100,100,replace=TRUE))
## [1] 0.7368375
# Block 5: BOOTSTRAP w Boot(), get 1000 metrics and stats on them

boot.out=boot(                     # boot creates 1000 metrics & stats on them
               data=Portfolio,     # data set
               statistic=alpha.fn, # function to create a metric
               R=1000              # number of repetitions
               )

boot.out
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
## 
## 
## Bootstrap Statistics :
##      original       bias    std. error
## t1* 0.5758321 -0.001695873  0.09366347
plot(boot.out)