For more details on using R Markdown see http://rmarkdown.rstudio.com.
# Phil presumes this is the code from "Week Two Study Guide.Html"
# my data file is in:
# C:\Users\Phil\Desktop\Milner recovered data 100920\Cal So Un.....
# or set working directory to the above
getwd() # shows your working directory
## [1] "C:/Users/Phil/Desktop/Milner recovered data 100920/Cal So University DBA/SLU/AA 5300 (ML)/2 week/uploads-week-2/uploads-week-2"
# setwd(dir) # sets working directory to the value of dir in double quotes w forward slashes
# OR !!! in RStudio, click 'Session' (top menu bar), in dropdown select 'to source file location'
setwd("C:/Users/Phil/Desktop/Milner recovered data 100920/Cal So University DBA/SLU/AA 5300 (ML)/2 week/uploads-week-2/uploads-week-2")
ds.original <- read.csv(file = "test-data-set.txt", header = TRUE, sep = " ")
print(ds.original)
## a b c
## 1 11 12 13
## 2 21 22 23
## 3 31 32 33
## 4 41 42 43
for (iternum in (1:3)){
print(paste("Shuffle number:", iternum)) # print shuffle no.
ds.shuffled <- ds.original[sample(nrow(ds.original)),] # sample() randomizes
print(ds.shuffled) # print random rows
}
## [1] "Shuffle number: 1"
## a b c
## 1 11 12 13
## 4 41 42 43
## 3 31 32 33
## 2 21 22 23
## [1] "Shuffle number: 2"
## a b c
## 3 31 32 33
## 2 21 22 23
## 4 41 42 43
## 1 11 12 13
## [1] "Shuffle number: 3"
## a b c
## 4 41 42 43
## 3 31 32 33
## 2 21 22 23
## 1 11 12 13
### Example with Boston data
######### code block one gives vector with size of each subset
library(MASS)
dataset.obj <- as.data.frame(Boston)
#summary(dataset.obj)
## change the variable chas to factor, since it's a dummy variable
dataset.obj$chas <- as.factor(dataset.obj$chas)
## verify
# summary(dataset.obj$chas)
## The overall k-fold-based CV approach
samplesize <- nrow(dataset.obj)
numfolds <- 10 # we're setting k = 10
quotient <- samplesize %/% numfolds # the %/% operator gives quotient
remainder <- samplesize %% numfolds # the %% operator gives remainder
vct.sizes <- rep(quotient, numfolds) # vector subsets, size = quotient
if(remainder > 0){
for(i in 1:remainder){
vct.sizes[i] <- vct.sizes[i] + 1 # ++remainders to subsets
}
}
print(paste("K:", 10, "n:", samplesize))
## [1] "K: 10 n: 506"
print(vct.sizes)
## [1] 51 51 51 51 51 51 50 50 50 50
###################### block 2: split data in half & calc rmse for predicted
startval <- 1
endval <- nrow(dataset.obj)/2 # nrow() = no rows in object
model <- glm(crim ~ indus + dis + medv,
data = dataset.obj[-(startval:endval), ])
pred.vals <- predict(model, newdata=dataset.obj[startval:endval, ],
type="response")
# calc Residual MSE
rmse <- sqrt(mean((dataset.obj$crim[startval:endval] - pred.vals)^2))
######## end block 2:
#### block 3: 10-fold CV,MDL_1,3 vari's w above vector sample sizes, vct.sizes
set.seed(112)
dataset.obj <- dataset.obj[sample(nrow(dataset.obj)), ]
## create the vector to hold RMSE values
vct.rmses <- numeric(numfolds)
startval <- 1
for(kth in (1:numfolds)){
endval <- vct.sizes[kth] + startval - 1 #1st loop: vct.sizes.1 + 0
#2++ Loop: Vct.sizes.x +old_endval+1
model <- glm(crim ~ indus + dis + medv,
data = dataset.obj[-(startval:endval), ]) #data not in start:end
pred.vals <- predict(model, # predict w data in st:end,validate subset
newdata = dataset.obj[startval:endval, ],
type="response")
## compute the RMSE
rmse <- sqrt(mean((dataset.obj$crim[startval:endval] - pred.vals)^2))
## store the current fold's RMSE
vct.rmses[kth] <- rmse
## modify the start value to correspond to the next fold
startval <- endval + 1
}
## Compute the overall RMSE
overall.rmse <- mean(vct.rmses)
print(paste("For the model crim ~ indus + dis + medv, the overall 10-fold CV RMSE is:",
round(overall.rmse, 6)))
## [1] "For the model crim ~ indus + dis + medv, the overall 10-fold CV RMSE is: 6.965314"
########################################### end block 3, 1st 10-fold CV, mdl_1
#### block 4: 2nd 10-fold CV (Model 2, 4predictors) w vct.sizes
set.seed(112)
dataset.obj <- dataset.obj[sample(nrow(dataset.obj)), ]
## create the vector to hold RMSE values
vct.rmses <- numeric(numfolds)
startval <- 1
for(kth in (1:numfolds)){
endval <- vct.sizes[kth] + startval - 1
model <- glm(crim ~ indus + dis + medv + age,
data = dataset.obj[-(startval:endval), ])
pred.vals <- predict(model,
newdata = dataset.obj[startval:endval, ],
type="response")
## compute the RMSE
rmse <- sqrt(mean((dataset.obj$crim[startval:endval] - pred.vals)^2))
## store the current fold's RMSE
vct.rmses[kth] <- rmse
## modify the start value to correspond to the next fold
startval <- endval + 1
}
## Compute the overall RMSE
overall.rmse <- mean(vct.rmses)
print(paste("For the model crim ~ indus + dis + medv + age, the overall 10-fold CV RMSE is:",
round(overall.rmse, 6)))
## [1] "For the model crim ~ indus + dis + medv + age, the overall 10-fold CV RMSE is: 7.242021"
########################################### end block 4, mdl_2, 4 predictors
block 5: boot, resample to get variance of coefficients
getCoeffs <- function(dataset, indices, formula) # inputs
{
d <- dataset[indices, ] # build dataset w indicies
fit <- glm(formula, data=d) # input formula on data =d
return(coef(fit)) # use coef() on formula/model
}
library(boot)
set.seed(122)
results <- boot( # boot creates 1000 metrics & their stats
data = Boston, # data set
statistic = getCoeffs, # function to create metric
R = 1000, # number of resamples
formula = crim ~ indus + dis + medv) # my metric needs formula
# for the above, see also block 5 or entire nxt chunk ,simpler metric function
print("Coefficients of the model")
## [1] "Coefficients of the model"
print(coef(model))
## (Intercept) indus dis medv age
## 12.2860166210 0.1075313251 -0.9449532758 -0.2837658683 0.0006720045
print("Estimated coefficient values from bootstrap")
## [1] "Estimated coefficient values from bootstrap"
print(results)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = getCoeffs, R = 1000, formula = crim ~
## indus + dis + medv)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 11.6773802 0.0285925294 2.27003396
## t2* 0.1314472 0.0001859712 0.05433813
## t3* -0.9631976 -0.0015441674 0.17250672
## t4* -0.2606146 -0.0008408676 0.04963074
prednames <- c("indus", "dis", "medv")
# 4coefficients,iterate over CI calc
for(i in 2:4){
print(paste("The confidence interval for", prednames[i-1], "is:"))
print(boot.ci(results, conf=0.95, index=i, type="basic"))
}
## [1] "The confidence interval for indus is:"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = results, conf = 0.95, type = "basic", index = i)
##
## Intervals :
## Level Basic
## 95% ( 0.0291, 0.2490 )
## Calculations and Intervals on Original Scale
## [1] "The confidence interval for dis is:"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = results, conf = 0.95, type = "basic", index = i)
##
## Intervals :
## Level Basic
## 95% (-1.2714, -0.5850 )
## Calculations and Intervals on Original Scale
## [1] "The confidence interval for medv is:"
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = results, conf = 0.95, type = "basic", index = i)
##
## Intervals :
## Level Basic
## 95% (-0.3542, -0.1589 )
## Calculations and Intervals on Original Scale
## phil stole from ISL next line
### !!! ask Mudigonda is this is helpful !!!
plot(results)
The ISL Text Lab example of Boot:
alpha = function( with inputs ‘x’ & ‘y’)
## Bootstrap
## Minimum risk investment - Section 5.2
# Block 1: create a function to compute a statistic
alpha=function(x,y){
vx=var(x)
vy=var(y)
cxy=cov(x,y)
(vy-cxy)/(vx+vy-2*cxy)
}
alpha(Portfolio$X,Portfolio$Y) # run 'alpha' over the whole data set
## [1] 0.5758321
####### block 1 ends
## Block 2: new F() takes data & index # SE or estimate?
alpha.fn=function(data, index){
with(data[index,],alpha(X,Y)) # with() applies data-index to 'alpha'
}
alpha.fn(Portfolio,1:100) # index uses all data, same answer as block 1
## [1] 0.5758321
########## block 2 ends, same answer as block 1
# Block 3: Alternatively, estimate function in one step
alpha.fn=function (data ,index){
X=data$X [index]
Y=data$Y [index]
return ((var(Y)-cov (X,Y))/(var(X)+var(Y) -2* cov(X,Y)))
}
alpha.fn(Portfolio,1:100)
## [1] 0.5758321
########## block 3 ends, same answer as block 1 & 2
# Block 4: BOOTSTRAP data set, 100 values x,y random sample from
# all 'portfolio' data (that is elements 1:100), then recomputing ˆα
set.seed(1)
alpha.fn (Portfolio,sample(1:100,100,replace=TRUE))
## [1] 0.7368375
# Block 5: BOOTSTRAP w Boot(), get 1000 metrics and stats on them
boot.out=boot( # boot creates 1000 metrics & stats on them
data=Portfolio, # data set
statistic=alpha.fn, # function to create a metric
R=1000 # number of repetitions
)
boot.out
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5758321 -0.001695873 0.09366347
plot(boot.out)