Lecture 1

library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
data("iris")
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
inTrain <- createDataPartition(y = iris$Species,p = 0.7, list = FALSE)
training = iris[inTrain,]
testing = iris[-inTrain,]
dim(training)
## [1] 105   5
dim(testing)
## [1] 45  5
qplot(Petal.Width, Sepal.Width, color = Species, data = training)

modFit <- train(Species ~ ., method = "rpart", data=training)
## Loading required package: rpart
print(modFit$finalModel)
## n= 105 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.6 35  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.6 70 35 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Length< 4.85 35  2 versicolor (0.00000000 0.94285714 0.05714286) *
##     7) Petal.Length>=4.85 35  2 virginica (0.00000000 0.05714286 0.94285714) *
plot(modFit$finalModel, uniform = TRUE, main = "Classification Tree")
text(modFit$finalModel, use.n = TRUE, all = TRUE, cex = 1)

predict(modFit, newdata = testing)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor virginica  versicolor
## [19] versicolor versicolor versicolor versicolor virginica  versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  virginica  versicolor virginica  virginica  virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica

Lecture 2 slides

library(ElemStatLearn)
data(ozone, package = "ElemStatLearn")
ozone = ozone[order(ozone$ozone),]
head(ozone)
##     ozone radiation temperature wind
## 17      1         8          59  9.7
## 19      4        25          61  9.7
## 14      6        78          57 18.4
## 45      7        48          80 14.3
## 106     7        49          69 10.3
## 7       8        19          61 20.1
l1 <- matrix (NA, nrow = 10, ncol = 155)
for(i in 1:10){
        ss <- sample(1:dim(ozone)[1], replace = T)
        ozone0 = ozone[ss, ]
        ozone0 = ozone[order(ozone0$ozone),]
        loess0 = loess(temperature ~ ozone, data = ozone0, span = 0.2)
        l1[i, ] = predict( loess0, newdata = data.frame(ozone = 1:155))
}
plot(ozone$ozone, ozone$temperature,pch = 19, cex = 0.5) 
for(i in 1:10){lines(1:155, l1[i,], col = "grey",lwd = 2)}

Question1

Load the cell segmentation data from the AppliedPredictiveModeling package using the commands:

library(AppliedPredictiveModeling)
data(segmentationOriginal)
library(caret)

Question 2

If K is small in a K-fold cross validation is the bias in the estimate of out-of-sample (test set) accuracy smaller or bigger? If K is small is the variance in the estimate of out-of-sample (test set) accuracy smaller or bigger. Is K large or small in leave one out cross validation?

Question 3

library(pgmm)
data(olive)
olive = olive[,-1]
newdata = as.data.frame(t(colMeans(olive)))

Question 4

library(ElemStatLearn)
## 
## Attaching package: 'ElemStatLearn'
## The following object is masked _by_ '.GlobalEnv':
## 
##     ozone
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]
missClass = function(values,prediction){sum(((prediction > 0.5)*1) != values)/length(values)}
set.seed(13234)
fitMode = train(chd ~ age+alcohol+obesity+tobacco+typea+ldl, data = trainSA, method = "glm", family = "binomial")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to
## do classification? If so, use a 2 level factor as your outcome column.
result4 = predict(fitMode, newdata = testSA)
missClass(trainSA$chd, predict(fitMode, newdata = trainSA))
## [1] 0.2727273
missClass(testSA$chd, predict(fitMode, newdata = testSA))
## [1] 0.3116883

Question 5

library(ElemStatLearn)
data(vowel.train)
data(vowel.test)
set.seed(33833)