# Question 1
# Load the cell segmentation data from the AppliedPredictiveModeling package using the commands:
# install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
data(segmentationOriginal)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
str(segmentationOriginal)
## 'data.frame': 2019 obs. of 119 variables:
## $ Cell : int 207827637 207932307 207932463 207932470 207932455 207827656 207827659 207827661 207932479 207932480 ...
## $ Case : Factor w/ 2 levels "Test","Train": 1 2 2 2 1 1 1 1 1 1 ...
## $ Class : Factor w/ 2 levels "PS","WS": 1 1 2 1 1 2 2 1 2 2 ...
## $ AngleCh1 : num 143.25 133.75 106.65 69.15 2.89 ...
## $ AngleStatusCh1 : int 1 0 0 0 2 2 1 1 2 1 ...
## $ AreaCh1 : int 185 819 431 298 285 172 177 251 495 384 ...
## $ AreaStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ AvgIntenCh1 : num 15.7 31.9 28 19.5 24.3 ...
## $ AvgIntenCh2 : num 3.95 205.88 115.32 101.29 111.42 ...
## $ AvgIntenCh3 : num 9.55 69.92 63.94 28.22 20.47 ...
## $ AvgIntenCh4 : num 2.21 164.15 106.7 31.03 40.58 ...
## $ AvgIntenStatusCh1 : int 0 0 0 0 0 1 1 0 0 0 ...
## $ AvgIntenStatusCh2 : int 2 0 0 0 0 1 1 2 0 0 ...
## $ AvgIntenStatusCh3 : int 2 0 0 0 0 1 1 0 0 0 ...
## $ AvgIntenStatusCh4 : int 2 0 0 2 0 1 0 2 0 0 ...
## $ ConvexHullAreaRatioCh1 : num 1.12 1.26 1.05 1.2 1.11 ...
## $ ConvexHullAreaRatioStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ConvexHullPerimRatioCh1 : num 0.92 0.797 0.935 0.866 0.957 ...
## $ ConvexHullPerimRatioStatusCh1: int 0 2 0 2 0 1 1 2 2 2 ...
## $ DiffIntenDensityCh1 : num 29.5 31.9 32.5 26.7 31.6 ...
## $ DiffIntenDensityCh3 : num 13.8 43.1 36 22.9 21.7 ...
## $ DiffIntenDensityCh4 : num 6.83 79.31 51.36 26.39 25.03 ...
## $ DiffIntenDensityStatusCh1 : int 2 0 0 2 0 1 1 2 2 2 ...
## $ DiffIntenDensityStatusCh3 : int 2 0 0 0 0 1 0 0 2 0 ...
## $ DiffIntenDensityStatusCh4 : int 2 0 0 2 2 1 1 2 0 0 ...
## $ EntropyIntenCh1 : num 4.97 6.09 5.88 5.42 5.66 ...
## $ EntropyIntenCh3 : num 4.37 6.64 6.68 5.44 5.29 ...
## $ EntropyIntenCh4 : num 2.72 7.88 7.14 5.78 5.24 ...
## $ EntropyIntenStatusCh1 : int 2 0 0 2 2 0 0 2 2 2 ...
## $ EntropyIntenStatusCh3 : int 0 1 1 0 0 1 1 0 0 0 ...
## $ EntropyIntenStatusCh4 : int 2 1 0 0 0 0 0 2 0 0 ...
## $ EqCircDiamCh1 : num 15.4 32.3 23.4 19.5 19.1 ...
## $ EqCircDiamStatusCh1 : int 0 1 0 0 0 2 2 0 0 0 ...
## $ EqEllipseLWRCh1 : num 3.06 1.56 1.38 3.39 2.74 ...
## $ EqEllipseLWRStatusCh1 : int 1 0 0 1 0 0 0 0 0 0 ...
## $ EqEllipseOblateVolCh1 : num 337 2233 802 725 608 ...
## $ EqEllipseOblateVolStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ EqEllipseProlateVolCh1 : num 110 1433 583 214 222 ...
## $ EqEllipseProlateVolStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ EqSphereAreaCh1 : num 742 3279 1727 1195 1140 ...
## $ EqSphereAreaStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ EqSphereVolCh1 : num 1901 17654 6751 3884 3621 ...
## $ EqSphereVolStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FiberAlign2Ch3 : num 0 0.488 0.301 0.22 0.491 ...
## $ FiberAlign2Ch4 : num 0 0.352 0.522 0.733 0.385 ...
## $ FiberAlign2StatusCh3 : int 2 0 0 0 0 0 0 2 0 1 ...
## $ FiberAlign2StatusCh4 : int 2 0 0 1 0 0 0 2 0 1 ...
## $ FiberLengthCh1 : num 27 64.3 21.1 43.1 34.7 ...
## $ FiberLengthStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FiberWidthCh1 : num 7.41 13.17 21.14 7.4 8.48 ...
## $ FiberWidthStatusCh1 : int 2 0 1 2 2 0 0 2 0 0 ...
## $ IntenCoocASMCh3 : num 0.01118 0.02805 0.00686 0.03096 0.02277 ...
## $ IntenCoocASMCh4 : num 0.05045 0.01259 0.00614 0.01103 0.07969 ...
## $ IntenCoocASMStatusCh3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ IntenCoocASMStatusCh4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ IntenCoocContrastCh3 : num 40.75 8.23 14.45 7.3 15.85 ...
## $ IntenCoocContrastCh4 : num 13.9 6.98 16.7 13.39 3.54 ...
## $ IntenCoocContrastStatusCh3 : int 1 0 0 0 0 0 0 1 0 1 ...
## $ IntenCoocContrastStatusCh4 : int 1 0 1 1 2 0 1 0 0 1 ...
## $ IntenCoocEntropyCh3 : num 7.2 6.82 7.58 6.31 6.78 ...
## $ IntenCoocEntropyCh4 : num 5.25 7.1 7.67 7.2 5.5 ...
## $ IntenCoocEntropyStatusCh3 : int 0 0 1 0 0 0 0 1 0 1 ...
## $ IntenCoocEntropyStatusCh4 : int 0 0 1 0 0 0 0 2 0 1 ...
## $ IntenCoocMaxCh3 : num 0.0774 0.1532 0.0284 0.1628 0.1274 ...
## $ IntenCoocMaxCh4 : num 0.172 0.0739 0.0232 0.0775 0.2785 ...
## $ IntenCoocMaxStatusCh3 : int 0 0 2 0 0 2 2 2 0 0 ...
## $ IntenCoocMaxStatusCh4 : int 0 0 2 0 0 2 2 1 0 0 ...
## $ KurtIntenCh1 : num -0.6567 -0.2488 -0.2935 0.6259 0.0421 ...
## $ KurtIntenCh3 : num -0.608 -0.331 1.051 0.128 0.952 ...
## $ KurtIntenCh4 : num 0.726 -0.265 0.151 -0.347 -0.195 ...
## $ KurtIntenStatusCh1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ KurtIntenStatusCh3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ KurtIntenStatusCh4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LengthCh1 : num 26.2 47.2 28.1 37.9 36 ...
## $ LengthStatusCh1 : int 0 1 0 0 0 2 2 0 0 0 ...
## $ MemberAvgAvgIntenStatusCh2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MemberAvgTotalIntenStatusCh2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NeighborAvgDistCh1 : num 370 174 158 206 205 ...
## $ NeighborAvgDistStatusCh1 : int 1 2 2 0 0 0 0 0 0 0 ...
## $ NeighborMinDistCh1 : num 99.1 30.1 34.9 33.1 27 ...
## $ NeighborMinDistStatusCh1 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ NeighborVarDistCh1 : num 128 81.4 90.4 116.9 111 ...
## $ NeighborVarDistStatusCh1 : int 0 2 2 0 0 0 0 0 2 2 ...
## $ PerimCh1 : num 68.8 154.9 84.6 101.1 86.5 ...
## $ PerimStatusCh1 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ ShapeBFRCh1 : num 0.665 0.54 0.724 0.589 0.6 ...
## $ ShapeBFRStatusCh1 : int 0 2 1 0 0 0 1 0 0 0 ...
## $ ShapeLWRCh1 : num 2.46 1.47 1.33 2.83 2.73 ...
## $ ShapeLWRStatusCh1 : int 0 0 0 1 1 0 0 0 0 0 ...
## $ ShapeP2ACh1 : num 1.88 2.26 1.27 2.55 2.02 ...
## $ ShapeP2AStatusCh1 : int 0 0 0 1 0 0 0 1 0 0 ...
## $ SkewIntenCh1 : num 0.455 0.399 0.472 0.882 0.517 ...
## $ SkewIntenCh3 : num 0.46 0.62 0.971 1 1.177 ...
## $ SkewIntenCh4 : num 1.233 0.527 0.325 0.604 0.926 ...
## $ SkewIntenStatusCh1 : int 0 0 0 1 0 2 2 0 0 0 ...
## $ SkewIntenStatusCh3 : int 0 0 0 0 0 2 2 2 0 0 ...
## $ SkewIntenStatusCh4 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ SpotFiberCountCh3 : int 1 4 2 4 1 1 0 2 1 1 ...
## $ SpotFiberCountCh4 : int 4 11 6 7 7 4 4 7 11 7 ...
## [list output truncated]
- Subset the data to a training set and testing set based on the Case variable in the data set.
- Set the seed to 125 and fit a CART model with the rpart method using all predictor variables and default caret settings.
- In the final model what would be the final model prediction for cases with the following variable values:
- TotalIntench2 = 23,000; FiberWidthCh1 = 10; PerimStatusCh1=2
- TotalIntench2 = 50,000; FiberWidthCh1 = 10;VarIntenCh4 = 100
- TotalIntench2 = 57,000; FiberWidthCh1 = 8;VarIntenCh4 = 100
- FiberWidthCh1 = 8;VarIntenCh4 = 100; PerimStatusCh1=2
# Install rpart library
# install.packages("rpart")
library(rpart)
# install.packages("rpart.plot")
library(rpart.plot)
# Load the library caTools
# install.packages("caTools")
library(caTools)
# install.packages("rattle")
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 3.4.1 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
set.seed(125)
split = sample.split(segmentationOriginal$Case, SplitRatio = 0.7)
train.q1 = subset(segmentationOriginal, split==TRUE)
test.q1 = subset(segmentationOriginal, split==FALSE)
model.fit = rpart(Class ~ ., data=train.q1, , method="class")
prp(model.fit)

fancyRpartPlot(model.fit)

# a. PS
# b. WS
# c. PS
# d. Not possible to predict
# Only sequence where the a-b-c sequence made sense
# Question 2
# If K is small in a K-fold cross validation is the bias in the estimate of out-of-sample (test set) accuracy smaller or bigger? If K is small is the variance in the estimate of out-of-sample (test set) accuracy smaller or bigger. Is K large or small in leave one out cross validation?
# ANS The bias is larger and the variance is smaller. Under leave one out cross validation K is equal to the sample size. K-fold means the data is split into k sections, so leave one out is equivalent to k-fold validation where k=1. The smallest k-fold cross validation is k=2, which is like having equal sized test sets, so training sets are larger then for k-fold validation for k>2.
# Question 3
#Load the olive oil data using the commands:
# install.packages("pgmm")
library(pgmm)
data(olive)
str(olive)
## 'data.frame': 572 obs. of 10 variables:
## $ Region : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Area : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Palmitic : num 1075 1088 911 966 1051 ...
## $ Palmitoleic: num 75 73 54 57 67 49 66 61 60 55 ...
## $ Stearic : num 226 224 246 240 259 268 264 235 239 213 ...
## $ Oleic : num 7823 7709 8113 7952 7771 ...
## $ Linoleic : num 672 781 549 619 672 678 618 734 709 633 ...
## $ Linolenic : num 36 31 31 50 50 51 49 39 46 26 ...
## $ Arachidic : num 60 61 63 78 80 70 56 64 83 52 ...
## $ Eicosenoic : num 29 29 29 35 46 44 29 35 33 30 ...
olive = olive[,-1]
model<-train(Area ~ ., data=olive, method="rpart")
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
model.b = rpart(Area ~ ., data=olive)
# (NOTE: If you have trouble installing the pgmm package, you can download the olive dataset here: olive_data.zip. After unzipping the archive, you can load the file using the load() function in R.)
# These data contain information on 572 different Italian olive oils from multiple regions in Italy. Fit a classification tree where Area is the outcome variable. Then predict the value of area for the following data frame using the tree command with all defaults.
newdata.cart = as.data.frame(t(colMeans(olive)))
newdata = as.data.frame(t(colMeans(olive)))
predict(model.b, newdata)
## 1
## 2.875
# What is the resulting prediction? 2.875
# Is the resulting prediction strange? Why or why not? ANS It is strange because Area should be a qualitative variable - but tree is reporting the average value of Area as a numeric variable in the leaf predicted for newdata
# Question 4
# Load the South Africa Heart Disease Data and create training and test sets with the following code:
# install.packages("ElemStatLearn")
library(ElemStatLearn)
data(SAheart)
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart[train,]
testSA = SAheart[-train,]
# Then set the seed to 13234 and fit a logistic regression model (method="glm", be sure to specify family="binomial") with Coronary Heart Disease (chd) as the outcome and age at onset, current alcohol consumption, obesity levels, cumulative tabacco, type-A behavior, and low density lipoprotein cholesterol as predictors. Calculate the misclassification rate for your model using this function and a prediction on the "response" scale:
set.seed(13234)
modelSA <- train(chd ~ age + alcohol + obesity + tobacco + typea + ldl, data=trainSA, method="glm", family="binomial")
summary(modelSA)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1007 -0.8200 -0.4195 0.8952 2.1711
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.712364 1.433244 -1.892 0.05843 .
## age 0.065608 0.014701 4.463 8.09e-06 ***
## alcohol -0.003240 0.005777 -0.561 0.57493
## obesity -0.132407 0.047060 -2.814 0.00490 **
## tobacco 0.103744 0.039821 2.605 0.00918 **
## typea 0.028937 0.016351 1.770 0.07678 .
## ldl 0.165439 0.088511 1.869 0.06160 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 302.83 on 230 degrees of freedom
## Residual deviance: 237.71 on 224 degrees of freedom
## AIC: 251.71
##
## Number of Fisher Scoring iterations: 4
predict.modelSA.test = predict(modelSA, newdata=testSA)
table(predict.modelSA.test>0.5, testSA$chd) # test set prediction
##
## 0 1
## FALSE 117 34
## TRUE 38 42
predict.modelSA.train = predict(modelSA, newdata=trainSA)
table(predict.modelSA.test>0.5, trainSA$chd) # training set prediction
##
## 0 1
## FALSE 94 57
## TRUE 53 27
sum(diag(table(predict.modelSA.test>0.5, testSA$chd)))/nrow(testSA) # test set Accuracy = 0.6883117
## [1] 0.6883117
missClass = function(values,prediction){sum(((prediction > 0.5)*1) != values)/length(values)}
missClass(testSA$chd,predict.modelSA.test) # (1 - Accuracy) = 0.3116883 for testing set
## [1] 0.3116883
missClass(trainSA$chd,predict.modelSA.train) # (1 - Accuracy) = 0.4761905 for training set
## [1] 0.2727273
1 - missClass(testSA$chd,predict.modelSA.test) - sum(diag(table(predict.modelSA.test>0.5, testSA$chd)))/nrow(testSA)
## [1] 0
# Question 5
# Load the vowel.train and vowel.test data sets:
# install.packages("ElemStatLearn")
library(ElemStatLearn)
data(vowel.train)
data(vowel.test)
str(vowel.train)
## 'data.frame': 528 obs. of 11 variables:
## $ y : int 1 2 3 4 5 6 7 8 9 10 ...
## $ x.1 : num -3.64 -3.33 -2.12 -2.29 -2.6 ...
## $ x.2 : num 0.418 0.496 0.894 1.809 1.938 ...
## $ x.3 : num -0.67 -0.694 -1.576 -1.498 -0.846 ...
## $ x.4 : num 1.779 1.365 0.147 1.012 1.062 ...
## $ x.5 : num -0.168 -0.265 -0.707 -1.053 -1.633 ...
## $ x.6 : num 1.627 1.933 1.559 1.06 0.764 ...
## $ x.7 : num -0.388 -0.363 -0.579 -0.567 0.394 0.217 0.322 -0.435 -0.512 -0.466 ...
## $ x.8 : num 0.529 0.51 0.676 0.235 -0.15 -0.246 0.45 0.992 0.928 0.702 ...
## $ x.9 : num -0.874 -0.621 -0.809 -0.091 0.277 0.238 0.377 0.575 -0.167 0.06 ...
## $ x.10: num -0.814 -0.488 -0.049 -0.795 -0.396 -0.365 -0.366 -0.301 -0.434 -0.836 ...
str(vowel.test)
## 'data.frame': 462 obs. of 11 variables:
## $ y : int 1 2 3 4 5 6 7 8 9 10 ...
## $ x.1 : num -1.15 -2.61 -2.5 -1.77 -2.67 ...
## $ x.2 : num -0.904 -0.092 0.632 1.769 3.155 ...
## $ x.3 : num -1.988 -0.54 -0.593 -1.142 -0.514 ...
## $ x.4 : num 0.739 0.484 0.304 -0.739 0.133 ...
## $ x.5 : num -0.06 0.389 0.496 -0.086 -0.964 ...
## $ x.6 : num 1.206 1.741 0.824 0.12 0.234 ...
## $ x.7 : num 0.864 0.198 -0.162 -0.23 -0.071 -0.496 -1.11 -0.322 -0.751 -0.484 ...
## $ x.8 : num 1.196 0.257 0.181 0.217 1.192 ...
## $ x.9 : num -0.3 -0.375 -0.363 -0.009 0.254 0.638 0.42 0.653 0.309 -0.246 ...
## $ x.10: num -0.467 -0.604 -0.764 -0.279 -0.471 ...
# Set the variable y to be a factor variable in both the training and test set. Then set the seed to 33833.
vowel.train$y = as.factor(vowel.train$y)
vowel.test$y = as.factor(vowel.test$y)
set.seed(33833)
# Fit a random forest predictor relating the factor variable y to the remaining variables. Read about variable importance in random forests here: http://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm#ooberr The caret package uses by defualt the Gini importance. Calculate the variable importance using the varImp function in the caret package. What is the order of variable importance?
vowel.rfmodel <- train(y ~ ., data=vowel.train, method="rf")
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
varImp(vowel.rfmodel)
## rf variable importance
##
## Overall
## x.1 100.000
## x.2 99.895
## x.5 45.292
## x.6 30.265
## x.8 25.512
## x.4 10.568
## x.3 10.133
## x.9 9.524
## x.7 6.514
## x.10 0.000