library(AppliedPredictiveModeling)
data(twoClassData)
str(predictors)
## 'data.frame': 208 obs. of 2 variables:
## $ PredictorA: num 0.158 0.655 0.706 0.199 0.395 ...
## $ PredictorB: num 0.1609 0.4918 0.6333 0.0881 0.4152 ...
str(classes)
## Factor w/ 2 levels "Class1","Class2": 2 2 2 2 2 2 2 2 2 2 ...
# random
random <- sample(classes)
# stratified random splits
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(1)
trainingRows <- createDataPartition(classes,
p = .80,
list = FALSE)
head(trainingRows)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
## [6,] 6
trainPredictors <- predictors[trainingRows,]
trainClasses <- classes[trainingRows]
testPredictors <- predictors[-trainingRows,]
testClasses <- classes[-trainingRows]
str(trainPredictors)
## 'data.frame': 167 obs. of 2 variables:
## $ PredictorA: num 0.158 0.655 0.706 0.199 0.395 ...
## $ PredictorB: num 0.1609 0.4918 0.6333 0.0881 0.4152 ...
str(testPredictors)
## 'data.frame': 41 obs. of 2 variables:
## $ PredictorA: num 0.0658 0.1056 0.2909 0.4129 0.0472 ...
## $ PredictorB: num 0.1786 0.0801 0.3021 0.2869 0.0414 ...
# maxDissim in caret sequentially sample data using maximum dissimilarity
Techniques:
Size difference between training set and resampling subsets get smaller -> bias decreases -> performance improved
k=10 is recommended from the perspective of computational efficiency.
cvSplits <- createFolds(trainClasses, k=10,
returnTrain = TRUE)
str(cvSplits)
## List of 10
## $ Fold01: int [1:151] 1 2 3 4 5 6 7 8 9 11 ...
## $ Fold02: int [1:150] 1 2 3 4 5 6 7 9 10 11 ...
## $ Fold03: int [1:151] 1 3 4 5 6 7 8 9 10 11 ...
## $ Fold04: int [1:151] 2 3 4 5 6 7 8 9 10 11 ...
## $ Fold05: int [1:150] 1 2 4 5 7 8 9 10 11 12 ...
## $ Fold06: int [1:150] 1 2 3 4 6 8 9 10 12 13 ...
## $ Fold07: int [1:150] 1 2 3 4 5 6 7 8 10 11 ...
## $ Fold08: int [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold09: int [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold10: int [1:150] 1 2 3 5 6 7 8 9 10 11 ...
The ith training set outcome:
\[ GCV=\frac{1}{n}\sum^n_{i=1}(\frac{y_i-y_i^{'}}{1-df/n})^2 \]
repeatedcv <- createMultiFolds(classes)
To get stable performance estimation, larger amount of repetitions (50~200) is suggested.
For large amount of repetitions, higher proportions are prefered to reduce uncertainty.
set.seed(1)
repeatedSplits <- createDataPartition(trainClasses, p=.80,
times = 3)
str(repeatedSplits)
## List of 3
## $ Resample1: int [1:135] 1 2 4 5 6 8 9 10 11 12 ...
## $ Resample2: int [1:135] 2 3 4 6 7 8 9 11 14 15 ...
## $ Resample3: int [1:135] 4 5 6 7 8 9 11 13 14 15 ...
fold1 <- cvSplits[[1]]
cvPredictors <- trainPredictors[fold1,]
cvClasses1 <- trainClasses[fold1]
nrow(trainPredictors)
## [1] 167
nrow(cvPredictors)
## [1] 151
Sampling the data with replacement and predict the out-of-bag samples. Prefer large sample size.
Uncertainty < k-fold cv. bias \(\approx\) 2-fold cv.
bootstrapping <- createResample(trainClasses)
trainPredictors <- as.matrix(trainPredictors)
knnFit <- knn3(x = trainPredictors, y=trainClasses,k=5)
knnFit
## 5-nearest neighbor classification model
## Training set class distribution:
##
## Class1 Class2
## 89 78
testPredictors <- predict(knnFit, newdata = testPredictors,
type = "class")
head(testPredictors)
## [1] Class2 Class2 Class1 Class1 Class2 Class2
## Levels: Class1 Class2
str(testPredictors)
## Factor w/ 2 levels "Class1","Class2": 2 2 1 1 2 2 2 2 2 2 ...
Visualization: Performance profile
data(GermanCredit)
inTrain <- createDataPartition(GermanCredit$Class, p=.80, list = F)
GermanCredit <- GermanCredit[,-nearZeroVar(GermanCredit)]
GermanCreditTrain <- GermanCredit[inTrain,]
GermanCreditTest <- GermanCredit[-inTrain,]
svmFit <- train(Class ~.,
data = GermanCreditTrain,
method = "svmRadial",
preProc = c("center","scale"),
tuneLength = 10,
trControl = trainControl(method = "repeatedcv",
repeats = 5))
## Loading required package: kernlab
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 800 samples
## 48 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (48), scaled (48)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Accuracy SD Kappa SD
## 0.25 0.70000 0.0000000 0.00000000 0.00000000
## 0.50 0.72975 0.1703871 0.02170800 0.07664483
## 1.00 0.75600 0.3287802 0.03679092 0.11227421
## 2.00 0.76025 0.3817824 0.04386392 0.11617371
## 4.00 0.75825 0.3902908 0.03949700 0.10139945
## 8.00 0.74225 0.3658296 0.03632426 0.09490392
## 16.00 0.72950 0.3441937 0.03760699 0.09308175
## 32.00 0.71725 0.3221384 0.03862183 0.09396451
## 64.00 0.71700 0.3220448 0.04038716 0.09989072
## 128.00 0.71600 0.3192356 0.03928896 0.09810420
##
## Tuning parameter 'sigma' was held constant at a value of 0.01213641
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01213641 and C = 2.
plot(svmFit, scales = list(x=list(log=2)))
predictedClasses <- predict(svmFit, GermanCreditTest)
str(predictedClasses)
## Factor w/ 2 levels "Bad","Good": 2 1 2 2 1 2 2 2 2 1 ...
logisticReg <- train(Class~.,
data = GermanCreditTrain,
method = "glm",
trControl = trainControl(method = "repeatedcv",
repeats = 5))
logisticReg
## Generalized Linear Model
##
## 800 samples
## 48 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ...
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.75175 0.3686202 0.04366866 0.1153117
##
##
resamp <- resamples(list(SVM = svmFit, Logistic = logisticReg))
summary(resamp)
##
## Call:
## summary.resamples(object = resamp)
##
## Models: SVM, Logistic
## Number of resamples: 50
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.6500 0.7375 0.7562 0.7602 0.7969 0.85 0
## Logistic 0.6625 0.7250 0.7500 0.7518 0.7875 0.85 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.12500 0.3139 0.3708 0.3818 0.4504 0.6429 0
## Logistic 0.09722 0.2907 0.3750 0.3686 0.4504 0.6250 0
modelDifferences <- diff(resamp)
summary(modelDifferences)
##
## Call:
## summary.diff.resamples(object = modelDifferences)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## SVM Logistic
## SVM 0.0085
## Logistic 0.3119
##
## Kappa
## SVM Logistic
## SVM 0.01316
## Logistic 0.5279