# From Machine Learning with R
# By Brett Lantz Chapter 10 Evaluating Model Performance
# The holdout method
credit <- read.csv("~/machine_learning_withR/Machine Learning with R, Second Edition_Code/Chapter 10/credit.csv")
# suppose we have a data frame named credit with 100 rows of data
# we can divide this into three partitions
random_ids <- order(runif(1000))# create a vector of randomly ordered row IDs from 1 to 1000
credit_train <- credit[random_ids[1:500],]
credit_validate <- credit[random_ids[501:750],]
credit_test <- credit[random_ids[751:1000],]
# stratified random sampling
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
in_train <- createDataPartition(credit$default,p=0.75, list = FALSE)
credit_train <- credit[in_train,]
credit_test <- credit[-in_train,]
# Cross-validation
# The repeated holdout is the basis of a technique known as k-fold cross-validation
# (or k-fold CV), which has become the industry standard for estimating model
# performance. But rather than taking repeated random samples that could potentially
# use the same record more than once, k-fold CV randomly divides the data into k
# completely separate random partitions called folds.
folds <- createFolds(credit$default,k=10);str(folds)
## List of 10
## $ Fold01: int [1:100] 6 37 50 51 62 81 102 129 133 145 ...
## $ Fold02: int [1:100] 10 12 29 46 48 61 74 77 89 90 ...
## $ Fold03: int [1:100] 4 9 13 22 31 38 43 45 66 73 ...
## $ Fold04: int [1:100] 24 27 32 35 47 52 63 70 86 91 ...
## $ Fold05: int [1:100] 1 8 25 28 33 56 78 87 92 105 ...
## $ Fold06: int [1:100] 3 5 36 39 44 54 58 96 99 110 ...
## $ Fold07: int [1:100] 7 34 49 55 117 127 148 149 153 156 ...
## $ Fold08: int [1:100] 16 19 21 26 41 53 59 60 65 69 ...
## $ Fold09: int [1:100] 2 11 15 18 23 67 71 82 83 85 ...
## $ Fold10: int [1:100] 14 17 20 30 40 42 57 64 68 75 ...
credit01_train <- credit[folds$Fold01,]
credit01_test <- credit[-folds$Fold01,]
# To demonstrate the process, we'll estimate the kappa statistic for a C5.0 decision tree
# model of the credit data using 10-fold CV. First, we need to load caret (for creating
# the folds), C50 (for the decision tree), and irr (for calculating kappa). The latter two
# packages were chosen for illustrative purposes; if you desire, you can use a different
# model or a different performance measure with the same series of steps.
library(C50)
library(irr)
## Loading required package: lpSolve
set.seed(123)
folds <- createFolds(credit$default,k=10)
cv_results <- lapply(folds, function(x){
credit_train <- credit[x,]
credit_test <- credit[-x,]
credit_model <- C5.0(default~.,data = credit_train)
credit_pred <- predict(credit_model,credit_test)
credit_actual <- credit_test$default
kappa<-kappa2(data.frame(credit_actual,credit_pred))$value
return(kappa)
})
str(cv_results)
## List of 10
## $ Fold01: num 0.283
## $ Fold02: num 0.108
## $ Fold03: num 0.326
## $ Fold04: num 0.162
## $ Fold05: num 0.243
## $ Fold06: num 0.257
## $ Fold07: num 0.0355
## $ Fold08: num 0.0761
## $ Fold09: num 0.241
## $ Fold10: num 0.253
mean(unlist(cv_results))#Unfortunately, this kappa statistic is fairly low—in fact, this corresponds to "poor"
## [1] 0.1984929
# on the interpretation scale—which suggests that the credit scoring model does
# not perform much better than random chance