Data Split Example

## Load the packages
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.1
## Loading required package: lattice
library(klaR)
## Warning: package 'klaR' was built under R version 4.2.2
## Loading required package: MASS
library(ggplot2)
library(lattice)
## Load Iris dataset
data(iris)

define an 80%/20% train/test split of the dataset

trainIndex <- createDataPartition(iris$Species, p=0.80, list=FALSE)
dataTrain<-iris[ trainIndex, ]
dataTest<-iris[ -trainIndex, ]

Train a Naive Bayes model

fit <- NaiveBayes(Species~., data=dataTrain)

Make Predictions

Predictions<-predict(fit,dataTest[ , 1:4])

Summarize results

confusionMatrix(Predictions$class, dataTest$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         0
##   virginica       0          0        10
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8843, 1)
##     No Information Rate : 0.3333     
##     P-Value [Acc > NIR] : 4.857e-15  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000

Estimating Model Accuracy - Resampling methods

K-fold Cross Validation

library(caret)
data(iris)
## define the training control
trainControl<-trainControl(method="cv", number=10)
## Evaluate the model
fit2<- train(Species~., data=iris, trControl=trainControl, method="rf")
## display the results
print(fit2)
## Random Forest 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa
##   2     0.96      0.94 
##   3     0.96      0.94 
##   4     0.96      0.94 
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(fit2)

## Calculating Accuracy and Kappa metrics - using the Pima Indians dataset

library(caret)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.2.1
library(ggplot2)
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
## Prepare resampling method
trainControl2<-trainControl(method="cv", number=5)
set.seed(7)
fit3<-train(diabetes~., data=PimaIndiansDiabetes, method="glm", metric="Accuracy", trControl=trainControl)
print(fit3)
## Generalized Linear Model 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 691, 691, 691, 691, 691, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7839884  0.5003949

Using R squared (RMSE) metrics for the longley dataset

## Load package
library(caret)
## Load data
data(longley)
str(longley)
## 'data.frame':    16 obs. of  7 variables:
##  $ GNP.deflator: num  83 88.5 88.2 89.5 96.2 ...
##  $ GNP         : num  234 259 258 285 329 ...
##  $ Unemployed  : num  236 232 368 335 210 ...
##  $ Armed.Forces: num  159 146 162 165 310 ...
##  $ Population  : num  108 109 110 111 112 ...
##  $ Year        : int  1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 ...
##  $ Employed    : num  60.3 61.1 60.2 61.2 63.2 ...
## Prepare resampling method
trainControl3<-trainControl(method="cv", number=5)
set.seed(7)
fit4<-train(Employed~., data=longley, method="lm", metric="RMSE", trControl=trainControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
print(fit4)
## Linear Regression 
## 
## 16 samples
##  6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 14, 14, 14, 14, 14, 15, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.3357221  1         0.3140253
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE