Iris

library(datasets)
library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(skimr)

## 
## Attaching package: 'skimr'

## The following object is masked from 'package:stats':
## 
##     filter

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(knitr)

## 
## Attaching package: 'knitr'

## The following object is masked from 'package:skimr':
## 
##     kable

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(ggpubr)

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

theme_set(theme_light())

df <- as_tibble(clean_names(datasets::iris))

Summary

df %>% 
group_by(species) %>% 
skim() %>%  
  skimr::kable()

## Skim summary statistics  
##  n obs: 150    
##  n variables: 5    
## 
## Variable type: numeric
## 
##   species        variable      missing    complete    n     mean     sd     p0     p25     p50     p75     p100      hist   
## ------------  --------------  ---------  ----------  ----  ------  ------  -----  ------  ------  ------  ------  ----------
##    setosa      petal_length       0          50       50    1.46    0.17     1     1.4     1.5     1.58    1.9     <U+2581><U+2581><U+2585><U+2587><U+2587><U+2585><U+2582><U+2581> 
##    setosa      petal_width        0          50       50    0.25    0.11    0.1    0.2     0.2     0.3     0.6     <U+2582><U+2587><U+2581><U+2582><U+2582><U+2581><U+2581><U+2581> 
##    setosa      sepal_length       0          50       50    5.01    0.35    4.3    4.8      5      5.2     5.8     <U+2582><U+2583><U+2585><U+2587><U+2587><U+2583><U+2581><U+2582> 
##    setosa      sepal_width        0          50       50    3.43    0.38    2.3    3.2     3.4     3.68    4.4     <U+2581><U+2581><U+2583><U+2585><U+2587><U+2583><U+2582><U+2581> 
##  versicolor    petal_length       0          50       50    4.26    0.47     3      4      4.35    4.6     5.1     <U+2581><U+2583><U+2582><U+2586><U+2586><U+2587><U+2587><U+2583> 
##  versicolor    petal_width        0          50       50    1.33    0.2      1     1.2     1.3     1.5     1.8     <U+2586><U+2583><U+2587><U+2585><U+2586><U+2582><U+2581><U+2581> 
##  versicolor    sepal_length       0          50       50    5.94    0.52    4.9    5.6     5.9     6.3      7      <U+2583><U+2582><U+2587><U+2587><U+2587><U+2583><U+2585><U+2582> 
##  versicolor    sepal_width        0          50       50    2.77    0.31     2     2.52    2.8      3      3.4     <U+2581><U+2582><U+2583><U+2585><U+2583><U+2587><U+2583><U+2581> 
##  virginica     petal_length       0          50       50    5.55    0.55    4.5    5.1     5.55    5.88    6.9     <U+2582><U+2587><U+2583><U+2587><U+2585><U+2582><U+2581><U+2582> 
##  virginica     petal_width        0          50       50    2.03    0.27    1.4    1.8      2      2.3     2.5     <U+2582><U+2581><U+2587><U+2583><U+2583><U+2586><U+2585><U+2583> 
##  virginica     sepal_length       0          50       50    6.59    0.64    4.9    6.23    6.5     6.9     7.9     <U+2581><U+2581><U+2583><U+2587><U+2585><U+2583><U+2582><U+2583> 
##  virginica     sepal_width        0          50       50    2.97    0.32    2.2    2.8      3      3.18    3.8     <U+2581><U+2583><U+2587><U+2587><U+2585><U+2583><U+2581><U+2582>

Visualise

df %>% 
  gather(key, value, -species) %>% 
  group_by(species, key) %>% 
  summarise(mean = mean(value), sd = sd(value)) %>%   ggplot(aes(key,mean)) + 
  geom_point(aes(colour = species)) +
  geom_errorbar(aes(x = key, ymax = mean + sd, ymin = mean - sd, colour = species)) +
  labs(title = "Iris data set dimensions", x = "Iris Feature", "cm", caption = "Error bars based on SD")

Data Split

#do any 80/20 split of the dataset
validation_index <- createDataPartition(df$species, p=0.80, list=F)

#create a training data set using the 80%
dataset <- df[validation_index,]

#and a validation data set based on the other 20%
validation <- df[-validation_index,]

Summary of data split

skim(dataset)

## Skim summary statistics
##  n obs: 120 
##  n variables: 5 
## 
## -- Variable type:factor ----------------------------------------------------------------------------------------------
##  variable missing complete   n n_unique                       top_counts
##   species       0      120 120        3 set: 40, ver: 40, vir: 40, NA: 0
##  ordered
##    FALSE
## 
## -- Variable type:numeric ---------------------------------------------------------------------------------------------
##      variable missing complete   n mean   sd  p0 p25  p50 p75 p100
##  petal_length       0      120 120 3.76 1.77 1   1.5 4.4  5.1  6.7
##   petal_width       0      120 120 1.2  0.77 0.1 0.3 1.35 1.8  2.5
##  sepal_length       0      120 120 5.86 0.81 4.4 5.1 5.8  6.4  7.9
##   sepal_width       0      120 120 3.08 0.44 2   2.8 3    3.4  4.4
##      hist
##  <U+2587><U+2581><U+2581><U+2581><U+2585><U+2585><U+2583><U+2582>
##  <U+2587><U+2581><U+2581><U+2585><U+2585><U+2583><U+2582><U+2582>
##  <U+2585><U+2587><U+2587><U+2587><U+2587><U+2586><U+2582><U+2581>
##  <U+2581><U+2582><U+2585><U+2587><U+2585><U+2582><U+2581><U+2581>

skim(validation)

## Skim summary statistics
##  n obs: 30 
##  n variables: 5 
## 
## -- Variable type:factor ----------------------------------------------------------------------------------------------
##  variable missing complete  n n_unique                       top_counts
##   species       0       30 30        3 set: 10, ver: 10, vir: 10, NA: 0
##  ordered
##    FALSE
## 
## -- Variable type:numeric ---------------------------------------------------------------------------------------------
##      variable missing complete  n mean   sd  p0  p25  p50  p75 p100
##  petal_length       0       30 30 3.75 1.78 1.1 1.7  4.15 4.97  6.9
##   petal_width       0       30 30 1.18 0.76 0.1 0.4  1.25 1.95  2.3
##  sepal_length       0       30 30 5.76 0.91 4.3 5.03 5.7  6.27  7.7
##   sepal_width       0       30 30 2.97 0.41 2.2 2.8  3    3.1   3.8
##      hist
##  <U+2587><U+2581><U+2581><U+2582><U+2587><U+2582><U+2583><U+2582>
##  <U+2587><U+2583><U+2581><U+2586><U+2583><U+2583><U+2585><U+2586>
##  <U+2583><U+2587><U+2582><U+2587><U+2586><U+2583><U+2581><U+2583>
##  <U+2582><U+2582><U+2585><U+2587><U+2582><U+2582><U+2581><U+2583>

10 fold cross validation. These values are plugged into our models below for metric and trControl arguements

#split our dataset into 10 parts, train in 9 and test on 1

control <- trainControl(method="cv", number=10)

#We are using the metric of “Accuracy” to evaluate models. This is a ratio of the number of correctly predicted instances in divided by the total number of instances in the dataset multiplied by 100 to give a percentage (e.g. 95% accurate). We will be using the metric variable when we run build and evaluate each model next.

metric <- "Accuracy"

KNN

#reset the random number seed before reach run to ensure that the evaluation of each algorithm is performed using exactly the same data splits

set.seed(7)
fit.knn <- train(species~., data=dataset, method="knn", metric=metric, trControl=control)

Linear model

set.seed(7)
fit.lda <- train(species~., data=dataset, method="lda", metric=metric, trControl=control)

Non-linear

set.seed(7)
fit.cart <- train(species~., data=dataset, method="rpart", metric=metric, trControl=control)

Summary of results

results <- resamples(list(lda=fit.lda, rpart=fit.cart, knn=fit.knn))
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: lda, rpart, knn 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## lda   0.9166667 0.9166667 1.0000000 0.9666667 1.0000000    1    0
## rpart 0.8333333 0.9166667 0.9166667 0.9083333 0.9166667    1    0
## knn   0.9166667 0.9375000 1.0000000 0.9750000 1.0000000    1    0
## 
## Kappa 
##        Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## lda   0.875 0.87500  1.000 0.9500   1.000    1    0
## rpart 0.750 0.87500  0.875 0.8625   0.875    1    0
## knn   0.875 0.90625  1.000 0.9625   1.000    1    0

Summary of fits

print(fit.lda)

## Linear Discriminant Analysis 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results:
## 
##   Accuracy   Kappa
##   0.9666667  0.95

print(fit.cart)

## CART 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   cp      Accuracy   Kappa 
##   0.0000  0.9083333  0.8625
##   0.4375  0.8250000  0.7375
##   0.5000  0.3333333  0.0000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

print(fit.knn)

## k-Nearest Neighbors 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa 
##   5  0.9666667  0.9500
##   7  0.9666667  0.9500
##   9  0.9750000  0.9625
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Test the lda model on our validation data set

ldapredictions <- predict(fit.lda, validation)

confusionMatrix(ldapredictions, validation$species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         0
##   virginica       0          0        10
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8843, 1)
##     No Information Rate : 0.3333     
##     P-Value [Acc > NIR] : 4.857e-15  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           1.0000
## Specificity                 1.0000            1.0000           1.0000
## Pos Pred Value              1.0000            1.0000           1.0000
## Neg Pred Value              1.0000            1.0000           1.0000
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3333
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            1.0000           1.0000

And on nonlinar

cartpredictions <- predict(fit.cart, validation)

confusionMatrix(cartpredictions, validation$species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

And KNN

knnpredictions <- predict(fit.knn, validation)

confusionMatrix(knnpredictions, validation$species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

traintest <- dataset 

traintest <- traintest %>% 
mutate(guess = predict(fit.lda,traintest)) %>% 
  mutate(accuracy = ifelse(species == guess, "correct", "incorrect"))

Visualise the accuracies

p1 <- ggplot(traintest) +
  geom_point(aes(petal_length, petal_width, shape = accuracy, colour = species), size =3) +
  scale_shape_manual(values = c(1, 4)) +
  theme(legend.position="none")

p2 <- ggplot(traintest) +
  geom_point(aes(petal_length, sepal_width, shape = accuracy, colour = species), size =3) +
  scale_shape_manual(values = c(1, 4)) +
  theme(legend.position="none")

p3 <- ggplot(traintest) +
  geom_point(aes(petal_length, sepal_length, shape = accuracy, colour = species), size =3) +
  scale_shape_manual(values = c(1, 4)) +
  theme(legend.position="none")

p4 <- ggplot(traintest) +
  geom_point(aes(sepal_length, petal_width, shape = accuracy, colour = species), size =3) +
  scale_shape_manual(values = c(1, 4)) +
  theme(legend.position="none")

ggarrange(p1, p2, p3, p4, common.legend = T, legend="bottom")

If I go with the default settings (irgnoring metric and trcontrol), apparently random forrest is used, bootstrapped 25 times:

set.seed(7)
fit.default <- train(species~., data=dataset)

print(fit.default)

## Random Forest 
## 
## 120 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 120, 120, 120, 120, 120, 120, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9459571  0.9181640
##   3     0.9487254  0.9223111
##   4     0.9505344  0.9250561
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.

Test the default model on our validation data set

### compare our default fit to the validation data set. This creates a table of just species names that are guess for the species of iris in the 30 validation observations
defaultpredictions <- predict(fit.default, validation)

### summary of our guess versus the actual species of the validation observations
confusionMatrix(defaultpredictions, validation$species)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         10         1
##   virginica       0          0         9
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9667          
##                  95% CI : (0.8278, 0.9992)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : 2.963e-13       
##                                           
##                   Kappa : 0.95            
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9000
## Specificity                 1.0000            0.9500           1.0000
## Pos Pred Value              1.0000            0.9091           1.0000
## Neg Pred Value              1.0000            1.0000           0.9524
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3333           0.3000
## Detection Prevalence        0.3333            0.3667           0.3000
## Balanced Accuracy           1.0000            0.9750           0.9500

Check which observations were correct or not in the validation data:

cbind(validation, defaultpredictions) %>% 
  mutate(prediction = ifelse(species == defaultpredictions, "right", "wrong"))

##    sepal_length sepal_width petal_length petal_width    species
## 1           4.6         3.1          1.5         0.2     setosa
## 2           5.0         3.6          1.4         0.2     setosa
## 3           4.9         3.1          1.5         0.1     setosa
## 4           4.3         3.0          1.1         0.1     setosa
## 5           5.7         3.8          1.7         0.3     setosa
## 6           5.1         3.3          1.7         0.5     setosa
## 7           5.0         3.4          1.6         0.4     setosa
## 8           4.4         3.0          1.3         0.2     setosa
## 9           5.1         3.8          1.9         0.4     setosa
## 10          5.3         3.7          1.5         0.2     setosa
## 11          6.3         3.3          4.7         1.6 versicolor
## 12          5.8         2.7          4.1         1.0 versicolor
## 13          6.2         2.2          4.5         1.5 versicolor
## 14          6.1         2.8          4.0         1.3 versicolor
## 15          6.1         2.8          4.7         1.2 versicolor
## 16          5.5         2.4          3.8         1.1 versicolor
## 17          5.0         2.3          3.3         1.0 versicolor
## 18          5.7         3.0          4.2         1.2 versicolor
## 19          5.7         2.9          4.2         1.3 versicolor
## 20          5.7         2.8          4.1         1.3 versicolor
## 21          7.6         3.0          6.6         2.1  virginica
## 22          4.9         2.5          4.5         1.7  virginica
## 23          6.8         3.0          5.5         2.1  virginica
## 24          5.7         2.5          5.0         2.0  virginica
## 25          6.5         3.0          5.5         1.8  virginica
## 26          7.7         2.6          6.9         2.3  virginica
## 27          5.6         2.8          4.9         2.0  virginica
## 28          6.4         2.8          5.6         2.1  virginica
## 29          7.7         3.0          6.1         2.3  virginica
## 30          6.5         3.0          5.2         2.0  virginica
##    defaultpredictions prediction
## 1              setosa      right
## 2              setosa      right
## 3              setosa      right
## 4              setosa      right
## 5              setosa      right
## 6              setosa      right
## 7              setosa      right
## 8              setosa      right
## 9              setosa      right
## 10             setosa      right
## 11         versicolor      right
## 12         versicolor      right
## 13         versicolor      right
## 14         versicolor      right
## 15         versicolor      right
## 16         versicolor      right
## 17         versicolor      right
## 18         versicolor      right
## 19         versicolor      right
## 20         versicolor      right
## 21          virginica      right
## 22         versicolor      wrong
## 23          virginica      right
## 24          virginica      right
## 25          virginica      right
## 26          virginica      right
## 27          virginica      right
## 28          virginica      right
## 29          virginica      right
## 30          virginica      right

I just made up a new object called “result”. What would our model (in this case, random forrest from last attempt) predict. Versicolour apparently!

X sepal_length sepal_width petal_length petal_width species
1 1 8 3.5 3 0.2 madeup

result <- as_tibble(read.csv("result.csv"))


result$sepal_length <- as.numeric(result$sepal_length)
result$sepal_width <- as.numeric(result$sepal_width)
result$petal_length <- as.numeric(result$petal_length)
result$petal_width <- as.numeric(result$petal_width)

resultpredictions <- predict(fit.default,result)

cbind(result, resultpredictions)

##   X sepal_length sepal_width petal_length petal_width species
## 1 1            8         3.5            3         0.2  madeup
##   resultpredictions
## 1            setosa

Use all the other models to check. Here knn and cart predict versicolour, lda predicts setosa!

resultknn <- predict(fit.knn,result)
resultcart <- predict(fit.cart,result)
resultlda <- predict(fit.lda,result)


cbind(result, resultknn, resultcart, resultlda)

##   X sepal_length sepal_width petal_length petal_width species  resultknn
## 1 1            8         3.5            3         0.2  madeup versicolor
##   resultcart resultlda
## 1 versicolor    setosa