library(datasets)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(knitr)
##
## Attaching package: 'knitr'
## The following object is masked from 'package:skimr':
##
## kable
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
theme_set(theme_light())
df <- as_tibble(clean_names(datasets::iris))
Summary
df %>%
group_by(species) %>%
skim() %>%
skimr::kable()
## Skim summary statistics
## n obs: 150
## n variables: 5
##
## Variable type: numeric
##
## species variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## ------------ -------------- --------- ---------- ---- ------ ------ ----- ------ ------ ------ ------ ----------
## setosa petal_length 0 50 50 1.46 0.17 1 1.4 1.5 1.58 1.9 <U+2581><U+2581><U+2585><U+2587><U+2587><U+2585><U+2582><U+2581>
## setosa petal_width 0 50 50 0.25 0.11 0.1 0.2 0.2 0.3 0.6 <U+2582><U+2587><U+2581><U+2582><U+2582><U+2581><U+2581><U+2581>
## setosa sepal_length 0 50 50 5.01 0.35 4.3 4.8 5 5.2 5.8 <U+2582><U+2583><U+2585><U+2587><U+2587><U+2583><U+2581><U+2582>
## setosa sepal_width 0 50 50 3.43 0.38 2.3 3.2 3.4 3.68 4.4 <U+2581><U+2581><U+2583><U+2585><U+2587><U+2583><U+2582><U+2581>
## versicolor petal_length 0 50 50 4.26 0.47 3 4 4.35 4.6 5.1 <U+2581><U+2583><U+2582><U+2586><U+2586><U+2587><U+2587><U+2583>
## versicolor petal_width 0 50 50 1.33 0.2 1 1.2 1.3 1.5 1.8 <U+2586><U+2583><U+2587><U+2585><U+2586><U+2582><U+2581><U+2581>
## versicolor sepal_length 0 50 50 5.94 0.52 4.9 5.6 5.9 6.3 7 <U+2583><U+2582><U+2587><U+2587><U+2587><U+2583><U+2585><U+2582>
## versicolor sepal_width 0 50 50 2.77 0.31 2 2.52 2.8 3 3.4 <U+2581><U+2582><U+2583><U+2585><U+2583><U+2587><U+2583><U+2581>
## virginica petal_length 0 50 50 5.55 0.55 4.5 5.1 5.55 5.88 6.9 <U+2582><U+2587><U+2583><U+2587><U+2585><U+2582><U+2581><U+2582>
## virginica petal_width 0 50 50 2.03 0.27 1.4 1.8 2 2.3 2.5 <U+2582><U+2581><U+2587><U+2583><U+2583><U+2586><U+2585><U+2583>
## virginica sepal_length 0 50 50 6.59 0.64 4.9 6.23 6.5 6.9 7.9 <U+2581><U+2581><U+2583><U+2587><U+2585><U+2583><U+2582><U+2583>
## virginica sepal_width 0 50 50 2.97 0.32 2.2 2.8 3 3.18 3.8 <U+2581><U+2583><U+2587><U+2587><U+2585><U+2583><U+2581><U+2582>
Visualise
df %>%
gather(key, value, -species) %>%
group_by(species, key) %>%
summarise(mean = mean(value), sd = sd(value)) %>% ggplot(aes(key,mean)) +
geom_point(aes(colour = species)) +
geom_errorbar(aes(x = key, ymax = mean + sd, ymin = mean - sd, colour = species)) +
labs(title = "Iris data set dimensions", x = "Iris Feature", "cm", caption = "Error bars based on SD")
Data Split
#do any 80/20 split of the dataset
validation_index <- createDataPartition(df$species, p=0.80, list=F)
#create a training data set using the 80%
dataset <- df[validation_index,]
#and a validation data set based on the other 20%
validation <- df[-validation_index,]
Summary of data split
skim(dataset)
## Skim summary statistics
## n obs: 120
## n variables: 5
##
## -- Variable type:factor ----------------------------------------------------------------------------------------------
## variable missing complete n n_unique top_counts
## species 0 120 120 3 set: 40, ver: 40, vir: 40, NA: 0
## ordered
## FALSE
##
## -- Variable type:numeric ---------------------------------------------------------------------------------------------
## variable missing complete n mean sd p0 p25 p50 p75 p100
## petal_length 0 120 120 3.76 1.77 1 1.5 4.4 5.1 6.7
## petal_width 0 120 120 1.2 0.77 0.1 0.3 1.35 1.8 2.5
## sepal_length 0 120 120 5.86 0.81 4.4 5.1 5.8 6.4 7.9
## sepal_width 0 120 120 3.08 0.44 2 2.8 3 3.4 4.4
## hist
## <U+2587><U+2581><U+2581><U+2581><U+2585><U+2585><U+2583><U+2582>
## <U+2587><U+2581><U+2581><U+2585><U+2585><U+2583><U+2582><U+2582>
## <U+2585><U+2587><U+2587><U+2587><U+2587><U+2586><U+2582><U+2581>
## <U+2581><U+2582><U+2585><U+2587><U+2585><U+2582><U+2581><U+2581>
skim(validation)
## Skim summary statistics
## n obs: 30
## n variables: 5
##
## -- Variable type:factor ----------------------------------------------------------------------------------------------
## variable missing complete n n_unique top_counts
## species 0 30 30 3 set: 10, ver: 10, vir: 10, NA: 0
## ordered
## FALSE
##
## -- Variable type:numeric ---------------------------------------------------------------------------------------------
## variable missing complete n mean sd p0 p25 p50 p75 p100
## petal_length 0 30 30 3.75 1.78 1.1 1.7 4.15 4.97 6.9
## petal_width 0 30 30 1.18 0.76 0.1 0.4 1.25 1.95 2.3
## sepal_length 0 30 30 5.76 0.91 4.3 5.03 5.7 6.27 7.7
## sepal_width 0 30 30 2.97 0.41 2.2 2.8 3 3.1 3.8
## hist
## <U+2587><U+2581><U+2581><U+2582><U+2587><U+2582><U+2583><U+2582>
## <U+2587><U+2583><U+2581><U+2586><U+2583><U+2583><U+2585><U+2586>
## <U+2583><U+2587><U+2582><U+2587><U+2586><U+2583><U+2581><U+2583>
## <U+2582><U+2582><U+2585><U+2587><U+2582><U+2582><U+2581><U+2583>
10 fold cross validation. These values are plugged into our models below for metric and trControl arguements
#split our dataset into 10 parts, train in 9 and test on 1
control <- trainControl(method="cv", number=10)
#We are using the metric of “Accuracy” to evaluate models. This is a ratio of the number of correctly predicted instances in divided by the total number of instances in the dataset multiplied by 100 to give a percentage (e.g. 95% accurate). We will be using the metric variable when we run build and evaluate each model next.
metric <- "Accuracy"
KNN
#reset the random number seed before reach run to ensure that the evaluation of each algorithm is performed using exactly the same data splits
set.seed(7)
fit.knn <- train(species~., data=dataset, method="knn", metric=metric, trControl=control)
Linear model
set.seed(7)
fit.lda <- train(species~., data=dataset, method="lda", metric=metric, trControl=control)
Non-linear
set.seed(7)
fit.cart <- train(species~., data=dataset, method="rpart", metric=metric, trControl=control)
Summary of results
results <- resamples(list(lda=fit.lda, rpart=fit.cart, knn=fit.knn))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, rpart, knn
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.9166667 0.9166667 1.0000000 0.9666667 1.0000000 1 0
## rpart 0.8333333 0.9166667 0.9166667 0.9083333 0.9166667 1 0
## knn 0.9166667 0.9375000 1.0000000 0.9750000 1.0000000 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.875 0.87500 1.000 0.9500 1.000 1 0
## rpart 0.750 0.87500 0.875 0.8625 0.875 1 0
## knn 0.875 0.90625 1.000 0.9625 1.000 1 0
Summary of fits
print(fit.lda)
## Linear Discriminant Analysis
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9666667 0.95
print(fit.cart)
## CART
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0000 0.9083333 0.8625
## 0.4375 0.8250000 0.7375
## 0.5000 0.3333333 0.0000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
print(fit.knn)
## k-Nearest Neighbors
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9666667 0.9500
## 7 0.9666667 0.9500
## 9 0.9750000 0.9625
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Test the lda model on our validation data set
ldapredictions <- predict(fit.lda, validation)
confusionMatrix(ldapredictions, validation$species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 0
## virginica 0 0 10
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8843, 1)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 4.857e-15
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3333
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 1.0000 1.0000
And on nonlinar
cartpredictions <- predict(fit.cart, validation)
confusionMatrix(cartpredictions, validation$species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
And KNN
knnpredictions <- predict(fit.knn, validation)
confusionMatrix(knnpredictions, validation$species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
traintest <- dataset
traintest <- traintest %>%
mutate(guess = predict(fit.lda,traintest)) %>%
mutate(accuracy = ifelse(species == guess, "correct", "incorrect"))
Visualise the accuracies
p1 <- ggplot(traintest) +
geom_point(aes(petal_length, petal_width, shape = accuracy, colour = species), size =3) +
scale_shape_manual(values = c(1, 4)) +
theme(legend.position="none")
p2 <- ggplot(traintest) +
geom_point(aes(petal_length, sepal_width, shape = accuracy, colour = species), size =3) +
scale_shape_manual(values = c(1, 4)) +
theme(legend.position="none")
p3 <- ggplot(traintest) +
geom_point(aes(petal_length, sepal_length, shape = accuracy, colour = species), size =3) +
scale_shape_manual(values = c(1, 4)) +
theme(legend.position="none")
p4 <- ggplot(traintest) +
geom_point(aes(sepal_length, petal_width, shape = accuracy, colour = species), size =3) +
scale_shape_manual(values = c(1, 4)) +
theme(legend.position="none")
ggarrange(p1, p2, p3, p4, common.legend = T, legend="bottom")
If I go with the default settings (irgnoring metric and trcontrol), apparently random forrest is used, bootstrapped 25 times:
set.seed(7)
fit.default <- train(species~., data=dataset)
print(fit.default)
## Random Forest
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 120, 120, 120, 120, 120, 120, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9459571 0.9181640
## 3 0.9487254 0.9223111
## 4 0.9505344 0.9250561
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
Test the default model on our validation data set
### compare our default fit to the validation data set. This creates a table of just species names that are guess for the species of iris in the 30 validation observations
defaultpredictions <- predict(fit.default, validation)
### summary of our guess versus the actual species of the validation observations
confusionMatrix(defaultpredictions, validation$species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
Check which observations were correct or not in the validation data:
cbind(validation, defaultpredictions) %>%
mutate(prediction = ifelse(species == defaultpredictions, "right", "wrong"))
## sepal_length sepal_width petal_length petal_width species
## 1 4.6 3.1 1.5 0.2 setosa
## 2 5.0 3.6 1.4 0.2 setosa
## 3 4.9 3.1 1.5 0.1 setosa
## 4 4.3 3.0 1.1 0.1 setosa
## 5 5.7 3.8 1.7 0.3 setosa
## 6 5.1 3.3 1.7 0.5 setosa
## 7 5.0 3.4 1.6 0.4 setosa
## 8 4.4 3.0 1.3 0.2 setosa
## 9 5.1 3.8 1.9 0.4 setosa
## 10 5.3 3.7 1.5 0.2 setosa
## 11 6.3 3.3 4.7 1.6 versicolor
## 12 5.8 2.7 4.1 1.0 versicolor
## 13 6.2 2.2 4.5 1.5 versicolor
## 14 6.1 2.8 4.0 1.3 versicolor
## 15 6.1 2.8 4.7 1.2 versicolor
## 16 5.5 2.4 3.8 1.1 versicolor
## 17 5.0 2.3 3.3 1.0 versicolor
## 18 5.7 3.0 4.2 1.2 versicolor
## 19 5.7 2.9 4.2 1.3 versicolor
## 20 5.7 2.8 4.1 1.3 versicolor
## 21 7.6 3.0 6.6 2.1 virginica
## 22 4.9 2.5 4.5 1.7 virginica
## 23 6.8 3.0 5.5 2.1 virginica
## 24 5.7 2.5 5.0 2.0 virginica
## 25 6.5 3.0 5.5 1.8 virginica
## 26 7.7 2.6 6.9 2.3 virginica
## 27 5.6 2.8 4.9 2.0 virginica
## 28 6.4 2.8 5.6 2.1 virginica
## 29 7.7 3.0 6.1 2.3 virginica
## 30 6.5 3.0 5.2 2.0 virginica
## defaultpredictions prediction
## 1 setosa right
## 2 setosa right
## 3 setosa right
## 4 setosa right
## 5 setosa right
## 6 setosa right
## 7 setosa right
## 8 setosa right
## 9 setosa right
## 10 setosa right
## 11 versicolor right
## 12 versicolor right
## 13 versicolor right
## 14 versicolor right
## 15 versicolor right
## 16 versicolor right
## 17 versicolor right
## 18 versicolor right
## 19 versicolor right
## 20 versicolor right
## 21 virginica right
## 22 versicolor wrong
## 23 virginica right
## 24 virginica right
## 25 virginica right
## 26 virginica right
## 27 virginica right
## 28 virginica right
## 29 virginica right
## 30 virginica right
I just made up a new object called “result”. What would our model (in this case, random forrest from last attempt) predict. Versicolour apparently!
X sepal_length sepal_width petal_length petal_width species
1 1 8 3.5 3 0.2 madeup
result <- as_tibble(read.csv("result.csv"))
result$sepal_length <- as.numeric(result$sepal_length)
result$sepal_width <- as.numeric(result$sepal_width)
result$petal_length <- as.numeric(result$petal_length)
result$petal_width <- as.numeric(result$petal_width)
resultpredictions <- predict(fit.default,result)
cbind(result, resultpredictions)
## X sepal_length sepal_width petal_length petal_width species
## 1 1 8 3.5 3 0.2 madeup
## resultpredictions
## 1 setosa
Use all the other models to check. Here knn and cart predict versicolour, lda predicts setosa!
resultknn <- predict(fit.knn,result)
resultcart <- predict(fit.cart,result)
resultlda <- predict(fit.lda,result)
cbind(result, resultknn, resultcart, resultlda)
## X sepal_length sepal_width petal_length petal_width species resultknn
## 1 1 8 3.5 3 0.2 madeup versicolor
## resultcart resultlda
## 1 versicolor setosa