Experimentation de 4 modèles de classification sur le jeu de données iris
Comparaison et choix du modèle le plus éfficient
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data(iris)
dataset <- iris
validation_index <- createDataPartition(dataset$Species, p=0.70, list=FALSE)
# 30% pour les données de validation
validation <- dataset[-validation_index,]
# 70% des données pour l'entrainement et test des modèles
dataset <- dataset[validation_index,]
# dimensions du dataset
dim(dataset)
## [1] 105 5
# liste des types pour chaque attribut
sapply(dataset, class)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## "numeric" "numeric" "numeric" "numeric" "factor"
# visualisation des 5 premières lignes des données
head(dataset)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2 4.9 3.0 1.4 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
# liste des niveaux de classes du facteur de classification
levels(dataset$Species)
## [1] "setosa" "versicolor" "virginica"
# distribution de la classe (nombre d'instances appartenant à chaque classe)
percentage <- prop.table(table(dataset$Species)) * 100
cbind(freq=table(dataset$Species), percentage=percentage)
## freq percentage
## setosa 35 33.33333
## versicolor 35 33.33333
## virginica 35 33.33333
# résumé statistique du dataset
summary(dataset)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.400 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.300 Median :1.300
## Mean :5.869 Mean :3.068 Mean :3.782 Mean :1.198
## 3rd Qu.:6.400 3rd Qu.:3.400 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :35
## versicolor:35
## virginica :35
##
##
##
# séparation des données d'entrée et du facteur de classification
x <- dataset[,1:4]
y <- dataset[,5]
# tracé des nuages de points de toutes les paires d'attributs
featurePlot(x=x, y=y, plot="ellipse")
# Nous allons 10 fois la validation croisée pour estimer la précision.
control <- trainControl(method="cv", number=10)
metric <- "Accuracy"
# The logisitic model
set.seed(7)
fit.nnet <- train(Species~., data=dataset, method="nnet", metric=metric, trControl=control)
# CART
set.seed(7)
fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control)
# kNN
set.seed(7)
fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control)
# Random Forest
set.seed(7)
fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control)
# Résumé de la précisions des modèles
results <- resamples(list(nnet=fit.nnet, cart=fit.cart, knn=fit.knn, rf=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: nnet, cart, knn, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## nnet 0.9000000 0.9318182 1 0.9718182 1 1 0
## cart 0.8181818 0.9022727 1 0.9460606 1 1 0
## knn 0.9000000 0.9375000 1 0.9725758 1 1 0
## rf 0.9000000 0.9109848 1 0.9634848 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## nnet 0.8461538 0.8981481 1 0.9571111 1 1 0
## cart 0.7250000 0.8506648 1 0.9185351 1 1 0
## knn 0.8461538 0.9062500 1 0.9585351 1 1 0
## rf 0.8461538 0.8668981 1 0.9449549 1 1 0
# Comparaison graphique
dotplot(results)
Le modèle le plus précis est le kNN
Affichons le résumé statistique de chaque modèle
print(fit.nnet)
## Neural Network
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 94, 96, 93, 95, 95, 93, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.7165152 0.5870150
## 1 1e-04 0.8312121 0.7520846
## 1 1e-01 0.9707071 0.9558290
## 3 0e+00 0.8360606 0.7624869
## 3 1e-04 0.9432828 0.9143642
## 3 1e-01 0.9718182 0.9571111
## 5 0e+00 0.9010606 0.8529372
## 5 1e-04 0.9607071 0.9404444
## 5 1e-01 0.9718182 0.9571111
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.1.
print(fit.cart)
## CART
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 94, 96, 93, 95, 95, 93, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0000000 0.9460606 0.9185351
## 0.4571429 0.7115152 0.5769875
## 0.5000000 0.3951515 0.1285714
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
print(fit.knn)
## k-Nearest Neighbors
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 94, 96, 93, 95, 95, 93, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9534848 0.9287457
## 7 0.9725758 0.9585351
## 9 0.9725758 0.9585351
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
print(fit.rf)
## Random Forest
##
## 105 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 94, 96, 93, 95, 95, 93, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9634848 0.9449549
## 3 0.9634848 0.9449549
## 4 0.9551515 0.9324549
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.