if (!require("caret")) {
install.packages("caret", repos="http://cran.rstudio.com/")
library("caret")
}
data("iris")
# Rename the dataset as myfirstdataset
myfirstdataset <- iris
# Set the column names in the dataset
colnames(myfirstdataset) <- c("Sepal.Length","Sepal.Width", "Petal.length", "Petal.Width","Species")
names(myfirstdataset)
## [1] "Sepal.Length" "Sepal.Width" "Petal.length" "Petal.Width"
## [5] "Species"
# Create a Validation Dataset
# We will split the loaded dataset into two, 80% of which we will use to train our models and 20% that we will hold back as a validation dataset.
validation_index <- createDataPartition(myfirstdataset$Species,p=0.80,list = FALSE)
# validation_index variable holds a list of 80% of original dataset for training.
validation <- myfirstdataset[-validation_index,]
# validation variable holds 20% of original dataset for testing.
myfirstdataset <- myfirstdataset[validation_index,]
# store 80% of remaining dataset in myfirstdataset
# Find the dimensions of dataset
dim(myfirstdataset)
## [1] 120 5
# check for types of Attributes
sapply(myfirstdataset, class)
## Sepal.Length Sepal.Width Petal.length Petal.Width Species
## "numeric" "numeric" "numeric" "numeric" "factor"
# Take a look at the Data
head(myfirstdataset)
## Sepal.Length Sepal.Width Petal.length Petal.Width Species
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
# Check for levels of the Class
levels(myfirstdataset$Species)
## [1] "setosa" "versicolor" "virginica"
# We will take a look at the number of instances (rows) that belong to each class in terms of Percentage and frequency
percentage <- prop.table(table(myfirstdataset$Species)) * 100
cbind(freq = table(myfirstdataset$Species), percentage = percentage)
## freq percentage
## setosa 40 33.33333
## versicolor 40 33.33333
## virginica 40 33.33333
# summarize attribute distributions
summary(myfirstdataset)
## Sepal.Length Sepal.Width Petal.length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.00 Min. :0.10
## 1st Qu.:5.100 1st Qu.:2.775 1st Qu.:1.50 1st Qu.:0.30
## Median :5.800 Median :3.000 Median :4.40 Median :1.30
## Mean :5.853 Mean :3.048 Mean :3.76 Mean :1.19
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.10 3rd Qu.:1.80
## Max. :7.900 Max. :4.400 Max. :6.90 Max. :2.50
## Species
## setosa :40
## versicolor:40
## virginica :40
##
##
##
# We can see that all of the numerical values have the same scale (centimeters) and similar ranges [0,8] centimeters
# We start with some univariate plots, that is, plots of each individual variable.
# split input and output
x <- myfirstdataset[,1:4]
y <- myfirstdataset[,5]
# Given that the input variables are numeric, we can create box and whisker plots of each.
par(mfrow=c(1,4))
for( i in 1:4) {
boxplot(x[,i],main=names(myfirstdataset)[i])
}
# we can also create a barplot of the Species class variable to get a graphical representation of the class distribution.
# barplot for class breakdown
plot(y)
### Plot a multivariate plot
# Make a multivariate plot to show interaction betweeen the variables
featurePlot(x=x,y=y,plot="ellipse")
# we can see Box and Whisker Plot of Iris data by Class Value
featurePlot(x=x,y=y,plot="box")
# View the density plot for each attribute by class value.
# Density Plots of Iris Data By Class Value
scales <- list(x =list(relation="free"), y=list(relation="free"))
featurePlot(x=x,y=y,plot ="density",scales=scales)
# We will use 10- cross valdation to estimate accuracy
control <- trainControl(method = "cv",number = 10)
metric <- "Accuracy"
# a) Linear algorithms
set.seed(7)
fit.lda <- train(Species~., data = myfirstdataset, method = "lda",metric = metric, trControl =control)
# b) Non Linear algorithms
# CART
set.seed(7)
fit.cart <- train(Species~., data = myfirstdataset, methods="rpart",metric=metric,trControl = control
)
# c) Advanced algorithmn
set.seed(7)
fit.svm <- train(Species~., data = myfirstdataset, method = "svmRadial", metric= metric, trControl=control)
# knn
set.seed(7)
fit.knn <- train(Species~., data = myfirstdataset,method ="knn", metric=metric,trControl = control)
# Random Forest
set.seed(7)
fit.rf <- train(Species~., data = myfirstdataset, method ="rf", metric = metric, trControl = control)
# summarize accuracy of models
results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf))
summary((results))
##
## Call:
## summary.resamples(object = (results))
##
## Models: lda, cart, knn, svm, rf
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.9167 1.0000 1.0000 0.9833 1 1 0
## cart 0.8333 0.9167 0.9167 0.9417 1 1 0
## knn 0.8333 0.9375 1.0000 0.9667 1 1 0
## svm 0.8333 0.9167 1.0000 0.9583 1 1 0
## rf 0.8333 0.9167 0.9167 0.9417 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.875 1.0000 1.000 0.9750 1 1 0
## cart 0.750 0.8750 0.875 0.9125 1 1 0
## knn 0.750 0.9062 1.000 0.9500 1 1 0
## svm 0.750 0.8750 1.000 0.9375 1 1 0
## rf 0.750 0.8750 0.875 0.9125 1 1 0
# We can see the accuracy of each classifier and also other metrics like Kappa:
# Compare Accuracy of models
dotplot(results)
# The result of LDA model can be summerized
print(fit.lda)
## Linear Discriminant Analysis
##
## 120 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9833333 0.975
##
##
# We can run the LDA model directly on the validation set and summarize the results in a confusion matrix.
# estimate skill of LDA on the validation data set
predictions <- predict(fit.lda, validation)
confusionMatrix(predictions,validation$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
# We can see that the accuracy is 100%, nearest to our prediction between expected margin of 97% +/-4%. Therefore we predict that LDA is the best fit model.