setwd("~/Data_Science/R/Projects/IRIS")
iris=read.csv("iris.data", sep=',',header = FALSE)
colnames(iris) = c("sepal_length","sepal_width","petal_length","petal_width", "class")
summary(iris)
## sepal_length sepal_width petal_length petal_width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.054 Mean :3.759 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## class
## Iris-setosa :50
## Iris-versicolor:50
## Iris-virginica :50
##
##
##
Lets explore data
library(ggplot2)
ggplot(iris, aes(x=sepal_length, y=sepal_width, color=class)) + geom_point()
There is no clear separation between Iris-versicolor and Iris-virginica based on sepal length and width. Iris-setosa does appear to have lower sepal length and higher sepal width.
ggplot(iris, aes(x=petal_length, y=petal_width, color=class)) + geom_point()
There is very clear separation amongst the three classes based on petal length and width. Iris-setosa has the smallest petal length and width. Iris-Virginica has the largest petal length and width. Iris-versicolor’s petal length and width are in between Iris-setosa and Iris-virginica. It is much closer to Iris-virginica than Iris-setosa
#lets look at correlation matrix
cor(iris[,-5])
## sepal_length sepal_width petal_length petal_width
## sepal_length 1.0000000 -0.1093692 0.8717542 0.8179536
## sepal_width -0.1093692 1.0000000 -0.4205161 -0.3565441
## petal_length 0.8717542 -0.4205161 1.0000000 0.9627571
## petal_width 0.8179536 -0.3565441 0.9627571 1.0000000
This data appears highly correlated. sepal_length has correlation of 0.87 and 0.81 with petal length and width respectively. Let’s build a model with just sepal length and width and see how it performs with test data
set.seed(1)
train= sample(1:nrow(iris), nrow(iris)*0.7)
test = -train
library(MASS)
lda.fit = lda(class ~ sepal_length + sepal_width, data=iris[train,])
lda.pred = predict(lda.fit, newdata = iris[test,], type="response")
mean(lda.pred$class == iris$class[test])
## [1] 0.8
prop.table(table(Prediction = lda.pred$class, truth = iris$class[test]),2)
## truth
## Prediction Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 1.0000000 0.0000000 0.0000000
## Iris-versicolor 0.0000000 0.7647059 0.3846154
## Iris-virginica 0.0000000 0.2352941 0.6153846
This LDA model has 100%, 76.47% and 61.53% accuracy for Iris-setosa, Iris-versicolor and Iris-virginica classes respectively. Overall accuracy is 80%.
qda.fit = qda(class ~ sepal_length + sepal_width, data=iris[train,])
qda.pred =predict(qda.fit, iris[test,], type="response")
mean(qda.pred$class == iris$class[test])
## [1] 0.7777778
prop.table(table(Prediction=qda.pred$class, Truth=iris$class[test]), 2)
## Truth
## Prediction Iris-setosa Iris-versicolor Iris-virginica
## Iris-setosa 1.0000000 0.0000000 0.0000000
## Iris-versicolor 0.0000000 0.6470588 0.3076923
## Iris-virginica 0.0000000 0.3529412 0.6923077
This QDA model has 100%, 64.70% and 69.23% accuracy for Iris-setosa, Iris-versicolor and Iris-virginica classes respectively. Overall accuracy is 77.78%.
#Lets try KNN as well
library(class)
set.seed(1)
x = data.frame(iris$sepal_length, iris$sepal_width)
y = iris$class
accuracy=c()
for (i in seq(1:10)){
knn.fit = knn(x[train,], x[test,], y[train], k=i)
accuracy[i] = mean(knn.fit == iris$class[test])
}
plot(accuracy, lty=2, type="b")
KNN performs similar to LDA with 80% accuracy at k = 9
#Lets try random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(1)
rf.fit = randomForest(class ~ sepal_length + sepal_width, data=iris[train,], ntree=500)
rf.pred = predict(rf.fit, iris[test,], type="response")
mean(rf.pred == iris$class[test])
## [1] 0.7333333
random Forest las lowest accuracy of 73.33% amongst all of the models considered