setwd("~/Data_Science/R/Projects/IRIS")
iris=read.csv("iris.data", sep=',',header = FALSE)
colnames(iris) = c("sepal_length","sepal_width","petal_length","petal_width", "class")
summary(iris)
##   sepal_length    sepal_width     petal_length    petal_width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##              class   
##  Iris-setosa    :50  
##  Iris-versicolor:50  
##  Iris-virginica :50  
##                      
##                      
## 


Lets explore data

library(ggplot2)
ggplot(iris, aes(x=sepal_length, y=sepal_width, color=class)) + geom_point() 


There is no clear separation between Iris-versicolor and Iris-virginica based on sepal length and width. Iris-setosa does appear to have lower sepal length and higher sepal width.

ggplot(iris, aes(x=petal_length, y=petal_width, color=class)) + geom_point() 


There is very clear separation amongst the three classes based on petal length and width. Iris-setosa has the smallest petal length and width. Iris-Virginica has the largest petal length and width. Iris-versicolor’s petal length and width are in between Iris-setosa and Iris-virginica. It is much closer to Iris-virginica than Iris-setosa

#lets look at correlation matrix
cor(iris[,-5])
##              sepal_length sepal_width petal_length petal_width
## sepal_length    1.0000000  -0.1093692    0.8717542   0.8179536
## sepal_width    -0.1093692   1.0000000   -0.4205161  -0.3565441
## petal_length    0.8717542  -0.4205161    1.0000000   0.9627571
## petal_width     0.8179536  -0.3565441    0.9627571   1.0000000


This data appears highly correlated. sepal_length has correlation of 0.87 and 0.81 with petal length and width respectively. Let’s build a model with just sepal length and width and see how it performs with test data

set.seed(1)
train= sample(1:nrow(iris), nrow(iris)*0.7)
test = -train
library(MASS)
lda.fit = lda(class ~ sepal_length + sepal_width, data=iris[train,])
lda.pred = predict(lda.fit, newdata = iris[test,], type="response")
mean(lda.pred$class == iris$class[test])
## [1] 0.8
prop.table(table(Prediction = lda.pred$class, truth = iris$class[test]),2)
##                  truth
## Prediction        Iris-setosa Iris-versicolor Iris-virginica
##   Iris-setosa       1.0000000       0.0000000      0.0000000
##   Iris-versicolor   0.0000000       0.7647059      0.3846154
##   Iris-virginica    0.0000000       0.2352941      0.6153846


This LDA model has 100%, 76.47% and 61.53% accuracy for Iris-setosa, Iris-versicolor and Iris-virginica classes respectively. Overall accuracy is 80%.

qda.fit = qda(class ~ sepal_length + sepal_width, data=iris[train,])
qda.pred =predict(qda.fit, iris[test,], type="response")
mean(qda.pred$class == iris$class[test])
## [1] 0.7777778
prop.table(table(Prediction=qda.pred$class, Truth=iris$class[test]), 2)
##                  Truth
## Prediction        Iris-setosa Iris-versicolor Iris-virginica
##   Iris-setosa       1.0000000       0.0000000      0.0000000
##   Iris-versicolor   0.0000000       0.6470588      0.3076923
##   Iris-virginica    0.0000000       0.3529412      0.6923077


This QDA model has 100%, 64.70% and 69.23% accuracy for Iris-setosa, Iris-versicolor and Iris-virginica classes respectively. Overall accuracy is 77.78%.

#Lets try KNN as well
library(class)
set.seed(1)
x = data.frame(iris$sepal_length, iris$sepal_width)
y = iris$class

accuracy=c()
for (i in seq(1:10)){
knn.fit = knn(x[train,], x[test,], y[train], k=i)
accuracy[i] = mean(knn.fit == iris$class[test])
}
plot(accuracy, lty=2, type="b")


KNN performs similar to LDA with 80% accuracy at k = 9

#Lets try random forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
set.seed(1)
rf.fit = randomForest(class ~ sepal_length + sepal_width, data=iris[train,], ntree=500)
rf.pred = predict(rf.fit, iris[test,], type="response")
mean(rf.pred == iris$class[test])
## [1] 0.7333333


random Forest las lowest accuracy of 73.33% amongst all of the models considered