Classification: Support Vector Machine

Data Ingestion

data(iris)
head(iris)

Data Exploration

sep.l <- iris$Sepal.Length
sep.w <- iris$Sepal.Width
pet.l <- iris$Petal.Length

plot_ly(x=sep.l, y=sep.w, z=pet.l, type="scatter3d", mode="markers", color=iris$Species)
ggpairs(iris, ggplot2::aes(color = Species, alpha = 0.04))

The histogram Petal.length and Petal.width clearly seperate out Setosa species with very high confidence.

However, Versicolor and Virginica Species are overlapping. If we look at the scatterplot of Sepal.Length vs Petal.Length and Petal.Width vs Petal.Length, we can distintly see a seperator that can be draw between the groups of Species.

Looks like we can just use Petal.Width and Petal.Length as parameters and come with a good model. SVM seems to be a very good model for this type of data.

Create Support Vector Machine Model

First we randomly split into a training and testing set.

# Reproducing same sample (has nothing to do with the model )
      set.seed = 10
# Indexing the data set
      index <- c(1:nrow(iris))
# Randomly drawing samples from the data set as test data
      test.index <- sample(index, size = (length(index)/4))

# Generating Test and Training Data
      train <- iris[-test.index ,]
      test <- iris[test.index ,]

Linear Support Vector Machine Model

svm.model.lin <- svm(Species ~ ., data = train,
                        kernel = 'linear')

Prediction Training Data Set

table(Prediction = predict(svm.model.lin, train),Truth = train$Species)
##             Truth
## Prediction   setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         41         0
##   virginica       0          0        37
plot(svm.model.lin, data = iris, 
     Petal.Width ~ Petal.Length, # dim 1, 2
     slice = list(Sepal.Width = 3, Sepal.Length = 4)) # dimension 3, 4 hold constant

Radial Support Vector Machine Model

svm.model.rad <- svm(Species ~ ., data = train,
                        kernel = 'rad', 
                 cost = 15, 
                 gamma = .15)


plot(svm.model.rad, data = iris, 
     Petal.Width ~ Petal.Length, # dim 1, 2
     slice = list(Sepal.Width = 3, Sepal.Length = 4)) # dimension 3, 4 hold constant

table(Prediction = predict(svm.model.rad, train),Truth = train$Species)
##             Truth
## Prediction   setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         40         0
##   virginica       0          1        37

Performancei with Test data

Linear Kernel

table(Prediction = predict(svm.model.lin, test),Truth = test$Species)
##             Truth
## Prediction   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0          7         1
##   virginica       0          2        12

Radial Kernel

table(Prediction = predict(svm.model.rad, test),Truth = test$Species)
##             Truth
## Prediction   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0          7         3
##   virginica       0          2        10

Clustering

Iris plot colord according to species

ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point()

set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 52, 48, 50
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     4.269231    1.342308
## 2     5.595833    2.037500
## 3     1.462000    0.246000
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [149] 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167  2.02200
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Iris plot colored according to discovered clusters 1,2,3

irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point()

Performance

table(irisCluster$cluster, iris$Species)
##    
##     setosa versicolor virginica
##   1      0         48         4
##   2      0          2        46
##   3     50          0         0
cluster.name = as.character(irisCluster$cluster)

cluster.name[cluster.name == 3] = "setosa"
cluster.name[cluster.name  == 1] = "versicolor"
cluster.name[cluster.name  == 2] = "virginica"

cluster.name = as.factor(cluster.name)


ggplot() + 
      geom_point(data = iris, aes(Petal.Length, Petal.Width, color = Species))+
      geom_point(data = iris[iris$Species != cluster.name, ], aes(Petal.Length, Petal.Width), 
                 color = "red", 
                 alpha = 0.2,
                 size = 4)

iris[iris$Species != cluster.name, ]