Topics for today!

  1. Introduction to SVM
  2. SVM vs kNN - More on cross validation

Data for today

setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments")
data = iris # Native dataset to R, consists of 150 observations of three species of flower.

Last time we considered the following question: Suppose you come across an iris flower in the wild and you’d like to try and identify it’s species. You take measurements of it’s petals and sepals, and have a labeled data set of iris flowers and their species for reference. In this notebook we will use the k nearest neighbors (kNN) algorithm to leverage the data the classify the new observation of flowers label.

In this notebook we will continue on that question using a SVM, and compare it to kNN.

Topic 1: Support Vector Machine (SVM)

In SVM we try and learn a decision boundary that separates the space in which the data lies in class 1 and class 2.

# install.packages("e1071)
library(e1071)

# Lets try just binary classification.
new_iris = iris[iris$Species %in% c("setosa", "virginica"),]
new_iris$Species = as.factor(as.character(new_iris$Species)) # removing versicolor label
new_iris = new_iris[sample(nrow(new_iris)),] # Shuffles the rows

training_set = new_iris[1:50,]
testing_set = new_iris[51:100,]

# Let's build the SVM model
model_svm = svm(data = training_set, Species ~ Petal.Length + Petal.Width, kernal = "linear")
predicted_labels_svm = predict(model_svm, testing_set)

plot(model_svm, data = training_set, Petal.Length ~ Petal.Width)

plot(model_svm, data = testing_set, Petal.Length ~ Petal.Width)

conf_mat_svm = table(predicted_labels_svm, testing_set$Species) 
# Note here the model does a perfect job!

Topic 2: SVM vs kNN

library(class)
predicted_labels_knn = knn(training_set[,c("Petal.Length", "Petal.Width")], testing_set[,c("Petal.Length", "Petal.Width")], training_set$Species, k = 5)

conf_mat_knn = table(predicted_labels_knn, testing_set$Species) 
# Also does a perfect job this task (it's pretty easy)

Recall from last time we used cross validation to learn k

library(class)
# Function from last time that performs cross validation
crossval_knn = function(data, labels, folds, k_neighbors){
  # SYNTAX
  ## data is the dateframe of training_data
  ## labels is the vector of class labels
  ## folds is the number of folds validation to do
  ## k_neighbors is the input parameter for the knn
  results = rep(0, folds)
  for(i in 1:folds){
    ind = seq(i,length(data[,1]), folds)
    training_set = data[-ind, ]
    testing_set = data[ind,]
    training_labels = labels[!(1:length(data[,1]) %in% ind)]
    testing_labels = labels[ind]
    predicted_labels = knn(training_set, testing_set, training_labels, k_neighbors)
    results[i] = sum(diag(table(predicted_labels, testing_labels)))/length(testing_labels)
  }
  return(results)
}

# Lets check the accuracy of the first 15 k with 5 folds for the problem of predicting using Sepal.Length and Sepal.Width
acc_knn = rep(0, 15)
for(k in 1:15){
  acc_knn[k] = mean(crossval_knn(new_iris[,c("Sepal.Length", "Sepal.Width")],new_iris$Species, 5, k))
}
print(acc_knn)
##  [1] 0.98 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99
print(which(acc_knn == max(acc_knn)))
##  [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15
# Note for this problem most choices of k almost all of the accuracy! If we go back to the full dataset i.e.

new_iris = iris[sample(150),] # No more removing Virginica
acc_knn = rep(0, 15)
for(k in 1:15){
  acc_knn[k] = mean(crossval_knn(new_iris[,c("Sepal.Length", "Sepal.Width")],new_iris$Species, 5, k))
}
print(acc_knn)
##  [1] 0.7000000 0.7133333 0.7266667 0.7666667 0.7800000 0.7866667 0.7866667
##  [8] 0.7666667 0.7533333 0.7533333 0.7466667 0.7533333 0.7733333 0.7466667
## [15] 0.7466667
print(which(acc_knn == max(acc_knn)))
## [1] 6 7
# The results become less trivial.