setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments")
data = iris # Native dataset to R, consists of 150 observations of three species of flower.
Last time we considered the following question: Suppose you come across an iris flower in the wild and you’d like to try and identify it’s species. You take measurements of it’s petals and sepals, and have a labeled data set of iris flowers and their species for reference. In this notebook we will use the k nearest neighbors (kNN) algorithm to leverage the data the classify the new observation of flowers label.
In this notebook we will continue on that question using a SVM, and compare it to kNN.
In SVM we try and learn a decision boundary that separates the space in which the data lies in class 1 and class 2.
# install.packages("e1071)
library(e1071)
# Lets try just binary classification.
new_iris = iris[iris$Species %in% c("setosa", "virginica"),]
new_iris$Species = as.factor(as.character(new_iris$Species)) # removing versicolor label
new_iris = new_iris[sample(nrow(new_iris)),] # Shuffles the rows
training_set = new_iris[1:50,]
testing_set = new_iris[51:100,]
# Let's build the SVM model
model_svm = svm(data = training_set, Species ~ Petal.Length + Petal.Width, kernal = "linear")
predicted_labels_svm = predict(model_svm, testing_set)
plot(model_svm, data = training_set, Petal.Length ~ Petal.Width)
plot(model_svm, data = testing_set, Petal.Length ~ Petal.Width)
conf_mat_svm = table(predicted_labels_svm, testing_set$Species)
# Note here the model does a perfect job!
library(class)
predicted_labels_knn = knn(training_set[,c("Petal.Length", "Petal.Width")], testing_set[,c("Petal.Length", "Petal.Width")], training_set$Species, k = 5)
conf_mat_knn = table(predicted_labels_knn, testing_set$Species)
# Also does a perfect job this task (it's pretty easy)
library(class)
# Function from last time that performs cross validation
crossval_knn = function(data, labels, folds, k_neighbors){
# SYNTAX
## data is the dateframe of training_data
## labels is the vector of class labels
## folds is the number of folds validation to do
## k_neighbors is the input parameter for the knn
results = rep(0, folds)
for(i in 1:folds){
ind = seq(i,length(data[,1]), folds)
training_set = data[-ind, ]
testing_set = data[ind,]
training_labels = labels[!(1:length(data[,1]) %in% ind)]
testing_labels = labels[ind]
predicted_labels = knn(training_set, testing_set, training_labels, k_neighbors)
results[i] = sum(diag(table(predicted_labels, testing_labels)))/length(testing_labels)
}
return(results)
}
# Lets check the accuracy of the first 15 k with 5 folds for the problem of predicting using Sepal.Length and Sepal.Width
acc_knn = rep(0, 15)
for(k in 1:15){
acc_knn[k] = mean(crossval_knn(new_iris[,c("Sepal.Length", "Sepal.Width")],new_iris$Species, 5, k))
}
print(acc_knn)
## [1] 0.98 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99
print(which(acc_knn == max(acc_knn)))
## [1] 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Note for this problem most choices of k almost all of the accuracy! If we go back to the full dataset i.e.
new_iris = iris[sample(150),] # No more removing Virginica
acc_knn = rep(0, 15)
for(k in 1:15){
acc_knn[k] = mean(crossval_knn(new_iris[,c("Sepal.Length", "Sepal.Width")],new_iris$Species, 5, k))
}
print(acc_knn)
## [1] 0.7000000 0.7133333 0.7266667 0.7666667 0.7800000 0.7866667 0.7866667
## [8] 0.7666667 0.7533333 0.7533333 0.7466667 0.7533333 0.7733333 0.7466667
## [15] 0.7466667
print(which(acc_knn == max(acc_knn)))
## [1] 6 7
# The results become less trivial.