setwd("~/Desktop/R Materials/mih140/Lecture 21 - Introduction to Machine Learning")
data = iris # Native dataset to R, consists of 150 observations of three species of flower.
Motivating Question: Suppose you come across an iris flower in the wild and you’d like to try and identify it’s species. You take measurements of it’s petals and sepals, and know a labeled data set of iris flowers and their species for reference. In this notebook we will use the k nearest neighbors (kNN) algorithm to leverage the data the classify the new observation of flowers label.
In supervised learning we want predict an observations class label by leveraging labeled data in a training set. We saw an example of this last class when we examined Logistic Regression with a Threshold. Lets randomly shuffle the iris data set and split it into a training and testing set
shuffled_iris = iris[sample(nrow(iris)), ]
training_set = shuffled_iris[1:100,]
testing_set = shuffled_iris[101:nrow(iris), ]
kNN is a classification algorithm. From a labeled training set, it can be used to classify/predict the class labels (in this case species) of new observations of flowers. It works by examining, new each observation in the testing set, the k closest points in the training set and assigning it the majority label.
In R knn can be running by calling it from the package “class”.
# To use kNN we need to call the package "class"
# To download run: install.packages("class")
library(class)
# Lets use sepal.length and petal.length to determine closeness of points i.e. use the 1 and 3 columns as training data.
# Syntax: knn(training_data, testing_data, vector of trainset class labels, k)
# Lets choose k to be 5, just for fun.
predicted_labels = knn(training_set[,c(1,3)], testing_set[,c(1,3)], training_set$Species, k = 5)
# predicted_labels is vector of 50 predicted class labels.
# Lets make a confusion matrix to evaluate
confusion_matrix = table(predicted_labels, testing_set$Species) # We can read the specificity and sensitivity from this
print(sum(diag(confusion_matrix))/nrow(testing_set)) # Prediction Accuracy
## [1] 0.96
So k = 5 seems to work pretty well! But perhaps there’s a better choice? Lets try and find the best choice of k for this training and testing set
# We will check k from 1 to 20
acc_by_k = rep(0, 20) # We will store the prediction accuracy for each k here
for(i in 1:20){
predicted_labels = knn(training_set[,c(1,3)], testing_set[,c(1,3)], training_set$Species, k = i) # Note k = i
confusion_matrix = table(predicted_labels, testing_set$Species)
acc_by_k[i] = sum(diag(confusion_matrix))/nrow(testing_set)
}
plot(1:20, acc_by_k, main = "Accuracy of different values of k", xlab = "k", ylab = "Prediction Accuracy")
# We can ge the index of the k which yields the maximum prediction accuracy by using which.max(acc_by_k)
print(which.max(acc_by_k)) # This is the best choice of k
## [1] 5
Motivation Question How to pick this parameter k?
We could just try different parameters on the training set and pick the best. Problem: Will this parameter generalize i.e. will it do well on the testing set? I want to guard against overfitting the random variation in the training set.
Cross validation is the idea that we split the training data into many pieces and shuffle which of those pieces we use. Then we find the best parameter, and average over all instances to obtain a more robust choice.
library(class)
# Function that performs cross validation for knn
crossval_knn = function(data, labels, folds, k_neighbors){
# SYNTAX
# data is the dateframe of training_data
# labels is the vector of class labels
# folds is the number of folds validation to do
# k_neighbors is the input parameter for the knn
results = rep(0, folds)
for(i in 1:folds){
ind = seq(i,length(data[,1]), folds)
training_set = data[-ind, ]
testing_set = data[ind,]
training_labels = labels[!(1:length(data[,1]) %in% ind)]
testing_labels = labels[ind]
predicted_labels = knn(training_set, testing_set, training_labels, k_neighbors)
results[i] = sum(diag(table(predicted_labels, testing_labels)))/length(testing_labels)
}
return(results)
}
# Lets check the accuracy of the first 15 k with 5 folds for the problem of predicting using Sepal.Length and Sepal.Width
acc_knn = rep(0, 15)
for(k in 1:15){
acc_knn[k] = mean(crossval_knn(training_set[,c(1,3)],training_set$Species, 5, k))
}
print(acc_knn)
## [1] 0.88 0.89 0.94 0.95 0.94 0.93 0.96 0.95 0.96 0.96 0.96 0.95 0.93 0.95 0.94
print(which(acc_knn == max(acc_knn)))
## [1] 7 9 10 11
plot(1:15, acc_knn, main = "Accuracy of different values of k", xlab = "k", ylab = "Prediction Accuracy")
# We can ge the index of the k which yields the maximum prediction accuracy by using which.max(acc_by_k)
print(which.max(acc_knn)) # This is the best choice of k as chosen by cross_val
## [1] 7