Question 3.1 Using the same data set (credit_card_data.txt or credit_card_data-headers.txt) as in Question 2.2, use the ksvm or kknn function to find a good classifier: (a) using cross-validation (do this for the k-nearest-neighbors model; SVM is optional); and (b) splitting the data into training, validation, and test data sets (pick either KNN or SVM; the other is optional).
Using Cross-Validation via LOOCV (Leave One Out Cross Validation )
```# Installing and calling packages
#install.packages(“kknn”) library(kknn)
creditdata <- read.table(“C:/Users/MKRISHNAN/OneDrive - Cox Automotive/Documents/GATech/Week1_Homework/hw1/data 2.2/credit_card_data.txt”, stringsAsFactors = FALSE, header = FALSE)
Optional check to make sure the data is read correctly
``` r
head(data)
##
## 1 function (..., list = character(), package = NULL, lib.loc = NULL,
## 2 verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
## 3 {
## 4 fileExt <- function(x) {
## 5 db <- grepl("\\\\.[^.]+\\\\.(gz|bz2|xz)$", x)
## 6 ans <- sub(".*\\\\.", "", x)
##————Using Cross-Validation using LOOCV————##
This method uses n-fold cross-validation, where n is the number of data points, because that’s how train.kknn does cross validation. It’s also called “leave-one-out” cross validation .
Setting the random number generator seed so that our results are reproducible. Similarly setting the K value to 100 and creating the training and testing dataset by doing a 80% training data and 20% test data split.
```{set.seed(1)} k <- 100
cc_train_size <- 0.8 cc_test_size <- 0.2
cc_train_rows <- floor(nrow(creditdata) * cc_train_size) cc_test_rows <- nrow(creditdata) - cc_train_rows
cc_train_idx <- sample(1:nrow(creditdata), cc_train_rows) cc_test_idx <- setdiff(1:nrow(creditdata), c(cc_train_idx))
train_data <- data[cc_train_idx, ] test_data <- data[cc_test_idx, ]
knn_model <- train.kknn((V11)~. , data = train_data, kmax = k, scale = TRUE) summary(knn_model)
Summary Results #Type of response variable: continuous #minimal mean
absolute error: 0.1955315 #Minimal mean squared error: 0.1085533 #Best
kernel: optimal #Best k: 41
Checking accuracy of the model prediction. From training the model we
got the Best K value as 41.
``` training_prediction
#initializing variable
training_accuracy <- 0
#calculating prediction values
for (i in 1: 1:nrow(train_data)){
model <- kknn((V11)~. , train_data[-i,], train_data[i,], k=41, kernel ="optimal", scale = TRUE)
training_prediction[i] <- as.integer(fitted(model)+0.5)
}
training_accuracy <- sum(training_prediction == train_data[,11]) / nrow(train_data)
##Checking Testing accuracy
testing_prediction <- rep(0,(nrow(test_data))) # predictions: start with a vector of all zeros
testing_accuracy<- 0 #initialize variable
for (i in 1:nrow(test_data)){
model=kknn(V11~.,test_data[-i,],test_data[i,],k=12,kernel="optimal", scale = TRUE) # use scaled data
testing_prediction[i]<- as.integer(fitted(model)+0.5) # round off to 0 or 1 and store predicted values in vector
}
# calculate fraction of correct predictions
testing_accuracy<- sum(testing_prediction == test_data[,11]) / nrow(test_data)
testing_accuracy
training_accuracy
training_accuracy
0.8413002
testing_accuracy
0.8244275
##————Splitting the data into training, validation, and test data sets————## ## – Code below This code demonstrates how to split the data into training, validation, and test sets, and then use the k-nearest neighbors (KNN) and support vector machine (SVM) models to classify the data.
#install.packages(“kernlab”) #install.packages(“ggplot”) #install.packages(“caret”) # Load the required libraries library(kernlab) library(kknn) library(caret)
data <- read.table(“C:/Users/MKRISHNAN/OneDrive - Cox Automotive/Documents/GATech/Week1_Homework/hw1/data 2.2/credit_card_data.txt”, stringsAsFactors = FALSE, header = FALSE)
#ensures reproducibility of the random sampling set.seed(123)
train_size <- 0.6 val_size <- 0.2 test_size <- 0.2
train_rows <- floor(nrow(data) * train_size) val_rows <- floor(nrow(data) * val_size) test_rows <- nrow(data) - train_rows - val_rows
train_idx <- sample(1:nrow(data), train_rows) val_idx <- sample(setdiff(1:nrow(data), train_idx), val_rows) test_idx <- setdiff(1:nrow(data), c(train_idx, val_idx))
train_data <- data[train_idx, ] validation_data <- data[val_idx, ] test_data <- data[test_idx, ]
#Evaluating the KNN model: The code then evaluates the KNN model using the validation set. #Iterates over different values of k (the number of nearest neighbors) and calculates the accuracy of the KNN model on the validation set. #Then selects the value of k that gives the highest accuracy.
kmax <- 100 accuracy <- rep(0,kmax)
for (i in 1:kmax){ model <- kknn(V11 ~ ., train_data, validation_data, k = k, scale=TRUE) result <- as.integer(fitted(model)+0.5) accuracy[i] <- sum(result == validation_data$V11) / nrow(validation_data)
} print(paste(“Validation Accuracy for KNN:”, accuracy)) print(paste(“Use KKNN with k =”, which.max(accuracy)))
confusionMatrix(as.factor(result),as.factor(validation_data$V11))
#Confusion Matrix and Statistics
#Reference #Prediction 0 1 #0 66 6 #1 11 47
#Accuracy : 0.8692
#95% CI : (0.7989, 0.9219) #No Information Rate : 0.5923
#P-Value [Acc > NIR] : 4.881e-12
#Kappa : 0.7332
#Mcnemar’s Test P-Value : 0.332
#Testing the model on test data set set.seed(123) test_model <- kknn(V11 ~ ., train_data, test_data, k = which.max(test_accuracy), scale=TRUE)
res <- as.integer(fitted(test_model)+0.5) # round off to 0 or 1
test_accuracy <- sum(res == test_data$V11) / nrow(test_data)
print(paste(“Performance on test data is”, test_accuracy)) print(paste(“Use KKNN with k =”, which.max(test_accuracy)))
confusionMatrix(as.factor(res),as.factor(test_data$V11))
svm_model <- ksvm(V11 ~ ., train_data, C = 1, type = “C-svc”, kernel = “rbfdot”) val_predictions <- predict(svm_model, validation_data) val_accuracy <- mean(validation_data$V11 == val_predictions)
print(paste(“Validation accuracy for SVM:”, val_accuracy))
test_predictions <- predict(knn_model, newdata = test_data) test_accuracy <- mean(test_data$V11 == test_predictions)
print(paste(“Test accuracy for KNN:”, test_accuracy))
test_predictions <- predict(svm_model, newdata = test_data) test_accuracy <- mean(test_data$V11 == test_predictions)
print(paste(“Test accuracy for SVM:”, test_accuracy))