Question 3.1 Using the same data set (credit_card_data.txt or credit_card_data-headers.txt) as in Question 2.2, use the ksvm or kknn function to find a good classifier: (a) using cross-validation (do this for the k-nearest-neighbors model; SVM is optional); and (b) splitting the data into training, validation, and test data sets (pick either KNN or SVM; the other is optional).


Using Cross-Validation via LOOCV (Leave One Out Cross Validation )

```# Installing and calling packages

#install.packages(“kknn”) library(kknn)

Reading the data

creditdata <- read.table(“C:/Users/MKRISHNAN/OneDrive - Cox Automotive/Documents/GATech/Week1_Homework/hw1/data 2.2/credit_card_data.txt”, stringsAsFactors = FALSE, header = FALSE)


Optional check to make sure the data is read correctly


``` r
head(data)
##                                                                             
## 1 function (..., list = character(), package = NULL, lib.loc = NULL,        
## 2     verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE) 
## 3 {                                                                         
## 4     fileExt <- function(x) {                                              
## 5         db <- grepl("\\\\.[^.]+\\\\.(gz|bz2|xz)$", x)                     
## 6         ans <- sub(".*\\\\.", "", x)

##————Using Cross-Validation using LOOCV————##

This method uses n-fold cross-validation, where n is the number of data points, because that’s how train.kknn does cross validation. It’s also called “leave-one-out” cross validation .

Setting the random number generator seed so that our results are reproducible. Similarly setting the K value to 100 and creating the training and testing dataset by doing a 80% training data and 20% test data split.

```{set.seed(1)} k <- 100

Split the data into training, and test sets

cc_train_size <- 0.8 cc_test_size <- 0.2

Calculate the number of rows for each set

cc_train_rows <- floor(nrow(creditdata) * cc_train_size) cc_test_rows <- nrow(creditdata) - cc_train_rows

Randomly select the rows for each set

cc_train_idx <- sample(1:nrow(creditdata), cc_train_rows) cc_test_idx <- setdiff(1:nrow(creditdata), c(cc_train_idx))

Create the data sets

train_data <- data[cc_train_idx, ] test_data <- data[cc_test_idx, ]

Train the KNN model on the training data

knn_model <- train.kknn((V11)~. , data = train_data, kmax = k, scale = TRUE) summary(knn_model)


Summary Results #Type of response variable: continuous #minimal mean
absolute error: 0.1955315 #Minimal mean squared error: 0.1085533 #Best
kernel: optimal #Best k: 41

Checking accuracy of the model prediction. From training the model we
got the Best K value as 41.


``` training_prediction

#initializing variable
training_accuracy <- 0 


#calculating prediction values
for (i in 1: 1:nrow(train_data)){
  model <- kknn((V11)~. , train_data[-i,], train_data[i,], k=41, kernel ="optimal", scale = TRUE)
  training_prediction[i] <- as.integer(fitted(model)+0.5)
}

training_accuracy <- sum(training_prediction == train_data[,11]) / nrow(train_data)

##Checking Testing accuracy
testing_prediction <- rep(0,(nrow(test_data))) # predictions: start with a vector of all zeros
testing_accuracy<- 0 #initialize variable

for (i in 1:nrow(test_data)){
  model=kknn(V11~.,test_data[-i,],test_data[i,],k=12,kernel="optimal", scale = TRUE) # use scaled data
  testing_prediction[i]<- as.integer(fitted(model)+0.5) # round off to 0 or 1 and store predicted values in vector
}

# calculate fraction of correct predictions
testing_accuracy<- sum(testing_prediction == test_data[,11]) / nrow(test_data)

testing_accuracy
training_accuracy

training_accuracy

0.8413002

testing_accuracy

0.8244275

##————Splitting the data into training, validation, and test data sets————## ## – Code below This code demonstrates how to split the data into training, validation, and test sets, and then use the k-nearest neighbors (KNN) and support vector machine (SVM) models to classify the data.

#install.packages(“kernlab”) #install.packages(“ggplot”) #install.packages(“caret”) # Load the required libraries library(kernlab) library(kknn) library(caret)

Read the data

data <- read.table(“C:/Users/MKRISHNAN/OneDrive - Cox Automotive/Documents/GATech/Week1_Homework/hw1/data 2.2/credit_card_data.txt”, stringsAsFactors = FALSE, header = FALSE)

#ensures reproducibility of the random sampling set.seed(123)

Split the data into training, validation, and test sets

train_size <- 0.6 val_size <- 0.2 test_size <- 0.2

Calculate the number of rows for each set

train_rows <- floor(nrow(data) * train_size) val_rows <- floor(nrow(data) * val_size) test_rows <- nrow(data) - train_rows - val_rows

Randomly select the rows for each set

train_idx <- sample(1:nrow(data), train_rows) val_idx <- sample(setdiff(1:nrow(data), train_idx), val_rows) test_idx <- setdiff(1:nrow(data), c(train_idx, val_idx))

Create the data sets

train_data <- data[train_idx, ] validation_data <- data[val_idx, ] test_data <- data[test_idx, ]

#Evaluating the KNN model: The code then evaluates the KNN model using the validation set. #Iterates over different values of k (the number of nearest neighbors) and calculates the accuracy of the KNN model on the validation set. #Then selects the value of k that gives the highest accuracy.

kmax <- 100 accuracy <- rep(0,kmax)

for (i in 1:kmax){ model <- kknn(V11 ~ ., train_data, validation_data, k = k, scale=TRUE) result <- as.integer(fitted(model)+0.5) accuracy[i] <- sum(result == validation_data$V11) / nrow(validation_data)

} print(paste(“Validation Accuracy for KNN:”, accuracy)) print(paste(“Use KKNN with k =”, which.max(accuracy)))

confusionMatrix(as.factor(result),as.factor(validation_data$V11))

#Confusion Matrix and Statistics

#Reference #Prediction 0 1 #0 66 6 #1 11 47

#Accuracy : 0.8692
#95% CI : (0.7989, 0.9219) #No Information Rate : 0.5923
#P-Value [Acc > NIR] : 4.881e-12

#Kappa : 0.7332

#Mcnemar’s Test P-Value : 0.332

Sensitivity : 0.8571

Specificity : 0.8868

Pos Pred Value : 0.9167

Neg Pred Value : 0.8103

Prevalence : 0.5923

Detection Rate : 0.5077

Detection Prevalence : 0.5538

Balanced Accuracy : 0.8720

‘Positive’ Class : 0

#Testing the model on test data set set.seed(123) test_model <- kknn(V11 ~ ., train_data, test_data, k = which.max(test_accuracy), scale=TRUE)

res <- as.integer(fitted(test_model)+0.5) # round off to 0 or 1

test_accuracy <- sum(res == test_data$V11) / nrow(test_data)

print(paste(“Performance on test data is”, test_accuracy)) print(paste(“Use KKNN with k =”, which.max(test_accuracy)))

confusionMatrix(as.factor(res),as.factor(test_data$V11))

Alternatively, train the SVM model on the training data and evaluate on the validation data

svm_model <- ksvm(V11 ~ ., train_data, C = 1, type = “C-svc”, kernel = “rbfdot”) val_predictions <- predict(svm_model, validation_data) val_accuracy <- mean(validation_data$V11 == val_predictions)

print(paste(“Validation accuracy for SVM:”, val_accuracy))

Evaluate the final model on the test data

test_predictions <- predict(knn_model, newdata = test_data) test_accuracy <- mean(test_data$V11 == test_predictions)

print(paste(“Test accuracy for KNN:”, test_accuracy))

Alternatively, evaluate the final SVM model on the test data

test_predictions <- predict(svm_model, newdata = test_data) test_accuracy <- mean(test_data$V11 == test_predictions)

print(paste(“Test accuracy for SVM:”, test_accuracy))