Question 3.1 Using the same data set (credit_card_data.txt or credit_card_data-headers.txt) as in Question 2.2, use the ksvm or kknn function to find a good classifier: (a) using cross-validation (do this for the k-nearest-neighbors model; SVM is optional); and (b) splitting the data into training, validation, and test data sets (pick either KNN or SVM; the other is optional).

Using Cross-Validation via LOOCV (Leave One Out Cross Validation )

```# Installing and calling packages

#install.packages(“kknn”) library(kknn)

Reading the data

creditdata <- read.table(“C:/Users/MKRISHNAN/OneDrive - Cox Automotive/Documents/GATech/Week1_Homework/hw1/data 2.2/credit_card_data.txt”, stringsAsFactors = FALSE, header = FALSE)


Optional check to make sure the data is read correctly


``` r
head(data)

##                                                                             
## 1 function (..., list = character(), package = NULL, lib.loc = NULL,        
## 2     verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE) 
## 3 {                                                                         
## 4     fileExt <- function(x) {                                              
## 5         db <- grepl("\\\\.[^.]+\\\\.(gz|bz2|xz)$", x)                     
## 6         ans <- sub(".*\\\\.", "", x)

##————Using Cross-Validation using LOOCV————##

This method uses n-fold cross-validation, where n is the number of data points, because that’s how train.kknn does cross validation. It’s also called “leave-one-out” cross validation .

Setting the random number generator seed so that our results are reproducible. Similarly setting the K value to 100 and creating the training and testing dataset by doing a 80% training data and 20% test data split.

```{set.seed(1)} k <- 100

Split the data into training, and test sets

cc_train_size <- 0.8 cc_test_size <- 0.2

Calculate the number of rows for each set

cc_train_rows <- floor(nrow(creditdata) * cc_train_size) cc_test_rows <- nrow(creditdata) - cc_train_rows

Randomly select the rows for each set

cc_train_idx <- sample(1:nrow(creditdata), cc_train_rows) cc_test_idx <- setdiff(1:nrow(creditdata), c(cc_train_idx))

Create the data sets

train_data <- data[cc_train_idx, ] test_data <- data[cc_test_idx, ]

Train the KNN model on the training data

knn_model <- train.kknn((V11)~. , data = train_data, kmax = k, scale = TRUE) summary(knn_model)


Summary Results #Type of response variable: continuous #minimal mean
absolute error: 0.1955315 #Minimal mean squared error: 0.1085533 #Best
kernel: optimal #Best k: 41

Checking accuracy of the model prediction. From training the model we
got the Best K value as 41.


``` training_prediction

#initializing variable
training_accuracy <- 0 


#calculating prediction values
for (i in 1: 1:nrow(train_data)){
  model <- kknn((V11)~. , train_data[-i,], train_data[i,], k=41, kernel ="optimal", scale = TRUE)
  training_prediction[i] <- as.integer(fitted(model)+0.5)
}

training_accuracy <- sum(training_prediction == train_data[,11]) / nrow(train_data)

##Checking Testing accuracy
testing_prediction <- rep(0,(nrow(test_data))) # predictions: start with a vector of all zeros
testing_accuracy<- 0 #initialize variable

for (i in 1:nrow(test_data)){
  model=kknn(V11~.,test_data[-i,],test_data[i,],k=12,kernel="optimal", scale = TRUE) # use scaled data
  testing_prediction[i]<- as.integer(fitted(model)+0.5) # round off to 0 or 1 and store predicted values in vector
}

# calculate fraction of correct predictions
testing_accuracy<- sum(testing_prediction == test_data[,11]) / nrow(test_data)

testing_accuracy
training_accuracy

training_accuracy

0.8413002

testing_accuracy

0.8244275

##————Splitting the data into training, validation, and test data sets————## ## – Code below This code demonstrates how to split the data into training, validation, and test sets, and then use the k-nearest neighbors (KNN) and support vector machine (SVM) models to classify the data.

#install.packages(“kernlab”) #install.packages(“ggplot”) #install.packages(“caret”) # Load the required libraries library(kernlab) library(kknn) library(caret)

Read the data

data <- read.table(“C:/Users/MKRISHNAN/OneDrive - Cox Automotive/Documents/GATech/Week1_Homework/hw1/data 2.2/credit_card_data.txt”, stringsAsFactors = FALSE, header = FALSE)

#ensures reproducibility of the random sampling set.seed(123)

Split the data into training, validation, and test sets

train_size <- 0.6 val_size <- 0.2 test_size <- 0.2

Calculate the number of rows for each set

train_rows <- floor(nrow(data) * train_size) val_rows <- floor(nrow(data) * val_size) test_rows <- nrow(data) - train_rows - val_rows

Randomly select the rows for each set

train_idx <- sample(1:nrow(data), train_rows) val_idx <- sample(setdiff(1:nrow(data), train_idx), val_rows) test_idx <- setdiff(1:nrow(data), c(train_idx, val_idx))

Create the data sets

train_data <- data[train_idx, ] validation_data <- data[val_idx, ] test_data <- data[test_idx, ]

#Evaluating the KNN model: The code then evaluates the KNN model using the validation set. #Iterates over different values of k (the number of nearest neighbors) and calculates the accuracy of the KNN model on the validation set. #Then selects the value of k that gives the highest accuracy.

kmax <- 100 accuracy <- rep(0,kmax)

for (i in 1:kmax){ model <- kknn(V11 ~ ., train_data, validation_data, k = k, scale=TRUE) result <- as.integer(fitted(model)+0.5) accuracy[i] <- sum(result == validation_data$V11) / nrow(validation_data)

} print(paste(“Validation Accuracy for KNN:”, accuracy)) print(paste(“Use KKNN with k =”, which.max(accuracy)))

confusionMatrix(as.factor(result),as.factor(validation_data$V11))

#Confusion Matrix and Statistics

#Reference #Prediction 0 1 #0 66 6 #1 11 47

#Accuracy : 0.8692
#95% CI : (0.7989, 0.9219) #No Information Rate : 0.5923
#P-Value [Acc > NIR] : 4.881e-12

#Kappa : 0.7332

#Mcnemar’s Test P-Value : 0.332

Sensitivity : 0.8571

Specificity : 0.8868

Pos Pred Value : 0.9167

Neg Pred Value : 0.8103

Prevalence : 0.5923

Detection Rate : 0.5077

Detection Prevalence : 0.5538

Balanced Accuracy : 0.8720

‘Positive’ Class : 0

#Testing the model on test data set set.seed(123) test_model <- kknn(V11 ~ ., train_data, test_data, k = which.max(test_accuracy), scale=TRUE)

res <- as.integer(fitted(test_model)+0.5) # round off to 0 or 1

test_accuracy <- sum(res == test_data$V11) / nrow(test_data)

print(paste(“Performance on test data is”, test_accuracy)) print(paste(“Use KKNN with k =”, which.max(test_accuracy)))

confusionMatrix(as.factor(res),as.factor(test_data$V11))

Alternatively, train the SVM model on the training data and evaluate on the validation data

svm_model <- ksvm(V11 ~ ., train_data, C = 1, type = “C-svc”, kernel = “rbfdot”) val_predictions <- predict(svm_model, validation_data) val_accuracy <- mean(validation_data$V11 == val_predictions)

print(paste(“Validation accuracy for SVM:”, val_accuracy))

Evaluate the final model on the test data

test_predictions <- predict(knn_model, newdata = test_data) test_accuracy <- mean(test_data$V11 == test_predictions)

print(paste(“Test accuracy for KNN:”, test_accuracy))

Alternatively, evaluate the final SVM model on the test data

test_predictions <- predict(svm_model, newdata = test_data) test_accuracy <- mean(test_data$V11 == test_predictions)

print(paste(“Test accuracy for SVM:”, test_accuracy))

Week2_Homework

Reading the data

Split the data into training, and test sets

Calculate the number of rows for each set

Randomly select the rows for each set

Create the data sets

Train the KNN model on the training data

Read the data

Split the data into training, validation, and test sets

Calculate the number of rows for each set

Randomly select the rows for each set

Create the data sets

Sensitivity : 0.8571

Specificity : 0.8868

Pos Pred Value : 0.9167

Neg Pred Value : 0.8103

Prevalence : 0.5923

Detection Rate : 0.5077

Detection Prevalence : 0.5538

Balanced Accuracy : 0.8720

‘Positive’ Class : 0

Alternatively, train the SVM model on the training data and evaluate on the validation data

Evaluate the final model on the test data

Alternatively, evaluate the final SVM model on the test data