HW2_Fall20_Q3.1-b-.R

# Question 3.1 

#Using the same data set (credit_card_data.txt or credit_card_data-headers.txt) as in Question 2.2, use the ksvm or kknn function to find a good classifier:
#(a)    using cross-validation (do this for the k-nearest-neighbors model; SVM is optional); and
#(b)    splitting the data into training, validation, and test data sets (pick either KNN or SVM; the other is optional).

#Solution 3.1(b):

#Define the library for kknn and ksvm command
library(kernlab)
library(kknn)

# loading data from text file of creditcard provided in the HW2

creditcard_data = read.delim(file.choose(),header=F)

#Checking to make sure that data has been read correctly

head(creditcard_data,5)

##   V1    V2    V3   V4 V5 V6 V7 V8  V9 V10 V11
## 1  1 30.83 0.000 1.25  1  0  1  1 202   0   1
## 2  0 58.67 4.460 3.04  1  0  6  1  43 560   1
## 3  0 24.50 0.500 1.50  1  1  0  1 280 824   1
## 4  1 27.83 1.540 3.75  1  0  5  0 100   3   1
## 5  1 20.17 5.625 1.71  1  1  0  1 120   0   1

#set.seed command is used to generate a sequence of random numbers, so that my result is reproducible
set.seed(1)

# Need to split data into training, validation, and test sets as shown in class video
# 60% for training set and 20% each for validation and test set

# Creating a mask using the sample function for the split

# 60% of data for training set 
mask_trainingset = sample(nrow(creditcard_data), size = floor(nrow(creditcard_data) * 0.6))

# Training data set
creditcard_training = creditcard_data[mask_trainingset,] 

# Using the remaining data for test and validation sets by splitting into half
# selecting all rows except rows assigned to training set
remaining_data = creditcard_data[-mask_trainingset,]  

# Half of remaining data is for validation set and half for test set
mask_validationset = sample(nrow(remaining_data), size = floor(nrow(remaining_data)/2))

# validation data set
creditcard_validation = remaining_data[mask_validationset,] 

# test data set
creditcard_test = remaining_data[-mask_validationset,] 

# Now, validation and test set have 131 observations, which is 20% of total 654 observations


# We have to select the best of 9 SVM models and 20 KNN models

accu = rep(0,29)  # 1-9 are SVM, 10-29 are KNN


# We have to Train SVM models
# values of C to test

amounts_c = c(0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000) 

for (i in 1:9) {
  
#Now, fitting model using training dataset by using C-classification method and simple linear kernel (vanilladot)
  
creditcardmodel_scaled = ksvm(as.matrix(creditcard_training[,1:10]),as.factor(creditcard_training[,11]), type = "C-svc",kernel = "vanilladot",C = amounts_c[i],scaled=TRUE) 
  
# Model prediction
    pred = predict(creditcardmodel_scaled,creditcard_validation[,1:10])
    
#How much model`s prediction actually match the actual classification`
  accu[i] = (sum(pred == creditcard_validation$V11) / nrow(creditcard_validation))*100
}

##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters  
##  Setting default kernel parameters

# Accuracy percentage of models
accu[1:9]

## [1] 57.25191 57.25191 69.46565 87.78626 87.78626 87.78626 87.78626 87.78626
## [9] 87.78626

# Now, we have to find the best SVM model in validation data

cat("Best SVM model number= ",which.max(accu[1:9]),"\n")

## Best SVM model number=  4

cat("Best C value =",amounts_c[which.max(accu[1:9])],"\n")

## Best C value = 0.01

cat("Best validation set accuracy (%)=",max(accu[1:9]),"\n")

## Best validation set accuracy (%)= 87.78626

# retrain the best model using training data

creditcardmodel_scaled  = ksvm(as.matrix(creditcard_training[,1:10]),as.factor(creditcard_training[,11]),type = "C-svc",kernel = "vanilladot",C = amounts_c[which.max(accu[1:9])],
                     scaled=TRUE)

##  Setting default kernel parameters

cat("Acuracy on test dataset = ",sum(predict(creditcardmodel_scaled,creditcard_test[,1:10]) == creditcard_test$V11) / nrow(creditcard_test),"\n")

## Acuracy on test dataset =  0.8625954

# We have to Train SVM models

for (k in 1:20) {
  
# Now, we have to fit k-nearest-neighbor model using training dataset using kknn
  
knn_model = kknn(V11~.,creditcard_training,creditcard_validation,k=k,scale=TRUE)
  
#Model comparison using validation dataset
  
  prediction = as.integer(fitted(knn_model)+0.5) # rounding off to 0 or 1
  
  accu[k+9] = (sum(prediction == creditcard_validation$V11) / nrow(creditcard_validation))*100
}

# Accuracy percentage of models
accu[10:29]

##  [1] 75.57252 75.57252 75.57252 75.57252 81.67939 82.44275 81.67939 81.67939
##  [9] 81.67939 83.96947 83.96947 83.96947 83.96947 83.96947 83.96947 83.96947
## [17] 83.96947 83.96947 83.96947 83.96947

# Now, we have to find the best KNN model with its accuracy % in validation data
cat("Best KNN model number=",which.max(accu[10:29]),"\n")

## Best KNN model number= 10

cat("Best validation set accuracy (%)= ",max(accu[10:29]),"\n")

## Best validation set accuracy (%)=  83.96947

# cross-checking data by running our best model on test data

knn_model= kknn(V11~.,creditcard_training,creditcard_test,k=which.max(accu[10:29]),scale=TRUE)

prediction = as.integer(fitted(knn_model)+0.5) 

cat("Accuracy on test dataset = ",sum(prediction == creditcard_test$V11) / nrow(creditcard_test),"\n")

## Accuracy on test dataset =  0.8778626

# Overall best model evaluation on test data 

# Applying if command to verify ksvm method is best and else command to verify knn method is best

if (which.max(accu) <= 9)  
{        
  #evaluating the ksvm method on the test dataset 
  
  cat("Use ksvm with C = ",amounts_c[which.max(accu[1:9])],"\n")
  cat("Accuracy on test dataset = ",sum(predict(creditcardmodel_scaled ,creditcard_test[,1:10]) == creditcard_test$V11) / nrow(creditcard_test),"\n")
  
} else 
{
  #evaluating the knn method on the test dataset 
  
  cat("Use knn with k = ",which.max(accu[10:29]),"\n")
  cat("Accuracy on test data= ",sum(prediction == creditcard_validation$V11) / nrow(creditcard_validation),"\n")
}

## Use ksvm with C =  0.01 
## Accuracy on test dataset =  0.8625954

# Final Answer
#Use ksvm with C =  0.01 
#Accuracy on test dataset =  0.8625954

HW2_Fall20_Q3.1-b-.R

Ashutosh Sharma

2020-09-01