Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_144')
library(kernlab)  
library(kknn) 

Question 2.1: Describe a situation or problem from your job, everyday life, current events, etc., for which a classification model would be appropriate. List some (up to 5) predictors that you might use

I work a lot with companies that service home loans, one of the biggest classification/modelling challanges we come across is around probability of default. Some of the major predictors are Property Value to Income ratio, Interest rates, Net worth, credit scores and demographis (age, gender, education etc)

Question 2.2: The files credit_card_data.txt (without headers) and credit_card_data-headers.txt (with headers)contain a dataset with 654 data points, 6 continuous and 4 binary predictor variables. It has anonymized credit card applications with a binary response variable (last column) indicating if the application was positive or negative.

Part 1: Using the support vector machine function ksvm contained in the R package kernlab, find a good classifier for this data. Show the equation of your classifier, and how well it classifies the data points in the full data set. (Don’t worry about test/validation data yet; we’ll cover that topic soon.)

#load the data 
data <- read.table("C:\\Users\\vabhagat\\Downloads\\credit_card_data.txt", header=F, stringsAsFactors = F) 
head(data)
#Define Model
data_model = ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), C=50, scaled = T, kernel="vanilladot", type = "C-svc")
##  Setting default kernel parameters
#(NOTE:: Tried several different values for C, but the traning error did not change significantly.)
data_model
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 50 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 189 
## 
## Objective Function Value : -8944.221 
## Training error : 0.136086
#Find Weights 
Weights = colSums(data_model@xmatrix[[1]] * data_model@coef[[1]] )
Weights
##            V1            V2            V3            V4            V5 
## -0.0010523630 -0.0012025131 -0.0015382662  0.0028761998  1.0052764944 
##            V6            V7            V8            V9           V10 
## -0.0024958086  0.0001810245 -0.0006514829 -0.0013757143  0.1064002847
#calculate w0
w0 = -data_model@b
w0
## [1] 0.08147145
#The classifier's equation is then: -0.0010523630v1 -0.0012025131v2 -0.0015382662v3 + 0.0028761998v4 + 1.0052764944v5 -0.0024958086v6 + 0.0001810245v7 -0.0006514829v8 -0.0013757143v9 + 0.1064002847v10 + 0.08147145 = 0

#model prediction
data_model_pred = predict(data_model, data[,1:10])
data_model_pred
##   [1] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [351] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [386] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [421] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [456] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
## [561] 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [596] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [631] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
#lets observe the percentage of the model's correct predictions.
sum(data_model_pred == data[,11]) / nrow(data) * 100
## [1] 86.39144
#The model's accuracy is 86.39%.

Part 2:You are welcome, but not required, to try other (nonlinear) kernels as well; we’re not covering them in this course, but they can sometimes be useful and might provide better predictions than vanilladot

Skipping for now as it’s optional.

Part 3: Using the k-nearest-neighbors classification function kknn contained in the R kknn package, suggest a good value of k, and show how well it classifies that data points in the full data set. Don’t forget to scale the data (scale=TRUE in kknn).

chk = function(Z){
  model_pred= rep(0,(nrow(data))) 
  
  for (i in 1:nrow(data)){
  #ensure it doesn't use i itself 
    knn_model=kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,data[-i,],data[i,],k=Z, scale = T) 
    model_pred[i] <- as.integer(fitted(knn_model)+0.5) #for rounding
  }
  
  acc = sum(model_pred == data[,11]) / nrow(data)
  return(acc)
}

tst_vec = rep(0,30) # 30 zeroes vector for accuracy test (knn values ranging from 1 to 30)
for (Z in 1:30){
  tst_vec[Z] = chk(Z) 
}


knn_acc = as.matrix(tst_vec * 100) #accuracy percentage
knn_acc
##           [,1]
##  [1,] 81.49847
##  [2,] 81.49847
##  [3,] 81.49847
##  [4,] 81.49847
##  [5,] 85.16820
##  [6,] 84.55657
##  [7,] 84.70948
##  [8,] 84.86239
##  [9,] 84.70948
## [10,] 85.01529
## [11,] 85.16820
## [12,] 85.32110
## [13,] 85.16820
## [14,] 85.16820
## [15,] 85.32110
## [16,] 85.16820
## [17,] 85.16820
## [18,] 85.16820
## [19,] 85.01529
## [20,] 85.01529
## [21,] 84.86239
## [22,] 84.70948
## [23,] 84.40367
## [24,] 84.55657
## [25,] 84.55657
## [26,] 84.40367
## [27,] 84.09786
## [28,] 83.79205
## [29,] 83.94495
## [30,] 84.09786
knn_val <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)
plot(knn_val,knn_acc) #Accuracy for every value