Sys.setenv(JAVA_HOME='C:\\Program Files\\Java\\jre1.8.0_144')
library(kernlab)
library(kknn)
I work a lot with companies that service home loans, one of the biggest classification/modelling challanges we come across is around probability of default. Some of the major predictors are Property Value to Income ratio, Interest rates, Net worth, credit scores and demographis (age, gender, education etc)
#load the data
data <- read.table("C:\\Users\\vabhagat\\Downloads\\credit_card_data.txt", header=F, stringsAsFactors = F)
head(data)
#Define Model
data_model = ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), C=50, scaled = T, kernel="vanilladot", type = "C-svc")
## Setting default kernel parameters
#(NOTE:: Tried several different values for C, but the traning error did not change significantly.)
data_model
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 50
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 189
##
## Objective Function Value : -8944.221
## Training error : 0.136086
#Find Weights
Weights = colSums(data_model@xmatrix[[1]] * data_model@coef[[1]] )
Weights
## V1 V2 V3 V4 V5
## -0.0010523630 -0.0012025131 -0.0015382662 0.0028761998 1.0052764944
## V6 V7 V8 V9 V10
## -0.0024958086 0.0001810245 -0.0006514829 -0.0013757143 0.1064002847
#calculate w0
w0 = -data_model@b
w0
## [1] 0.08147145
#The classifier's equation is then: -0.0010523630v1 -0.0012025131v2 -0.0015382662v3 + 0.0028761998v4 + 1.0052764944v5 -0.0024958086v6 + 0.0001810245v7 -0.0006514829v8 -0.0013757143v9 + 0.1064002847v10 + 0.08147145 = 0
#model prediction
data_model_pred = predict(data_model, data[,1:10])
data_model_pred
## [1] 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [351] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [386] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [421] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [456] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
## [561] 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## [596] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [631] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
#lets observe the percentage of the model's correct predictions.
sum(data_model_pred == data[,11]) / nrow(data) * 100
## [1] 86.39144
#The model's accuracy is 86.39%.
Skipping for now as it’s optional.
chk = function(Z){
model_pred= rep(0,(nrow(data)))
for (i in 1:nrow(data)){
#ensure it doesn't use i itself
knn_model=kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,data[-i,],data[i,],k=Z, scale = T)
model_pred[i] <- as.integer(fitted(knn_model)+0.5) #for rounding
}
acc = sum(model_pred == data[,11]) / nrow(data)
return(acc)
}
tst_vec = rep(0,30) # 30 zeroes vector for accuracy test (knn values ranging from 1 to 30)
for (Z in 1:30){
tst_vec[Z] = chk(Z)
}
knn_acc = as.matrix(tst_vec * 100) #accuracy percentage
knn_acc
## [,1]
## [1,] 81.49847
## [2,] 81.49847
## [3,] 81.49847
## [4,] 81.49847
## [5,] 85.16820
## [6,] 84.55657
## [7,] 84.70948
## [8,] 84.86239
## [9,] 84.70948
## [10,] 85.01529
## [11,] 85.16820
## [12,] 85.32110
## [13,] 85.16820
## [14,] 85.16820
## [15,] 85.32110
## [16,] 85.16820
## [17,] 85.16820
## [18,] 85.16820
## [19,] 85.01529
## [20,] 85.01529
## [21,] 84.86239
## [22,] 84.70948
## [23,] 84.40367
## [24,] 84.55657
## [25,] 84.55657
## [26,] 84.40367
## [27,] 84.09786
## [28,] 83.79205
## [29,] 83.94495
## [30,] 84.09786
knn_val <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30)
plot(knn_val,knn_acc) #Accuracy for every value