install.packages("knitr", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/lenovo/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'knitr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lenovo\AppData\Local\Temp\RtmpkJEMMv\downloaded_packages
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
opts_chunk$set(tidy.opts=list(width.cutoff=100),tidy=TRUE)
Q 2.1 Describe a situation or problem from your job, everyday life, current events, etc., for which a classification model would be appropriate. List some (up to 5) predictors that you might use.
I work as a consultant in Data Regulation for banks.One way classification can be useful is in Anti-Money Laundering , specifically Transaction Monitoring.Classification can be used to determine transactions that have suspicious activity vs those that do not. Some examples of predictor variables that can be used are.
Amount being transacted.
Average amount per transaction for a specific account
Average number of transactions per month
The originating location of the transaction
Time since last transaction
Transaction type e.g wire transfer
Q 2.2.1. Using the support vector machine function ksvm contained in the R package kernlab, find a good classifier for this data. Show the equation of your classifier, and how well it classifies the data points in the full data set. (Don’t worry about test/validation data yet; we’ll cover that topic soon.)
{r echo=FALSE , results=FALSE ,message=FALSE, warning=FALSE}
install.packages("kernlab", repos = "http://cran.us.r-project.org")
install.packages("kknn", repos="http://cran.us.r-project.org")
library(kernlab)
library("kknn")
Read data
data <- read.table("C:/Users/lenovo/Documents/omsa/ISYE 6501 - Introduction to Analytics Modelling/Homeworks/hw1/data 2.2/credit_card_data.txt", stringsAsFactors = FALSE, header = FALSE)
head(data)
One of the challenges of this homework is to find a value of C that works well. We will therefore try different values of C and see which one works the best.
## Create a list to hold possible values of C
C_values <- list(0.00001,0.001,100,1000,10000000)
for (c in C_values) {
model <- ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), C=c, scaled = TRUE, kernel="vanilladot", type = "C-svc")
##See what the model predicts
pred <- predict(model,data[,1:10])
# see model accuracy for different C values
accuracy = sum(pred == data[,11]) / nrow(data)
print(paste0("When C is ", c, " the model accuracy is ", accuracy))
}
## Setting default kernel parameters
## [1] "When C is 1e-05 the model accuracy is 0.547400611620795"
## Setting default kernel parameters
## [1] "When C is 0.001 the model accuracy is 0.837920489296636"
## Setting default kernel parameters
## [1] "When C is 100 the model accuracy is 0.863914373088685"
## Setting default kernel parameters
## [1] "When C is 1000 the model accuracy is 0.862385321100917"
## Setting default kernel parameters
## [1] "When C is 1e+07 the model accuracy is 0.545871559633027"
As seen above, and for the different values of C provided, the model has the highest accuracy when the value of C is 100 therefore I settled with C = 100
So my model definition is as follows
best_model <- ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), C=100, scaled = TRUE, kernel="vanilladot", type = "C-svc")
## Setting default kernel parameters
pred_best <- predict(best_model,data[,1:10])
accuracy_best = sum(pred_best == data[,11]) / nrow(data)
best_model
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 100
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 189
##
## Objective Function Value : -17887.92
## Training error : 0.136086
accuracy_best
## [1] 0.8639144
# calculate a1…am
a <- colSums(best_model@xmatrix[[1]] * best_model@coef[[1]])
a
## V1 V2 V3 V4 V5
## -0.0010065348 -0.0011729048 -0.0016261967 0.0030064203 1.0049405641
## V6 V7 V8 V9 V10
## -0.0028259432 0.0002600295 -0.0005349551 -0.0012283758 0.1063633995
# calculate a0
a0 <- -best_model@b
a0
## [1] 0.08158492
x=colnames(data)
print(paste0(a[1],x[1],"+",a[2],x[2],"+",a[3],x[3],"+",a[4],x[4],"+",a[5],x[5],"+",a[6],x[6],"+",a[7],x[7],"+",a[8],x[8],"+",a[9],x[9],"+",a[10],x[10],"+", a0))
## [1] "-0.00100653481057611V1+-0.00117290480611665V2+-0.00162619672236963V3+0.0030064202649194V4+1.00494056410556V5+-0.00282594323043472V6+0.000260029507016313V7+-0.000534955143494997V8+-0.00122837582291523V9+0.106363399527188V10+0.081584921659538"
The classifier equation is therefore -0.00100653481057611V1-0.00117290480611665V2-0.00162619672236963V3+0.0030064202649194V4+1.00494056410556V5-0.00282594323043472V6+0.000260029507016313V7-0.000534955143494997V8-0.00122837582291523V9+0.106363399527188V10+0.081584921659538
Q 2.2.2 try other (nonlinear) kernels as well; we’re not covering them in this course, but they can sometimes be useful and might provide better predictions than vanilladot.
The idea here is that, we create a list containing other possible kernels and create a ksvm model for each kernel, then evaluate how peformance varies.I will use a C value of 100 based on the previous Q.
## Create a list to hold the following kernels
kernel_example <- list("vanilladot","rbfdot","laplacedot","polydot","tanhdot","besseldot","anovadot","splinedot")
for (k in kernel_example) {
model <- ksvm(as.matrix(data[,1:10]),as.factor(data[,11]), C=100, scaled = TRUE, kernel=k, type = "C-svc")
##See what the model predicts
pred <- predict(model,data[,1:10])
# see model accuracy for different C values
accuracy = sum(pred == data[,11]) / nrow(data)
print(paste0("When we use kernel ", k, " the model accuracy is ", accuracy))
}
## Setting default kernel parameters
## [1] "When we use kernel vanilladot the model accuracy is 0.863914373088685"
## [1] "When we use kernel rbfdot the model accuracy is 0.951070336391437"
## [1] "When we use kernel laplacedot the model accuracy is 1"
## Setting default kernel parameters
## [1] "When we use kernel polydot the model accuracy is 0.863914373088685"
## Setting default kernel parameters
## [1] "When we use kernel tanhdot the model accuracy is 0.7217125382263"
## Setting default kernel parameters
## [1] "When we use kernel besseldot the model accuracy is 0.925076452599388"
## Setting default kernel parameters
## [1] "When we use kernel anovadot the model accuracy is 0.906727828746177"
## Setting default kernel parameters
## [1] "When we use kernel splinedot the model accuracy is 0.978593272171254"
The model that yields the best accuracy with the laplacedot kernel
Q 2.2.3 3.Using the k-nearest-neighbors classification function kknn contained in the R kknn package, suggest a good value of k, and show how well it classifies that data points in the full data set. Don’t forget to scale the data (scale=TRUE in kknn
##Using scaled data
check_accuracy_kknn = function(X){
predicted <- rep(0,(nrow(data))) # predictions: start with a vector of all zeros
# for each row, estimate its response based on the other rows
for (i in 1:nrow(data)){
# data[-i] means we remove row i of the data when finding nearest neighbors. and use scaled data
model=kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,data[-i,],data[i,],k=X, scale = TRUE) # use scaled data
# record whether the prediction is at least 0.5 (round to one) or less than 0.5 (round to zero)
predicted[i] <- as.integer(fitted(model)+0.5) # round off to 0 or 1
}
# calculate fraction of correct predictions
accuracy = sum(predicted == data[,11]) / nrow(data)
return(accuracy)
}
# call the function for values of k from 1 to 30
acc <- rep(0,30) # set up a vector of 30 zeros to start
for (X in 1:30){
acc[X] = check_accuracy_kknn(X) # test knn with X neighbors
}
#
# report accuracies
#
acc
## [1] 0.8149847 0.8149847 0.8149847 0.8149847 0.8516820 0.8455657 0.8470948
## [8] 0.8486239 0.8470948 0.8501529 0.8516820 0.8532110 0.8516820 0.8516820
## [15] 0.8532110 0.8516820 0.8516820 0.8516820 0.8501529 0.8501529 0.8486239
## [22] 0.8470948 0.8440367 0.8455657 0.8455657 0.8440367 0.8409786 0.8379205
## [29] 0.8394495 0.8409786
max(acc)
## [1] 0.853211
best value for k in this case is 12 as the model yields the best accuracy
Try using unscaled data by setting scale=FALSE
check_accuracy_unscaled = function(X){
predicted <- rep(0,(nrow(data)))
for (i in 1:nrow(data)){
# model creation using scaled data
model=kknn(V11~V1+V2+V3+V4+V5+V6+V7+V8+V9+V10,data[-i,],data[i,],k=X, scale = FALSE) # use unscaled data
# this is to round-off values
predicted[i] <- as.integer(fitted(model)+0.5)
}
# calculation of accuracy
accuracy = sum(predicted == data[,11]) / nrow(data)
return(accuracy)
}
acc_unscaled <- rep(0,30) # set up a vector of 30 zeros to start
for (X in 1:30){
acc_unscaled[X] = check_accuracy_unscaled(X) # test knn with X neighbors
}
#
# report accuracies
#
acc_unscaled
## [1] 0.6636086 0.6636086 0.6636086 0.6636086 0.6911315 0.6957187 0.6926606
## [8] 0.6926606 0.6865443 0.6773700 0.6804281 0.6834862 0.6865443 0.6880734
## [15] 0.6880734 0.6926606 0.6926606 0.6926606 0.6926606 0.6911315 0.6926606
## [22] 0.6926606 0.6926606 0.6957187 0.6926606 0.6880734 0.6911315 0.6926606
## [29] 0.6941896 0.6941896
max(acc_unscaled)
## [1] 0.6957187
As seen from above code, the model performs much worse with unscaled data