library("dplyr")
library("tidyr")
library("ggplot2")
library("ROCR")
library("rpart")
library("rpart.plot")
library("caret")
library("randomForest")
library("tidyverse")
library("tm")
library("SnowballC")
library("softImpute")
library("glmnet")
library("Hmisc")
library("dummies")
library('tinytex')
library('GGally')
library('gplots')
library('FNN')
library("dplyr")
library("tidyr")
library("caTools")
library("ggpubr")
library("reshape")
rm(list=ls())
setwd("/Users/kayhanbabakan/OneDrive/MIT/Data Mining/Data_export")
bank = read.csv("/Users/kayhanbabakan/OneDrive/MIT/Data Mining/Data_export/UniversalBank.csv")
bank$Education = as.factor(bank$Education)
bank_dummy = dummy.data.frame(select(bank,-c(ZIP.Code,ID)))
bank_dummy$Personal.Loan = as.factor(bank_dummy$Personal.Loan)
bank_dummy$CCAvg = as.integer(bank_dummy$CCAvg)
set.seed(1)
train.index <- sample(row.names(bank_dummy), 0.6*dim(bank_dummy)[1]) ## need to look at hints
test.index <- setdiff(row.names(bank_dummy), train.index)
train.df <- bank_dummy[train.index, ]
valid.df <- bank_dummy[test.index, ]
new.df = data.frame(Age = as.integer(40), Experience = as.integer(10), Income = as.integer(84), Family = as.integer(2), CCAvg = as.integer(2), Education1 = as.integer(0), Education2 = as.integer(1), Education3 = as.integer(0), Mortgage = as.integer(0), Securities.Account = as.integer(0), CD.Account = as.integer(0), Online = as.integer(1), CreditCard = as.integer(1))
norm.values <- preProcess(train.df[, -c(10)], method=c("center", "scale"))
train.df[, -c(10)] <- predict(norm.values, train.df[, -c(10)])
valid.df[, -c(10)] <- predict(norm.values, valid.df[, -c(10)])
new.df <- predict(norm.values, new.df)
knn.1 <- knn(train = train.df[,-c(10)],test = new.df, cl = train.df[,10], k=5, prob=TRUE)
knn.attributes <- attributes(knn.1)
knn.attributes[1]
$levels
[1] "0"
knn.attributes[3]
$prob
[1] 1
all 5 nearest neighbors will classified as a 0, in turn the customer will be classified as a 0.
accuracy.df <- data.frame(k = seq(1, 14, 1), accuracy = rep(0, 14))
for(i in 1:14) {
knn.2 <- knn(train = train.df[,-10],test = valid.df[,-10], cl = train.df[,10], k=i, prob=TRUE)
accuracy.df[i, 2] <- confusionMatrix(knn.2, valid.df[,10])$overall[1]
}
accuracy.df
the best choice of k which also balances the model from overfitting is k = 3
knn.3 <- knn(train = train.df[,-10],test = valid.df[,-10], cl = train.df[,10], k=3, prob=TRUE)
confusionMatrix(knn.3, valid.df[,10])
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1787 64
1 8 141
Accuracy : 0.964
95% CI : (0.9549, 0.9717)
No Information Rate : 0.8975
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7774
Mcnemar's Test P-Value : 9.063e-11
Sensitivity : 0.9955
Specificity : 0.6878
Pos Pred Value : 0.9654
Neg Pred Value : 0.9463
Prevalence : 0.8975
Detection Rate : 0.8935
Detection Prevalence : 0.9255
Balanced Accuracy : 0.8417
'Positive' Class : 0
confusion matrix as per above
customer.df= data.frame(Age = 40, Experience = 10, Income = 84, Family = 2, CCAvg = 2, Education_1 = 0, Education_2 = 1, Education_3 = 0, Mortgage = 0, Securities.Account = 0, CD.Account = 0, Online = 1, CreditCard = 1)
knn.4 <- knn(train = train.df[,-10],test = customer.df, cl = train.df[,10], k=3, prob=TRUE)
knn.4
[1] 1
attr(,"prob")
[1] 1
attr(,"nn.index")
[,1] [,2] [,3]
[1,] 2721 2146 939
attr(,"nn.dist")
[,1] [,2] [,3]
[1,] 90.51969 90.53808 90.56426
Levels: 1
customer is classified as a 1 with 100% probability
bank_dummy = dummy.data.frame(select(bank,-c(ZIP.Code,ID)))
bank_dummy$Personal.Loan = as.factor(bank_dummy$Personal.Loan)
bank_dummy$CCAvg = as.integer(bank_dummy$CCAvg)
set.seed(1)
train.index <- sample(rownames(bank_dummy), 0.5*dim(bank_dummy)[1]) ## need to look at hints
set.seed(1)
valid.index <- sample(setdiff(rownames(bank_dummy),train.index), 0.3*dim(bank_dummy)[1])
test.index = setdiff(rownames(bank_dummy), union(train.index, valid.index))
train.df <- bank_dummy[train.index, ]
valid.df <- bank_dummy[valid.index, ]
test.df <- bank_dummy[test.index, ]
norm.values <- preProcess(train.df[, -c(10)], method=c("center", "scale"))
train.df[, -c(10)] <- predict(norm.values, train.df[, -c(10)])
valid.df[, -c(10)] <- predict(norm.values, valid.df[, -c(10)])
test.df[,-c(10)] <- predict(norm.values, test.df[,-c(10)])
testknn <- knn(train = train.df[,-c(10)],test = test.df[,-c(10)], cl = train.df[,10], k=3, prob=TRUE)
validknn <- knn(train = train.df[,-c(10)],test = valid.df[,-c(10)], cl = train.df[,10], k=3, prob=TRUE)
trainknn <- knn(train = train.df[,-c(10)],test = train.df[,-c(10)], cl = train.df[,10], k=3, prob=TRUE)
confusionMatrix(testknn, test.df[,10])
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 890 34
1 2 74
Accuracy : 0.964
95% CI : (0.9505, 0.9747)
No Information Rate : 0.892
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7852
Mcnemar's Test P-Value : 2.383e-07
Sensitivity : 0.9978
Specificity : 0.6852
Pos Pred Value : 0.9632
Neg Pred Value : 0.9737
Prevalence : 0.8920
Detection Rate : 0.8900
Detection Prevalence : 0.9240
Balanced Accuracy : 0.8415
'Positive' Class : 0
confusionMatrix(validknn, valid.df[,10])
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1355 45
1 5 95
Accuracy : 0.9667
95% CI : (0.9563, 0.9752)
No Information Rate : 0.9067
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7741
Mcnemar's Test P-Value : 3.479e-08
Sensitivity : 0.9963
Specificity : 0.6786
Pos Pred Value : 0.9679
Neg Pred Value : 0.9500
Prevalence : 0.9067
Detection Rate : 0.9033
Detection Prevalence : 0.9333
Balanced Accuracy : 0.8374
'Positive' Class : 0
confusionMatrix(trainknn, train.df[,10])
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2264 51
1 4 181
Accuracy : 0.978
95% CI : (0.9715, 0.9834)
No Information Rate : 0.9072
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.8563
Mcnemar's Test P-Value : 5.552e-10
Sensitivity : 0.9982
Specificity : 0.7802
Pos Pred Value : 0.9780
Neg Pred Value : 0.9784
Prevalence : 0.9072
Detection Rate : 0.9056
Detection Prevalence : 0.9260
Balanced Accuracy : 0.8892
'Positive' Class : 0
Test Accuracy : 0.964 Valid Accuracy: 0.9667 Train Accuracy: 0.978
As the model is being fit on the training data it would make intuitive sense that the classifications are most accurate on the training data set and least accurate on the test datasets.