Project#KNN

##Question 1a.

#Question 1a.
# Reading the data
UB <- read.csv("UniversalBank.csv")

#Converting CCAvg and Personal Loan to integers and factors respectively
UB$CCAvg <- as.integer(UB$CCAvg)
UB$Personal.Loan <- as.factor(UB$Personal.Loan)
UB$Education_1 <- as.numeric(UB$Education==1)
UB$Education_2 <- as.numeric(UB$Education==2)
UB$Education_3 <- as.numeric(UB$Education==3)

#Eliminating ID, Zip Code, Education  
UB1 <- UB[, -c(1,5,8)]

#Partition of the data into training data
set.seed(3.14)
partitionData <- function(df, prop = c(0.60, 0.20, 0.20)){
  n = nrow(df)
  idx = sample(1:n, n)
  n1 = round(n*prop[1])
  n2 = round(n*prop[2])
  
  train.idx = idx[1:n1]
  valid.idx = idx[(n1+1):(n1+n2)]
  test.idx = idx[-c(1:(n1+n2))]
  
  train = df[train.idx, ]
  valid = df[valid.idx, ]
  test = df[test.idx, ]
  
  list(train = train, valid = valid, test = test)
}
partition <- partitionData(UB1, c(0.60,0.40))
Customer.train <- partition$train
Customer.valid <-partition$valid

#Assigning predicting Values
CustomerPred = data.frame(Age=40, Experience = 10, Income =84, Family = 2, 
                      CCAvg= 2,Mortgage = 0, `Securities Account` = 0, 
                      `CD Account` = 0,Online = 1, CreditCard = 1, 
                      Education_1=0, Education_2=1, Education_3=0)

#Normalization of the data
norm.value <- preProcess(Customer.train, method = "range", cutoff = 0.5)
Customer.norm.train <- predict(norm.value, Customer.train)
Customer.norm.valid<- predict(norm.value, Customer.valid)
CustomerPred.norm <- predict(norm.value, CustomerPred)

#Predict the KNN-Nearest of the data with the new data
knn.pred <- knn(train = Customer.norm.train[,-7], test = CustomerPred.norm,
                     cl= Customer.norm.train[,7], k=1, prob = TRUE)
knn.pred

## [1] 0
## attr(,"prob")
## [1] 1
## attr(,"nn.index")
##      [,1]
## [1,] 1091
## attr(,"nn.dist")
##           [,1]
## [1,] 0.1351852
## Levels: 0

"From the output we conclude that the above customer is classified 
 as belonging to the loan not accepted group"

## [1] "From the output we conclude that the above customer is classified \n as belonging to the loan not accepted group"

##Question 1b.

accuracy.df = data.frame(k=seq(1,14,1), accuracy = rep(0,14))
for(i in 1:14){
knn.pred2 <- knn(train=Customer.norm.train[,-7],test = Customer.norm.valid[,-7],
                 cl=Customer.norm.train[,7], k=i, prob = TRUE)
accuracy.df[i,2]<- caret::confusionMatrix(factor(knn.pred2, levels = c(0,1)),
                    factor(Customer.norm.valid[,7], levels = c(0,1)))$overall[1]
  
}
accuracy.df

plot(accuracy.df, type = "l", xlab = "k")

 "Choice of k that balances between over fitting and ignoring the predictor 
 would be k = 3. The value is chosen because it minimizes the % validation error. 
After testing various k levels. According to the validation error log for 
different k the best k points to 3, where %error training is 95.95% and 
validation % error is 4.05%."

## [1] "Choice of k that balances between over fitting and ignoring the predictor \n would be k = 3. The value is chosen because it minimizes the % validation error. \nAfter testing various k levels. According to the validation error log for \ndifferent k the best k points to 3, where %error training is 95.95% and \nvalidation % error is 4.05%."

##Question 1c.

knn.pred3 <-FNN::knn(train =Customer.norm.train[,-7],
                     test = Customer.norm.valid[,-7],cl=Customer.norm.train[,7],
                     k=3, prob=TRUE)

"Showing the Confusion Matrix"

## [1] "Showing the Confusion Matrix"

caret::confusionMatrix(knn.pred3, Customer.norm.valid[,7])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1795   71
##          1   10  124
##                                           
##                Accuracy : 0.9595          
##                  95% CI : (0.9499, 0.9677)
##     No Information Rate : 0.9025          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7326          
##                                           
##  Mcnemar's Test P-Value : 2.617e-11       
##                                           
##             Sensitivity : 0.9945          
##             Specificity : 0.6359          
##          Pos Pred Value : 0.9620          
##          Neg Pred Value : 0.9254          
##              Prevalence : 0.9025          
##          Detection Rate : 0.8975          
##    Detection Prevalence : 0.9330          
##       Balanced Accuracy : 0.8152          
##                                           
##        'Positive' Class : 0               
##

#Question 1d

customer.df= data.frame(Age = 40, Experience = 10, Income = 84, Family = 2, 
                        CCAvg = 2, Education_1 = 0, Education_2 = 1,
                        Education_3 = 0, Mortgage = 0, Securities.Account = 0, 
                        CD.Account = 0, Online = 1, CreditCard = 1)
knn.pred4 <-FNN::knn(train = Customer.norm.train[,-7],test = customer.df, 
                     cl = Customer.norm.train[,7], k=3, prob=TRUE)
knn.pred4

## [1] 1
## attr(,"prob")
## [1] 1
## attr(,"nn.index")
##      [,1] [,2] [,3]
## [1,] 2159 2327 1446
## attr(,"nn.dist")
##          [,1]     [,2]    [,3]
## [1,] 92.34951 92.37084 92.3761
## Levels: 1

"customer is classified as a 1 with 100% probability"

## [1] "customer is classified as a 1 with 100% probability"

##Question 1e

"Partittion of the data"

## [1] "Partittion of the data"

partition1 <- partitionData(UB1, c(0.50,0.30,0.20))
Customer.train1 <- partition1$train
Customer.valid1 <- partition1$valid
Customer.test <- partition1$test

"Normalization of the data"

## [1] "Normalization of the data"

norm.value1 <- preProcess(Customer.train1, method = "range", cutoff = 0.5)
Customer.norm.train1 <- predict(norm.value1, Customer.train1)
Customer.norm.valid1<- predict(norm.value1, Customer.valid1)
Customertest.norm <- predict(norm.value1, Customer.test)

testknn <-FNN::knn(train=Customer.norm.train1[,-7],test=Customertest.norm[,-7], 
               cl = Customer.norm.train1[,7], k=3, prob=TRUE)
validknn <-FNN::knn(train=Customer.norm.train1[,-7],test=Customer.norm.valid1[,-7], 
                cl = Customer.norm.train1[,7], k=3, prob=TRUE)
trainknn <-FNN::knn(train = Customer.norm.train1[,-7],
                    test = Customer.norm.train1[,-7], cl= Customer.norm.train1[,7],
                    k=3, prob=TRUE)
"Confusion Matrix for Test"

## [1] "Confusion Matrix for Test"

caret::confusionMatrix(testknn, Customertest.norm[,7])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 911  26
##          1   4  59
##                                           
##                Accuracy : 0.97            
##                  95% CI : (0.9574, 0.9797)
##     No Information Rate : 0.915           
##     P-Value [Acc > NIR] : 1.044e-12       
##                                           
##                   Kappa : 0.7815          
##                                           
##  Mcnemar's Test P-Value : 0.000126        
##                                           
##             Sensitivity : 0.9956          
##             Specificity : 0.6941          
##          Pos Pred Value : 0.9723          
##          Neg Pred Value : 0.9365          
##              Prevalence : 0.9150          
##          Detection Rate : 0.9110          
##    Detection Prevalence : 0.9370          
##       Balanced Accuracy : 0.8449          
##                                           
##        'Positive' Class : 0               
##

"Confussion Matrix for Validation"

## [1] "Confussion Matrix for Validation"

caret::confusionMatrix(validknn,Customer.norm.valid1[,7])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1343   57
##          1   10   90
##                                           
##                Accuracy : 0.9553          
##                  95% CI : (0.9436, 0.9652)
##     No Information Rate : 0.902           
##     P-Value [Acc > NIR] : 1.122e-14       
##                                           
##                   Kappa : 0.7054          
##                                           
##  Mcnemar's Test P-Value : 1.912e-08       
##                                           
##             Sensitivity : 0.9926          
##             Specificity : 0.6122          
##          Pos Pred Value : 0.9593          
##          Neg Pred Value : 0.9000          
##              Prevalence : 0.9020          
##          Detection Rate : 0.8953          
##    Detection Prevalence : 0.9333          
##       Balanced Accuracy : 0.8024          
##                                           
##        'Positive' Class : 0               
##

"Confusion Matrix for Training"

## [1] "Confusion Matrix for Training"

caret::confusionMatrix(trainknn, Customer.norm.train1[,7])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2245   58
##          1    7  190
##                                          
##                Accuracy : 0.974          
##                  95% CI : (0.967, 0.9799)
##     No Information Rate : 0.9008         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8399         
##                                          
##  Mcnemar's Test P-Value : 5.584e-10      
##                                          
##             Sensitivity : 0.9969         
##             Specificity : 0.7661         
##          Pos Pred Value : 0.9748         
##          Neg Pred Value : 0.9645         
##              Prevalence : 0.9008         
##          Detection Rate : 0.8980         
##    Detection Prevalence : 0.9212         
##       Balanced Accuracy : 0.8815         
##                                          
##        'Positive' Class : 0              
##

"Test_Accuraccy test = 0.952, Test_Accuracy for Validation = 0.9487, 
Test_Accuracy for training = 0.9752"

## [1] "Test_Accuraccy test = 0.952, Test_Accuracy for Validation = 0.9487, \nTest_Accuracy for training = 0.9752"

"As the model is being fit on the training data it would make intuitive
sense that the classification are most accurate on training data set and least
accurate on the test datasets"

## [1] "As the model is being fit on the training data it would make intuitive\nsense that the classification are most accurate on training data set and least\naccurate on the test datasets"

Project#KNN

PAUL BAFFOE

3/16/2022