DATA 622 HW3

library(knitr)
library(rmdformats)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.5     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret) # For createDataPartition, featureplot, classification report

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(corrplot) # For correlation matrix

## corrplot 0.90 loaded

library(mice) # Multivariate Imputation By Chained Equations

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(parallel)
library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:purrr':
## 
##     compact

library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

## Global options


options(max.print="100")
opts_knit$set(width=31)

K-Nearest Neighbors

loan_raw <- read.csv('https://raw.githubusercontent.com/metis-macys-66898/data622_fa2021/main/hw3/data/Loan_approval.csv', header = TRUE, na.strings = " ")
loan_raw[loan_raw==""] <- NA
loan_raw <- loan_raw %>% mutate_if(is.character, factor)
loan <- loan_raw

Processing

Besides creating a data.frame called loan_knn, I explicitly recoded the Y/N values into 1/0’s.

str(loan)

## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Married          : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Dependents       : Factor w/ 4 levels "0","1","2","3+": 1 2 1 1 1 3 1 4 3 2 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...

loan_knn <- loan
loan_knn$Loan_Status <- as.numeric(loan_knn$Loan_Status)-1 
str(loan_knn)

## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Married          : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Dependents       : Factor w/ 4 levels "0","1","2","3+": 1 2 1 1 1 3 1 4 3 2 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : num  1 0 1 1 1 1 1 0 1 0 ...

Besides, I also used colSums to look for how many rows of missing values for each column in the loan_knn data.frame.

colSums(is.na(loan_knn))

##           Loan_ID            Gender           Married        Dependents 
##                 0                13                 3                15 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                32                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                50                 0 
##       Loan_Status 
##                 0

After that, I plotted a histogram to better illustrate where are the missing values coming from. It’s essentially another way of looking at the distribution of missing values.

aggr_plot <- aggr(loan_knn, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##           Variable       Count
##     Credit_History 0.081433225
##      Self_Employed 0.052117264
##         LoanAmount 0.035830619
##         Dependents 0.024429967
##   Loan_Amount_Term 0.022801303
##             Gender 0.021172638
##            Married 0.004885993
##            Loan_ID 0.000000000
##          Education 0.000000000
##    ApplicantIncome 0.000000000
##  CoapplicantIncome 0.000000000
##      Property_Area 0.000000000
##        Loan_Status 0.000000000

There is a total of 6 columns that require imputation. They can be triaged into 4 different categories.

Below we showed you where each of the 6 columns fall under and follow by the eventual method we used in the mice package. Mice package implements a method to deal with missing data. It’s short for Multivariate Imputation by Chained Equations.

Categorical Variables with more than 2 levels: Dependents(4) - polyreg (Polytomous logistic regression)

Categorical Variables with 2 levels: Gender (2), Married(3), Credit_History(11), Self_Employed (6) - logreg (Logistic regression)

Discrete variable (ordered) with more than 2 levels: Loan_Amount_Term (10) - ~~polr~~ (Proportional odds model) -> norm (Bayesian linear regression)

Continuous Variables: LoanAmount (9) - norm (Bayesian linear regression)

Additional Data (Processing) Manipulation Steps

So, first off, I need to convert Credit_History to factors so that the mice model that I’m going to use can detect that column as a categorical variable.

Combining ApplicantIncome and CoapplicantIncome into a new variable TotalIncome, and dropping the respective input columns. Loan_ID doesn’t help with the prediction obviously. So dropping it as well.

loan_knn_pre_imp <- loan_knn
loan_knn_pre_imp$Credit_History <- as.factor(loan_knn_pre_imp$Credit_History)



loan_knn_pre_imp <- loan_knn_pre_imp %>% mutate(TotalIncome = ApplicantIncome + CoapplicantIncome)
loan_knn_pre_imp <- loan_knn_pre_imp %>% select(-c('Loan_ID','ApplicantIncome','CoapplicantIncome'))

# loan_knn_pre_imp[loan_knn_pre_imp$Dependents = "3+"] <- "3"

# recode dependents 3+ to 3
loan_knn_pre_imp$Dependents <- revalue(loan_knn_pre_imp$Dependents, c("3+"="3"))


str(loan_knn_pre_imp)

## 'data.frame':    614 obs. of  11 variables:
##  $ Gender          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Married         : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Dependents      : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 3 1 4 3 2 ...
##  $ Education       : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed   : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
##  $ LoanAmount      : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term: int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History  : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ Property_Area   : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status     : num  1 0 1 1 1 1 1 0 1 0 ...
##  $ TotalIncome     : num  5849 6091 3000 4941 6000 ...

I’ve set up a predictorMatrix where I can instruct mice to use which method for which column for imputation.

Set seed = 501. Retrieved the results.

# loan_knn_imp1 <- mice (loan_knn_pre_imp, method = c("",
#                                                     "logreg",
#                                                     "logreg",
#                                                     "polyreg",
#                                                     "",
#                                                     "logreg",
#                                                     "",
#                                                     "",
#                                                     "cart",
#                                                     "polr",
#                                                     "logreg",
#                                                     "",
#                                                     ""
#                                                    ), seed=501, nnet.MaxNWts = 30100
# )
# 
# loan_knn_imp1 <- mice (loan_knn_pre_imp, seed=501, nnet.MaxNWts = 30100)
# 
# loan_knn_imp2 <- mice (loan_knn_pre_imp, method = "rf", seed=501, nnet.MaxNWts = 30100)


# loan_knn_imp2 <- parlmice(data = loan_knn_pre_imp, method = c(  "", 
#                                                                 "logreg", 
#                                                                 "logreg", 
#                                                                 "polyreg", 
#                                                                 "", 
#                                                                 "", 
#                                                                 "", 
#                                                                 "", 
#                                                                 "cart",
#                                                                 "polr",
#                                                                 "logreg",
#                                                                 "",
#                                                                 ""
#                                                               ), cluster.seed = 501, nnet.MaxNWts = 30100, n.core = 2, n.imp.core = 150)



init <- mice(loan_knn_pre_imp, maxit=0) 
meth <- init$method
predM <- init$predictorMatrix
# meth[c('Loan_Amount_Term')] <- 'polr'
# meth[c('LoanAmount')] <- 'cart' 
meth[c('LoanAmount','Loan_Amount_Term')] <- 'norm'
meth[c('Credit_History','Self_Employed','Gender','Married')] <- 'logreg'
meth[c('Dependents')] <- 'polyreg'
meth[c('Loan_Status','TotalIncome','Property_Area','Education')] = ''
loan_knn_imp1 <- mice(loan_knn_pre_imp, method=meth, predictorMatrix=predM, seed=501)

## 
##  iter imp variable
##   1   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   1   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   2   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   3   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   4   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   1  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   2  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   3  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   4  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History
##   5   5  Gender  Married  Dependents  Self_Employed  LoanAmount  Loan_Amount_Term  Credit_History

After some manual examinations of the different imputed results, I’ve decided to go with imputed column #3.

# Manual examination 
#Credit_History
loan_knn[96:118,]

##      Loan_ID Gender Married Dependents    Education Self_Employed
## 96  LP001326   Male      No          0     Graduate          <NA>
## 97  LP001327 Female     Yes          0     Graduate            No
## 98  LP001333   Male     Yes          0     Graduate            No
## 99  LP001334   Male     Yes          0 Not Graduate            No
## 100 LP001343   Male     Yes          0     Graduate            No
## 101 LP001345   Male     Yes          2 Not Graduate            No
## 102 LP001349   Male      No          0     Graduate            No
##     ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## 96             6782                 0         NA              360
## 97             2484              2302        137              360
## 98             1977               997         50              360
## 99             4188                 0        115              180
## 100            1759              3541        131              360
## 101            4288              3263        133              180
## 102            4843              3806        151              360
##     Credit_History Property_Area Loan_Status
## 96              NA         Urban           0
## 97               1     Semiurban           1
## 98               1     Semiurban           1
## 99               1     Semiurban           1
## 100              1     Semiurban           1
## 101              1         Urban           1
## 102              1     Semiurban           1
##  [ reached 'max' / getOption("max.print") -- omitted 16 rows ]

loan_knn_imp1$imp$Credit_History

##     1 2 3 4 5
## 17  1 1 1 1 1
## 25  0 0 1 0 0
## 31  1 1 0 0 0
## 43  1 1 1 1 1
## 80  1 1 1 1 1
## 84  0 1 0 0 1
## 87  1 1 1 1 1
## 96  0 0 0 1 1
## 118 1 1 1 1 1
## 126 1 1 1 1 1
## 130 1 0 1 0 1
## 131 1 1 1 1 1
## 157 1 1 1 1 1
## 182 1 1 0 1 1
## 188 1 1 1 1 1
## 199 1 0 1 1 1
## 220 1 1 1 1 1
## 237 1 1 1 1 0
## 238 1 1 1 1 1
## 260 0 0 0 0 0
##  [ reached 'max' / getOption("max.print") -- omitted 30 rows ]

#Married
loan_knn[430:436,]

##      Loan_ID Gender Married Dependents    Education Self_Employed
## 430 LP002370   Male      No          0 Not Graduate            No
## 431 LP002377 Female      No          1     Graduate           Yes
## 432 LP002379   Male      No          0     Graduate            No
## 433 LP002386   Male      No          0     Graduate          <NA>
## 434 LP002387   Male     Yes          0     Graduate            No
## 435 LP002390   Male      No          0     Graduate            No
## 436 LP002393 Female    <NA>       <NA>     Graduate            No
##     ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## 430            2717                 0         60              180
## 431            8624                 0        150              360
## 432            6500                 0        105              360
## 433           12876                 0        405              360
## 434            2425              2340        143              360
## 435            3750                 0        100              360
## 436           10047                 0         NA              240
##     Credit_History Property_Area Loan_Status
## 430              1         Urban           1
## 431              1     Semiurban           1
## 432              0         Rural           0
## 433              1     Semiurban           1
## 434              1     Semiurban           1
## 435              1         Urban           1
## 436              1     Semiurban           1

loan_knn_imp1$imp$Married

##       1   2   3   4   5
## 105 Yes Yes Yes  No Yes
## 229  No Yes Yes Yes  No
## 436  No Yes  No  No Yes

#Dependents
loan_knn[227:229,]

##      Loan_ID Gender Married Dependents    Education Self_Employed
## 227 LP001754   Male     Yes       <NA> Not Graduate           Yes
## 228 LP001758   Male     Yes          2     Graduate            No
## 229 LP001760   Male    <NA>       <NA>     Graduate            No
##     ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## 227            4735                 0        138              360
## 228            6250              1695        210              360
## 229            4758                 0        158              480
##     Credit_History Property_Area Loan_Status
## 227              1         Urban           0
## 228              1     Semiurban           1
## 229              1     Semiurban           1

loan_knn_imp1$imp$Dependents

##     1 2 3 4 5
## 103 3 0 1 2 0
## 105 0 0 2 0 3
## 121 2 2 2 0 2
## 227 2 3 3 2 2
## 229 0 0 0 1 0
## 294 1 1 0 0 0
## 302 2 1 0 0 0
## 333 0 2 3 0 0
## 336 1 1 2 0 2
## 347 2 0 3 3 1
## 356 0 0 0 0 1
## 436 0 1 0 0 1
## 518 3 1 1 3 2
## 572 2 1 0 0 0
## 598 0 0 0 0 0

loan_knn2 <- complete(loan_knn_imp1, 3) # 2nd argument if not provided is defaulted to 1

Modeling with KNN

First off, set seed = 688.

Create training/test partitions by calling createDataPartition. p is set to .8 to mean 80/20 split for train/test set.

Checking the structure of the train set (knn_train)

set.seed(688)
# recoding Loan_Status back to categorical variable
loan_knn2$Loan_Status <- as.factor(loan_knn2$Loan_Status)
str(loan_knn2)

## 'data.frame':    614 obs. of  11 variables:
##  $ Gender          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Married         : Factor w/ 2 levels "No","Yes": 1 2 2 2 1 2 2 2 2 2 ...
##  $ Dependents      : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 3 1 4 3 2 ...
##  $ Education       : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed   : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 2 1 1 1 1 ...
##  $ LoanAmount      : num  114 128 66 120 141 ...
##  $ Loan_Amount_Term: num  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History  : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
##  $ Property_Area   : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status     : Factor w/ 2 levels "0","1": 2 1 2 2 2 2 2 1 2 1 ...
##  $ TotalIncome     : num  5849 6091 3000 4941 6000 ...

# Data Partitioning
trainIndex <- createDataPartition(loan_knn2$Loan_Status, p = .8, list = FALSE, times = 1)
knn_train <- loan_knn2[trainIndex,]
knn_test  <- loan_knn2[-trainIndex,]



str(knn_train)

## 'data.frame':    492 obs. of  11 variables:
##  $ Gender          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Married         : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ Dependents      : Factor w/ 4 levels "0","1","2","3": 1 2 1 1 1 4 3 3 3 3 ...
##  $ Education       : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 2 1 1 1 1 1 ...
##  $ Self_Employed   : Factor w/ 2 levels "No","Yes": 1 1 2 1 1 1 1 1 1 1 ...
##  $ LoanAmount      : num  114 128 66 120 95 ...
##  $ Loan_Amount_Term: num  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History  : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
##  $ Property_Area   : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 2 3 3 3 3 ...
##  $ Loan_Status     : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 2 2 2 2 ...
##  $ TotalIncome     : num  5849 6091 3000 4941 3849 ...

Checking the structure of the test set (knn_test)

str(knn_test)

## 'data.frame':    122 obs. of  11 variables:
##  $ Gender          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 1 2 ...
##  $ Married         : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 1 2 2 2 ...
##  $ Dependents      : Factor w/ 4 levels "0","1","2","3": 1 3 2 3 3 2 4 3 2 1 ...
##  $ Education       : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 2 1 1 1 1 1 ...
##  $ Self_Employed   : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 1 2 1 ...
##  $ LoanAmount      : num  141 267 349 112 110 106 320 134 286 96 ...
##  $ Loan_Amount_Term: num  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History  : Factor w/ 2 levels "0","1": 2 2 2 1 2 2 2 2 1 2 ...
##  $ Property_Area   : Factor w/ 3 levels "Rural","Semiurban",..: 3 3 2 1 3 1 1 3 3 2 ...
##  $ Loan_Status     : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 1 1 1 2 ...
##  $ TotalIncome     : num  6000 9613 23809 5282 5266 ...

Cross Validation

Perform a repeated 11-fold cross-validation, meaning the number of complete sets of folks to compute is 11. For this classification problem, we assigned our fitted model to knn.fit. The cross-validated results is plugged in the form of trControl.

trControl <- trainControl(method  = "repeatedcv",
                          repeats  = 11)
knn.fit <- train(Loan_Status ~ .,
             method     = "knn",
             tuneGrid   = expand.grid(k = 1:10),
             trControl  = trControl,
             preProcess = c("center","scale"),
             data       = knn_train
             )

Since our target variable is a binary factor of 2, by default, we use Accuracy as the determining performance metric. The optimal K is thus determined by Accuracy. K = 9 was finally selected. # of neighbors is 9.

# getOption("max.print")
knn.fit

## k-Nearest Neighbors 
## 
## 492 samples
##  10 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered (13), scaled (13) 
## Resampling: Cross-Validated (10 fold, repeated 11 times) 
## Summary of sample sizes: 442, 444, 443, 443, 442, 444, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.7245167  0.3505620
##    2  0.7169147  0.3233247
##    3  0.7841964  0.4397484
##    4  0.7743537  0.4188539
##    5  0.8019750  0.4716819
##    6  0.7976880  0.4556506
##    7  0.8026869  0.4614455
##    8  0.7991506  0.4521741
##    9  0.8033958  0.4561901
##   10  0.7975032  0.4380019
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Accuracy is 77.1% while balanced accuracy is only 67.48%.

knn_pred <- predict(knn.fit, newdata = knn_test)
# options('max.print' = 100)  
# getOption("max.print")
confusionMatrix(knn_pred, knn_test$Loan_Status)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 16  6
##          1 22 78
##                                           
##                Accuracy : 0.7705          
##                  95% CI : (0.6857, 0.8418)
##     No Information Rate : 0.6885          
##     P-Value [Acc > NIR] : 0.029156        
##                                           
##                   Kappa : 0.3952          
##                                           
##  Mcnemar's Test P-Value : 0.004586        
##                                           
##             Sensitivity : 0.4211          
##             Specificity : 0.9286          
##          Pos Pred Value : 0.7273          
##          Neg Pred Value : 0.7800          
##              Prevalence : 0.3115          
##          Detection Rate : 0.1311          
##    Detection Prevalence : 0.1803          
##       Balanced Accuracy : 0.6748          
##                                           
##        'Positive' Class : 0               
##