STEP ONE:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

loan<-read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/credit_data_label.csv",
stringsAsFactors = TRUE)
str(loan)
## 'data.frame':    1000 obs. of  21 variables:
##  $ Default                          : int  0 1 0 0 1 0 0 0 0 1 ...
##  $ Account.Balance                  : Factor w/ 4 levels "< 0 DM",">= 200 DM / salary assignments for at least 1 year",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ Duration.of.Credit..month        : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ Payment.Status.of.Previous.Credit: Factor w/ 5 levels "all credits at this bank paid back duly",..: 2 4 2 4 3 4 4 4 4 2 ...
##  $ Purpose                          : Factor w/ 10 levels "(vacation - does not exist?)",..: 5 5 1 8 3 1 8 4 5 3 ...
##  $ Credit.Amount                    : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ Value.Savings.Stocks             : Factor w/ 5 levels ".. >= 1000 DM ",..: 5 2 2 2 2 5 4 2 1 2 ...
##  $ Length.of.current.employment     : Factor w/ 5 levels ".. >= 7 years",..: 1 3 4 4 3 3 1 3 4 5 ...
##  $ Instalment.per.cent              : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ Sex.Marital.Status               : Factor w/ 4 levels "female : divorced/separated/married",..: 4 1 4 4 4 4 4 4 2 3 ...
##  $ Guarantors                       : Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ Duration.in.Current.address      : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ Most.valuable.available.asset    : Factor w/ 4 levels "if not A121 : building society savings agreement/ life insurance",..: 3 3 3 1 4 4 1 2 3 2 ...
##  $ Age.years                        : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ Current.Credits                  : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Type.of.apartment                : Factor w/ 3 levels "for free","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ No.of.Credits.at.this.Bank       : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ Occupation                       : Factor w/ 4 levels "management/ self-employed/ highly qualified employee/ officer",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ No.of.dependents                 : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ Telephone                        : Factor w/ 2 levels "none","yes, registered under the customers name ": 2 1 1 1 1 2 1 2 1 1 ...
##  $ Foreign.Worker                   : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...

Response Variable: ‘Default’. In the current dataset it is an integer, but should be converted into a factor given that its outcomes are either 1 (person defaulted) or 0 (person did not default). At first glance, account balance, payment status of previous credit, occupation, credit amount, and length of current employment all seem like they could be the most useful in building a model to predict default.

STEP TWO:

ggplot(data=loan, aes(x=Credit.Amount,fill=factor(Default)))+
  geom_boxplot()

This plot supported my previous assumptions: people with higher credits, who owe more money to the bank, are more likely to default and faul to pay the bank back.

STEP THREE:

loanNum <- loan %>%
  select('Duration.of.Credit..month','Credit.Amount','Duration.in.Current.address','Age.years','No.of.Credits.at.this.Bank','No.of.dependents','Default')

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x))) }

loanNum <- na.omit(loanNum)

loanNorm <- as.data.frame(lapply(loanNum[,1:6], normalize)) %>%
  cbind(Default=loanNum$Default)

head(loanNorm)
##   Duration.of.Credit..month Credit.Amount Duration.in.Current.address
## 1                0.02941176    0.05056674                   1.0000000
## 2                0.64705882    0.31368989                   0.3333333
## 3                0.11764706    0.10157368                   0.6666667
## 4                0.55882353    0.41994057                   1.0000000
## 5                0.29411765    0.25420931                   1.0000000
## 6                0.47058824    0.48448333                   1.0000000
##    Age.years No.of.Credits.at.this.Bank No.of.dependents Default
## 1 0.85714286                  0.3333333                0       0
## 2 0.05357143                  0.0000000                0       1
## 3 0.53571429                  0.0000000                1       0
## 4 0.46428571                  0.0000000                1       0
## 5 0.60714286                  0.3333333                1       1
## 6 0.28571429                  0.0000000                1       0

STEP FOUR:

library(caret)
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
set.seed(500)
caretSamp <- createDataPartition(loanNorm$Default, p = 0.7, list = FALSE)

trainSplit  <- loanNorm[caretSamp, ]
testSplit <- loanNorm[-caretSamp, ]

# Verifies proportions are relatively equal in both groups
mean(trainSplit$Default)
## [1] 0.3014286
mean(testSplit$Default)
## [1] 0.2966667

STEP FIVE:

library(class)
## Warning: package 'class' was built under R version 4.2.3
trainFeatures <- trainSplit %>%
  select(-Default)
testFeatures <- testSplit %>%
  select(-Default)
trainDefault <- trainSplit$Default
testDefault <- testSplit$Default

set.seed(1)
knn.pred=knn(train = trainFeatures,
             test = testFeatures,
             cl = trainDefault,
             k=3)

knn.pred
##   [1] 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0
##  [75] 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0
## [112] 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0
## [149] 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0
## [186] 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0
## [260] 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0
## [297] 1 1 0 0
## Levels: 0 1

STEP SIX:

# Confusion matrix
cm <- table(knn.pred, testDefault)
cm
##         testDefault
## knn.pred   0   1
##        0 167  63
##        1  44  26
# Correct rate
cr <- mean(knn.pred == testDefault)
cr
## [1] 0.6433333
# Error rate
er <- 1-cr
er
## [1] 0.3566667
# False positives (applicants predicted to default (1) but do not (0))
44
## [1] 44
# False negatives (applicants predicted to not default (0) but do (1))
63
## [1] 63
# Sensitivity (Percentage of overall positives predicted correctly) 
26/(26+63)
## [1] 0.2921348
# Specificity (Percentage of overall negatives predicted correctly)
167/(167+44)
## [1] 0.7914692

STEP SEVEN:

set.seed(123)

error <- c()
for (i in 1:30){
  knnLoan<- knn(train = trainFeatures,
                test = testFeatures,
                cl = trainDefault, 
                k = i)
  error[i] = 1- mean(knnLoan==testDefault)
}

ggplot(data = data.frame(error), aes(x = 1:30, y = error)) +
  geom_line(color = "Blue")

# k=19 minimizes error!

set.seed(1)
knn.pred2=knn(train = trainFeatures,
             test = testFeatures,
             cl = trainDefault,
             k=19)
cr <- mean(knn.pred2 == testDefault)
cr
## [1] 0.7
# The correct rate increased by 0.0567 to 0.7

STEP EIGHT: Many of the features of the tutorial dataset are coded as integers despite being categorical variables. Thus, it doesn’t make sense to normalize these data, because there’s no way to tell closeness between different categorical outcomes. For example, it appears as though there are three different types of occupation in the dataset. It’s unclear what the different types are, but theres no way of knowing how similar type 1 is to type 3 vs type 2 to type 3. Thus, it makes no sense to put occupation on the same scale as age, for example.