R Markdown

Q3. Re- Write Core components of previous analysis

DAM Assignment3 Group(TeamLassoandRidge)


1. Install packages into R

# Install packages into R

#install.packages("caret")
#install.packages("tidyverse")
#install.packages("plyr")
#install.packages("rmarkdown")
#install.packages("dummies")
#install.packages("glmnet")
#install.packages("e1071")
#install.packages("rpart")
#install.packages("tree")
#install.packages("randomForest")
#install.packages("ROCR")
#install.packages("gbm")
#install.packages("AUC")
#install.packages("rpart.plot")
#install.packages("mlbench")
#install.packages("pROC")
#install.packages("parallel")
#install.packages("doParallel")
#install.packages("knitr")
#install.packages("here")
#install.packages("roxygen2")
#install.packages("testthat")
#install.packages("gridExtra")

2. Load libraries into R

# Load libraries into R

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## Registered S3 method overwritten by 'rvest':
##   method            from
##   read_xml.response xml2
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.1     v purrr   0.3.2
## v tidyr   0.8.3     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.1.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-18
library(e1071)
library(rpart)
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(gbm)
## Loaded gbm 2.1.5
library(AUC)
## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
## 
## Attaching package: 'AUC'
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following objects are masked from 'package:caret':
## 
##     sensitivity, specificity
library(rpart.plot)
library(mlbench)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:AUC':
## 
##     auc, roc
## The following object is masked from 'package:glmnet':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(parallel)
library(rmarkdown)  # R Mardown is a type of reproducible notebook using literate programming as it is executed in chunks with tracebacks of the last step.
library(here)
## here() starts at C:/Users/Wendy/Desktop/DSP/assignment 3/project
## 
## Attaching package: 'here'
## The following object is masked from 'package:plyr':
## 
##     here
library(knitr)
library(roxygen2)   # include documentation for writing functions
library(testthat)   # include unit testing
## 
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
## 
##     matches
## The following object is masked from 'package:purrr':
## 
##     is_null
# Do use: rm(list=ls()) # R code may break when someone tries to run code on another computer

4. Read csv file into R

# Read in csv file from the folder "DSP Assignment 3"

c <- read_csv(here("./data/credit_card_data_training.csv"))
## Parsed with column specification:
## cols(
##   ID = col_double(),
##   LIMIT_BAL = col_double(),
##   SEX = col_double(),
##   EDUCATION = col_double(),
##   MARRIAGE = col_double(),
##   AGE = col_double(),
##   PAY_PC1 = col_double(),
##   PAY_PC2 = col_double(),
##   PAY_PC3 = col_double(),
##   AMT_PC1 = col_double(),
##   AMT_PC2 = col_double(),
##   AMT_PC3 = col_double(),
##   AMT_PC4 = col_double(),
##   AMT_PC5 = col_double(),
##   AMT_PC6 = col_double(),
##   AMT_PC7 = col_double(),
##   default = col_double()
## )

5.Explore the dimensions of the data

#  View its dimensions        
dim(c)
## [1] 21001    17

6.View the data types

# View its class
class(c)            # dataframe
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

7. View the structure

# Structure of training data the dplyr way.  Data types, preview 17 columns, 21001 observations or rows
glimpse(c)                 # SEX, EDUCATION, MARRIAGE, Default are integers not factors
## Observations: 21,001
## Variables: 17
## $ ID        <dbl> 1, 2, 3, 7, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 2...
## $ LIMIT_BAL <dbl> 400000, 200000, 50000, 80000, 260000, 20000, 60000, ...
## $ SEX       <dbl> 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2...
## $ EDUCATION <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 4, 2, 2, 2, 2, 1, 1...
## $ MARRIAGE  <dbl> 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2...
## $ AGE       <dbl> 33, 35, 25, 28, 33, 36, 44, 42, 30, 30, 35, 23, 43, ...
## $ PAY_PC1   <dbl> 1.44929361, -1.80136756, -0.39330764, -0.39330764, -...
## $ PAY_PC2   <dbl> -1.2723509, 0.7679557, 0.1755550, 0.1755550, -0.3492...
## $ PAY_PC3   <dbl> -0.362878237, -0.264508711, 0.004885522, 0.004885522...
## $ AMT_PC1   <dbl> 2.26658679, 4.12474891, -0.74816346, 1.13057727, -1....
## $ AMT_PC2   <dbl> 2.64796345, -1.12245175, -0.37413651, -0.64517001, -...
## $ AMT_PC3   <dbl> 6.31417677, -0.57493602, -0.10406594, -0.04354691, -...
## $ AMT_PC4   <dbl> -3.461439078, -0.166878371, -0.011751497, 0.09175045...
## $ AMT_PC5   <dbl> 0.228894439, 0.256636062, -0.013829394, -0.024157751...
## $ AMT_PC6   <dbl> -0.696090967, 0.159330316, 0.046065937, -0.044970627...
## $ AMT_PC7   <dbl> 0.903658132, 0.488583491, -0.017876349, -0.080402170...
## $ default   <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0...

8. View the column details

# Column names
names(c)
##  [1] "ID"        "LIMIT_BAL" "SEX"       "EDUCATION" "MARRIAGE" 
##  [6] "AGE"       "PAY_PC1"   "PAY_PC2"   "PAY_PC3"   "AMT_PC1"  
## [11] "AMT_PC2"   "AMT_PC3"   "AMT_PC4"   "AMT_PC5"   "AMT_PC6"  
## [16] "AMT_PC7"   "default"
# View variable names
colnames(c)   
##  [1] "ID"        "LIMIT_BAL" "SEX"       "EDUCATION" "MARRIAGE" 
##  [6] "AGE"       "PAY_PC1"   "PAY_PC2"   "PAY_PC3"   "AMT_PC1"  
## [11] "AMT_PC2"   "AMT_PC3"   "AMT_PC4"   "AMT_PC5"   "AMT_PC6"  
## [16] "AMT_PC7"   "default"

9. Descriptive Statistics

# Summary of distribution of each column
summary(c)
##        ID          LIMIT_BAL            SEX          EDUCATION    
##  Min.   :    1   Min.   :  10000   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 7468   1st Qu.:  50000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :14938   Median : 140000   Median :2.000   Median :2.000  
##  Mean   :14973   Mean   : 167502   Mean   :1.606   Mean   :1.856  
##  3rd Qu.:22519   3rd Qu.: 240000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :29995   Max.   :1000000   Max.   :2.000   Max.   :6.000  
##     MARRIAGE          AGE           PAY_PC1              PAY_PC2         
##  Min.   :0.000   Min.   :21.00   Min.   :-13.302028   Min.   :-4.422427  
##  1st Qu.:1.000   1st Qu.:28.00   1st Qu.: -0.393308   1st Qu.:-0.227765  
##  Median :2.000   Median :34.00   Median : -0.393308   Median : 0.175555  
##  Mean   :1.555   Mean   :35.47   Mean   : -0.009395   Mean   :-0.000277  
##  3rd Qu.:2.000   3rd Qu.:41.00   3rd Qu.:  1.360047   3rd Qu.: 0.361123  
##  Max.   :3.000   Max.   :79.00   Max.   :  3.813348   Max.   : 5.441026  
##     PAY_PC3             AMT_PC1            AMT_PC2         
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.717690  
##  1st Qu.:-0.283941   1st Qu.:-1.51032   1st Qu.:-0.431402  
##  Median : 0.004886   Median :-0.85848   Median :-0.208967  
##  Mean   : 0.000949   Mean   : 0.01059   Mean   :-0.005951  
##  3rd Qu.: 0.077070   3rd Qu.: 0.52003   3rd Qu.: 0.085625  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :28.783658  
##     AMT_PC3              AMT_PC4              AMT_PC5          
##  Min.   :-10.389523   Min.   :-19.171464   Min.   :-24.108569  
##  1st Qu.: -0.135722   1st Qu.: -0.068251   1st Qu.: -0.082314  
##  Median : -0.070445   Median :  0.018161   Median : -0.032000  
##  Mean   :  0.003369   Mean   :  0.002936   Mean   : -0.000712  
##  3rd Qu.:  0.001416   3rd Qu.:  0.081621   3rd Qu.:  0.025086  
##  Max.   : 21.984829   Max.   : 21.823749   Max.   : 17.430967  
##     AMT_PC6             AMT_PC7             default      
##  Min.   :-38.88504   Min.   :-25.90403   Min.   :0.0000  
##  1st Qu.: -0.04252   1st Qu.: -0.09209   1st Qu.:0.0000  
##  Median : -0.00234   Median : -0.04045   Median :0.0000  
##  Mean   : -0.00008   Mean   : -0.00171   Mean   :0.2459  
##  3rd Qu.:  0.06772   3rd Qu.:  0.03061   3rd Qu.:0.0000  
##  Max.   : 14.72234   Max.   : 22.92727   Max.   :1.0000

9. Inspect the first 5 and last 5 values of the dataset

# First 3 variables
head(c,3)
## # A tibble: 3 x 17
##      ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_PC1 PAY_PC2  PAY_PC3
##   <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl>   <dbl>   <dbl>    <dbl>
## 1     1    400000     1         1        2    33   1.45   -1.27  -0.363  
## 2     2    200000     1         2        1    35  -1.80    0.768 -0.265  
## 3     3     50000     1         2        2    25  -0.393   0.176  0.00489
## # ... with 8 more variables: AMT_PC1 <dbl>, AMT_PC2 <dbl>, AMT_PC3 <dbl>,
## #   AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>,
## #   default <dbl>
# Bottom 3 variables
tail(c,3)
## # A tibble: 3 x 17
##      ID LIMIT_BAL   SEX EDUCATION MARRIAGE   AGE PAY_PC1 PAY_PC2 PAY_PC3
##   <dbl>     <dbl> <dbl>     <dbl>    <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1 29992     80000     2         3        1    39   2.09  -0.0763  -0.311
## 2 29993    100000     1         2        1    41   0.984  0.994    0.374
## 3 29995    500000     1         1        2    37   0.646  0.0368  -1.04 
## # ... with 8 more variables: AMT_PC1 <dbl>, AMT_PC2 <dbl>, AMT_PC3 <dbl>,
## #   AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>,
## #   default <dbl>

10. Pre-Processing the Training Data

# Pre-Processing Data (Data Cleaning) for modelling 

# Transform integer variables into categorical variables (default, SEX,EDUCATION, MARRIAGE, default)
c$EDUCATION <- factor(c$EDUCATION)
c$MARRIAGE <- factor(c$MARRIAGE)
c$default <- factor(c$default)
c$SEX <- factor(c$SEX)

11. Convert categorical variables into factors

# Convert categorical variables into factors to represent their levels
c$SEX <- as.factor(c$SEX)
c$EDUCATION <- as.factor(c$EDUCATION)
c$MARRIAGE <- as.factor(c$MARRIAGE)
#c$default <-  as.factor(c$default)

12.View the structure of the transformed variables

# check that categorical variables have converted with levels
glimpse(c)  
## Observations: 21,001
## Variables: 17
## $ ID        <dbl> 1, 2, 3, 7, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 2...
## $ LIMIT_BAL <dbl> 400000, 200000, 50000, 80000, 260000, 20000, 60000, ...
## $ SEX       <fct> 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2...
## $ EDUCATION <fct> 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 4, 2, 2, 2, 2, 1, 1...
## $ MARRIAGE  <fct> 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2...
## $ AGE       <dbl> 33, 35, 25, 28, 33, 36, 44, 42, 30, 30, 35, 23, 43, ...
## $ PAY_PC1   <dbl> 1.44929361, -1.80136756, -0.39330764, -0.39330764, -...
## $ PAY_PC2   <dbl> -1.2723509, 0.7679557, 0.1755550, 0.1755550, -0.3492...
## $ PAY_PC3   <dbl> -0.362878237, -0.264508711, 0.004885522, 0.004885522...
## $ AMT_PC1   <dbl> 2.26658679, 4.12474891, -0.74816346, 1.13057727, -1....
## $ AMT_PC2   <dbl> 2.64796345, -1.12245175, -0.37413651, -0.64517001, -...
## $ AMT_PC3   <dbl> 6.31417677, -0.57493602, -0.10406594, -0.04354691, -...
## $ AMT_PC4   <dbl> -3.461439078, -0.166878371, -0.011751497, 0.09175045...
## $ AMT_PC5   <dbl> 0.228894439, 0.256636062, -0.013829394, -0.024157751...
## $ AMT_PC6   <dbl> -0.696090967, 0.159330316, 0.046065937, -0.044970627...
## $ AMT_PC7   <dbl> 0.903658132, 0.488583491, -0.017876349, -0.080402170...
## $ default   <fct> 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0...

13.Create dummy variables for the categorical variables with more than 2 levels

# Create dummy variables for the categorical variables more than 2 levels
library(dummies)
EDUCATION <- dummy(c$EDUCATION)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
head(EDUCATION)   # check that the values are been converted to dummy variables
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd0
## [1,]                                                                                             0
## [2,]                                                                                             0
## [3,]                                                                                             0
## [4,]                                                                                             0
## [5,]                                                                                             0
## [6,]                                                                                             0
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd1
## [1,]                                                                                             1
## [2,]                                                                                             0
## [3,]                                                                                             0
## [4,]                                                                                             0
## [5,]                                                                                             1
## [6,]                                                                                             0
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd2
## [1,]                                                                                             0
## [2,]                                                                                             1
## [3,]                                                                                             1
## [4,]                                                                                             1
## [5,]                                                                                             0
## [6,]                                                                                             1
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd3
## [1,]                                                                                             0
## [2,]                                                                                             0
## [3,]                                                                                             0
## [4,]                                                                                             0
## [5,]                                                                                             0
## [6,]                                                                                             0
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd4
## [1,]                                                                                             0
## [2,]                                                                                             0
## [3,]                                                                                             0
## [4,]                                                                                             0
## [5,]                                                                                             0
## [6,]                                                                                             0
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd5
## [1,]                                                                                             0
## [2,]                                                                                             0
## [3,]                                                                                             0
## [4,]                                                                                             0
## [5,]                                                                                             0
## [6,]                                                                                             0
##      C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd6
## [1,]                                                                                             0
## [2,]                                                                                             0
## [3,]                                                                                             0
## [4,]                                                                                             0
## [5,]                                                                                             0
## [6,]                                                                                             0
MARRIAGE <- dummy(c$MARRIAGE)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
SEX <- dummy(c$SEX)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
default <- dummy(c$default)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored

16.Check for any missing values in c1

# Are there any missing values?
any(is.na(c1))
## [1] FALSE

18.Visualize the data c1

# Plot data

library (gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
#  Histogram of a numeric variable Age
hist(c1$AGE,main = "Histogram of Age", 
     xlab = "Age")       

19.Summarise Age

# summary of Age
summary(c1$AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   28.00   34.00   35.47   41.00   79.00

20. View Marriage variable

plot(x = c1$MARRIAGE,
     main = "Distribution of Marriage", 
     xlab = "Marriage",
     ylab = "count") 


21. View Gender variable

plot(x= c1$SEX,main = "Distribution of Gender", 
     xlab = "Gender",
     ylab = "count")


22. View Education variable

plot( x = c1$EDUCATION, main = "Distribution of Education", 
      xlab = "Education",
      ylab = "Count")


22. Scatterplot of numeric variables

library(ggplot2)
# Scatterplot of a subset of data - non-linear
pairs(c1[, c("SEX","MARRIAGE","AGE","EDUCATION")], 
      main = "credit training data")


23.Density plots of numeric variables (Principal Components of repayment status from April to September, 2005)

# Density plot of PAY_PC1
p1 <- ggplot(c1, aes(x=c1$PAY_PC1)) + 
  geom_histogram() + 
  ggtitle(" Histogram of PAY_PC1")

p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


24. Density plot of PAY_PC2

p2 <- ggplot(c1, aes(x=c1$PAY_PC2)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of PAY_PC2")

p2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

26. Density plot of AMT_PC1

p4 <- ggplot(c1, aes(x=c1$AMT_PC1)) + geom_histogram()+ ggtitle(" Histogram of AMT_PC1")
p4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

27. Other Density Plots

p5 <- ggplot(c1, aes(x=c1$AMT_PC2)) + 
  geom_histogram()+ ggtitle(" Histogram of AMT_PC2")

p5
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p6 <- ggplot(c1, aes(x=c1$AMT_PC3)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of AMT_PC3")

p6
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p7 <- ggplot(c1, aes(x=c1$AMT_PC5)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of AMT_PC5")

p7
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p8 <- ggplot(c1, aes(x=c1$AMT_PC6)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of AMT_PC6")

p8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p9 <- ggplot(c1, aes(x=c1$AMT_PC7)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of AMT_PC7")

p9
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p10 <- ggplot(c1, aes(x=c1$LIMIT_BAL)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of Limit Balance")

p10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

28. Density plot ofAMT_PC4

p11 <- ggplot(c1, aes(x=c1$AMT_PC4)) + 
  geom_histogram()+ 
  ggtitle(" Histogram of AMT_PC4")

p11
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(c1$AMT_PC4)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -19.171464  -0.068251   0.018161   0.002936   0.081621  21.823749

30.Data Splitting and Partitoning - Training and Testing split of dataset

# Multi-core processing
library(roxygen2)
library(parallel)
library(doParallel)
library(caret)
library(glmnet)

# Create data partition row list for reproducible results 

# Setting a random seed ensures we get the same result each time
set.seed(42)                 
train <- createDataPartition(y = c1$default, p = 0.7, list = F)


# Partition c1 data into two sets 
training <- c1[train, ]
testing  <- c1[-train, ]

32. Glimpse Testing dataset

glimpse(testing)
## Observations: 6,300
## Variables: 17
## $ ID        <dbl> 10, 13, 14, 21, 22, 24, 26, 27, 34, 38, 39, 41, 57, ...
## $ LIMIT_BAL <dbl> 20000, 360000, 80000, 170000, 20000, 80000, 280000, ...
## $ SEX       <fct> 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1...
## $ EDUCATION <fct> 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1...
## $ MARRIAGE  <fct> 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 3...
## $ AGE       <dbl> 36, 30, 30, 35, 50, 29, 26, 49, 40, 25, 30, 30, 24, ...
## $ PAY_PC1   <dbl> -2.4073889, 1.3401272, -6.0527516, -0.3933076, -9.70...
## $ PAY_PC2   <dbl> -1.83901815, 0.45707589, 1.44151821, 0.17555500, -3....
## $ PAY_PC3   <dbl> -0.415630041, -0.298341076, 1.320680779, 0.004885522...
## $ AMT_PC1   <dbl> -1.3135059, -0.9696446, 0.5172295, 2.2663186, -1.035...
## $ AMT_PC2   <dbl> -0.2005159, 0.9623496, -0.4638258, -0.5892800, -0.42...
## $ AMT_PC3   <dbl> -0.11698565, 0.45622277, -0.26057444, -0.05093772, -...
## $ AMT_PC4   <dbl> 1.471378e-03, -1.038874e+00, 1.104836e-01, 1.012649e...
## $ AMT_PC5   <dbl> -0.018313482, 0.115321882, 0.022706173, 0.009804274,...
## $ AMT_PC6   <dbl> -0.021475238, 0.278215504, -0.166555180, -0.18931989...
## $ AMT_PC7   <dbl> 0.072760274, -0.334222722, -0.062878249, -0.05035034...
## $ default   <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...

33. Modify the Target variable to a factor

# Need to modify the Target variable"Gender" to a factor for caret
training$default = factor(ifelse(training$default == 1, "yes", "no"),
                          levels =c("yes", "no"))
testing$default = factor(ifelse(testing$default == 1, "yes", "no"),
                         levels =c("yes","no"))

ctrl = trainControl(method = "cv",
                    number = 5,
                    classProbs = T,
                    summaryFunction = twoClassSummary,
                    allowParallel = TRUE)

35. Build Model 2 - Improve on the first model

#Improve the logistic model_1

# Second Model - remove EDUCATION

Model_2 <-  glm(formula = default ~  ID + LIMIT_BAL + SEX + AGE + MARRIAGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7, 
                data = training,
                family = "binomial")

summary(Model_2)
## 
## Call:
## glm(formula = default ~ ID + LIMIT_BAL + SEX + AGE + MARRIAGE + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4860   0.1899   0.5705   0.7528   3.0450  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.676e+00  7.129e-01   3.753 0.000174 ***
## ID           1.591e-07  2.376e-06   0.067 0.946613    
## LIMIT_BAL    1.869e-06  2.154e-07   8.677  < 2e-16 ***
## SEX2         5.282e-03  4.239e-02   0.125 0.900841    
## AGE         -1.428e-02  2.463e-03  -5.797 6.76e-09 ***
## MARRIAGE1   -1.252e+00  7.050e-01  -1.776 0.075698 .  
## MARRIAGE2   -1.147e+00  7.050e-01  -1.627 0.103822    
## MARRIAGE3   -9.639e-01  7.280e-01  -1.324 0.185493    
## PAY_PC1      2.935e-01  1.168e-02  25.128  < 2e-16 ***
## PAY_PC2      4.105e-01  2.388e-02  17.191  < 2e-16 ***
## PAY_PC3     -2.916e-01  3.037e-02  -9.600  < 2e-16 ***
## AMT_PC1      8.506e-02  1.220e-02   6.970 3.18e-12 ***
## AMT_PC2      1.753e-01  3.480e-02   5.037 4.73e-07 ***
## AMT_PC3     -5.074e-02  3.602e-02  -1.409 0.158899    
## AMT_PC4     -4.429e-02  3.421e-02  -1.295 0.195476    
## AMT_PC5     -3.422e-02  3.811e-02  -0.898 0.369171    
## AMT_PC6      8.529e-02  3.648e-02   2.338 0.019382 *  
## AMT_PC7      9.995e-03  4.945e-02   0.202 0.839813    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14527  on 14683  degrees of freedom
## AIC: 14563
## 
## Number of Fisher Scoring iterations: 5
plot(Model_2)   # Plot regression diagnostics

# AIC ~ 14639  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

36. Build Model 3 - Improve on the second model

# Third Model - remove MARRIAGE
Model_3 <- glm(formula = default ~ LIMIT_BAL + SEX + AGE+ EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 ,
               data = training,
               family = "binomial")

summary(Model_3) 
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + SEX + AGE + EDUCATION + PAY_PC1 + 
##     PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + 
##     AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4985   0.1762   0.5584   0.7423   2.9529  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.225e+01  1.172e+02   0.105    0.917    
## LIMIT_BAL    1.503e-06  2.201e-07   6.826 8.75e-12 ***
## SEX2         3.856e-03  4.234e-02   0.091    0.927    
## AGE         -1.040e-02  2.290e-03  -4.539 5.64e-06 ***
## EDUCATION1  -1.072e+01  1.172e+02  -0.091    0.927    
## EDUCATION2  -1.079e+01  1.172e+02  -0.092    0.927    
## EDUCATION3  -1.128e+01  1.172e+02  -0.096    0.923    
## EDUCATION4  -9.866e+00  1.172e+02  -0.084    0.933    
## EDUCATION5  -9.464e+00  1.172e+02  -0.081    0.936    
## EDUCATION6  -1.025e+01  1.172e+02  -0.087    0.930    
## PAY_PC1      2.891e-01  1.173e-02  24.642  < 2e-16 ***
## PAY_PC2      4.124e-01  2.399e-02  17.195  < 2e-16 ***
## PAY_PC3     -2.976e-01  3.054e-02  -9.746  < 2e-16 ***
## AMT_PC1      8.373e-02  1.223e-02   6.849 7.46e-12 ***
## AMT_PC2      1.783e-01  3.488e-02   5.112 3.19e-07 ***
## AMT_PC3     -5.322e-02  3.594e-02  -1.481    0.139    
## AMT_PC4     -4.672e-02  3.415e-02  -1.368    0.171    
## AMT_PC5     -2.992e-02  3.793e-02  -0.789    0.430    
## AMT_PC6      8.926e-02  3.631e-02   2.459    0.014 *  
## AMT_PC7      1.601e-02  4.919e-02   0.326    0.745    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14416  on 14681  degrees of freedom
## AIC: 14456
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14482  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

36. Build Model 4 - Improve on the third model

# Fourth Model - remove SEX
Model_4 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE+ EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 , 
               data = training,
               family = "binomial")
summary(Model_4)
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE + EDUCATION + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", 
##     data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5108   0.1775   0.5585   0.7411   2.9577  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.375e+01  1.174e+02   0.117 0.906775    
## LIMIT_BAL    1.556e-06  2.213e-07   7.033 2.02e-12 ***
## MARRIAGE1   -1.606e+00  7.097e-01  -2.263 0.023648 *  
## MARRIAGE2   -1.518e+00  7.097e-01  -2.139 0.032474 *  
## MARRIAGE3   -1.298e+00  7.327e-01  -1.771 0.076565 .  
## AGE         -8.741e-03  2.527e-03  -3.460 0.000541 ***
## EDUCATION1  -1.073e+01  1.174e+02  -0.091 0.927149    
## EDUCATION2  -1.079e+01  1.174e+02  -0.092 0.926747    
## EDUCATION3  -1.129e+01  1.174e+02  -0.096 0.923408    
## EDUCATION4  -9.873e+00  1.174e+02  -0.084 0.932974    
## EDUCATION5  -9.461e+00  1.174e+02  -0.081 0.935764    
## EDUCATION6  -1.025e+01  1.174e+02  -0.087 0.930403    
## PAY_PC1      2.893e-01  1.172e-02  24.678  < 2e-16 ***
## PAY_PC2      4.127e-01  2.400e-02  17.194  < 2e-16 ***
## PAY_PC3     -2.976e-01  3.055e-02  -9.740  < 2e-16 ***
## AMT_PC1      8.373e-02  1.224e-02   6.841 7.86e-12 ***
## AMT_PC2      1.777e-01  3.488e-02   5.096 3.47e-07 ***
## AMT_PC3     -5.370e-02  3.597e-02  -1.493 0.135461    
## AMT_PC4     -4.616e-02  3.417e-02  -1.351 0.176722    
## AMT_PC5     -2.935e-02  3.794e-02  -0.774 0.439146    
## AMT_PC6      8.966e-02  3.630e-02   2.470 0.013516 *  
## AMT_PC7      1.543e-02  4.917e-02   0.314 0.753628    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14404  on 14679  degrees of freedom
## AIC: 14448
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14480  , p-values <- 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

37.Build Model 5 - Improve on the fourth model

# Fifth Model - remove SEX, EDUCATION
Model_5 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 , 
               data = training,
               family = "binomial")

summary(Model_5)
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE + PAY_PC1 + 
##     PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + 
##     AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4846   0.1898   0.5704   0.7530   3.0463  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.683e+00  7.109e-01   3.773 0.000161 ***
## LIMIT_BAL    1.870e-06  2.153e-07   8.689  < 2e-16 ***
## MARRIAGE1   -1.252e+00  7.048e-01  -1.776 0.075727 .  
## MARRIAGE2   -1.147e+00  7.048e-01  -1.627 0.103676    
## MARRIAGE3   -9.635e-01  7.278e-01  -1.324 0.185561    
## AGE         -1.432e-02  2.440e-03  -5.868 4.40e-09 ***
## PAY_PC1      2.935e-01  1.167e-02  25.158  < 2e-16 ***
## PAY_PC2      4.106e-01  2.388e-02  17.195  < 2e-16 ***
## PAY_PC3     -2.916e-01  3.037e-02  -9.599  < 2e-16 ***
## AMT_PC1      8.503e-02  1.220e-02   6.969 3.19e-12 ***
## AMT_PC2      1.753e-01  3.480e-02   5.036 4.75e-07 ***
## AMT_PC3     -5.069e-02  3.602e-02  -1.407 0.159315    
## AMT_PC4     -4.425e-02  3.421e-02  -1.293 0.195873    
## AMT_PC5     -3.421e-02  3.812e-02  -0.897 0.369502    
## AMT_PC6      8.530e-02  3.648e-02   2.338 0.019382 *  
## AMT_PC7      1.001e-02  4.945e-02   0.202 0.839583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14527  on 14685  degrees of freedom
## AIC: 14559
## 
## Number of Fisher Scoring iterations: 5
# AIC ~ 14635  , p-values > 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

38. Build Model 6 - Improve on the fifth model

# Sixth Model - remove AMT_PC3
Model_6 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION+ PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 , 
               data = training,
               family = "binomial")

summary(Model_6)
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC4 + 
##     AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4252   0.1879   0.5585   0.7410   2.9599  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.374e+01  1.174e+02   0.117 0.906895    
## LIMIT_BAL    1.545e-06  2.214e-07   6.980 2.95e-12 ***
## MARRIAGE1   -1.605e+00  7.101e-01  -2.261 0.023767 *  
## MARRIAGE2   -1.516e+00  7.101e-01  -2.135 0.032736 *  
## MARRIAGE3   -1.297e+00  7.331e-01  -1.769 0.076838 .  
## SEX2         1.122e-02  4.259e-02   0.263 0.792195    
## AGE         -8.650e-03  2.549e-03  -3.394 0.000689 ***
## EDUCATION1  -1.073e+01  1.174e+02  -0.091 0.927190    
## EDUCATION2  -1.079e+01  1.174e+02  -0.092 0.926784    
## EDUCATION3  -1.128e+01  1.174e+02  -0.096 0.923454    
## EDUCATION4  -9.869e+00  1.174e+02  -0.084 0.933031    
## EDUCATION5  -9.458e+00  1.174e+02  -0.081 0.935813    
## EDUCATION6  -1.025e+01  1.174e+02  -0.087 0.930451    
## PAY_PC1      2.896e-01  1.173e-02  24.689  < 2e-16 ***
## PAY_PC2      4.162e-01  2.386e-02  17.447  < 2e-16 ***
## PAY_PC3     -2.964e-01  3.053e-02  -9.710  < 2e-16 ***
## AMT_PC1      8.158e-02  1.206e-02   6.765 1.33e-11 ***
## AMT_PC2      1.641e-01  3.270e-02   5.019 5.19e-07 ***
## AMT_PC4     -4.059e-02  3.426e-02  -1.184 0.236224    
## AMT_PC5     -2.056e-02  3.792e-02  -0.542 0.587671    
## AMT_PC6      8.194e-02  3.478e-02   2.356 0.018483 *  
## AMT_PC7      1.610e-02  4.598e-02   0.350 0.726268    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14406  on 14679  degrees of freedom
## AIC: 14450
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14481  , p-values > 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

39. Build Model 7 - Improve on the sixth model

# Seventh Model - remove AMT_PC4
Model_7 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC5 + AMT_PC6 + AMT_PC7 , 
               data = training,
               family = "binomial")

summary(Model_7)
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3926   0.1821   0.5593   0.7408   2.9553  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.374e+01  1.174e+02   0.117 0.906825    
## LIMIT_BAL    1.558e-06  2.215e-07   7.035 2.00e-12 ***
## MARRIAGE1   -1.606e+00  7.100e-01  -2.263 0.023665 *  
## MARRIAGE2   -1.516e+00  7.100e-01  -2.136 0.032710 *  
## MARRIAGE3   -1.297e+00  7.330e-01  -1.770 0.076706 .  
## SEX2         1.152e-02  4.260e-02   0.271 0.786735    
## AGE         -8.665e-03  2.548e-03  -3.401 0.000673 ***
## EDUCATION1  -1.073e+01  1.174e+02  -0.091 0.927126    
## EDUCATION2  -1.079e+01  1.174e+02  -0.092 0.926725    
## EDUCATION3  -1.129e+01  1.174e+02  -0.096 0.923388    
## EDUCATION4  -9.875e+00  1.174e+02  -0.084 0.932944    
## EDUCATION5  -9.460e+00  1.174e+02  -0.081 0.935756    
## EDUCATION6  -1.025e+01  1.174e+02  -0.087 0.930426    
## PAY_PC1      2.895e-01  1.174e-02  24.664  < 2e-16 ***
## PAY_PC2      4.126e-01  2.400e-02  17.195  < 2e-16 ***
## PAY_PC3     -2.996e-01  3.053e-02  -9.813  < 2e-16 ***
## AMT_PC1      8.311e-02  1.220e-02   6.811 9.67e-12 ***
## AMT_PC2      1.725e-01  3.421e-02   5.041 4.63e-07 ***
## AMT_PC3     -4.749e-02  3.523e-02  -1.348 0.177701    
## AMT_PC5     -2.059e-02  3.684e-02  -0.559 0.576230    
## AMT_PC6      8.804e-02  3.727e-02   2.362 0.018183 *  
## AMT_PC7      1.606e-02  4.736e-02   0.339 0.734468    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14405  on 14679  degrees of freedom
## AIC: 14449
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14480  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

40. Build other models and compare AIC results

# Eighth Model - remove PAY_PC1
Model_8 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION+ AMT_PC5 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC7 , 
               data = training,
               family = "binomial")

summary(Model_8)
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + 
##     AMT_PC5 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC4 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7771   0.1444   0.6004   0.7890   1.7788  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.334e+01  1.191e+02   0.112  0.91082    
## LIMIT_BAL    3.599e-06  2.140e-07  16.819  < 2e-16 ***
## MARRIAGE1   -1.373e+00  6.499e-01  -2.113  0.03462 *  
## MARRIAGE2   -1.277e+00  6.499e-01  -1.965  0.04944 *  
## MARRIAGE3   -1.048e+00  6.737e-01  -1.556  0.11981    
## SEX2         6.445e-02  4.134e-02   1.559  0.11899    
## AGE         -6.624e-03  2.470e-03  -2.682  0.00732 ** 
## EDUCATION1  -1.101e+01  1.191e+02  -0.092  0.92635    
## EDUCATION2  -1.114e+01  1.191e+02  -0.094  0.92549    
## EDUCATION3  -1.162e+01  1.191e+02  -0.098  0.92228    
## EDUCATION4  -9.992e+00  1.191e+02  -0.084  0.93314    
## EDUCATION5  -9.632e+00  1.191e+02  -0.081  0.93554    
## EDUCATION6  -1.034e+01  1.191e+02  -0.087  0.93081    
## AMT_PC5     -5.369e-02  3.946e-02  -1.361  0.17362    
## PAY_PC2      3.649e-01  2.349e-02  15.534  < 2e-16 ***
## PAY_PC3     -2.677e-01  3.055e-02  -8.762  < 2e-16 ***
## AMT_PC1     -8.076e-03  1.142e-02  -0.707  0.47944    
## AMT_PC2      2.981e-01  3.684e-02   8.092 5.89e-16 ***
## AMT_PC3     -8.131e-02  3.782e-02  -2.150  0.03155 *  
## AMT_PC4     -6.192e-02  3.505e-02  -1.766  0.07734 .  
## AMT_PC6      1.120e-01  3.714e-02   3.015  0.00257 ** 
## AMT_PC7      4.770e-02  5.400e-02   0.883  0.37703    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 15061  on 14679  degrees of freedom
## AIC: 15105
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 15172  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

# Nineth Model - remove AMT_PC5
Model_9 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + EDUCATION + AGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC5 , 
               data = training,
               family = "binomial")

summary(Model_9)
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + EDUCATION + 
##     AGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC4 + AMT_PC6 + AMT_PC5, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4679   0.1760   0.5588   0.7411   2.9552  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.374e+01  1.174e+02   0.117  0.90683    
## LIMIT_BAL    1.553e-06  2.214e-07   7.012 2.34e-12 ***
## MARRIAGE1   -1.608e+00  7.100e-01  -2.264  0.02356 *  
## MARRIAGE2   -1.518e+00  7.100e-01  -2.138  0.03252 *  
## MARRIAGE3   -1.299e+00  7.329e-01  -1.772  0.07632 .  
## SEX2         1.202e-02  4.260e-02   0.282  0.77784    
## EDUCATION1  -1.073e+01  1.174e+02  -0.091  0.92715    
## EDUCATION2  -1.079e+01  1.174e+02  -0.092  0.92675    
## EDUCATION3  -1.129e+01  1.174e+02  -0.096  0.92341    
## EDUCATION4  -9.871e+00  1.174e+02  -0.084  0.93298    
## EDUCATION5  -9.461e+00  1.174e+02  -0.081  0.93576    
## EDUCATION6  -1.025e+01  1.174e+02  -0.087  0.93041    
## AGE         -8.649e-03  2.549e-03  -3.394  0.00069 ***
## PAY_PC1      2.893e-01  1.173e-02  24.654  < 2e-16 ***
## PAY_PC2      4.126e-01  2.401e-02  17.187  < 2e-16 ***
## PAY_PC3     -2.983e-01  3.049e-02  -9.785  < 2e-16 ***
## AMT_PC1      8.397e-02  1.224e-02   6.862 6.81e-12 ***
## AMT_PC2      1.791e-01  3.475e-02   5.154 2.55e-07 ***
## AMT_PC3     -5.400e-02  3.590e-02  -1.504  0.13260    
## AMT_PC4     -4.650e-02  3.420e-02  -1.360  0.17392    
## AMT_PC6      9.097e-02  3.618e-02   2.515  0.01192 *  
## AMT_PC5     -3.233e-02  3.703e-02  -0.873  0.38259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14404  on 14679  degrees of freedom
## AIC: 14448
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14480  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

# Tenth Model - remove LIMIT_BAL
Model_10 <- glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + AGE + EDUCATION+ PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC5 , 
                data = training,
                family = "binomial")

summary(Model_10)
## 
## Call:
## glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + AGE + EDUCATION + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC4 + AMT_PC6 + AMT_PC5, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6469   0.1624   0.5688   0.7306   3.0133  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  13.951692 117.826705   0.118  0.90574    
## AMT_PC5      -0.030944   0.037867  -0.817  0.41383    
## MARRIAGE1    -1.610387   0.716234  -2.248  0.02455 *  
## MARRIAGE2    -1.548956   0.716258  -2.163  0.03057 *  
## MARRIAGE3    -1.382453   0.739007  -1.871  0.06139 .  
## SEX2          0.024767   0.042476   0.583  0.55983    
## AGE          -0.006925   0.002544  -2.722  0.00649 ** 
## EDUCATION1  -10.693516 117.824492  -0.091  0.92768    
## EDUCATION2  -10.824782 117.824491  -0.092  0.92680    
## EDUCATION3  -11.349599 117.824495  -0.096  0.92326    
## EDUCATION4   -9.830571 117.825660  -0.083  0.93351    
## EDUCATION5   -9.517344 117.824986  -0.081  0.93562    
## EDUCATION6  -10.375467 117.825788  -0.088  0.92983    
## PAY_PC1       0.318343   0.011082  28.727  < 2e-16 ***
## PAY_PC2       0.417341   0.024020  17.375  < 2e-16 ***
## PAY_PC3      -0.299428   0.030379  -9.856  < 2e-16 ***
## AMT_PC1       0.124136   0.010916  11.372  < 2e-16 ***
## AMT_PC2       0.217815   0.035020   6.220 4.98e-10 ***
## AMT_PC3      -0.047997   0.036694  -1.308  0.19086    
## AMT_PC4      -0.050882   0.034825  -1.461  0.14399    
## AMT_PC6       0.093999   0.037085   2.535  0.01126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14455  on 14680  degrees of freedom
## AIC: 14497
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14525  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

# Eleventh Model - remove AGE
Model_11 <- glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC5 , 
                data = training,
                family = "binomial")

summary(Model_11)
## 
## Call:
## glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + EDUCATION + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC4 + AMT_PC6 + AMT_PC5, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6443   0.1631   0.5674   0.7327   2.9584  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  13.68418  118.21571   0.116   0.9078    
## AMT_PC5      -0.03086    0.03781  -0.816   0.4144    
## MARRIAGE1    -1.65017    0.71854  -2.297   0.0216 *  
## MARRIAGE2    -1.53231    0.71867  -2.132   0.0330 *  
## MARRIAGE3    -1.44360    0.74096  -1.948   0.0514 .  
## SEX2          0.03932    0.04212   0.933   0.3506    
## EDUCATION1  -10.66708  118.21353  -0.090   0.9281    
## EDUCATION2  -10.79545  118.21353  -0.091   0.9272    
## EDUCATION3  -11.35418  118.21354  -0.096   0.9235    
## EDUCATION4   -9.78222  118.21470  -0.083   0.9341    
## EDUCATION5   -9.49354  118.21402  -0.080   0.9360    
## EDUCATION6  -10.40621  118.21482  -0.088   0.9299    
## PAY_PC1       0.31626    0.01105  28.631  < 2e-16 ***
## PAY_PC2       0.41720    0.02400  17.382  < 2e-16 ***
## PAY_PC3      -0.29943    0.03038  -9.856  < 2e-16 ***
## AMT_PC1       0.12219    0.01089  11.218  < 2e-16 ***
## AMT_PC2       0.21804    0.03501   6.228 4.74e-10 ***
## AMT_PC3      -0.04829    0.03667  -1.317   0.1878    
## AMT_PC4      -0.05131    0.03476  -1.476   0.1399    
## AMT_PC6       0.09384    0.03704   2.533   0.0113 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14462  on 14681  degrees of freedom
## AIC: 14502
## 
## Number of Fisher Scoring iterations: 11
# AIC ~ 14529  , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")

# Let's stick with the Model_7(removed remove AMT_PC4) because it produced  the lowest AIC of 14480

41. Coefficients Model 7 - Inspect the regression coefficients

# Let's stick with the Model_7(removed remove AMT_PC4) because it produced  the lowest AIC of 14480

# Access coefficients of the fitted model_7
coef(Model_7)  #  A degree of Multicollinearity exists in Model_7 due to negative coefficients in(AMT_PC1,AMT_PC2,P AMT_PC6,PAY_PC1,PAY_PC2,LIMIT_BAL)
##   (Intercept)     LIMIT_BAL     MARRIAGE1     MARRIAGE2     MARRIAGE3 
##  1.373597e+01  1.557942e-06 -1.606421e+00 -1.516401e+00 -1.297488e+00 
##          SEX2           AGE    EDUCATION1    EDUCATION2    EDUCATION3 
##  1.152448e-02 -8.664856e-03 -1.073343e+01 -1.079275e+01 -1.128569e+01 
##    EDUCATION4    EDUCATION5    EDUCATION6       PAY_PC1       PAY_PC2 
## -9.874501e+00 -9.459511e+00 -1.024625e+01  2.894638e-01  4.126059e-01 
##       PAY_PC3       AMT_PC1       AMT_PC2       AMT_PC3       AMT_PC5 
## -2.996103e-01  8.310676e-02  1.724569e-01 -4.749015e-02 -2.058903e-02 
##       AMT_PC6       AMT_PC7 
##  8.803751e-02  1.606383e-02

42. Check the presence of multicollinearity among the numeric variables

# Check presence of multicollinearity 
multi <- c1[, c("PAY_PC1","AGE","PAY_PC3","PAY_PC2", "AMT_PC6","AMT_PC3","AMT_PC5","AMT_PC7", "AMT_PC1", "AMT_PC2")]

43. Glimpse structure of ‘multi’

glimpse(multi)
## Observations: 21,001
## Variables: 10
## $ PAY_PC1 <dbl> 1.44929361, -1.80136756, -0.39330764, -0.39330764, -0....
## $ AGE     <dbl> 33, 35, 25, 28, 33, 36, 44, 42, 30, 30, 35, 23, 43, 27...
## $ PAY_PC3 <dbl> -0.362878237, -0.264508711, 0.004885522, 0.004885522, ...
## $ PAY_PC2 <dbl> -1.2723509, 0.7679557, 0.1755550, 0.1755550, -0.349252...
## $ AMT_PC6 <dbl> -0.696090967, 0.159330316, 0.046065937, -0.044970627, ...
## $ AMT_PC3 <dbl> 6.31417677, -0.57493602, -0.10406594, -0.04354691, -0....
## $ AMT_PC5 <dbl> 0.228894439, 0.256636062, -0.013829394, -0.024157751, ...
## $ AMT_PC7 <dbl> 0.903658132, 0.488583491, -0.017876349, -0.080402170, ...
## $ AMT_PC1 <dbl> 2.26658679, 4.12474891, -0.74816346, 1.13057727, -1.71...
## $ AMT_PC2 <dbl> 2.64796345, -1.12245175, -0.37413651, -0.64517001, -0....

44. Generate Pair Plot

# Pair plot
pairs(multi)


45. Correlation Matrix

# Correlation matrix - few techniques (looking at the strength of a pair of numeric variables, 0 = weak and 1 = strong positive relationship, -1 = strong negative relationship )
cor(multi)
##              PAY_PC1          AGE       PAY_PC3       PAY_PC2      AMT_PC6
## PAY_PC1  1.000000000  0.059283535 -0.0013301821  0.0059243526 -0.006505793
## AGE      0.059283535  1.000000000  0.0040853707 -0.0028862890  0.002564926
## PAY_PC3 -0.001330182  0.004085371  1.0000000000 -0.0006602639 -0.030816042
## PAY_PC2  0.005924353 -0.002886289 -0.0006602639  1.0000000000 -0.006054684
## AMT_PC6 -0.006505793  0.002564926 -0.0308160421 -0.0060546844  1.000000000
## AMT_PC3  0.037109079  0.004522907 -0.0328408230 -0.0541293031  0.014001444
## AMT_PC5  0.010311492  0.002824440  0.0065640818 -0.0100275635  0.044710483
## AMT_PC7  0.015145512 -0.001968249 -0.0597155232  0.0002993426  0.025348705
## AMT_PC1 -0.252161806  0.061652070  0.0176827839  0.0413118350 -0.015989877
## AMT_PC2  0.243521986  0.014731858 -0.0070392614  0.1205176373 -0.055491388
##              AMT_PC3      AMT_PC5       AMT_PC7      AMT_PC1      AMT_PC2
## PAY_PC1  0.037109079  0.010311492  0.0151455119 -0.252161806  0.243521986
## AGE      0.004522907  0.002824440 -0.0019682489  0.061652070  0.014731858
## PAY_PC3 -0.032840823  0.006564082 -0.0597155232  0.017682784 -0.007039261
## PAY_PC2 -0.054129303 -0.010027564  0.0002993426  0.041311835  0.120517637
## AMT_PC6  0.014001444  0.044710483  0.0253487052 -0.015989877 -0.055491388
## AMT_PC3  1.000000000 -0.042850333  0.0258501353  0.028970051  0.141786762
## AMT_PC5 -0.042850333  1.000000000 -0.0528917823  0.012664498  0.067606503
## AMT_PC7  0.025850135 -0.052891782  1.0000000000  0.008783861  0.056185635
## AMT_PC1  0.028970051  0.012664498  0.0087838608  1.000000000 -0.044748851
## AMT_PC2  0.141786762  0.067606503  0.0561856349 -0.044748851  1.000000000
cor(multi, method = "pearson")
##              PAY_PC1          AGE       PAY_PC3       PAY_PC2      AMT_PC6
## PAY_PC1  1.000000000  0.059283535 -0.0013301821  0.0059243526 -0.006505793
## AGE      0.059283535  1.000000000  0.0040853707 -0.0028862890  0.002564926
## PAY_PC3 -0.001330182  0.004085371  1.0000000000 -0.0006602639 -0.030816042
## PAY_PC2  0.005924353 -0.002886289 -0.0006602639  1.0000000000 -0.006054684
## AMT_PC6 -0.006505793  0.002564926 -0.0308160421 -0.0060546844  1.000000000
## AMT_PC3  0.037109079  0.004522907 -0.0328408230 -0.0541293031  0.014001444
## AMT_PC5  0.010311492  0.002824440  0.0065640818 -0.0100275635  0.044710483
## AMT_PC7  0.015145512 -0.001968249 -0.0597155232  0.0002993426  0.025348705
## AMT_PC1 -0.252161806  0.061652070  0.0176827839  0.0413118350 -0.015989877
## AMT_PC2  0.243521986  0.014731858 -0.0070392614  0.1205176373 -0.055491388
##              AMT_PC3      AMT_PC5       AMT_PC7      AMT_PC1      AMT_PC2
## PAY_PC1  0.037109079  0.010311492  0.0151455119 -0.252161806  0.243521986
## AGE      0.004522907  0.002824440 -0.0019682489  0.061652070  0.014731858
## PAY_PC3 -0.032840823  0.006564082 -0.0597155232  0.017682784 -0.007039261
## PAY_PC2 -0.054129303 -0.010027564  0.0002993426  0.041311835  0.120517637
## AMT_PC6  0.014001444  0.044710483  0.0253487052 -0.015989877 -0.055491388
## AMT_PC3  1.000000000 -0.042850333  0.0258501353  0.028970051  0.141786762
## AMT_PC5 -0.042850333  1.000000000 -0.0528917823  0.012664498  0.067606503
## AMT_PC7  0.025850135 -0.052891782  1.0000000000  0.008783861  0.056185635
## AMT_PC1  0.028970051  0.012664498  0.0087838608  1.000000000 -0.044748851
## AMT_PC2  0.141786762  0.067606503  0.0561856349 -0.044748851  1.000000000
cor(multi, method = "spearman")
##              PAY_PC1          AGE     PAY_PC3      PAY_PC2      AMT_PC6
## PAY_PC1  1.000000000  0.087211938 -0.13277136  0.134995638  0.002073015
## AGE      0.087211938  1.000000000 -0.01316903  0.024629257  0.012307570
## PAY_PC3 -0.132771359 -0.013169030  1.00000000 -0.097668754 -0.085168355
## PAY_PC2  0.134995638  0.024629257 -0.09766875  1.000000000  0.035274819
## AMT_PC6  0.002073015  0.012307570 -0.08516835  0.035274819  1.000000000
## AMT_PC3  0.048972803  0.015561667 -0.03898459 -0.083087619 -0.149867410
## AMT_PC5  0.037181569  0.005468656  0.05099078 -0.027121908 -0.041056891
## AMT_PC7  0.048847388 -0.001202342 -0.19783491 -0.031113969  0.105356551
## AMT_PC1 -0.499905544  0.019019580  0.10023535 -0.003800802  0.081385027
## AMT_PC2  0.499613066  0.002250097 -0.10844990  0.207400725 -0.012594602
##             AMT_PC3      AMT_PC5      AMT_PC7      AMT_PC1      AMT_PC2
## PAY_PC1  0.04897280  0.037181569  0.048847388 -0.499905544  0.499613066
## AGE      0.01556167  0.005468656 -0.001202342  0.019019580  0.002250097
## PAY_PC3 -0.03898459  0.050990778 -0.197834913  0.100235351 -0.108449897
## PAY_PC2 -0.08308762 -0.027121908 -0.031113969 -0.003800802  0.207400725
## AMT_PC6 -0.14986741 -0.041056891  0.105356551  0.081385027 -0.012594602
## AMT_PC3  1.00000000  0.165300058 -0.097920284 -0.019170573  0.103695505
## AMT_PC5  0.16530006  1.000000000 -0.197423322 -0.018757471  0.044378508
## AMT_PC7 -0.09792028 -0.197423322  1.000000000 -0.043584598  0.007449202
## AMT_PC1 -0.01917057 -0.018757471 -0.043584598  1.000000000 -0.403230353
## AMT_PC2  0.10369550  0.044378508  0.007449202 -0.403230353  1.000000000

46. Calculating Probabilities and Predictions on Model 7

# Calculating probabilities and predictions  - Model 7

# Building the logistic regression model and the predictions is given below
# removed  AMT_PC4 produced lowest AIC score 
Model_7 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC5 + AMT_PC6 + AMT_PC7 , data = training,
               family = "binomial")

# Obtain regression coefficients
summary(Model_7) #
## 
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + 
##     PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + 
##     AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3926   0.1821   0.5593   0.7408   2.9553  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.374e+01  1.174e+02   0.117 0.906825    
## LIMIT_BAL    1.558e-06  2.215e-07   7.035 2.00e-12 ***
## MARRIAGE1   -1.606e+00  7.100e-01  -2.263 0.023665 *  
## MARRIAGE2   -1.516e+00  7.100e-01  -2.136 0.032710 *  
## MARRIAGE3   -1.297e+00  7.330e-01  -1.770 0.076706 .  
## SEX2         1.152e-02  4.260e-02   0.271 0.786735    
## AGE         -8.665e-03  2.548e-03  -3.401 0.000673 ***
## EDUCATION1  -1.073e+01  1.174e+02  -0.091 0.927126    
## EDUCATION2  -1.079e+01  1.174e+02  -0.092 0.926725    
## EDUCATION3  -1.129e+01  1.174e+02  -0.096 0.923388    
## EDUCATION4  -9.875e+00  1.174e+02  -0.084 0.932944    
## EDUCATION5  -9.460e+00  1.174e+02  -0.081 0.935756    
## EDUCATION6  -1.025e+01  1.174e+02  -0.087 0.930426    
## PAY_PC1      2.895e-01  1.174e-02  24.664  < 2e-16 ***
## PAY_PC2      4.126e-01  2.400e-02  17.195  < 2e-16 ***
## PAY_PC3     -2.996e-01  3.053e-02  -9.813  < 2e-16 ***
## AMT_PC1      8.311e-02  1.220e-02   6.811 9.67e-12 ***
## AMT_PC2      1.725e-01  3.421e-02   5.041 4.63e-07 ***
## AMT_PC3     -4.749e-02  3.523e-02  -1.348 0.177701    
## AMT_PC5     -2.059e-02  3.684e-02  -0.559 0.576230    
## AMT_PC6      8.804e-02  3.727e-02   2.362 0.018183 *  
## AMT_PC7      1.606e-02  4.736e-02   0.339 0.734468    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16400  on 14700  degrees of freedom
## Residual deviance: 14405  on 14679  degrees of freedom
## AIC: 14449
## 
## Number of Fisher Scoring iterations: 11

47. Inlcude the probabilities from training data into the testing dataset

# Add the probabilities to the testing data (within training data set)
testing$probability <- predict(Model_7, 
                               newdata = testing, 
                               type = "response")

48. Create Class Prediction

# Assume that the optimum probability threshold is 0.5
# Create the class prediction - our target is the "1" class

testing$prediction = "yes"
testing[testing$probability >= 0.5, 
        "prediction"] = "yes"

49. Inspect probability and prediction

# Have a look at the data. Put data into the model (predicted probabilities)

# probability and prediction
head(testing) 
## # A tibble: 6 x 19
##      ID LIMIT_BAL SEX   EDUCATION MARRIAGE   AGE PAY_PC1 PAY_PC2  PAY_PC3
##   <dbl>     <dbl> <fct> <fct>     <fct>    <dbl>   <dbl>   <dbl>    <dbl>
## 1    10     20000 1     2         1           36  -2.41   -1.84  -0.416  
## 2    13    360000 1     1         2           30   1.34    0.457 -0.298  
## 3    14     80000 2     2         2           30  -6.05    1.44   1.32   
## 4    21    170000 2     2         2           35  -0.393   0.176  0.00489
## 5    22     20000 2     2         1           50  -9.70   -3.82   0.310  
## 6    24     80000 2     1         2           29   1.08   -0.906  0.914  
## # ... with 10 more variables: AMT_PC1 <dbl>, AMT_PC2 <dbl>, AMT_PC3 <dbl>,
## #   AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>,
## #   default <fct>, probability <dbl>, prediction <chr>

50. Export the Predictions to a csv file

# write predictions to disk
trainingdata_predictions_logistic <- data.frame(testing,testing$probability)

write.csv(trainingdata_predictions_logistic ,
          file="trainingdata_predictions_logistic.csv")

52. Generate a Confusion Matrix

# Evaluation Model_7 logistic on training data


# Create a confusion matrix (along with other measures) using the 
# function 'confusionMatrix' from the caret package
library(e1071)


# Generate a Confusion Matrix - 2 class example
# confusionMatrix(data = testing$prediction,testing$default, mode = "everything")

## 2 class example

lvs <- c("yes", "no")
truth <- factor(rep(lvs, times = c(86, 258)),
                levels = rev(lvs))
pred <- factor(
               c(
                 rep(lvs, times = c(54, 32)),
                 rep(lvs, times = c(27, 231))),
               levels = rev(lvs))

xtab <- table(pred, truth)

confusionMatrix(xtab)
## Confusion Matrix and Statistics
## 
##      truth
## pred   no yes
##   no  231  32
##   yes  27  54
##                                           
##                Accuracy : 0.8285          
##                  95% CI : (0.7844, 0.8668)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.0003097       
##                                           
##                   Kappa : 0.5336          
##                                           
##  Mcnemar's Test P-Value : 0.6025370       
##                                           
##             Sensitivity : 0.8953          
##             Specificity : 0.6279          
##          Pos Pred Value : 0.8783          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.7500          
##          Detection Rate : 0.6715          
##    Detection Prevalence : 0.7645          
##       Balanced Accuracy : 0.7616          
##                                           
##        'Positive' Class : no              
## 
confusionMatrix(pred, truth)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  231  32
##        yes  27  54
##                                           
##                Accuracy : 0.8285          
##                  95% CI : (0.7844, 0.8668)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.0003097       
##                                           
##                   Kappa : 0.5336          
##                                           
##  Mcnemar's Test P-Value : 0.6025370       
##                                           
##             Sensitivity : 0.8953          
##             Specificity : 0.6279          
##          Pos Pred Value : 0.8783          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.7500          
##          Detection Rate : 0.6715          
##    Detection Prevalence : 0.7645          
##       Balanced Accuracy : 0.7616          
##                                           
##        'Positive' Class : no              
## 
confusionMatrix(xtab, prevalence = 0.25)
## Confusion Matrix and Statistics
## 
##      truth
## pred   no yes
##   no  231  32
##   yes  27  54
##                                           
##                Accuracy : 0.8285          
##                  95% CI : (0.7844, 0.8668)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.0003097       
##                                           
##                   Kappa : 0.5336          
##                                           
##  Mcnemar's Test P-Value : 0.6025370       
##                                           
##             Sensitivity : 0.8953          
##             Specificity : 0.6279          
##          Pos Pred Value : 0.4451          
##          Neg Pred Value : 0.9474          
##              Prevalence : 0.2500          
##          Detection Rate : 0.6715          
##    Detection Prevalence : 0.7645          
##       Balanced Accuracy : 0.7616          
##                                           
##        'Positive' Class : no              
## 

53. Generate a Accuracy Score

# Accuracy = TP + TN/(TP+TN+FP+FN)  0.245873
acc <- (1549+ 0)/(1549 +0 + 4751 +0)  # accuracy of 24.59%
acc
## [1] 0.245873

54. Training error

# training error is 1- accuracy 24.59 = 75.41%
training_error <- 1-0.245873
training_error
## [1] 0.754127

55. Precision

# precision = TP/TP+ FP   # 0.245873
prec <- 1549/(1549 + 4751)     # precision is 24.59%
prec
## [1] 0.245873

56. Recall

# recall = TP/TP+FN
rec <- 1549/(1549 + 0)    # recall is 100%
rec
## [1] 1

57. F1 Score

# F1 =  2 x (precision x recall)/(precision + recall)  # 0.3946999
F1 <- 2*(0.245873 *1)/(0.245873+ 1)  # F1 is 39.47%
F1
## [1] 0.3946999

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.