# Install packages into R
#install.packages("caret")
#install.packages("tidyverse")
#install.packages("plyr")
#install.packages("rmarkdown")
#install.packages("dummies")
#install.packages("glmnet")
#install.packages("e1071")
#install.packages("rpart")
#install.packages("tree")
#install.packages("randomForest")
#install.packages("ROCR")
#install.packages("gbm")
#install.packages("AUC")
#install.packages("rpart.plot")
#install.packages("mlbench")
#install.packages("pROC")
#install.packages("parallel")
#install.packages("doParallel")
#install.packages("knitr")
#install.packages("here")
#install.packages("roxygen2")
#install.packages("testthat")
#install.packages("gridExtra")
# Load libraries into R
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## Registered S3 method overwritten by 'rvest':
## method from
## read_xml.response xml2
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-18
library(e1071)
library(rpart)
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(gbm)
## Loaded gbm 2.1.5
library(AUC)
## AUC 0.3.0
## Type AUCNews() to see the change log and ?AUC to get an overview.
##
## Attaching package: 'AUC'
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:caret':
##
## sensitivity, specificity
library(rpart.plot)
library(mlbench)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:AUC':
##
## auc, roc
## The following object is masked from 'package:glmnet':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(parallel)
library(rmarkdown) # R Mardown is a type of reproducible notebook using literate programming as it is executed in chunks with tracebacks of the last step.
library(here)
## here() starts at C:/Users/Wendy/Desktop/DSP/assignment 3/project
##
## Attaching package: 'here'
## The following object is masked from 'package:plyr':
##
## here
library(knitr)
library(roxygen2) # include documentation for writing functions
library(testthat) # include unit testing
##
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
##
## matches
## The following object is masked from 'package:purrr':
##
## is_null
# Do use: rm(list=ls()) # R code may break when someone tries to run code on another computer
# Read in csv file from the folder "DSP Assignment 3"
c <- read_csv(here("./data/credit_card_data_training.csv"))
## Parsed with column specification:
## cols(
## ID = col_double(),
## LIMIT_BAL = col_double(),
## SEX = col_double(),
## EDUCATION = col_double(),
## MARRIAGE = col_double(),
## AGE = col_double(),
## PAY_PC1 = col_double(),
## PAY_PC2 = col_double(),
## PAY_PC3 = col_double(),
## AMT_PC1 = col_double(),
## AMT_PC2 = col_double(),
## AMT_PC3 = col_double(),
## AMT_PC4 = col_double(),
## AMT_PC5 = col_double(),
## AMT_PC6 = col_double(),
## AMT_PC7 = col_double(),
## default = col_double()
## )
# View its dimensions
dim(c)
## [1] 21001 17
# View its class
class(c) # dataframe
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
# Structure of training data the dplyr way. Data types, preview 17 columns, 21001 observations or rows
glimpse(c) # SEX, EDUCATION, MARRIAGE, Default are integers not factors
## Observations: 21,001
## Variables: 17
## $ ID <dbl> 1, 2, 3, 7, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 2...
## $ LIMIT_BAL <dbl> 400000, 200000, 50000, 80000, 260000, 20000, 60000, ...
## $ SEX <dbl> 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2...
## $ EDUCATION <dbl> 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 4, 2, 2, 2, 2, 1, 1...
## $ MARRIAGE <dbl> 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2...
## $ AGE <dbl> 33, 35, 25, 28, 33, 36, 44, 42, 30, 30, 35, 23, 43, ...
## $ PAY_PC1 <dbl> 1.44929361, -1.80136756, -0.39330764, -0.39330764, -...
## $ PAY_PC2 <dbl> -1.2723509, 0.7679557, 0.1755550, 0.1755550, -0.3492...
## $ PAY_PC3 <dbl> -0.362878237, -0.264508711, 0.004885522, 0.004885522...
## $ AMT_PC1 <dbl> 2.26658679, 4.12474891, -0.74816346, 1.13057727, -1....
## $ AMT_PC2 <dbl> 2.64796345, -1.12245175, -0.37413651, -0.64517001, -...
## $ AMT_PC3 <dbl> 6.31417677, -0.57493602, -0.10406594, -0.04354691, -...
## $ AMT_PC4 <dbl> -3.461439078, -0.166878371, -0.011751497, 0.09175045...
## $ AMT_PC5 <dbl> 0.228894439, 0.256636062, -0.013829394, -0.024157751...
## $ AMT_PC6 <dbl> -0.696090967, 0.159330316, 0.046065937, -0.044970627...
## $ AMT_PC7 <dbl> 0.903658132, 0.488583491, -0.017876349, -0.080402170...
## $ default <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0...
# Column names
names(c)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE"
## [6] "AGE" "PAY_PC1" "PAY_PC2" "PAY_PC3" "AMT_PC1"
## [11] "AMT_PC2" "AMT_PC3" "AMT_PC4" "AMT_PC5" "AMT_PC6"
## [16] "AMT_PC7" "default"
# View variable names
colnames(c)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE"
## [6] "AGE" "PAY_PC1" "PAY_PC2" "PAY_PC3" "AMT_PC1"
## [11] "AMT_PC2" "AMT_PC3" "AMT_PC4" "AMT_PC5" "AMT_PC6"
## [16] "AMT_PC7" "default"
# Summary of distribution of each column
summary(c)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : 10000 Min. :1.000 Min. :0.000
## 1st Qu.: 7468 1st Qu.: 50000 1st Qu.:1.000 1st Qu.:1.000
## Median :14938 Median : 140000 Median :2.000 Median :2.000
## Mean :14973 Mean : 167502 Mean :1.606 Mean :1.856
## 3rd Qu.:22519 3rd Qu.: 240000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :29995 Max. :1000000 Max. :2.000 Max. :6.000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Min. :0.000 Min. :21.00 Min. :-13.302028 Min. :-4.422427
## 1st Qu.:1.000 1st Qu.:28.00 1st Qu.: -0.393308 1st Qu.:-0.227765
## Median :2.000 Median :34.00 Median : -0.393308 Median : 0.175555
## Mean :1.555 Mean :35.47 Mean : -0.009395 Mean :-0.000277
## 3rd Qu.:2.000 3rd Qu.:41.00 3rd Qu.: 1.360047 3rd Qu.: 0.361123
## Max. :3.000 Max. :79.00 Max. : 3.813348 Max. : 5.441026
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.717690
## 1st Qu.:-0.283941 1st Qu.:-1.51032 1st Qu.:-0.431402
## Median : 0.004886 Median :-0.85848 Median :-0.208967
## Mean : 0.000949 Mean : 0.01059 Mean :-0.005951
## 3rd Qu.: 0.077070 3rd Qu.: 0.52003 3rd Qu.: 0.085625
## Max. : 3.364030 Max. :37.49240 Max. :28.783658
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-10.389523 Min. :-19.171464 Min. :-24.108569
## 1st Qu.: -0.135722 1st Qu.: -0.068251 1st Qu.: -0.082314
## Median : -0.070445 Median : 0.018161 Median : -0.032000
## Mean : 0.003369 Mean : 0.002936 Mean : -0.000712
## 3rd Qu.: 0.001416 3rd Qu.: 0.081621 3rd Qu.: 0.025086
## Max. : 21.984829 Max. : 21.823749 Max. : 17.430967
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-25.90403 Min. :0.0000
## 1st Qu.: -0.04252 1st Qu.: -0.09209 1st Qu.:0.0000
## Median : -0.00234 Median : -0.04045 Median :0.0000
## Mean : -0.00008 Mean : -0.00171 Mean :0.2459
## 3rd Qu.: 0.06772 3rd Qu.: 0.03061 3rd Qu.:0.0000
## Max. : 14.72234 Max. : 22.92727 Max. :1.0000
# First 3 variables
head(c,3)
## # A tibble: 3 x 17
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 400000 1 1 2 33 1.45 -1.27 -0.363
## 2 2 200000 1 2 1 35 -1.80 0.768 -0.265
## 3 3 50000 1 2 2 25 -0.393 0.176 0.00489
## # ... with 8 more variables: AMT_PC1 <dbl>, AMT_PC2 <dbl>, AMT_PC3 <dbl>,
## # AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>,
## # default <dbl>
# Bottom 3 variables
tail(c,3)
## # A tibble: 3 x 17
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 29992 80000 2 3 1 39 2.09 -0.0763 -0.311
## 2 29993 100000 1 2 1 41 0.984 0.994 0.374
## 3 29995 500000 1 1 2 37 0.646 0.0368 -1.04
## # ... with 8 more variables: AMT_PC1 <dbl>, AMT_PC2 <dbl>, AMT_PC3 <dbl>,
## # AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>,
## # default <dbl>
# Pre-Processing Data (Data Cleaning) for modelling
# Transform integer variables into categorical variables (default, SEX,EDUCATION, MARRIAGE, default)
c$EDUCATION <- factor(c$EDUCATION)
c$MARRIAGE <- factor(c$MARRIAGE)
c$default <- factor(c$default)
c$SEX <- factor(c$SEX)
# Convert categorical variables into factors to represent their levels
c$SEX <- as.factor(c$SEX)
c$EDUCATION <- as.factor(c$EDUCATION)
c$MARRIAGE <- as.factor(c$MARRIAGE)
#c$default <- as.factor(c$default)
# check that categorical variables have converted with levels
glimpse(c)
## Observations: 21,001
## Variables: 17
## $ ID <dbl> 1, 2, 3, 7, 9, 10, 11, 12, 13, 14, 16, 18, 19, 20, 2...
## $ LIMIT_BAL <dbl> 400000, 200000, 50000, 80000, 260000, 20000, 60000, ...
## $ SEX <fct> 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2...
## $ EDUCATION <fct> 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 4, 2, 2, 2, 2, 1, 1...
## $ MARRIAGE <fct> 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2...
## $ AGE <dbl> 33, 35, 25, 28, 33, 36, 44, 42, 30, 30, 35, 23, 43, ...
## $ PAY_PC1 <dbl> 1.44929361, -1.80136756, -0.39330764, -0.39330764, -...
## $ PAY_PC2 <dbl> -1.2723509, 0.7679557, 0.1755550, 0.1755550, -0.3492...
## $ PAY_PC3 <dbl> -0.362878237, -0.264508711, 0.004885522, 0.004885522...
## $ AMT_PC1 <dbl> 2.26658679, 4.12474891, -0.74816346, 1.13057727, -1....
## $ AMT_PC2 <dbl> 2.64796345, -1.12245175, -0.37413651, -0.64517001, -...
## $ AMT_PC3 <dbl> 6.31417677, -0.57493602, -0.10406594, -0.04354691, -...
## $ AMT_PC4 <dbl> -3.461439078, -0.166878371, -0.011751497, 0.09175045...
## $ AMT_PC5 <dbl> 0.228894439, 0.256636062, -0.013829394, -0.024157751...
## $ AMT_PC6 <dbl> -0.696090967, 0.159330316, 0.046065937, -0.044970627...
## $ AMT_PC7 <dbl> 0.903658132, 0.488583491, -0.017876349, -0.080402170...
## $ default <fct> 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0...
# Create dummy variables for the categorical variables more than 2 levels
library(dummies)
EDUCATION <- dummy(c$EDUCATION)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
head(EDUCATION) # check that the values are been converted to dummy variables
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd0
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd1
## [1,] 1
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 1
## [6,] 0
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd2
## [1,] 0
## [2,] 1
## [3,] 1
## [4,] 1
## [5,] 0
## [6,] 1
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd3
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd4
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd5
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## C:/Users/Wendy/Desktop/DSP/assignment 3/project/markdown/Combined_DAM_Assig3_rewritecode.Rmd6
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
MARRIAGE <- dummy(c$MARRIAGE)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
SEX <- dummy(c$SEX)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
default <- dummy(c$default)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts =
## FALSE): non-list contrasts argument ignored
# Are there any missing values?
any(is.na(c1))
## [1] FALSE
# Plot data
library (gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
# Histogram of a numeric variable Age
hist(c1$AGE,main = "Histogram of Age",
xlab = "Age")
# summary of Age
summary(c1$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 28.00 34.00 35.47 41.00 79.00
plot(x = c1$MARRIAGE,
main = "Distribution of Marriage",
xlab = "Marriage",
ylab = "count")
plot(x= c1$SEX,main = "Distribution of Gender",
xlab = "Gender",
ylab = "count")
plot( x = c1$EDUCATION, main = "Distribution of Education",
xlab = "Education",
ylab = "Count")
library(ggplot2)
# Scatterplot of a subset of data - non-linear
pairs(c1[, c("SEX","MARRIAGE","AGE","EDUCATION")],
main = "credit training data")
# Density plot of PAY_PC1
p1 <- ggplot(c1, aes(x=c1$PAY_PC1)) +
geom_histogram() +
ggtitle(" Histogram of PAY_PC1")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p2 <- ggplot(c1, aes(x=c1$PAY_PC2)) +
geom_histogram()+
ggtitle(" Histogram of PAY_PC2")
p2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p4 <- ggplot(c1, aes(x=c1$AMT_PC1)) + geom_histogram()+ ggtitle(" Histogram of AMT_PC1")
p4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p5 <- ggplot(c1, aes(x=c1$AMT_PC2)) +
geom_histogram()+ ggtitle(" Histogram of AMT_PC2")
p5
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p6 <- ggplot(c1, aes(x=c1$AMT_PC3)) +
geom_histogram()+
ggtitle(" Histogram of AMT_PC3")
p6
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p7 <- ggplot(c1, aes(x=c1$AMT_PC5)) +
geom_histogram()+
ggtitle(" Histogram of AMT_PC5")
p7
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p8 <- ggplot(c1, aes(x=c1$AMT_PC6)) +
geom_histogram()+
ggtitle(" Histogram of AMT_PC6")
p8
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p9 <- ggplot(c1, aes(x=c1$AMT_PC7)) +
geom_histogram()+
ggtitle(" Histogram of AMT_PC7")
p9
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p10 <- ggplot(c1, aes(x=c1$LIMIT_BAL)) +
geom_histogram()+
ggtitle(" Histogram of Limit Balance")
p10
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p11 <- ggplot(c1, aes(x=c1$AMT_PC4)) +
geom_histogram()+
ggtitle(" Histogram of AMT_PC4")
p11
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(c1$AMT_PC4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -19.171464 -0.068251 0.018161 0.002936 0.081621 21.823749
# Multi-core processing
library(roxygen2)
library(parallel)
library(doParallel)
library(caret)
library(glmnet)
# Create data partition row list for reproducible results
# Setting a random seed ensures we get the same result each time
set.seed(42)
train <- createDataPartition(y = c1$default, p = 0.7, list = F)
# Partition c1 data into two sets
training <- c1[train, ]
testing <- c1[-train, ]
glimpse(testing)
## Observations: 6,300
## Variables: 17
## $ ID <dbl> 10, 13, 14, 21, 22, 24, 26, 27, 34, 38, 39, 41, 57, ...
## $ LIMIT_BAL <dbl> 20000, 360000, 80000, 170000, 20000, 80000, 280000, ...
## $ SEX <fct> 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1...
## $ EDUCATION <fct> 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1...
## $ MARRIAGE <fct> 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 3...
## $ AGE <dbl> 36, 30, 30, 35, 50, 29, 26, 49, 40, 25, 30, 30, 24, ...
## $ PAY_PC1 <dbl> -2.4073889, 1.3401272, -6.0527516, -0.3933076, -9.70...
## $ PAY_PC2 <dbl> -1.83901815, 0.45707589, 1.44151821, 0.17555500, -3....
## $ PAY_PC3 <dbl> -0.415630041, -0.298341076, 1.320680779, 0.004885522...
## $ AMT_PC1 <dbl> -1.3135059, -0.9696446, 0.5172295, 2.2663186, -1.035...
## $ AMT_PC2 <dbl> -0.2005159, 0.9623496, -0.4638258, -0.5892800, -0.42...
## $ AMT_PC3 <dbl> -0.11698565, 0.45622277, -0.26057444, -0.05093772, -...
## $ AMT_PC4 <dbl> 1.471378e-03, -1.038874e+00, 1.104836e-01, 1.012649e...
## $ AMT_PC5 <dbl> -0.018313482, 0.115321882, 0.022706173, 0.009804274,...
## $ AMT_PC6 <dbl> -0.021475238, 0.278215504, -0.166555180, -0.18931989...
## $ AMT_PC7 <dbl> 0.072760274, -0.334222722, -0.062878249, -0.05035034...
## $ default <fct> 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
# Need to modify the Target variable"Gender" to a factor for caret
training$default = factor(ifelse(training$default == 1, "yes", "no"),
levels =c("yes", "no"))
testing$default = factor(ifelse(testing$default == 1, "yes", "no"),
levels =c("yes","no"))
ctrl = trainControl(method = "cv",
number = 5,
classProbs = T,
summaryFunction = twoClassSummary,
allowParallel = TRUE)
#Improve the logistic model_1
# Second Model - remove EDUCATION
Model_2 <- glm(formula = default ~ ID + LIMIT_BAL + SEX + AGE + MARRIAGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7,
data = training,
family = "binomial")
summary(Model_2)
##
## Call:
## glm(formula = default ~ ID + LIMIT_BAL + SEX + AGE + MARRIAGE +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4860 0.1899 0.5705 0.7528 3.0450
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.676e+00 7.129e-01 3.753 0.000174 ***
## ID 1.591e-07 2.376e-06 0.067 0.946613
## LIMIT_BAL 1.869e-06 2.154e-07 8.677 < 2e-16 ***
## SEX2 5.282e-03 4.239e-02 0.125 0.900841
## AGE -1.428e-02 2.463e-03 -5.797 6.76e-09 ***
## MARRIAGE1 -1.252e+00 7.050e-01 -1.776 0.075698 .
## MARRIAGE2 -1.147e+00 7.050e-01 -1.627 0.103822
## MARRIAGE3 -9.639e-01 7.280e-01 -1.324 0.185493
## PAY_PC1 2.935e-01 1.168e-02 25.128 < 2e-16 ***
## PAY_PC2 4.105e-01 2.388e-02 17.191 < 2e-16 ***
## PAY_PC3 -2.916e-01 3.037e-02 -9.600 < 2e-16 ***
## AMT_PC1 8.506e-02 1.220e-02 6.970 3.18e-12 ***
## AMT_PC2 1.753e-01 3.480e-02 5.037 4.73e-07 ***
## AMT_PC3 -5.074e-02 3.602e-02 -1.409 0.158899
## AMT_PC4 -4.429e-02 3.421e-02 -1.295 0.195476
## AMT_PC5 -3.422e-02 3.811e-02 -0.898 0.369171
## AMT_PC6 8.529e-02 3.648e-02 2.338 0.019382 *
## AMT_PC7 9.995e-03 4.945e-02 0.202 0.839813
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14527 on 14683 degrees of freedom
## AIC: 14563
##
## Number of Fisher Scoring iterations: 5
plot(Model_2) # Plot regression diagnostics
# AIC ~ 14639 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Third Model - remove MARRIAGE
Model_3 <- glm(formula = default ~ LIMIT_BAL + SEX + AGE+ EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 ,
data = training,
family = "binomial")
summary(Model_3)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + SEX + AGE + EDUCATION + PAY_PC1 +
## PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 +
## AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4985 0.1762 0.5584 0.7423 2.9529
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.225e+01 1.172e+02 0.105 0.917
## LIMIT_BAL 1.503e-06 2.201e-07 6.826 8.75e-12 ***
## SEX2 3.856e-03 4.234e-02 0.091 0.927
## AGE -1.040e-02 2.290e-03 -4.539 5.64e-06 ***
## EDUCATION1 -1.072e+01 1.172e+02 -0.091 0.927
## EDUCATION2 -1.079e+01 1.172e+02 -0.092 0.927
## EDUCATION3 -1.128e+01 1.172e+02 -0.096 0.923
## EDUCATION4 -9.866e+00 1.172e+02 -0.084 0.933
## EDUCATION5 -9.464e+00 1.172e+02 -0.081 0.936
## EDUCATION6 -1.025e+01 1.172e+02 -0.087 0.930
## PAY_PC1 2.891e-01 1.173e-02 24.642 < 2e-16 ***
## PAY_PC2 4.124e-01 2.399e-02 17.195 < 2e-16 ***
## PAY_PC3 -2.976e-01 3.054e-02 -9.746 < 2e-16 ***
## AMT_PC1 8.373e-02 1.223e-02 6.849 7.46e-12 ***
## AMT_PC2 1.783e-01 3.488e-02 5.112 3.19e-07 ***
## AMT_PC3 -5.322e-02 3.594e-02 -1.481 0.139
## AMT_PC4 -4.672e-02 3.415e-02 -1.368 0.171
## AMT_PC5 -2.992e-02 3.793e-02 -0.789 0.430
## AMT_PC6 8.926e-02 3.631e-02 2.459 0.014 *
## AMT_PC7 1.601e-02 4.919e-02 0.326 0.745
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14416 on 14681 degrees of freedom
## AIC: 14456
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14482 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Fourth Model - remove SEX
Model_4 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE+ EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 ,
data = training,
family = "binomial")
summary(Model_4)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE + EDUCATION +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5108 0.1775 0.5585 0.7411 2.9577
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.375e+01 1.174e+02 0.117 0.906775
## LIMIT_BAL 1.556e-06 2.213e-07 7.033 2.02e-12 ***
## MARRIAGE1 -1.606e+00 7.097e-01 -2.263 0.023648 *
## MARRIAGE2 -1.518e+00 7.097e-01 -2.139 0.032474 *
## MARRIAGE3 -1.298e+00 7.327e-01 -1.771 0.076565 .
## AGE -8.741e-03 2.527e-03 -3.460 0.000541 ***
## EDUCATION1 -1.073e+01 1.174e+02 -0.091 0.927149
## EDUCATION2 -1.079e+01 1.174e+02 -0.092 0.926747
## EDUCATION3 -1.129e+01 1.174e+02 -0.096 0.923408
## EDUCATION4 -9.873e+00 1.174e+02 -0.084 0.932974
## EDUCATION5 -9.461e+00 1.174e+02 -0.081 0.935764
## EDUCATION6 -1.025e+01 1.174e+02 -0.087 0.930403
## PAY_PC1 2.893e-01 1.172e-02 24.678 < 2e-16 ***
## PAY_PC2 4.127e-01 2.400e-02 17.194 < 2e-16 ***
## PAY_PC3 -2.976e-01 3.055e-02 -9.740 < 2e-16 ***
## AMT_PC1 8.373e-02 1.224e-02 6.841 7.86e-12 ***
## AMT_PC2 1.777e-01 3.488e-02 5.096 3.47e-07 ***
## AMT_PC3 -5.370e-02 3.597e-02 -1.493 0.135461
## AMT_PC4 -4.616e-02 3.417e-02 -1.351 0.176722
## AMT_PC5 -2.935e-02 3.794e-02 -0.774 0.439146
## AMT_PC6 8.966e-02 3.630e-02 2.470 0.013516 *
## AMT_PC7 1.543e-02 4.917e-02 0.314 0.753628
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14404 on 14679 degrees of freedom
## AIC: 14448
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14480 , p-values <- 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Fifth Model - remove SEX, EDUCATION
Model_5 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 ,
data = training,
family = "binomial")
summary(Model_5)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + AGE + PAY_PC1 +
## PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 +
## AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4846 0.1898 0.5704 0.7530 3.0463
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.683e+00 7.109e-01 3.773 0.000161 ***
## LIMIT_BAL 1.870e-06 2.153e-07 8.689 < 2e-16 ***
## MARRIAGE1 -1.252e+00 7.048e-01 -1.776 0.075727 .
## MARRIAGE2 -1.147e+00 7.048e-01 -1.627 0.103676
## MARRIAGE3 -9.635e-01 7.278e-01 -1.324 0.185561
## AGE -1.432e-02 2.440e-03 -5.868 4.40e-09 ***
## PAY_PC1 2.935e-01 1.167e-02 25.158 < 2e-16 ***
## PAY_PC2 4.106e-01 2.388e-02 17.195 < 2e-16 ***
## PAY_PC3 -2.916e-01 3.037e-02 -9.599 < 2e-16 ***
## AMT_PC1 8.503e-02 1.220e-02 6.969 3.19e-12 ***
## AMT_PC2 1.753e-01 3.480e-02 5.036 4.75e-07 ***
## AMT_PC3 -5.069e-02 3.602e-02 -1.407 0.159315
## AMT_PC4 -4.425e-02 3.421e-02 -1.293 0.195873
## AMT_PC5 -3.421e-02 3.812e-02 -0.897 0.369502
## AMT_PC6 8.530e-02 3.648e-02 2.338 0.019382 *
## AMT_PC7 1.001e-02 4.945e-02 0.202 0.839583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14527 on 14685 degrees of freedom
## AIC: 14559
##
## Number of Fisher Scoring iterations: 5
# AIC ~ 14635 , p-values > 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Sixth Model - remove AMT_PC3
Model_6 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION+ PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7 ,
data = training,
family = "binomial")
summary(Model_6)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC4 +
## AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4252 0.1879 0.5585 0.7410 2.9599
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.374e+01 1.174e+02 0.117 0.906895
## LIMIT_BAL 1.545e-06 2.214e-07 6.980 2.95e-12 ***
## MARRIAGE1 -1.605e+00 7.101e-01 -2.261 0.023767 *
## MARRIAGE2 -1.516e+00 7.101e-01 -2.135 0.032736 *
## MARRIAGE3 -1.297e+00 7.331e-01 -1.769 0.076838 .
## SEX2 1.122e-02 4.259e-02 0.263 0.792195
## AGE -8.650e-03 2.549e-03 -3.394 0.000689 ***
## EDUCATION1 -1.073e+01 1.174e+02 -0.091 0.927190
## EDUCATION2 -1.079e+01 1.174e+02 -0.092 0.926784
## EDUCATION3 -1.128e+01 1.174e+02 -0.096 0.923454
## EDUCATION4 -9.869e+00 1.174e+02 -0.084 0.933031
## EDUCATION5 -9.458e+00 1.174e+02 -0.081 0.935813
## EDUCATION6 -1.025e+01 1.174e+02 -0.087 0.930451
## PAY_PC1 2.896e-01 1.173e-02 24.689 < 2e-16 ***
## PAY_PC2 4.162e-01 2.386e-02 17.447 < 2e-16 ***
## PAY_PC3 -2.964e-01 3.053e-02 -9.710 < 2e-16 ***
## AMT_PC1 8.158e-02 1.206e-02 6.765 1.33e-11 ***
## AMT_PC2 1.641e-01 3.270e-02 5.019 5.19e-07 ***
## AMT_PC4 -4.059e-02 3.426e-02 -1.184 0.236224
## AMT_PC5 -2.056e-02 3.792e-02 -0.542 0.587671
## AMT_PC6 8.194e-02 3.478e-02 2.356 0.018483 *
## AMT_PC7 1.610e-02 4.598e-02 0.350 0.726268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14406 on 14679 degrees of freedom
## AIC: 14450
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14481 , p-values > 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Seventh Model - remove AMT_PC4
Model_7 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC5 + AMT_PC6 + AMT_PC7 ,
data = training,
family = "binomial")
summary(Model_7)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3926 0.1821 0.5593 0.7408 2.9553
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.374e+01 1.174e+02 0.117 0.906825
## LIMIT_BAL 1.558e-06 2.215e-07 7.035 2.00e-12 ***
## MARRIAGE1 -1.606e+00 7.100e-01 -2.263 0.023665 *
## MARRIAGE2 -1.516e+00 7.100e-01 -2.136 0.032710 *
## MARRIAGE3 -1.297e+00 7.330e-01 -1.770 0.076706 .
## SEX2 1.152e-02 4.260e-02 0.271 0.786735
## AGE -8.665e-03 2.548e-03 -3.401 0.000673 ***
## EDUCATION1 -1.073e+01 1.174e+02 -0.091 0.927126
## EDUCATION2 -1.079e+01 1.174e+02 -0.092 0.926725
## EDUCATION3 -1.129e+01 1.174e+02 -0.096 0.923388
## EDUCATION4 -9.875e+00 1.174e+02 -0.084 0.932944
## EDUCATION5 -9.460e+00 1.174e+02 -0.081 0.935756
## EDUCATION6 -1.025e+01 1.174e+02 -0.087 0.930426
## PAY_PC1 2.895e-01 1.174e-02 24.664 < 2e-16 ***
## PAY_PC2 4.126e-01 2.400e-02 17.195 < 2e-16 ***
## PAY_PC3 -2.996e-01 3.053e-02 -9.813 < 2e-16 ***
## AMT_PC1 8.311e-02 1.220e-02 6.811 9.67e-12 ***
## AMT_PC2 1.725e-01 3.421e-02 5.041 4.63e-07 ***
## AMT_PC3 -4.749e-02 3.523e-02 -1.348 0.177701
## AMT_PC5 -2.059e-02 3.684e-02 -0.559 0.576230
## AMT_PC6 8.804e-02 3.727e-02 2.362 0.018183 *
## AMT_PC7 1.606e-02 4.736e-02 0.339 0.734468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14405 on 14679 degrees of freedom
## AIC: 14449
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14480 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Eighth Model - remove PAY_PC1
Model_8 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION+ AMT_PC5 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC7 ,
data = training,
family = "binomial")
summary(Model_8)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION +
## AMT_PC5 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC4 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7771 0.1444 0.6004 0.7890 1.7788
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.334e+01 1.191e+02 0.112 0.91082
## LIMIT_BAL 3.599e-06 2.140e-07 16.819 < 2e-16 ***
## MARRIAGE1 -1.373e+00 6.499e-01 -2.113 0.03462 *
## MARRIAGE2 -1.277e+00 6.499e-01 -1.965 0.04944 *
## MARRIAGE3 -1.048e+00 6.737e-01 -1.556 0.11981
## SEX2 6.445e-02 4.134e-02 1.559 0.11899
## AGE -6.624e-03 2.470e-03 -2.682 0.00732 **
## EDUCATION1 -1.101e+01 1.191e+02 -0.092 0.92635
## EDUCATION2 -1.114e+01 1.191e+02 -0.094 0.92549
## EDUCATION3 -1.162e+01 1.191e+02 -0.098 0.92228
## EDUCATION4 -9.992e+00 1.191e+02 -0.084 0.93314
## EDUCATION5 -9.632e+00 1.191e+02 -0.081 0.93554
## EDUCATION6 -1.034e+01 1.191e+02 -0.087 0.93081
## AMT_PC5 -5.369e-02 3.946e-02 -1.361 0.17362
## PAY_PC2 3.649e-01 2.349e-02 15.534 < 2e-16 ***
## PAY_PC3 -2.677e-01 3.055e-02 -8.762 < 2e-16 ***
## AMT_PC1 -8.076e-03 1.142e-02 -0.707 0.47944
## AMT_PC2 2.981e-01 3.684e-02 8.092 5.89e-16 ***
## AMT_PC3 -8.131e-02 3.782e-02 -2.150 0.03155 *
## AMT_PC4 -6.192e-02 3.505e-02 -1.766 0.07734 .
## AMT_PC6 1.120e-01 3.714e-02 3.015 0.00257 **
## AMT_PC7 4.770e-02 5.400e-02 0.883 0.37703
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 15061 on 14679 degrees of freedom
## AIC: 15105
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 15172 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Nineth Model - remove AMT_PC5
Model_9 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + EDUCATION + AGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC5 ,
data = training,
family = "binomial")
summary(Model_9)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + EDUCATION +
## AGE + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC4 + AMT_PC6 + AMT_PC5, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4679 0.1760 0.5588 0.7411 2.9552
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.374e+01 1.174e+02 0.117 0.90683
## LIMIT_BAL 1.553e-06 2.214e-07 7.012 2.34e-12 ***
## MARRIAGE1 -1.608e+00 7.100e-01 -2.264 0.02356 *
## MARRIAGE2 -1.518e+00 7.100e-01 -2.138 0.03252 *
## MARRIAGE3 -1.299e+00 7.329e-01 -1.772 0.07632 .
## SEX2 1.202e-02 4.260e-02 0.282 0.77784
## EDUCATION1 -1.073e+01 1.174e+02 -0.091 0.92715
## EDUCATION2 -1.079e+01 1.174e+02 -0.092 0.92675
## EDUCATION3 -1.129e+01 1.174e+02 -0.096 0.92341
## EDUCATION4 -9.871e+00 1.174e+02 -0.084 0.93298
## EDUCATION5 -9.461e+00 1.174e+02 -0.081 0.93576
## EDUCATION6 -1.025e+01 1.174e+02 -0.087 0.93041
## AGE -8.649e-03 2.549e-03 -3.394 0.00069 ***
## PAY_PC1 2.893e-01 1.173e-02 24.654 < 2e-16 ***
## PAY_PC2 4.126e-01 2.401e-02 17.187 < 2e-16 ***
## PAY_PC3 -2.983e-01 3.049e-02 -9.785 < 2e-16 ***
## AMT_PC1 8.397e-02 1.224e-02 6.862 6.81e-12 ***
## AMT_PC2 1.791e-01 3.475e-02 5.154 2.55e-07 ***
## AMT_PC3 -5.400e-02 3.590e-02 -1.504 0.13260
## AMT_PC4 -4.650e-02 3.420e-02 -1.360 0.17392
## AMT_PC6 9.097e-02 3.618e-02 2.515 0.01192 *
## AMT_PC5 -3.233e-02 3.703e-02 -0.873 0.38259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14404 on 14679 degrees of freedom
## AIC: 14448
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14480 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Tenth Model - remove LIMIT_BAL
Model_10 <- glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + AGE + EDUCATION+ PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC5 ,
data = training,
family = "binomial")
summary(Model_10)
##
## Call:
## glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + AGE + EDUCATION +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC4 + AMT_PC6 + AMT_PC5, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6469 0.1624 0.5688 0.7306 3.0133
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.951692 117.826705 0.118 0.90574
## AMT_PC5 -0.030944 0.037867 -0.817 0.41383
## MARRIAGE1 -1.610387 0.716234 -2.248 0.02455 *
## MARRIAGE2 -1.548956 0.716258 -2.163 0.03057 *
## MARRIAGE3 -1.382453 0.739007 -1.871 0.06139 .
## SEX2 0.024767 0.042476 0.583 0.55983
## AGE -0.006925 0.002544 -2.722 0.00649 **
## EDUCATION1 -10.693516 117.824492 -0.091 0.92768
## EDUCATION2 -10.824782 117.824491 -0.092 0.92680
## EDUCATION3 -11.349599 117.824495 -0.096 0.92326
## EDUCATION4 -9.830571 117.825660 -0.083 0.93351
## EDUCATION5 -9.517344 117.824986 -0.081 0.93562
## EDUCATION6 -10.375467 117.825788 -0.088 0.92983
## PAY_PC1 0.318343 0.011082 28.727 < 2e-16 ***
## PAY_PC2 0.417341 0.024020 17.375 < 2e-16 ***
## PAY_PC3 -0.299428 0.030379 -9.856 < 2e-16 ***
## AMT_PC1 0.124136 0.010916 11.372 < 2e-16 ***
## AMT_PC2 0.217815 0.035020 6.220 4.98e-10 ***
## AMT_PC3 -0.047997 0.036694 -1.308 0.19086
## AMT_PC4 -0.050882 0.034825 -1.461 0.14399
## AMT_PC6 0.093999 0.037085 2.535 0.01126 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14455 on 14680 degrees of freedom
## AIC: 14497
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14525 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Eleventh Model - remove AGE
Model_11 <- glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC4 + AMT_PC6 + AMT_PC5 ,
data = training,
family = "binomial")
summary(Model_11)
##
## Call:
## glm(formula = default ~ AMT_PC5 + MARRIAGE + SEX + EDUCATION +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC4 + AMT_PC6 + AMT_PC5, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.6443 0.1631 0.5674 0.7327 2.9584
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.68418 118.21571 0.116 0.9078
## AMT_PC5 -0.03086 0.03781 -0.816 0.4144
## MARRIAGE1 -1.65017 0.71854 -2.297 0.0216 *
## MARRIAGE2 -1.53231 0.71867 -2.132 0.0330 *
## MARRIAGE3 -1.44360 0.74096 -1.948 0.0514 .
## SEX2 0.03932 0.04212 0.933 0.3506
## EDUCATION1 -10.66708 118.21353 -0.090 0.9281
## EDUCATION2 -10.79545 118.21353 -0.091 0.9272
## EDUCATION3 -11.35418 118.21354 -0.096 0.9235
## EDUCATION4 -9.78222 118.21470 -0.083 0.9341
## EDUCATION5 -9.49354 118.21402 -0.080 0.9360
## EDUCATION6 -10.40621 118.21482 -0.088 0.9299
## PAY_PC1 0.31626 0.01105 28.631 < 2e-16 ***
## PAY_PC2 0.41720 0.02400 17.382 < 2e-16 ***
## PAY_PC3 -0.29943 0.03038 -9.856 < 2e-16 ***
## AMT_PC1 0.12219 0.01089 11.218 < 2e-16 ***
## AMT_PC2 0.21804 0.03501 6.228 4.74e-10 ***
## AMT_PC3 -0.04829 0.03667 -1.317 0.1878
## AMT_PC4 -0.05131 0.03476 -1.476 0.1399
## AMT_PC6 0.09384 0.03704 2.533 0.0113 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14462 on 14681 degrees of freedom
## AIC: 14502
##
## Number of Fisher Scoring iterations: 11
# AIC ~ 14529 , p-values < 0.05 ("AMT_PC6", "AMT_PC1",AMT_PC2", "PAY_PC1","PAY_PC2","PAY_PC3","AGE", "LIMIT_BAL")
# Let's stick with the Model_7(removed remove AMT_PC4) because it produced the lowest AIC of 14480
# Let's stick with the Model_7(removed remove AMT_PC4) because it produced the lowest AIC of 14480
# Access coefficients of the fitted model_7
coef(Model_7) # A degree of Multicollinearity exists in Model_7 due to negative coefficients in(AMT_PC1,AMT_PC2,P AMT_PC6,PAY_PC1,PAY_PC2,LIMIT_BAL)
## (Intercept) LIMIT_BAL MARRIAGE1 MARRIAGE2 MARRIAGE3
## 1.373597e+01 1.557942e-06 -1.606421e+00 -1.516401e+00 -1.297488e+00
## SEX2 AGE EDUCATION1 EDUCATION2 EDUCATION3
## 1.152448e-02 -8.664856e-03 -1.073343e+01 -1.079275e+01 -1.128569e+01
## EDUCATION4 EDUCATION5 EDUCATION6 PAY_PC1 PAY_PC2
## -9.874501e+00 -9.459511e+00 -1.024625e+01 2.894638e-01 4.126059e-01
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC5
## -2.996103e-01 8.310676e-02 1.724569e-01 -4.749015e-02 -2.058903e-02
## AMT_PC6 AMT_PC7
## 8.803751e-02 1.606383e-02
# Check presence of multicollinearity
multi <- c1[, c("PAY_PC1","AGE","PAY_PC3","PAY_PC2", "AMT_PC6","AMT_PC3","AMT_PC5","AMT_PC7", "AMT_PC1", "AMT_PC2")]
glimpse(multi)
## Observations: 21,001
## Variables: 10
## $ PAY_PC1 <dbl> 1.44929361, -1.80136756, -0.39330764, -0.39330764, -0....
## $ AGE <dbl> 33, 35, 25, 28, 33, 36, 44, 42, 30, 30, 35, 23, 43, 27...
## $ PAY_PC3 <dbl> -0.362878237, -0.264508711, 0.004885522, 0.004885522, ...
## $ PAY_PC2 <dbl> -1.2723509, 0.7679557, 0.1755550, 0.1755550, -0.349252...
## $ AMT_PC6 <dbl> -0.696090967, 0.159330316, 0.046065937, -0.044970627, ...
## $ AMT_PC3 <dbl> 6.31417677, -0.57493602, -0.10406594, -0.04354691, -0....
## $ AMT_PC5 <dbl> 0.228894439, 0.256636062, -0.013829394, -0.024157751, ...
## $ AMT_PC7 <dbl> 0.903658132, 0.488583491, -0.017876349, -0.080402170, ...
## $ AMT_PC1 <dbl> 2.26658679, 4.12474891, -0.74816346, 1.13057727, -1.71...
## $ AMT_PC2 <dbl> 2.64796345, -1.12245175, -0.37413651, -0.64517001, -0....
# Pair plot
pairs(multi)
# Correlation matrix - few techniques (looking at the strength of a pair of numeric variables, 0 = weak and 1 = strong positive relationship, -1 = strong negative relationship )
cor(multi)
## PAY_PC1 AGE PAY_PC3 PAY_PC2 AMT_PC6
## PAY_PC1 1.000000000 0.059283535 -0.0013301821 0.0059243526 -0.006505793
## AGE 0.059283535 1.000000000 0.0040853707 -0.0028862890 0.002564926
## PAY_PC3 -0.001330182 0.004085371 1.0000000000 -0.0006602639 -0.030816042
## PAY_PC2 0.005924353 -0.002886289 -0.0006602639 1.0000000000 -0.006054684
## AMT_PC6 -0.006505793 0.002564926 -0.0308160421 -0.0060546844 1.000000000
## AMT_PC3 0.037109079 0.004522907 -0.0328408230 -0.0541293031 0.014001444
## AMT_PC5 0.010311492 0.002824440 0.0065640818 -0.0100275635 0.044710483
## AMT_PC7 0.015145512 -0.001968249 -0.0597155232 0.0002993426 0.025348705
## AMT_PC1 -0.252161806 0.061652070 0.0176827839 0.0413118350 -0.015989877
## AMT_PC2 0.243521986 0.014731858 -0.0070392614 0.1205176373 -0.055491388
## AMT_PC3 AMT_PC5 AMT_PC7 AMT_PC1 AMT_PC2
## PAY_PC1 0.037109079 0.010311492 0.0151455119 -0.252161806 0.243521986
## AGE 0.004522907 0.002824440 -0.0019682489 0.061652070 0.014731858
## PAY_PC3 -0.032840823 0.006564082 -0.0597155232 0.017682784 -0.007039261
## PAY_PC2 -0.054129303 -0.010027564 0.0002993426 0.041311835 0.120517637
## AMT_PC6 0.014001444 0.044710483 0.0253487052 -0.015989877 -0.055491388
## AMT_PC3 1.000000000 -0.042850333 0.0258501353 0.028970051 0.141786762
## AMT_PC5 -0.042850333 1.000000000 -0.0528917823 0.012664498 0.067606503
## AMT_PC7 0.025850135 -0.052891782 1.0000000000 0.008783861 0.056185635
## AMT_PC1 0.028970051 0.012664498 0.0087838608 1.000000000 -0.044748851
## AMT_PC2 0.141786762 0.067606503 0.0561856349 -0.044748851 1.000000000
cor(multi, method = "pearson")
## PAY_PC1 AGE PAY_PC3 PAY_PC2 AMT_PC6
## PAY_PC1 1.000000000 0.059283535 -0.0013301821 0.0059243526 -0.006505793
## AGE 0.059283535 1.000000000 0.0040853707 -0.0028862890 0.002564926
## PAY_PC3 -0.001330182 0.004085371 1.0000000000 -0.0006602639 -0.030816042
## PAY_PC2 0.005924353 -0.002886289 -0.0006602639 1.0000000000 -0.006054684
## AMT_PC6 -0.006505793 0.002564926 -0.0308160421 -0.0060546844 1.000000000
## AMT_PC3 0.037109079 0.004522907 -0.0328408230 -0.0541293031 0.014001444
## AMT_PC5 0.010311492 0.002824440 0.0065640818 -0.0100275635 0.044710483
## AMT_PC7 0.015145512 -0.001968249 -0.0597155232 0.0002993426 0.025348705
## AMT_PC1 -0.252161806 0.061652070 0.0176827839 0.0413118350 -0.015989877
## AMT_PC2 0.243521986 0.014731858 -0.0070392614 0.1205176373 -0.055491388
## AMT_PC3 AMT_PC5 AMT_PC7 AMT_PC1 AMT_PC2
## PAY_PC1 0.037109079 0.010311492 0.0151455119 -0.252161806 0.243521986
## AGE 0.004522907 0.002824440 -0.0019682489 0.061652070 0.014731858
## PAY_PC3 -0.032840823 0.006564082 -0.0597155232 0.017682784 -0.007039261
## PAY_PC2 -0.054129303 -0.010027564 0.0002993426 0.041311835 0.120517637
## AMT_PC6 0.014001444 0.044710483 0.0253487052 -0.015989877 -0.055491388
## AMT_PC3 1.000000000 -0.042850333 0.0258501353 0.028970051 0.141786762
## AMT_PC5 -0.042850333 1.000000000 -0.0528917823 0.012664498 0.067606503
## AMT_PC7 0.025850135 -0.052891782 1.0000000000 0.008783861 0.056185635
## AMT_PC1 0.028970051 0.012664498 0.0087838608 1.000000000 -0.044748851
## AMT_PC2 0.141786762 0.067606503 0.0561856349 -0.044748851 1.000000000
cor(multi, method = "spearman")
## PAY_PC1 AGE PAY_PC3 PAY_PC2 AMT_PC6
## PAY_PC1 1.000000000 0.087211938 -0.13277136 0.134995638 0.002073015
## AGE 0.087211938 1.000000000 -0.01316903 0.024629257 0.012307570
## PAY_PC3 -0.132771359 -0.013169030 1.00000000 -0.097668754 -0.085168355
## PAY_PC2 0.134995638 0.024629257 -0.09766875 1.000000000 0.035274819
## AMT_PC6 0.002073015 0.012307570 -0.08516835 0.035274819 1.000000000
## AMT_PC3 0.048972803 0.015561667 -0.03898459 -0.083087619 -0.149867410
## AMT_PC5 0.037181569 0.005468656 0.05099078 -0.027121908 -0.041056891
## AMT_PC7 0.048847388 -0.001202342 -0.19783491 -0.031113969 0.105356551
## AMT_PC1 -0.499905544 0.019019580 0.10023535 -0.003800802 0.081385027
## AMT_PC2 0.499613066 0.002250097 -0.10844990 0.207400725 -0.012594602
## AMT_PC3 AMT_PC5 AMT_PC7 AMT_PC1 AMT_PC2
## PAY_PC1 0.04897280 0.037181569 0.048847388 -0.499905544 0.499613066
## AGE 0.01556167 0.005468656 -0.001202342 0.019019580 0.002250097
## PAY_PC3 -0.03898459 0.050990778 -0.197834913 0.100235351 -0.108449897
## PAY_PC2 -0.08308762 -0.027121908 -0.031113969 -0.003800802 0.207400725
## AMT_PC6 -0.14986741 -0.041056891 0.105356551 0.081385027 -0.012594602
## AMT_PC3 1.00000000 0.165300058 -0.097920284 -0.019170573 0.103695505
## AMT_PC5 0.16530006 1.000000000 -0.197423322 -0.018757471 0.044378508
## AMT_PC7 -0.09792028 -0.197423322 1.000000000 -0.043584598 0.007449202
## AMT_PC1 -0.01917057 -0.018757471 -0.043584598 1.000000000 -0.403230353
## AMT_PC2 0.10369550 0.044378508 0.007449202 -0.403230353 1.000000000
# Calculating probabilities and predictions - Model 7
# Building the logistic regression model and the predictions is given below
# removed AMT_PC4 produced lowest AIC score
Model_7 <- glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION + PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 + AMT_PC5 + AMT_PC6 + AMT_PC7 , data = training,
family = "binomial")
# Obtain regression coefficients
summary(Model_7) #
##
## Call:
## glm(formula = default ~ LIMIT_BAL + MARRIAGE + SEX + AGE + EDUCATION +
## PAY_PC1 + PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
## AMT_PC5 + AMT_PC6 + AMT_PC7, family = "binomial", data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3926 0.1821 0.5593 0.7408 2.9553
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.374e+01 1.174e+02 0.117 0.906825
## LIMIT_BAL 1.558e-06 2.215e-07 7.035 2.00e-12 ***
## MARRIAGE1 -1.606e+00 7.100e-01 -2.263 0.023665 *
## MARRIAGE2 -1.516e+00 7.100e-01 -2.136 0.032710 *
## MARRIAGE3 -1.297e+00 7.330e-01 -1.770 0.076706 .
## SEX2 1.152e-02 4.260e-02 0.271 0.786735
## AGE -8.665e-03 2.548e-03 -3.401 0.000673 ***
## EDUCATION1 -1.073e+01 1.174e+02 -0.091 0.927126
## EDUCATION2 -1.079e+01 1.174e+02 -0.092 0.926725
## EDUCATION3 -1.129e+01 1.174e+02 -0.096 0.923388
## EDUCATION4 -9.875e+00 1.174e+02 -0.084 0.932944
## EDUCATION5 -9.460e+00 1.174e+02 -0.081 0.935756
## EDUCATION6 -1.025e+01 1.174e+02 -0.087 0.930426
## PAY_PC1 2.895e-01 1.174e-02 24.664 < 2e-16 ***
## PAY_PC2 4.126e-01 2.400e-02 17.195 < 2e-16 ***
## PAY_PC3 -2.996e-01 3.053e-02 -9.813 < 2e-16 ***
## AMT_PC1 8.311e-02 1.220e-02 6.811 9.67e-12 ***
## AMT_PC2 1.725e-01 3.421e-02 5.041 4.63e-07 ***
## AMT_PC3 -4.749e-02 3.523e-02 -1.348 0.177701
## AMT_PC5 -2.059e-02 3.684e-02 -0.559 0.576230
## AMT_PC6 8.804e-02 3.727e-02 2.362 0.018183 *
## AMT_PC7 1.606e-02 4.736e-02 0.339 0.734468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16400 on 14700 degrees of freedom
## Residual deviance: 14405 on 14679 degrees of freedom
## AIC: 14449
##
## Number of Fisher Scoring iterations: 11
# Add the probabilities to the testing data (within training data set)
testing$probability <- predict(Model_7,
newdata = testing,
type = "response")
# Assume that the optimum probability threshold is 0.5
# Create the class prediction - our target is the "1" class
testing$prediction = "yes"
testing[testing$probability >= 0.5,
"prediction"] = "yes"
# Have a look at the data. Put data into the model (predicted probabilities)
# probability and prediction
head(testing)
## # A tibble: 6 x 19
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3
## <dbl> <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 10 20000 1 2 1 36 -2.41 -1.84 -0.416
## 2 13 360000 1 1 2 30 1.34 0.457 -0.298
## 3 14 80000 2 2 2 30 -6.05 1.44 1.32
## 4 21 170000 2 2 2 35 -0.393 0.176 0.00489
## 5 22 20000 2 2 1 50 -9.70 -3.82 0.310
## 6 24 80000 2 1 2 29 1.08 -0.906 0.914
## # ... with 10 more variables: AMT_PC1 <dbl>, AMT_PC2 <dbl>, AMT_PC3 <dbl>,
## # AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>,
## # default <fct>, probability <dbl>, prediction <chr>
# write predictions to disk
trainingdata_predictions_logistic <- data.frame(testing,testing$probability)
write.csv(trainingdata_predictions_logistic ,
file="trainingdata_predictions_logistic.csv")
# Evaluation Model_7 logistic on training data
# Create a confusion matrix (along with other measures) using the
# function 'confusionMatrix' from the caret package
library(e1071)
# Generate a Confusion Matrix - 2 class example
# confusionMatrix(data = testing$prediction,testing$default, mode = "everything")
## 2 class example
lvs <- c("yes", "no")
truth <- factor(rep(lvs, times = c(86, 258)),
levels = rev(lvs))
pred <- factor(
c(
rep(lvs, times = c(54, 32)),
rep(lvs, times = c(27, 231))),
levels = rev(lvs))
xtab <- table(pred, truth)
confusionMatrix(xtab)
## Confusion Matrix and Statistics
##
## truth
## pred no yes
## no 231 32
## yes 27 54
##
## Accuracy : 0.8285
## 95% CI : (0.7844, 0.8668)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.0003097
##
## Kappa : 0.5336
##
## Mcnemar's Test P-Value : 0.6025370
##
## Sensitivity : 0.8953
## Specificity : 0.6279
## Pos Pred Value : 0.8783
## Neg Pred Value : 0.6667
## Prevalence : 0.7500
## Detection Rate : 0.6715
## Detection Prevalence : 0.7645
## Balanced Accuracy : 0.7616
##
## 'Positive' Class : no
##
confusionMatrix(pred, truth)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 231 32
## yes 27 54
##
## Accuracy : 0.8285
## 95% CI : (0.7844, 0.8668)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.0003097
##
## Kappa : 0.5336
##
## Mcnemar's Test P-Value : 0.6025370
##
## Sensitivity : 0.8953
## Specificity : 0.6279
## Pos Pred Value : 0.8783
## Neg Pred Value : 0.6667
## Prevalence : 0.7500
## Detection Rate : 0.6715
## Detection Prevalence : 0.7645
## Balanced Accuracy : 0.7616
##
## 'Positive' Class : no
##
confusionMatrix(xtab, prevalence = 0.25)
## Confusion Matrix and Statistics
##
## truth
## pred no yes
## no 231 32
## yes 27 54
##
## Accuracy : 0.8285
## 95% CI : (0.7844, 0.8668)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.0003097
##
## Kappa : 0.5336
##
## Mcnemar's Test P-Value : 0.6025370
##
## Sensitivity : 0.8953
## Specificity : 0.6279
## Pos Pred Value : 0.4451
## Neg Pred Value : 0.9474
## Prevalence : 0.2500
## Detection Rate : 0.6715
## Detection Prevalence : 0.7645
## Balanced Accuracy : 0.7616
##
## 'Positive' Class : no
##
# Accuracy = TP + TN/(TP+TN+FP+FN) 0.245873
acc <- (1549+ 0)/(1549 +0 + 4751 +0) # accuracy of 24.59%
acc
## [1] 0.245873
# training error is 1- accuracy 24.59 = 75.41%
training_error <- 1-0.245873
training_error
## [1] 0.754127
# precision = TP/TP+ FP # 0.245873
prec <- 1549/(1549 + 4751) # precision is 24.59%
prec
## [1] 0.245873
# recall = TP/TP+FN
rec <- 1549/(1549 + 0) # recall is 100%
rec
## [1] 1
# F1 = 2 x (precision x recall)/(precision + recall) # 0.3946999
F1 <- 2*(0.245873 *1)/(0.245873+ 1) # F1 is 39.47%
F1
## [1] 0.3946999
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.