library(caret)
library(ROCR)
library(pROC)
library(Metrics)
library(dplyr)
library(randomForest)
library(ROSE)
credit_train_data <- read.csv("AT2_credit_train_STUDENT.csv", na.strings = c("", " ", NA, "NA"))
str(credit_train_data)
## 'data.frame': 23101 obs. of 17 variables:
## $ ID : int 1 2 3 4 6 7 8 9 10 11 ...
## $ LIMIT_BAL: num 20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
## $ SEX : Factor w/ 5 levels "1","2","cat",..: 2 2 2 2 1 1 2 2 1 2 ...
## $ EDUCATION: int 2 2 2 2 1 1 2 3 3 3 ...
## $ MARRIAGE : int 1 2 2 1 2 2 2 1 2 2 ...
## $ AGE : int 24 26 34 37 37 29 23 28 35 34 ...
## $ PAY_PC1 : num 0.477 -1.462 -0.393 -0.393 -0.393 ...
## $ PAY_PC2 : num -3.225 0.854 0.176 0.176 0.176 ...
## $ PAY_PC3 : num 0.14504 -0.36086 0.00489 0.00489 0.00489 ...
## $ AMT_PC1 : num -1.752 -1.663 -1.135 -0.397 -0.393 ...
## $ AMT_PC2 : num -0.224 -0.144 -0.177 -0.451 -0.5 ...
## $ AMT_PC3 : num -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
## $ AMT_PC4 : num 0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
## $ AMT_PC5 : num -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
## $ AMT_PC6 : num 0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
## $ AMT_PC7 : num -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
## $ default : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...
names(credit_train_data)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE"
## [6] "AGE" "PAY_PC1" "PAY_PC2" "PAY_PC3" "AMT_PC1"
## [11] "AMT_PC2" "AMT_PC3" "AMT_PC4" "AMT_PC5" "AMT_PC6"
## [16] "AMT_PC7" "default"
summary(credit_train_data)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : -99 1 : 9244 Min. :0.000
## 1st Qu.: 7489 1st Qu.: 50000 2 :13854 1st Qu.:1.000
## Median :14987 Median : 140000 cat : 1 Median :2.000
## Mean :14981 Mean : 167524 dog : 1 Mean :1.853
## 3rd Qu.:22452 3rd Qu.: 240000 dolphin: 1 3rd Qu.:2.000
## Max. :30000 Max. :1000000 Max. :6.000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Min. :0.000 Min. : 21.0 Min. :-11.859675 Min. :-4.42243
## 1st Qu.:1.000 1st Qu.: 28.0 1st Qu.: -0.393308 1st Qu.:-0.23617
## Median :2.000 Median : 34.0 Median : -0.393308 Median : 0.17555
## Mean :1.553 Mean : 35.7 Mean : -0.001656 Mean :-0.00177
## 3rd Qu.:2.000 3rd Qu.: 41.0 3rd Qu.: 1.360047 3rd Qu.: 0.36112
## Max. :3.000 Max. :141.0 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50827 1st Qu.:-0.42961
## Median : 0.004886 Median :-0.86433 Median :-0.20780
## Mean : 0.000652 Mean : 0.00461 Mean : 0.00137
## 3rd Qu.: 0.093942 3rd Qu.: 0.49766 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13710 1st Qu.: -0.068199 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00383 Mean : 0.004618 Mean : 0.00148
## 3rd Qu.: 0.00325 3rd Qu.: 0.083236 3rd Qu.: 0.02644
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09273 Y: 5583
## Median : -0.00216 Median : -0.04099
## Mean : -0.00202 Mean : -0.00409
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
head(credit_train_data)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 1 1 20000 2 2 1 24 0.4774630 -3.2245900
## 2 2 120000 2 2 2 26 -1.4616124 0.8538666
## 3 3 90000 2 2 2 34 -0.3933076 0.1755550
## 4 4 50000 2 2 1 37 -0.3933076 0.1755550
## 5 6 50000 1 1 2 37 -0.3933076 0.1755550
## 6 7 500000 1 1 2 29 -0.3933076 0.1755550
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5
## 1 0.145040802 -1.7522077 -0.2243476 -0.07784106 0.006957244 -0.04135696
## 2 -0.360863449 -1.6633432 -0.1438856 -0.05460040 -0.002851947 0.04388912
## 3 0.004885522 -1.1348380 -0.1766087 0.01595485 -0.129071306 0.09824528
## 4 0.004885522 -0.3971748 -0.4510978 -0.09978950 -0.035338969 -0.05530658
## 5 0.004885522 -0.3927966 -0.5002107 -0.10334556 -0.117902012 -0.05457605
## 6 0.004885522 15.7185478 -0.6627291 -1.62093162 0.486746050 -0.47437084
## AMT_PC6 AMT_PC7 default
## 1 0.0008865935 -0.05626505 Y
## 2 -0.0261897987 -0.09997756 Y
## 3 -0.0223825102 -0.06898686 N
## 4 0.0504654915 -0.02820475 N
## 5 0.1121369503 0.01863707 N
## 6 -0.2846417994 0.70158577 N
unique(credit_train_data$SEX)
## [1] 2 1 dolphin cat dog
## Levels: 1 2 cat dog dolphin
as.data.frame(table(credit_train_data$SEX))
## Var1 Freq
## 1 1 9244
## 2 2 13854
## 3 cat 1
## 4 dog 1
## 5 dolphin 1
unique(credit_train_data$MARRIAGE)
## [1] 1 2 3 0
as.data.frame(table(credit_train_data$MARRIAGE))
## Var1 Freq
## 1 0 38
## 2 1 10510
## 3 2 12304
## 4 3 249
unique(credit_train_data$EDUCATION)
## [1] 2 1 3 5 4 6 0
as.data.frame(table(credit_train_data$EDUCATION))
## Var1 Freq
## 1 0 11
## 2 1 8192
## 3 2 10724
## 4 3 3821
## 5 4 88
## 6 5 229
## 7 6 36
new_credit_train_data <- credit_train_data
str(credit_train_data)
## 'data.frame': 23101 obs. of 17 variables:
## $ ID : int 1 2 3 4 6 7 8 9 10 11 ...
## $ LIMIT_BAL: num 20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
## $ SEX : Factor w/ 5 levels "1","2","cat",..: 2 2 2 2 1 1 2 2 1 2 ...
## $ EDUCATION: int 2 2 2 2 1 1 2 3 3 3 ...
## $ MARRIAGE : int 1 2 2 1 2 2 2 1 2 2 ...
## $ AGE : int 24 26 34 37 37 29 23 28 35 34 ...
## $ PAY_PC1 : num 0.477 -1.462 -0.393 -0.393 -0.393 ...
## $ PAY_PC2 : num -3.225 0.854 0.176 0.176 0.176 ...
## $ PAY_PC3 : num 0.14504 -0.36086 0.00489 0.00489 0.00489 ...
## $ AMT_PC1 : num -1.752 -1.663 -1.135 -0.397 -0.393 ...
## $ AMT_PC2 : num -0.224 -0.144 -0.177 -0.451 -0.5 ...
## $ AMT_PC3 : num -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
## $ AMT_PC4 : num 0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
## $ AMT_PC5 : num -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
## $ AMT_PC6 : num 0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
## $ AMT_PC7 : num -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
## $ default : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...
unique(new_credit_train_data$SEX)
## [1] 2 1 dolphin cat dog
## Levels: 1 2 cat dog dolphin
as.data.frame(table(new_credit_train_data$SEX))
## Var1 Freq
## 1 1 9244
## 2 2 13854
## 3 cat 1
## 4 dog 1
## 5 dolphin 1
new_credit_train_data$SEX <- factor(new_credit_train_data$SEX,
levels=c(1, 2, "dolphin", "cat", "dog"),
labels=c("male","female", "other", "other", "other"))
unique(new_credit_train_data$SEX)
## [1] female male other
## Levels: male female other
as.data.frame(table(new_credit_train_data$SEX))
## Var1 Freq
## 1 male 9244
## 2 female 13854
## 3 other 3
unique(new_credit_train_data$MARRIAGE)
## [1] 1 2 3 0
as.data.frame(table(new_credit_train_data$MARRIAGE))
## Var1 Freq
## 1 0 38
## 2 1 10510
## 3 2 12304
## 4 3 249
new_credit_train_data$MARRIAGE <- as.factor(new_credit_train_data$MARRIAGE)
table(new_credit_train_data$MARRIAGE)
##
## 0 1 2 3
## 38 10510 12304 249
new_credit_train_data$MARRIAGE <- factor(new_credit_train_data$MARRIAGE,
levels=c(0, 1, 2, 3),
labels=c("others","married","single","others"))
unique(new_credit_train_data$MARRIAGE)
## [1] married single others
## Levels: others married single
as.data.frame(table(new_credit_train_data$MARRIAGE))
## Var1 Freq
## 1 others 287
## 2 married 10510
## 3 single 12304
unique(new_credit_train_data$EDUCATION)
## [1] 2 1 3 5 4 6 0
as.data.frame(table(new_credit_train_data$EDUCATION))
## Var1 Freq
## 1 0 11
## 2 1 8192
## 3 2 10724
## 4 3 3821
## 5 4 88
## 6 5 229
## 7 6 36
new_credit_train_data$EDUCATION <- as.factor(new_credit_train_data$EDUCATION)
table(new_credit_train_data$EDUCATION)
##
## 0 1 2 3 4 5 6
## 11 8192 10724 3821 88 229 36
new_credit_train_data$EDUCATION <- factor(new_credit_train_data$EDUCATION,
levels=c(0, 1, 2, 3, 4, 5, 6),
labels=c("unknown","graduate school", "university", "high school",
"others", "unknown", "unknown"))
unique(new_credit_train_data$EDUCATION)
## [1] university graduate school high school unknown
## [5] others
## Levels: unknown graduate school university high school others
as.data.frame(table(new_credit_train_data$EDUCATION))
## Var1 Freq
## 1 unknown 276
## 2 graduate school 8192
## 3 university 10724
## 4 high school 3821
## 5 others 88
table(new_credit_train_data$default)
##
## N Y
## 17518 5583
str(new_credit_train_data)
## 'data.frame': 23101 obs. of 17 variables:
## $ ID : int 1 2 3 4 6 7 8 9 10 11 ...
## $ LIMIT_BAL: num 20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
## $ SEX : Factor w/ 3 levels "male","female",..: 2 2 2 2 1 1 2 2 1 2 ...
## $ EDUCATION: Factor w/ 5 levels "unknown","graduate school",..: 3 3 3 3 2 2 3 4 4 4 ...
## $ MARRIAGE : Factor w/ 3 levels "others","married",..: 2 3 3 2 3 3 3 2 3 3 ...
## $ AGE : int 24 26 34 37 37 29 23 28 35 34 ...
## $ PAY_PC1 : num 0.477 -1.462 -0.393 -0.393 -0.393 ...
## $ PAY_PC2 : num -3.225 0.854 0.176 0.176 0.176 ...
## $ PAY_PC3 : num 0.14504 -0.36086 0.00489 0.00489 0.00489 ...
## $ AMT_PC1 : num -1.752 -1.663 -1.135 -0.397 -0.393 ...
## $ AMT_PC2 : num -0.224 -0.144 -0.177 -0.451 -0.5 ...
## $ AMT_PC3 : num -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
## $ AMT_PC4 : num 0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
## $ AMT_PC5 : num -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
## $ AMT_PC6 : num 0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
## $ AMT_PC7 : num -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
## $ default : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...
Data partitioning for Train and Test. Setting the seed makes this process repeatable. ###————————————————-
set.seed (123)
ind_samples <- sample(2, nrow(new_credit_train_data),
replace = TRUE, prob = c(0.7, 0.3))
train_data <- new_credit_train_data[ind_samples == 1, ]
test_data <- new_credit_train_data[ind_samples == 2, ]
summary(train_data)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : -99 male :6453 unknown : 187
## 1st Qu.: 7456 1st Qu.: 50000 female:9734 graduate school:5706
## Median :14942 Median : 140000 other : 1 university :7588
## Mean :14930 Mean : 167033 high school :2651
## 3rd Qu.:22418 3rd Qu.: 240000 others : 56
## Max. :30000 Max. :1000000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## others : 198 Min. : 21.00 Min. :-11.859675 Min. :-4.422427
## married:7385 1st Qu.: 28.00 1st Qu.: -0.393308 1st Qu.:-0.229614
## single :8605 Median : 34.00 Median : -0.393308 Median : 0.175555
## Mean : 35.68 Mean : -0.003157 Mean : 0.001224
## 3rd Qu.: 41.00 3rd Qu.: 1.360047 3rd Qu.: 0.361123
## Max. :141.00 Max. : 3.813348 Max. : 5.441026
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-2.16157 Min. :-3.65090
## 1st Qu.:-0.283941 1st Qu.:-1.51237 1st Qu.:-0.43012
## Median : 0.004886 Median :-0.86176 Median :-0.20818
## Mean : 0.000117 Mean :-0.01655 Mean : 0.00755
## 3rd Qu.: 0.057637 3rd Qu.: 0.48514 3rd Qu.: 0.09444
## Max. : 3.192124 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.59342 Min. :-42.37665
## 1st Qu.: -0.13586 1st Qu.: -0.06955 1st Qu.: -0.08213
## Median : -0.07044 Median : 0.01831 Median : -0.03200
## Mean : 0.00016 Mean : 0.00206 Mean : -0.00546
## 3rd Qu.: 0.00248 3rd Qu.: 0.08183 3rd Qu.: 0.02559
## Max. : 19.96871 Max. : 18.95330 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:12254
## 1st Qu.: -0.04236 1st Qu.: -0.09115 Y: 3934
## Median : -0.00273 Median : -0.04034
## Mean : 0.00168 Mean : -0.00473
## 3rd Qu.: 0.06669 3rd Qu.: 0.03259
## Max. : 20.22670 Max. : 22.92727
summary(test_data)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 2 Min. : -99 male :2791 unknown : 89
## 1st Qu.: 7553 1st Qu.: 50000 female:4120 graduate school:2486
## Median :15079 Median :140000 other : 2 university :3136
## Mean :15100 Mean :168672 high school :1170
## 3rd Qu.:22571 3rd Qu.:240000 others : 32
## Max. :29994 Max. :780000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## others : 89 Min. : 21.00 Min. :-11.851683 Min. :-4.32285
## married:3125 1st Qu.: 28.00 1st Qu.: -0.393308 1st Qu.:-0.26506
## single :3699 Median : 34.00 Median : -0.393308 Median : 0.17555
## Mean : 35.77 Mean : 0.001857 Mean :-0.00878
## 3rd Qu.: 42.00 3rd Qu.: 1.360047 3rd Qu.: 0.33574
## Max. :139.00 Max. : 3.813348 Max. : 4.52335
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.085259 Min. :-3.41079 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50052 1st Qu.:-0.42752
## Median : 0.004886 Median :-0.86822 Median :-0.20665
## Mean : 0.001904 Mean : 0.05416 Mean :-0.01312
## 3rd Qu.: 0.126490 3rd Qu.: 0.52283 3rd Qu.: 0.08335
## Max. : 3.364030 Max. :20.06020 Max. :12.33953
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-6.88565 Min. :-16.09638 Min. :-13.50682
## 1st Qu.:-0.13927 1st Qu.: -0.06430 1st Qu.: -0.08283
## Median :-0.07044 Median : 0.01867 Median : -0.03200
## Mean : 0.01243 Mean : 0.01061 Mean : 0.01775
## 3rd Qu.: 0.00476 3rd Qu.: 0.08625 3rd Qu.: 0.02777
## Max. :21.98483 Max. : 21.82375 Max. : 15.95365
## AMT_PC6 AMT_PC7 default
## Min. :-22.827378 Min. :-8.404867 N:5264
## 1st Qu.: -0.042631 1st Qu.:-0.096352 Y:1649
## Median : -0.001124 Median :-0.042018
## Mean : -0.010677 Mean :-0.002592
## 3rd Qu.: 0.070155 3rd Qu.: 0.029146
## Max. : 14.565513 Max. :12.890726
names(train_data)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE"
## [6] "AGE" "PAY_PC1" "PAY_PC2" "PAY_PC3" "AMT_PC1"
## [11] "AMT_PC2" "AMT_PC3" "AMT_PC4" "AMT_PC5" "AMT_PC6"
## [16] "AMT_PC7" "default"
names(test_data)
## [1] "ID" "LIMIT_BAL" "SEX" "EDUCATION" "MARRIAGE"
## [6] "AGE" "PAY_PC1" "PAY_PC2" "PAY_PC3" "AMT_PC1"
## [11] "AMT_PC2" "AMT_PC3" "AMT_PC4" "AMT_PC5" "AMT_PC6"
## [16] "AMT_PC7" "default"
nrow(train_data)
## [1] 16188
nrow(test_data)
## [1] 6913
set.seed(333)
rf_all <- randomForest(default~., data = train_data)
rf_all
##
## Call:
## randomForest(formula = default ~ ., data = train_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 18.5%
## Confusion matrix:
## N Y class.error
## N 11465 789 0.06438714
## Y 2205 1729 0.56049822
p_rf_all <- predict(rf_all, test_data)
confusionMatrix(p_rf_all, test_data$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 4927 899
## Y 337 750
##
## Accuracy : 0.8212
## 95% CI : (0.812, 0.8302)
## No Information Rate : 0.7615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4426
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9360
## Specificity : 0.4548
## Pos Pred Value : 0.8457
## Neg Pred Value : 0.6900
## Prevalence : 0.7615
## Detection Rate : 0.7127
## Detection Prevalence : 0.8428
## Balanced Accuracy : 0.6954
##
## 'Positive' Class : N
##
prop.table(table(test_data$default))
##
## N Y
## 0.7614639 0.2385361
barplot(prop.table(table(test_data$default)),
col = rainbow(2),
ylim = c(0, 0.7),
main = "Class Distribution"
)
table(train_data$default)
##
## N Y
## 12254 3934
table(test_data$default)
##
## N Y
## 5264 1649
prop.table(table(train_data$default))
##
## N Y
## 0.7569805 0.2430195
prop.table(table(test_data$default))
##
## N Y
## 0.7614639 0.2385361
confusionMatrix(predict(rf_all, test_data), test_data$default, positive = 'Y')
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 4926 900
## Y 338 749
##
## Accuracy : 0.8209
## 95% CI : (0.8117, 0.8299)
## No Information Rate : 0.7615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4417
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4542
## Specificity : 0.9358
## Pos Pred Value : 0.6891
## Neg Pred Value : 0.8455
## Prevalence : 0.2385
## Detection Rate : 0.1083
## Detection Prevalence : 0.1572
## Balanced Accuracy : 0.6950
##
## 'Positive' Class : Y
##
over_sample_all <- ovun.sample(default~., data = train_data, method = "over",
N = 24508)$data
table(over_sample_all$default)
##
## N Y
## 12254 12254
summary(over_sample_all)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : -99 male :10040 unknown : 210
## 1st Qu.: 7375 1st Qu.: 50000 female:14465 graduate school: 7683
## Median :14678 Median : 115000 other : 3 university :11739
## Mean :14810 Mean : 150948 high school : 4813
## 3rd Qu.:22154 3rd Qu.: 220000 others : 63
## Max. :30000 Max. :1000000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## others : 290 Min. : 21.00 Min. :-11.8597 Min. :-4.42243
## married:11348 1st Qu.: 28.00 1st Qu.: -1.1331 1st Qu.:-0.57016
## single :12870 Median : 34.00 Median : -0.3933 Median : 0.17555
## Mean : 35.75 Mean : -0.3536 Mean :-0.07958
## 3rd Qu.: 42.00 3rd Qu.: 1.0595 3rd Qu.: 0.26834
## Max. :141.00 Max. : 3.8133 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-2.16157 Min. :-3.65090
## 1st Qu.:-0.283941 1st Qu.:-1.51943 1st Qu.:-0.44408
## Median : 0.004886 Median :-0.88170 Median :-0.22022
## Mean : 0.025415 Mean :-0.06528 Mean :-0.07655
## 3rd Qu.: 0.293712 3rd Qu.: 0.40786 3rd Qu.:-0.01147
## Max. : 3.192124 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.12791 1st Qu.: -0.058516 1st Qu.: -0.07982
## Median : -0.07044 Median : 0.018674 Median : -0.03200
## Mean : -0.00798 Mean : 0.000715 Mean : -0.00634
## 3rd Qu.: -0.00496 3rd Qu.: 0.078870 3rd Qu.: 0.01895
## Max. : 19.96871 Max. : 18.953301 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:12254
## 1st Qu.: -0.04154 1st Qu.: -0.08918 Y:12254
## Median : -0.00462 Median : -0.04189
## Mean : -0.00665 Mean : -0.01018
## 3rd Qu.: 0.05843 3rd Qu.: 0.02538
## Max. : 20.22670 Max. : 22.92727
rf_over_all <- randomForest(default~., data = over_sample_all)
p_rf_over_all <- predict(rf_over_all, test_data)
confusionMatrix(p_rf_over_all, test_data$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 4834 811
## Y 430 838
##
## Accuracy : 0.8205
## 95% CI : (0.8112, 0.8295)
## No Information Rate : 0.7615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4633
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9183
## Specificity : 0.5082
## Pos Pred Value : 0.8563
## Neg Pred Value : 0.6609
## Prevalence : 0.7615
## Detection Rate : 0.6993
## Detection Prevalence : 0.8166
## Balanced Accuracy : 0.7132
##
## 'Positive' Class : N
##
confusionMatrix(predict(rf_over_all, test_data), test_data$default, positive = 'Y')
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 4833 810
## Y 431 839
##
## Accuracy : 0.8205
## 95% CI : (0.8112, 0.8295)
## No Information Rate : 0.7615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4635
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5088
## Specificity : 0.9181
## Pos Pred Value : 0.6606
## Neg Pred Value : 0.8565
## Prevalence : 0.2385
## Detection Rate : 0.1214
## Detection Prevalence : 0.1837
## Balanced Accuracy : 0.7135
##
## 'Positive' Class : Y
##
over_sample <- ovun.sample(default ~ LIMIT_BAL + EDUCATION +
MARRIAGE + AGE + PAY_PC1 + PAY_PC2 +
PAY_PC3 + AMT_PC1 + AMT_PC2 + AMT_PC3 +
AMT_PC4 + AMT_PC5 + AMT_PC6 + AMT_PC7,
data = train_data, method = "over",
N = 24508)$data
table(over_sample$default)
##
## N Y
## 12254 12254
summary(over_sample)
## LIMIT_BAL EDUCATION MARRIAGE
## Min. : -99 unknown : 218 others : 310
## 1st Qu.: 50000 graduate school: 7788 married:11361
## Median : 110000 university :11556 single :12837
## Mean : 151296 high school : 4887
## 3rd Qu.: 220000 others : 59
## Max. :1000000
## AGE PAY_PC1 PAY_PC2
## Min. : 21.00 Min. :-11.8597 Min. :-4.42243
## 1st Qu.: 28.00 1st Qu.: -1.1331 1st Qu.:-0.57016
## Median : 34.00 Median : -0.3933 Median : 0.17555
## Mean : 35.74 Mean : -0.3592 Mean :-0.07503
## 3rd Qu.: 42.00 3rd Qu.: 1.0595 3rd Qu.: 0.31753
## Max. :141.00 Max. : 3.8133 Max. : 4.58191
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-2.16157 Min. :-3.65090
## 1st Qu.:-0.283941 1st Qu.:-1.51637 1st Qu.:-0.44760
## Median : 0.004886 Median :-0.86916 Median :-0.22022
## Mean : 0.020150 Mean :-0.05157 Mean :-0.07630
## 3rd Qu.: 0.267527 3rd Qu.: 0.44130 3rd Qu.:-0.01309
## Max. : 3.192124 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.12658 1st Qu.: -0.058751 1st Qu.: -0.07879
## Median : -0.07044 Median : 0.018674 Median : -0.03200
## Mean : -0.00434 Mean : 0.003546 Mean : -0.00178
## 3rd Qu.: -0.00436 3rd Qu.: 0.078247 3rd Qu.: 0.01972
## Max. : 19.96871 Max. : 18.953301 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:12254
## 1st Qu.: -0.04177 1st Qu.: -0.09003 Y:12254
## Median : -0.00462 Median : -0.04168
## Mean : -0.00611 Mean : -0.01433
## 3rd Qu.: 0.05897 3rd Qu.: 0.02497
## Max. : 20.22670 Max. : 22.92727
rf_over <- randomForest(default~., data = over_sample)
p_rf_over <- predict(rf_over, test_data)
# Confusion Matrix for the Model rf_over_all.
confusionMatrix(p_rf_over, test_data$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 4794 807
## Y 470 842
##
## Accuracy : 0.8153
## 95% CI : (0.8059, 0.8244)
## No Information Rate : 0.7615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4531
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9107
## Specificity : 0.5106
## Pos Pred Value : 0.8559
## Neg Pred Value : 0.6418
## Prevalence : 0.7615
## Detection Rate : 0.6935
## Detection Prevalence : 0.8102
## Balanced Accuracy : 0.7107
##
## 'Positive' Class : N
##
confusionMatrix(predict(rf_over, test_data), test_data$default, positive = 'Y')
## Confusion Matrix and Statistics
##
## Reference
## Prediction N Y
## N 4794 807
## Y 470 842
##
## Accuracy : 0.8153
## 95% CI : (0.8059, 0.8244)
## No Information Rate : 0.7615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4531
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5106
## Specificity : 0.9107
## Pos Pred Value : 0.6418
## Neg Pred Value : 0.8559
## Prevalence : 0.2385
## Detection Rate : 0.1218
## Detection Prevalence : 0.1898
## Balanced Accuracy : 0.7107
##
## 'Positive' Class : Y
##
pred_over <- predict(rf_over, test_data, type = "response")
test_data$pred_over <- pred_over
table(test_data$pred_over)
##
## N Y
## 5602 1311
table(test_data$default)
##
## N Y
## 5264 1649
test_data$default <- ifelse(test_data$default == "Y", 1,0)
test_data$pred_over <- ifelse(test_data$pred_over == "Y", 1,0)
ROC_over <- roc(test_data$pred_over, test_data$default)
ROC_over
##
## Call:
## roc.default(response = test_data$pred_over, predictor = test_data$default)
##
## Data: test_data$default in 5602 controls (test_data$pred_over 0) < 1311 cases (test_data$pred_over 1).
## Area under the curve: 0.7482
plot(ROC_over, col = "blue")
auc(test_data$pred_over, test_data$default)
## [1] 0.7481598