caret package for MLlibrary(tidyverse, quietly = T)
library(dplyr, quietly = T) #used by Caret
library(ggplot2, quietly = T)
library(corrplot, quietly =T)
library(caret, quietly = T)
library(Amelia, quietly = T)
library(gridExtra, quietly = T)
library(xgboost, quietly = T)
library(ROCR, quietly = T)
library(Matrix, quietly = T)
library(rpart.plot, quietly =T)
#setwd("~/Documents/brookstruong")
#Import the data
training.raw <- read_csv('AT2_credit_train_STUDENT.csv')
predict.raw <- read_csv('AT2_credit_test_STUDENT.csv')
23,101 observations over 17 variables for the training/test split 6899 observations over 17 variables for the prediction set.
glimpse(training.raw)
## Observations: 23,101
## Variables: 17
## $ ID <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 500000, 100000, ...
## $ SEX <int> 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2...
## $ EDUCATION <int> 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 3...
## $ MARRIAGE <int> 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2...
## $ AGE <int> 24, 26, 34, 37, 37, 29, 23, 28, 35, 34, 51, 41, 30, ...
## $ PAY_PC1 <dbl> 0.47746305, -1.46161240, -0.39330764, -0.39330764, -...
## $ PAY_PC2 <dbl> -3.224589961, 0.853866590, 0.175554996, 0.175554996,...
## $ PAY_PC3 <dbl> 0.145040802, -0.360863449, 0.004885522, 0.004885522,...
## $ AMT_PC1 <dbl> -1.7522077, -1.6633432, -1.1348380, -0.3971748, -0.3...
## $ AMT_PC2 <dbl> -0.22434761, -0.14388558, -0.17660872, -0.45109777, ...
## $ AMT_PC3 <dbl> -0.077841055, -0.054600404, 0.015954854, -0.09978950...
## $ AMT_PC4 <dbl> 0.006957244, -0.002851947, -0.129071306, -0.03533896...
## $ AMT_PC5 <dbl> -0.041356958, 0.043889122, 0.098245278, -0.055306582...
## $ AMT_PC6 <dbl> 0.0008865935, -0.0261897987, -0.0223825102, 0.050465...
## $ AMT_PC7 <dbl> -0.05626505, -0.09997756, -0.06898686, -0.02820475, ...
## $ default <chr> "Y", "Y", "N", "N", "N", "N", "N", "Y", "N", "N", "N...
training.raw$default <- as.factor(training.raw$default)
training.raw$SEX <- as.factor(training.raw$SEX)
training.raw$MARRIAGE <- as.factor(training.raw$MARRIAGE)
training.raw$AGE <- as.numeric(training.raw$AGE)
#remove random anomilies
#remove NA - 2 rows from the sex being cat/dog
training.raw <- training.raw[complete.cases(training.raw), ]
#set negative balance values to 0. Max Limit_bal for test set is 760K and validation set for 800k. Lets set these at median value.
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL <= 0] <- 0
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL >800000] <-140000
#all education factors greater than 4, set them to 4, also set 0 to 4
training.raw$EDUCATION[training.raw$EDUCATION == 5] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 6] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 0] <- 4
training.raw$EDUCATION <- factor(training.raw$EDUCATION)
#clean marriage
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE)
training.raw$MARRIAGE[training.raw$MARRIAGE == 0] <- 3
#lets label our factors properly
training.raw$SEX <- factor(training.raw$SEX,
levels =c('1', '2'),
labels =c('Male', 'Female'))
training.raw$EDUCATION[training.raw$EDUCATION == 2] <- 4
training.raw$EDUCATION <- factor(training.raw$EDUCATION,
levels =c('1', '2', '3', '4'),
labels = c('Grad_School', 'Uni', 'High_School', 'Other'))
training.raw$EDUCATION <- as.factor(training.raw$EDUCATION)
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE,
levels = c('1','2','3'),
labels = c('Married', 'Single', 'Other'))
summary(training.raw)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : 0 Male : 9244 Grad_School: 8192
## 1st Qu.: 7488 1st Qu.: 50000 Female:13854 Uni : 0
## Median :14989 Median :140000 High_School: 3820
## Mean :14982 Mean :167479 Other :11086
## 3rd Qu.:22453 3rd Qu.:240000
## Max. :30000 Max. :780000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Married:10507 Min. : 21.0 Min. :-11.859675 Min. :-4.42243
## Single :12304 1st Qu.: 28.0 1st Qu.: -0.393308 1st Qu.:-0.23617
## Other : 287 Median : 34.0 Median : -0.393308 Median : 0.17555
## Mean : 35.7 Mean : -0.001513 Mean :-0.00187
## 3rd Qu.: 41.0 3rd Qu.: 1.360047 3rd Qu.: 0.36042
## Max. :141.0 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50825 1st Qu.:-0.42972
## Median : 0.004886 Median :-0.86429 Median :-0.20781
## Mean : 0.000605 Mean : 0.00431 Mean : 0.00124
## 3rd Qu.: 0.093942 3rd Qu.: 0.49754 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13709 1st Qu.: -0.068216 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00396 Mean : 0.004458 Mean : 0.00144
## 3rd Qu.: 0.00325 3rd Qu.: 0.083214 3rd Qu.: 0.02637
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09262 Y: 5580
## Median : -0.00216 Median : -0.04099
## Mean : -0.00159 Mean : -0.00406
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
cplot <- training.raw %>%
select(-default) %>%
select_if(is.numeric)
M <- cor(cplot)
p.mat <- cor.mtest(M)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M,
method = "color",
order= "hclust",
type="full",
col=col(200),
diag =F,
title="Correlation of Numeric Variables",
addCoef.col = "black",
sig.level = 0.05,
insig ="blank",
mar=c(0,0,3,0))
training.raw$AGE <- ifelse(training.raw$AGE >=100, NA, training.raw$AGE)
map_int(training.raw,~sum(is.na(.x)))
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1
## 0 0 0 0 0 50 0
## PAY_PC2 PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5
## 0 0 0 0 0 0 0
## AMT_PC6 AMT_PC7 default
## 0 0 0
'we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age'
## [1] "we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age"
train.imp <- training.raw[-1]
map_int(train.imp,~sum(is.na(.x)))
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 0 0 0 0 50 0 0
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5 AMT_PC6
## 0 0 0 0 0 0 0
## AMT_PC7 default
## 0 0
head(train.imp)
## # A tibble: 6 x 16
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3 AMT_PC1
## <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20000 Fema~ Other Married 24 0.477 -3.22 0.145 -1.75
## 2 120000 Fema~ Other Single 26 -1.46 0.854 -0.361 -1.66
## 3 90000 Fema~ Other Single 34 -0.393 0.176 0.00489 -1.13
## 4 50000 Fema~ Other Married 37 -0.393 0.176 0.00489 -0.397
## 5 50000 Male Grad_Sch~ Single 37 -0.393 0.176 0.00489 -0.393
## 6 500000 Male Grad_Sch~ Single 29 -0.393 0.176 0.00489 15.7
## # ... with 7 more variables: AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>,
## # AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>
age.predictors <- train.imp %>%
select(LIMIT_BAL, SEX, EDUCATION, MARRIAGE, AGE, PAY_PC1) %>%
filter(complete.cases(.))
ctrl <- trainControl(method = "repeatedcv",
repeats = 5)
rpartGrid <- data.frame(maxdepth = seq(2,10,1))
rpartFit_ageimputation <- train (x=age.predictors[,-3],
y=age.predictors$AGE,
method='rpart2',
trControl = ctrl,
tuneGrid = rpartGrid)
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
rpartFit_ageimputation
## CART
##
## 23048 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 20743, 20743, 20742, 20742, 20744, 20745, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 2 4.019416 0.8085220 3.332353
## 3 2.770871 0.9090355 2.243507
## 4 2.450554 0.9288340 2.080836
## 5 2.130606 0.9462219 1.782894
## 6 1.804709 0.9614172 1.464249
## 7 1.456406 0.9748723 1.179393
## 8 1.456406 0.9748723 1.179393
## 9 1.456406 0.9748723 1.179393
## 10 1.456406 0.9748723 1.179393
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 7.
plot(rpartFit_ageimputation)
# Tree depth of 7 is optimal
rpart.plot(rpartFit_ageimputation$finalModel)
#save the model externally to load back in
saveRDS(rpartFit_ageimputation, file = 'rpartFit_ageimputation.rds')
#insert missing age back into the dataset
missing_age <- is.na(train.imp$AGE)
age.predicted <- predict(rpartFit_ageimputation, newdata = train.imp[missing_age,])
train.imp[missing_age, 'AGE'] <- age.predicted
summary(train.imp)
## LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 0 Male : 9244 Grad_School: 8192 Married:10507
## 1st Qu.: 50000 Female:13854 Uni : 0 Single :12304
## Median :140000 High_School: 3820 Other : 287
## Mean :167479 Other :11086
## 3rd Qu.:240000
## Max. :780000
## AGE PAY_PC1 PAY_PC2
## Min. :21.00 Min. :-11.859675 Min. :-4.42243
## 1st Qu.:28.00 1st Qu.: -0.393308 1st Qu.:-0.23617
## Median :34.00 Median : -0.393308 Median : 0.17555
## Mean :35.48 Mean : -0.001513 Mean :-0.00187
## 3rd Qu.:41.00 3rd Qu.: 1.360047 3rd Qu.: 0.36042
## Max. :75.00 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50825 1st Qu.:-0.42972
## Median : 0.004886 Median :-0.86429 Median :-0.20781
## Mean : 0.000605 Mean : 0.00431 Mean : 0.00124
## 3rd Qu.: 0.093942 3rd Qu.: 0.49754 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13709 1st Qu.: -0.068216 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00396 Mean : 0.004458 Mean : 0.00144
## 3rd Qu.: 0.00325 3rd Qu.: 0.083214 3rd Qu.: 0.02637
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09262 Y: 5580
## Median : -0.00216 Median : -0.04099
## Mean : -0.00159 Mean : -0.00406
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
set.seed(42)
training_rows <- createDataPartition(y = train.imp$default, p=0.7, list=F)
test.imp <- train.imp %>% filter(!(rownames(.) %in% training_rows))
train.imp <- train.imp %>% filter(rownames(.) %in% training_rows)
dim(train.imp)
## [1] 16169 16
## [1] 6929 16
map_int(train.imp,~sum(is.na(.x)))
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 0 0 0 0 0 0 0
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5 AMT_PC6
## 0 0 0 0 0 0 0
## AMT_PC7 default
## 0 0
No missing values.
default is the response variable. The response variable is a Yes/No boolean variable therefor is appropriate for our classification problem.
round(prop.table(table(train.imp$default)),2)
##
## N Y
## 0.76 0.24
76% of the dataset do not default and 24% have defaulted. This is a classic imbalanced dataset. We will need to deal with this either prior to modelling or using packages to sample for us.
First step is to look at all variables available using the ggplot2 framework for visuals.
LIMIT_BAL The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.AGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
p1 <- ggplot(data=train.imp, aes(x=LIMIT_BAL)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip() +
scale_x_continuous(labels=comma)
p2 <- ggplot(data=train.imp, aes(x=AGE)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p1,p2, nrow=1)
ggplot(data=train.imp, aes(x=LIMIT_BAL, y=AGE)) +
geom_jitter(aes(fill=default)) +
coord_flip()
p3 <- ggplot(data=train.imp, aes(x=PAY_PC1)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p4 <- ggplot(data=train.imp, aes(x=PAY_PC2)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p5 <- ggplot(data=train.imp, aes(x=PAY_PC3)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p3,p4,p5, nrow=1)
p6 <- ggplot(data=train.imp, aes(x=AMT_PC1)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p7 <- ggplot(data=train.imp, aes(x=AMT_PC2)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p8 <- ggplot(data=train.imp, aes(x=AMT_PC3)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p9 <- ggplot(data=train.imp, aes(x=AMT_PC4)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p10 <- ggplot(data=train.imp, aes(x=AMT_PC5)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p11 <- ggplot(data=train.imp, aes(x=AMT_PC6)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p12 <- ggplot(data=train.imp, aes(x=AMT_PC7)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p6,p7,p8,p9,p10,p11,p12, nrow=2)
#### Categorical Variables
SEX The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.EDUCATION certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.MARRIAGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
p <- lapply(X = c('SEX', 'EDUCATION', 'MARRIAGE'),
FUN = function(x) ggplot(data = train.imp) +
aes_string(x=x, fill = 'default') +
geom_bar(position="dodge") +
theme(legend.position="none"))
legend <- get_legend(ggplot(data = train.imp, aes(x=SEX, fill = default)) +
geom_bar())
grid.arrange(p[[1]],p[[2]],p[[3]],
legend, layout_matrix = cbind(c(1,2,3),
c(4,5,3),
c(6,6,6)),
widths=c(3,3,1))
#read in the model if it's on hand If not, follow code from 336.
xgbFitD <- readRDS('xgbFitD.rds')
#control measures
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = 'down',
savePredictions = T
)
#grid expansion
xgbGrid <- expand.grid(
nrounds=seq(100, 200, 500), #Run the model how many times?
max_depth=6, #The maximum depth of a tree
eta=c(0.01, 0.1, 0.5), #Step size shrinkage (smaller is less likely to overfit)
gamma=0, #Minimum Loss reduction (the larger the more conservative the model)
colsample_bytree=c(0.5,1), #subsample ratio of columns when constructing each tree
min_child_weight=1, #minimum sum of instance weight (hessian) needed in a child
subsample=c(0.5, 1) #Subsample ratio of the training instance; 0.5 means half the data is used.
)
xgbFitD <- train(
default~
PAY_PC1 +
AGE +
LIMIT_BAL +
AMT_PC1 +
EDUCATION +
PAY_PC2 +
PAY_PC3 +
AMT_PC2 +
AMT_PC7,
train.imp,
method = 'xgbTree',
trControl = ctrl,
tuneGrid = xgbGrid,
metric = "ROC"
)
saveRDS(xgbFitD, file = 'xgbFitD.rds')
print(xgbFitD, details=F)
## eXtreme Gradient Boosting
##
## 16169 samples
## 9 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 14552, 14553, 14552, 14553, 14551, 14552, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## eta colsample_bytree subsample ROC Sens Spec
## 0.01 0.5 0.5 0.7940388 0.7748844 0.6958986
## 0.01 0.5 1.0 0.7944374 0.7778040 0.6911871
## 0.01 1.0 0.5 0.8043237 0.8009142 0.6777680
## 0.01 1.0 1.0 0.8003286 0.8013388 0.6674766
## 0.10 0.5 0.5 0.7985907 0.7728947 0.6905190
## 0.10 0.5 1.0 0.8019819 0.7819628 0.6885730
## 0.10 1.0 0.5 0.8008873 0.7738409 0.6946682
## 0.10 1.0 1.0 0.8057312 0.7848987 0.6879580
## 0.50 0.5 0.5 0.7499764 0.7000248 0.6742929
## 0.50 0.5 1.0 0.7761141 0.7360846 0.6875997
## 0.50 1.0 0.5 0.7501419 0.6958673 0.6799233
## 0.50 1.0 1.0 0.7748297 0.7337195 0.6865296
##
## Tuning parameter 'nrounds' was held constant at a value of 100
##
## Tuning parameter 'gamma' was held constant at a value of 0
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 100, max_depth = 6,
## eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1
## and subsample = 1.
plot(xgbFitD)
xgb.importance(
model = xgbFitD$finalModel) %>%
xgb.ggplot.importance()
densityplot(xgbFitD,pch='|')
predict(xgbFitD,type = 'prob') -> train.ProbsD
histogram(~Y + N, train.ProbsD)
test.imp
## # A tibble: 6,929 x 16
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3
## <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 50000 Fema~ Other Married 37 -0.393 0.176 0.00489
## 2 50000 Male Grad_Sch~ Single 37 -0.393 0.176 0.00489
## 3 250000 Male Grad_Sch~ Single 29 -0.393 0.176 0.00489
## 4 120000 Fema~ Other Married 39 1.71 0.268 -0.140
## 5 50000 Male High_Sch~ Single 23 -0.393 0.176 0.00489
## 6 50000 Fema~ High_Sch~ Married 47 1.71 0.268 -0.140
## 7 230000 Fema~ Grad_Sch~ Single 27 1.71 0.268 -0.140
## 8 500000 Male Grad_Sch~ Married 58 3.81 0.361 -0.284
## 9 50000 Male Grad_Sch~ Single 25 2.18 -1.82 0.560
## 10 280000 Male Grad_Sch~ Single 31 0.271 0.358 -1.40
## # ... with 6,919 more rows, and 8 more variables: AMT_PC1 <dbl>,
## # AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>, AMT_PC5 <dbl>,
## # AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>
summary(test.imp)
## LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 0 Male :2805 Grad_School:2490 Married:3173
## 1st Qu.: 50000 Female:4124 Uni : 0 Single :3670
## Median :140000 High_School:1099 Other : 86
## Mean :166907 Other :3340
## 3rd Qu.:240000
## Max. :760000
## AGE PAY_PC1 PAY_PC2
## Min. :21.00 Min. :-11.859675 Min. :-4.42243
## 1st Qu.:28.00 1st Qu.: -0.393308 1st Qu.:-0.26506
## Median :34.00 Median : -0.393308 Median : 0.17555
## Mean :35.47 Mean : 0.007173 Mean :-0.00134
## 3rd Qu.:41.00 3rd Qu.: 1.360047 3rd Qu.: 0.34434
## Max. :70.00 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.616800 Min. :-2.161575 Min. :-3.47545
## 1st Qu.:-0.283941 1st Qu.:-1.515179 1st Qu.:-0.43997
## Median : 0.004886 Median :-0.858795 Median :-0.21123
## Mean : 0.012680 Mean : 0.002525 Mean :-0.01860
## 3rd Qu.: 0.112994 3rd Qu.: 0.463515 3rd Qu.: 0.06759
## Max. : 3.364030 Max. :19.598795 Max. :38.06379
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-29.377842 Min. :-15.759742 Min. :-25.102463
## 1st Qu.: -0.138236 1st Qu.: -0.066254 1st Qu.: -0.082010
## Median : -0.070445 Median : 0.018397 Median : -0.032000
## Mean : -0.008069 Mean : 0.003631 Mean : 0.008541
## 3rd Qu.: 0.001689 3rd Qu.: 0.081770 3rd Qu.: 0.025525
## Max. : 15.310976 Max. : 12.231116 Max. : 13.346865
## AMT_PC6 AMT_PC7 default
## Min. :-17.599251 Min. :-15.64263 N:5255
## 1st Qu.: -0.041455 1st Qu.: -0.09317 Y:1674
## Median : -0.001935 Median : -0.04190
## Mean : 0.000567 Mean : -0.01526
## 3rd Qu.: 0.066713 3rd Qu.: 0.02631
## Max. : 15.582944 Max. : 14.03629
map_int(test.imp,~sum(is.na(.x)))
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 0 0 0 0 0 0 0
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5 AMT_PC6
## 0 0 0 0 0 0 0
## AMT_PC7 default
## 0 0
test.imp <- test.imp[complete.cases(test.imp), ]
xgbPredDownF <- predict(object = xgbFitD, newdata = test.imp)
library(purrr)
xtab <- table(xgbPredDownF,test.imp$default)
xgbCM <- confusionMatrix(xtab)
#CM_list <- list(xgbCM, elastinetCM, knnCM, svmCM)
#compiled_results <- tibble(
#models = c('xgb','elastinet','knn','svm'),
#accuracy = map_dbl(CM_list,~.x$overall[1]),
#kappa = map_dbl(CM_list,~.x$overall[2]),
#sensitivity = map_dbl(CM_list,~.x$byClass[1]),
#specificity = map_dbl(CM_list,~.x$byClass[2]),
#F1 = map_dbl(CM_list,~.x$byClass[7])
#)
#compiled_results %>% arrange(accuracy,kappa)
xgbCM
## Confusion Matrix and Statistics
##
##
## xgbPredDownF N Y
## N 4145 556
## Y 1110 1118
##
## Accuracy : 0.7596
## 95% CI : (0.7493, 0.7696)
## No Information Rate : 0.7584
## P-Value [Acc > NIR] : 0.4175
##
## Kappa : 0.4104
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7888
## Specificity : 0.6679
## Pos Pred Value : 0.8817
## Neg Pred Value : 0.5018
## Prevalence : 0.7584
## Detection Rate : 0.5982
## Detection Prevalence : 0.6785
## Balanced Accuracy : 0.7283
##
## 'Positive' Class : N
##
predict.raw <- read_csv('AT2_credit_test_STUDENT.csv')
## Parsed with column specification:
## cols(
## ID = col_integer(),
## LIMIT_BAL = col_integer(),
## SEX = col_integer(),
## EDUCATION = col_integer(),
## MARRIAGE = col_integer(),
## AGE = col_integer(),
## PAY_PC1 = col_double(),
## PAY_PC2 = col_double(),
## PAY_PC3 = col_double(),
## AMT_PC1 = col_double(),
## AMT_PC2 = col_double(),
## AMT_PC3 = col_double(),
## AMT_PC4 = col_double(),
## AMT_PC5 = col_double(),
## AMT_PC6 = col_double(),
## AMT_PC7 = col_double()
## )
val.imp <- predict.raw
val.imp$SEX <- as.factor(val.imp$SEX)
val.imp$SEX <- factor(val.imp$SEX,
levels =c('1', '2'),
labels =c('Male', 'Female'))
#all education factors greater than 4, set them to 4, also set 0 to 4
val.imp$EDUCATION[val.imp$EDUCATION == 5] <- 4
val.imp$EDUCATION[val.imp$EDUCATION == 6] <- 4
val.imp$EDUCATION[val.imp$EDUCATION == 0] <- 4
val.imp$EDUCATION <- factor(val.imp$EDUCATION,
levels =c('1', '2', '3', '4'),
labels = c('Grad_School', 'Uni', 'High_School', 'Other'))
#clean marriage
val.imp$MARRIAGE[val.imp$MARRIAGE == 0] <- 3
val.imp$MARRIAGE <- as.factor(val.imp$MARRIAGE)
val.imp$MARRIAGE <- factor(val.imp$MARRIAGE,
levels = c('1','2','3'),
labels = c('Married', 'Single', 'Other'))
summary(val.imp)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 5 Min. : 10000 Male :2641 Grad_School:2393
## 1st Qu.: 7532 1st Qu.: 50000 Female:4258 Uni :3306
## Median :15062 Median :140000 High_School:1096
## Mean :15065 Mean :166217 Other : 104
## 3rd Qu.:22624 3rd Qu.:230000
## Max. :29989 Max. :800000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Married:3149 Min. :21.00 Min. :-13.302028 Min. :-4.110567
## Single :3660 1st Qu.:28.00 1st Qu.: -0.393308 1st Qu.:-0.210711
## Other : 90 Median :34.00 Median : -0.393308 Median : 0.175555
## Mean :35.47 Mean : 0.005547 Mean : 0.005926
## 3rd Qu.:41.00 3rd Qu.: 1.360047 3rd Qu.: 0.361123
## Max. :79.00 Max. : 3.813348 Max. : 4.967609
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.341166 Min. :-2.06507 Min. :-3.535521
## 1st Qu.:-0.283941 1st Qu.:-1.50940 1st Qu.:-0.424712
## Median : 0.004886 Median :-0.85844 Median :-0.211871
## Mean :-0.002183 Mean :-0.01544 Mean :-0.004575
## 3rd Qu.: 0.057637 3rd Qu.: 0.48720 3rd Qu.: 0.067081
## Max. : 3.176797 Max. :19.26397 Max. :28.783658
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-8.19007 Min. :-19.17146 Min. :-24.108569
## 1st Qu.:-0.13215 1st Qu.: -0.06678 1st Qu.: -0.082701
## Median :-0.07044 Median : 0.01664 Median : -0.032000
## Mean :-0.01284 Mean : -0.01546 Mean : -0.004962
## 3rd Qu.:-0.00122 3rd Qu.: 0.07733 3rd Qu.: 0.020652
## Max. :17.88261 Max. : 15.07758 Max. : 12.649943
## AMT_PC6 AMT_PC7
## Min. :-19.077508 Min. :-25.90403
## 1st Qu.: -0.040660 1st Qu.: -0.09017
## Median : -0.001886 Median : -0.04039
## Mean : 0.006764 Mean : 0.01371
## 3rd Qu.: 0.069230 3rd Qu.: 0.02954
## Max. : 14.222570 Max. : 18.18085
val.imp$default <- predict(xgbFitD, val.imp)
final.output <- val.imp %>%
select(ID, default)
final.output$default <- as.character(final.output$default)
final.output$default[final.output$default == "N"] = 0 #Set N to 0 and Y to 1
final.output$default[final.output$default == "Y"] = 1 #Set Y to 1
final.output
## # A tibble: 6,899 x 2
## ID default
## <int> <chr>
## 1 5 0
## 2 17 1
## 3 19 0
## 4 23 1
## 5 27 1
## 6 30 0
## 7 32 1
## 8 34 0
## 9 37 0
## 10 38 0
## # ... with 6,889 more rows
write.csv(final.output, file="xgb_resultsEdDown.csv", row.names=FALSE)