caret package for MLlibrary(tidyverse, quietly = T)
library(dplyr, quietly = T) #used by Caret
library(ggplot2, quietly = T)
library(corrplot, quietly =T)
library(caret, quietly = T)
library(Amelia, quietly = T)
library(gridExtra, quietly = T)
library(xgboost, quietly = T)
library(ROCR, quietly = T)
library(Matrix, quietly = T)
library(rpart.plot, quietly =T)
library(doParallel, quietly=T)
#Import the data
training.raw <- read_csv('AT2_credit_train_STUDENT.csv')
predict.raw <- read_csv('AT2_credit_test_STUDENT.csv')
23,101 observations over 17 variables for the training/test split 6899 observations over 17 variables for the prediction set.
glimpse(training.raw)
## Observations: 23,101
## Variables: 17
## $ ID <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 500000, 100000, ...
## $ SEX <int> 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2...
## $ EDUCATION <int> 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 3...
## $ MARRIAGE <int> 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2...
## $ AGE <int> 24, 26, 34, 37, 37, 29, 23, 28, 35, 34, 51, 41, 30, ...
## $ PAY_PC1 <dbl> 0.47746305, -1.46161240, -0.39330764, -0.39330764, -...
## $ PAY_PC2 <dbl> -3.224589961, 0.853866590, 0.175554996, 0.175554996,...
## $ PAY_PC3 <dbl> 0.145040802, -0.360863449, 0.004885522, 0.004885522,...
## $ AMT_PC1 <dbl> -1.7522077, -1.6633432, -1.1348380, -0.3971748, -0.3...
## $ AMT_PC2 <dbl> -0.22434761, -0.14388558, -0.17660872, -0.45109777, ...
## $ AMT_PC3 <dbl> -0.077841055, -0.054600404, 0.015954854, -0.09978950...
## $ AMT_PC4 <dbl> 0.006957244, -0.002851947, -0.129071306, -0.03533896...
## $ AMT_PC5 <dbl> -0.041356958, 0.043889122, 0.098245278, -0.055306582...
## $ AMT_PC6 <dbl> 0.0008865935, -0.0261897987, -0.0223825102, 0.050465...
## $ AMT_PC7 <dbl> -0.05626505, -0.09997756, -0.06898686, -0.02820475, ...
## $ default <chr> "Y", "Y", "N", "N", "N", "N", "N", "Y", "N", "N", "N...
training.raw$default <- as.factor(training.raw$default)
training.raw$SEX <- as.factor(training.raw$SEX)
training.raw$MARRIAGE <- as.factor(training.raw$MARRIAGE)
training.raw$AGE <- as.numeric(training.raw$AGE)
#remove random anomilies
#remove NA - 2 rows from the sex being cat/dog
training.raw <- training.raw[complete.cases(training.raw), ]
#set negative balance values to 0. Max Limit_bal for test set is 760K and validation set for 800k. Lets set these at median value.
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL <= 0] <- 0
training.raw$LIMIT_BAL[training.raw$LIMIT_BAL >750000] <-140000
#all education factors greater than 4, set them to 4, also set 0 to 4
training.raw$EDUCATION[training.raw$EDUCATION == 5] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 6] <- 4
training.raw$EDUCATION[training.raw$EDUCATION == 0] <- 4
training.raw$EDUCATION <- factor(training.raw$EDUCATION)
#clean marriage
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE)
training.raw$MARRIAGE[training.raw$MARRIAGE == 0] <- 3
#lets label our factors properly
training.raw$SEX <- factor(training.raw$SEX,
levels =c('1', '2'),
labels =c('Male', 'Female'))
training.raw$EDUCATION <- factor(training.raw$EDUCATION,
levels =c('1', '2', '3', '4'),
labels = c('Grad_School', 'Uni', 'High_School', 'Other'))
training.raw$EDUCATION <- as.factor(training.raw$EDUCATION)
training.raw$MARRIAGE <- factor(training.raw$MARRIAGE,
levels = c('1','2','3'),
labels = c('Married', 'Single', 'Other'))
summary(training.raw)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : 0 Male : 9244 Grad_School: 8192
## 1st Qu.: 7488 1st Qu.: 50000 Female:13854 Uni :10722
## Median :14989 Median :140000 High_School: 3820
## Mean :14982 Mean :167397 Other : 364
## 3rd Qu.:22453 3rd Qu.:240000
## Max. :30000 Max. :750000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Married:10507 Min. : 21.0 Min. :-11.859675 Min. :-4.42243
## Single :12304 1st Qu.: 28.0 1st Qu.: -0.393308 1st Qu.:-0.23617
## Other : 287 Median : 34.0 Median : -0.393308 Median : 0.17555
## Mean : 35.7 Mean : -0.001513 Mean :-0.00187
## 3rd Qu.: 41.0 3rd Qu.: 1.360047 3rd Qu.: 0.36042
## Max. :141.0 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50825 1st Qu.:-0.42972
## Median : 0.004886 Median :-0.86429 Median :-0.20781
## Mean : 0.000605 Mean : 0.00431 Mean : 0.00124
## 3rd Qu.: 0.093942 3rd Qu.: 0.49754 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13709 1st Qu.: -0.068216 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00396 Mean : 0.004458 Mean : 0.00144
## 3rd Qu.: 0.00325 3rd Qu.: 0.083214 3rd Qu.: 0.02637
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09262 Y: 5580
## Median : -0.00216 Median : -0.04099
## Mean : -0.00159 Mean : -0.00406
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
cplot <- training.raw %>%
select(-default) %>%
select_if(is.numeric)
M <- cor(cplot)
p.mat <- cor.mtest(M)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M,
method = "color",
order= "hclust",
type="full",
col=col(200),
diag =F,
title="Correlation of Numeric Variables",
addCoef.col = "black",
sig.level = 0.05,
insig ="blank",
mar=c(0,0,3,0))
training.raw$AGE <- ifelse(training.raw$AGE >=70, NA, training.raw$AGE)
map_int(training.raw,~sum(is.na(.x)))
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1
## 0 0 0 0 0 69 0
## PAY_PC2 PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5
## 0 0 0 0 0 0 0
## AMT_PC6 AMT_PC7 default
## 0 0 0
'we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age'
## [1] "we can see that there is 50 age that are missing, out of the scheme of things, it is <1% of the total dataset, however lets play with using rpart to predict the missing age"
train.imp <- training.raw[-1]
map_int(train.imp,~sum(is.na(.x)))
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 0 0 0 0 69 0 0
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5 AMT_PC6
## 0 0 0 0 0 0 0
## AMT_PC7 default
## 0 0
head(train.imp)
## # A tibble: 6 x 16
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3 AMT_PC1
## <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20000 Fema~ Uni Married 24 0.477 -3.22 0.145 -1.75
## 2 120000 Fema~ Uni Single 26 -1.46 0.854 -0.361 -1.66
## 3 90000 Fema~ Uni Single 34 -0.393 0.176 0.00489 -1.13
## 4 50000 Fema~ Uni Married 37 -0.393 0.176 0.00489 -0.397
## 5 50000 Male Grad_Sch~ Single 37 -0.393 0.176 0.00489 -0.393
## 6 500000 Male Grad_Sch~ Single 29 -0.393 0.176 0.00489 15.7
## # ... with 7 more variables: AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>,
## # AMT_PC5 <dbl>, AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>
#run in parallel
cl <- makeCluster(detectCores())
registerDoParallel(cl)
age.predictors <- train.imp %>%
filter(complete.cases(.))
ctrl <- trainControl(method = "repeatedcv",
repeats = 5)
rpartGrid <- data.frame(maxdepth = seq(2,10,1))
rpartFit_ageimputation <- train (x=age.predictors[,-3],
y=age.predictors$AGE,
method='rpart2',
trControl = ctrl,
tuneGrid = rpartGrid
)
## Warning: Setting row names on a tibble is deprecated.
rpartFit_ageimputation
plot(rpartFit_ageimputation)
# Tree depth of 7 is optimal
rpart.plot(rpartFit_ageimputation$finalModel)
#save the model externally to load back in
saveRDS(rpartFit_ageimputation, file = 'rpartFit_ageimputation.rds')
#insert missing age back into the dataset
missing_age <- is.na(train.imp$AGE)
age.predicted <- predict(rpartFit_ageimputation, newdata = train.imp[missing_age,])
train.imp[missing_age, 'AGE'] <- age.predicted
summary(train.imp)
## LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 0 Male : 9244 Grad_School: 8192 Married:10507
## 1st Qu.: 50000 Female:13854 Uni :10722 Single :12304
## Median :140000 High_School: 3820 Other : 287
## Mean :167397 Other : 364
## 3rd Qu.:240000
## Max. :750000
## AGE PAY_PC1 PAY_PC2
## Min. :21.00 Min. :-11.859675 Min. :-4.42243
## 1st Qu.:28.00 1st Qu.: -0.393308 1st Qu.:-0.23617
## Median :34.00 Median : -0.393308 Median : 0.17555
## Mean :35.45 Mean : -0.001513 Mean :-0.00187
## 3rd Qu.:41.00 3rd Qu.: 1.360047 3rd Qu.: 0.36042
## Max. :69.00 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50825 1st Qu.:-0.42972
## Median : 0.004886 Median :-0.86429 Median :-0.20781
## Mean : 0.000605 Mean : 0.00431 Mean : 0.00124
## 3rd Qu.: 0.093942 3rd Qu.: 0.49754 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13709 1st Qu.: -0.068216 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00396 Mean : 0.004458 Mean : 0.00144
## 3rd Qu.: 0.00325 3rd Qu.: 0.083214 3rd Qu.: 0.02637
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09262 Y: 5580
## Median : -0.00216 Median : -0.04099
## Mean : -0.00159 Mean : -0.00406
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
set.seed(42)
training_rows <- createDataPartition(y = train.imp$default, p=0.8, list=F)
test.imp <- train.imp %>% filter(!(rownames(.) %in% training_rows))
train.imp <- train.imp %>% filter(rownames(.) %in% training_rows)
dim(train.imp)
## [1] 18479 16
## [1] 4619 16
map_int(train.imp,~sum(is.na(.x)))
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 0 0 0 0 0 0 0
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5 AMT_PC6
## 0 0 0 0 0 0 0
## AMT_PC7 default
## 0 0
No missing values.
default is the response variable. The response variable is a Yes/No boolean variable therefor is appropriate for our classification problem.
round(prop.table(table(train.imp$default)),2)
##
## N Y
## 0.76 0.24
round(prop.table(table(train.imp$SEX)),2)
##
## Male Female
## 0.4 0.6
round(prop.table(table(train.imp$EDUCATION)),2)
##
## Grad_School Uni High_School Other
## 0.35 0.46 0.17 0.02
round(prop.table(table(train.imp$MARRIAGE)),2)
##
## Married Single Other
## 0.45 0.53 0.01
76% of the dataset do not default and 24% have defaulted. This is a classic imbalanced dataset. We will need to deal with this either prior to modelling or using packages to sample for us.
First step is to look at all variables available using the ggplot2 framework for visuals.
LIMIT_BAL The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.AGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
p1 <- ggplot(data=train.imp, aes(x=LIMIT_BAL)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip() +
scale_x_continuous(labels=comma)
p2 <- ggplot(data=train.imp, aes(x=AGE)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p1,p2, nrow=1)
ggplot(data=train.imp, aes(x=LIMIT_BAL, y=AGE)) +
geom_jitter(aes(colour=default)) +
coord_flip() +
scale_x_continuous(label = comma)
p3 <- ggplot(data=train.imp, aes(x=PAY_PC1)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p4 <- ggplot(data=train.imp, aes(x=PAY_PC2)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p5 <- ggplot(data=train.imp, aes(x=PAY_PC3)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p3,p4,p5, nrow=1)
p6 <- ggplot(data=train.imp, aes(x=AMT_PC1)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p7 <- ggplot(data=train.imp, aes(x=AMT_PC2)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p8 <- ggplot(data=train.imp, aes(x=AMT_PC3)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p9 <- ggplot(data=train.imp, aes(x=AMT_PC4)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p10 <- ggplot(data=train.imp, aes(x=AMT_PC5)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p11 <- ggplot(data=train.imp, aes(x=AMT_PC6)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p12 <- ggplot(data=train.imp, aes(x=AMT_PC7)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p6,p7,p8,p9,p10,p11,p12, nrow=2)
#### Categorical Variables
SEX The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.EDUCATION certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.MARRIAGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
p <- lapply(X = c('SEX', 'EDUCATION', 'MARRIAGE'),
FUN = function(x) ggplot(data = train.imp) +
aes_string(x=x, fill = 'default') +
geom_bar(position="dodge") +
theme(legend.position="none"))
legend <- get_legend(ggplot(data = train.imp, aes(x=SEX, fill = default)) +
geom_bar())
grid.arrange(p[[1]],p[[2]],p[[3]],
legend, layout_matrix = cbind(c(1,2,3),
c(4,5,3),
c(6,6,6)),
widths=c(3,3,3))
#read in the model if it's on hand If not, follow code from 336.
#xgbFitD <- readRDS('xgbFitD.rds')
#control measures
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
sampling = 'down',
savePredictions = T
)
#grid expansion
xgbGrid <- expand.grid(
nrounds=seq(100, 200, 1000), #Run the model how many times?
max_depth=c(2,6,2), #The maximum depth of a tree
eta=c(0.01, 0.1, 0.5), #Step size shrinkage (smaller is less likely to overfit)
gamma=0, #Minimum Loss reduction (the larger the more conservative the model)
colsample_bytree=c(0.5,1), #subsample ratio of columns when constructing each tree
min_child_weight=1, #minimum sum of instance weight (hessian) needed in a child
subsample=c(0.5, 1) #Subsample ratio of the training instance; 0.5 means half the data is used.
)
xgbFitD <- train(
default~
PAY_PC1 +
AGE +
LIMIT_BAL +
AMT_PC1 +
PAY_PC2 +
AMT_PC2 +
AMT_PC5 +
EDUCATION +
PAY_PC3 +
AMT_PC7 +
AMT_PC4 +
AMT_PC6 +
AMT_PC3,
train.imp,
method = 'xgbTree',
trControl = ctrl,
tuneGrid = xgbGrid,
metric = "ROC"
)
saveRDS(xgbFitD, file = 'xgbFitU3.rds')
print(xgbFitD, details=F)
## eXtreme Gradient Boosting
##
## 18479 samples
## 13 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 16632, 16632, 16631, 16631, 16630, 16631, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## eta max_depth colsample_bytree subsample ROC Sens
## 0.01 2 0.5 0.5 0.7597945 0.7739848
## 0.01 2 0.5 1.0 0.7585042 0.7664497
## 0.01 2 1.0 0.5 0.7547281 0.8184378
## 0.01 2 1.0 1.0 0.7495721 0.8163965
## 0.01 6 0.5 0.5 0.7887507 0.7761964
## 0.01 6 0.5 1.0 0.7895069 0.7809485
## 0.01 6 1.0 0.5 0.7976870 0.8087481
## 0.01 6 1.0 1.0 0.7961267 0.8089479
## 0.10 2 0.5 0.5 0.7834335 0.7660357
## 0.10 2 0.5 1.0 0.7846638 0.7705592
## 0.10 2 1.0 0.5 0.7866898 0.7725436
## 0.10 2 1.0 1.0 0.7871512 0.7801491
## 0.10 6 0.5 0.5 0.7912204 0.7659651
## 0.10 6 0.5 1.0 0.7967229 0.7781240
## 0.10 6 1.0 0.5 0.7934491 0.7655792
## 0.10 6 1.0 1.0 0.8002317 0.7827609
## 0.50 2 0.5 0.5 0.7736018 0.7379077
## 0.50 2 0.5 1.0 0.7816663 0.7559464
## 0.50 2 1.0 0.5 0.7748588 0.7401922
## 0.50 2 1.0 1.0 0.7854065 0.7596715
## 0.50 6 0.5 0.5 0.7325908 0.6821402
## 0.50 6 0.5 1.0 0.7643370 0.7198992
## 0.50 6 1.0 0.5 0.7366725 0.6864931
## 0.50 6 1.0 1.0 0.7680423 0.7217689
## Spec
## 0.6104377
## 0.6140657
## 0.5425613
## 0.5392483
## 0.6805975
## 0.6783149
## 0.6552417
## 0.6502673
## 0.6690842
## 0.6678276
## 0.6724007
## 0.6636607
## 0.6847234
## 0.6812767
## 0.6827077
## 0.6794376
## 0.6777749
## 0.6806444
## 0.6805574
## 0.6811808
## 0.6645189
## 0.6806456
## 0.6680117
## 0.6817234
##
## Tuning parameter 'nrounds' was held constant at a value of 100
##
## Tuning parameter 'gamma' was held constant at a value of 0
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 100, max_depth = 6,
## eta = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 1
## and subsample = 1.
plot(xgbFitD)
xgb.importance(
model = xgbFitD$finalModel) %>%
xgb.ggplot.importance()
densityplot(xgbFitD,pch='|')
predict(xgbFitD,type = 'prob') -> train.ProbsD
histogram(~Y + N, train.ProbsD)
mixture of ridge & lasso
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = T,
sampling = 'down'
)
glmnetGrid <- expand.grid(.alpha = c(0,.2,.4,.6,.8,1),
.lambda = seq(10^-10,10^-1,0.02))
glmnetFit <- train(
default~
PAY_PC1 +
AGE +
LIMIT_BAL +
AMT_PC1 +
EDUCATION +
PAY_PC2 +
PAY_PC3 +
AMT_PC2 +
AMT_PC7,
train.imp,
trControl=ctrl,
method='glmnet',
tuneGrid = glmnetGrid
)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
saveRDS(glmnetFit, file = 'glmnetFit.rds')
glmnetFit
## glmnet
##
## 18479 samples
## 9 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 16632, 16631, 16631, 16631, 16630, 16632, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.0 1e-10 0.7277187 0.6679416 0.6555589
## 0.0 2e-02 0.7278027 0.6659010 0.6567684
## 0.0 4e-02 0.7279900 0.6600361 0.6618759
## 0.0 6e-02 0.7280595 0.6554838 0.6658196
## 0.0 8e-02 0.7280931 0.6521160 0.6685975
## 0.2 1e-10 0.7275294 0.6736923 0.6529174
## 0.2 2e-02 0.7275913 0.6652301 0.6586516
## 0.2 4e-02 0.7271482 0.6560683 0.6650140
## 0.2 6e-02 0.7263258 0.6514166 0.6705244
## 0.2 8e-02 0.7252698 0.6479205 0.6744220
## 0.4 1e-10 0.7275277 0.6750910 0.6513919
## 0.4 2e-02 0.7271131 0.6653734 0.6577088
## 0.4 4e-02 0.7252763 0.6561976 0.6669392
## 0.4 6e-02 0.7225615 0.6508603 0.6671627
## 0.4 8e-02 0.7194290 0.6454661 0.6712390
## 0.6 1e-10 0.7275992 0.6745193 0.6521539
## 0.6 2e-02 0.7260800 0.6619621 0.6579779
## 0.6 4e-02 0.7223813 0.6531431 0.6644299
## 0.6 6e-02 0.7181876 0.6464364 0.6724941
## 0.6 8e-02 0.7147847 0.6374173 0.6801585
## 0.8 1e-10 0.7273519 0.6742201 0.6531386
## 0.8 2e-02 0.7247687 0.6594079 0.6589631
## 0.8 4e-02 0.7192135 0.6498753 0.6677442
## 0.8 6e-02 0.7144486 0.6383590 0.6794395
## 0.8 8e-02 0.7033890 0.6223331 0.6768873
## 1.0 1e-10 0.7275313 0.6759043 0.6500482
## 1.0 2e-02 0.7235773 0.6616630 0.6574410
## 1.0 4e-02 0.7169133 0.6487198 0.6695377
## 1.0 6e-02 0.7070203 0.6302829 0.6766215
## 1.0 8e-02 0.6907202 0.6120300 0.6684638
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0 and lambda = 0.08.
densityplot(glmnetFit, pch='|')
plot(varImp(glmnetFit),15,main='Elastinet Model')
predict(glmnetFit, type = 'prob') -> train.glmnet.Probs
histogram(~Y + N, train.glmnet.Probs)
Supposed to be the worst.
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = T,
sampling = 'down'
)
knnGrid <- expand.grid(k=seq(3,23,2))
knnFit <- train(
default~
PAY_PC1 +
AGE +
LIMIT_BAL +
AMT_PC1 +
EDUCATION +
PAY_PC2 +
PAY_PC3 +
AMT_PC2 +
AMT_PC7,
train.imp,
method = 'knn',
trControl = ctrl,
tuneGrid = knnGrid
)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
saveRDS(knnFit, file = 'knnFit.rds')
knnFit
## k-Nearest Neighbors
##
## 18479 samples
## 9 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 16631, 16631, 16632, 16631, 16631, 16631, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 3 0.6896864 0.6966531 0.6066241
## 5 0.7069371 0.7267068 0.5935506
## 7 0.7111725 0.7438599 0.5825244
## 9 0.7150697 0.7600292 0.5735652
## 11 0.7176105 0.7689184 0.5653230
## 13 0.7188659 0.7763688 0.5599450
## 15 0.7196457 0.7812484 0.5542553
## 17 0.7198720 0.7859155 0.5549256
## 19 0.7207763 0.7881274 0.5490135
## 21 0.7201811 0.7908104 0.5466847
## 23 0.7196569 0.7920513 0.5412158
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.
plot(knnFit)
densityplot(knnFit,pch='|')
predict(knnFit, type = 'prob') -> train.Probs
histogram(~Y+N, train.Probs)
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = T,
sampling = 'down'
)
svmFit <- train(
default~
PAY_PC1 +
AGE +
LIMIT_BAL +
AMT_PC1 +
EDUCATION +
PAY_PC2 +
PAY_PC3 +
AMT_PC2 +
AMT_PC7,
train.imp,
method = 'svmRadial',
trControl = ctrl,
tuneGrid = expand.grid(C=c(0.05,0.1,0.2,0.3), sigma=c(0.001,0.005,0.01,0.015))
)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
saveRDS(svmFit, file = 'svmFit')
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 18479 samples
## 9 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 16631, 16631, 16631, 16632, 16631, 16631, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## C sigma ROC Sens Spec
## 0.05 0.001 0.7265791 0.5840472 0.7300963
## 0.05 0.005 0.7305104 0.6621624 0.6605688
## 0.05 0.010 0.7345025 0.6683700 0.6599393
## 0.05 0.015 0.7379832 0.6864644 0.6537131
## 0.10 0.001 0.7276004 0.6501473 0.6671989
## 0.10 0.005 0.7324205 0.6674142 0.6584644
## 0.10 0.010 0.7380494 0.6955119 0.6481600
## 0.10 0.015 0.7425362 0.7190735 0.6387507
## 0.20 0.001 0.7280246 0.6516168 0.6676480
## 0.20 0.005 0.7352825 0.6875067 0.6508901
## 0.20 0.010 0.7424370 0.7232112 0.6359290
## 0.20 0.015 0.7472045 0.7412357 0.6285832
## 0.30 0.001 0.7285948 0.6544702 0.6644677
## 0.30 0.005 0.7372719 0.6984234 0.6451551
## 0.30 0.010 0.7448928 0.7349714 0.6312694
## 0.30 0.015 0.7500729 0.7465300 0.6263879
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.015 and C = 0.3.
plot(svmFit)
densityplot(svmFit, pch='|')
predict(svmFit, type = 'prob') -> train.Probs
histogram(~Y+N, train.Probs)
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = T,
sampling = 'down'
)
RFFit <- train(
default~
PAY_PC1 +
LIMIT_BAL +
AGE +
AMT_PC1 +
AMT_PC2 +
AMT_PC3 +
AMT_PC4 +
AMT_PC5 +
AMT_PC6 +
AMT_PC7 +
PAY_PC2 +
PAY_PC3,
train.imp,
method = 'rf',
trControl = ctrl
)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was
## not in the result set. ROC will be used instead.
saveRDS(RFFit, file = 'RFFit.rds')
RFFit
## Random Forest
##
## 18479 samples
## 12 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 16630, 16631, 16631, 16631, 16631, 16631, ...
## Addtional sampling using down-sampling
##
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7689348 0.7414769 0.6753137
## 7 0.7732522 0.7488111 0.6704772
## 12 0.7719981 0.7486116 0.6686388
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 7.
plot(RFFit)
densityplot(RFFit,pch='|')
plot(varImp(RFFit),15,main='RF - Variable Importance')
predict(RFFit, type = 'prob') -> train.Probs
histogram(~Y+N, train.Probs)
#set threshold
re <-
resamples(x = list(
xgb = xgbFitD,
knn = knnFit,
elastinet = glmnetFit,
svm = svmFit,
rf = RFFit
))
dotplot(re)
bwplot(re)
difValues <- diff(re)
dotplot(difValues)
test.imp
## # A tibble: 4,619 x 16
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2 PAY_PC3
## <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 20000 Fema~ Uni Married 24 0.477 -3.22 0.145
## 2 50000 Fema~ Uni Married 37 -0.393 0.176 0.00489
## 3 50000 Male Grad_Sch~ Single 37 -0.393 0.176 0.00489
## 4 140000 Fema~ High_Sch~ Married 28 -1.10 0.00576 -0.948
## 5 250000 Male Grad_Sch~ Single 29 -0.393 0.176 0.00489
## 6 50000 Male High_Sch~ Single 23 -0.393 0.176 0.00489
## 7 50000 Fema~ High_Sch~ Married 47 1.71 0.268 -0.140
## 8 230000 Fema~ Grad_Sch~ Single 27 1.71 0.268 -0.140
## 9 500000 Male Grad_Sch~ Married 58 3.81 0.361 -0.284
## 10 50000 Male Grad_Sch~ Single 25 2.18 -1.82 0.560
## # ... with 4,609 more rows, and 8 more variables: AMT_PC1 <dbl>,
## # AMT_PC2 <dbl>, AMT_PC3 <dbl>, AMT_PC4 <dbl>, AMT_PC5 <dbl>,
## # AMT_PC6 <dbl>, AMT_PC7 <dbl>, default <fct>
summary(test.imp)
## LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 0 Male :1849 Grad_School:1687 Married:2104
## 1st Qu.: 50000 Female:2770 Uni :2145 Single :2454
## Median :140000 High_School: 729 Other : 61
## Mean :168275 Other : 58
## 3rd Qu.:240000
## Max. :750000
## AGE PAY_PC1 PAY_PC2
## Min. :21.00 Min. :-11.85168 Min. :-4.42243
## 1st Qu.:28.00 1st Qu.: -0.39331 1st Qu.:-0.26901
## Median :34.00 Median : -0.39331 Median : 0.17555
## Mean :35.43 Mean : 0.01748 Mean :-0.01418
## 3rd Qu.:42.00 3rd Qu.: 1.36957 3rd Qu.: 0.26834
## Max. :69.00 Max. : 3.81335 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-2.705913 Min. :-1.970628 Min. :-3.98807
## 1st Qu.:-0.283941 1st Qu.:-1.531260 1st Qu.:-0.43463
## Median : 0.004886 Median :-0.898807 Median :-0.20994
## Mean : 0.011796 Mean :-0.005837 Mean :-0.01761
## 3rd Qu.: 0.110511 3rd Qu.: 0.448394 3rd Qu.: 0.05934
## Max. : 3.192124 Max. :19.598795 Max. :38.06379
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-29.377842 Min. :-15.759742 Min. :-25.10246
## 1st Qu.: -0.136882 1st Qu.: -0.064640 1st Qu.: -0.07890
## Median : -0.070445 Median : 0.018313 Median : -0.03200
## Mean : -0.016652 Mean : -0.004938 Mean : 0.00689
## 3rd Qu.: 0.006755 3rd Qu.: 0.080747 3rd Qu.: 0.02671
## Max. : 15.310976 Max. : 10.520814 Max. : 13.34686
## AMT_PC6 AMT_PC7 default
## Min. :-17.599251 Min. :-15.64263 N:3503
## 1st Qu.: -0.040710 1st Qu.: -0.09164 Y:1116
## Median : -0.002781 Median : -0.04212
## Mean : 0.000511 Mean : -0.01809
## 3rd Qu.: 0.065534 3rd Qu.: 0.02653
## Max. : 15.582944 Max. : 13.27408
map_int(test.imp,~sum(is.na(.x)))
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1 PAY_PC2
## 0 0 0 0 0 0 0
## PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5 AMT_PC6
## 0 0 0 0 0 0 0
## AMT_PC7 default
## 0 0
test.imp <- test.imp[complete.cases(test.imp), ]
xgbPred <- predict(object = xgbFitD, newdata = test.imp)
elastinetPred <- predict(object = glmnetFit, newdata = test.imp)
knnPred <- predict(object = knnFit, newdata = test.imp)
svmPred <- predict(object = svmFit, newdata = test.imp)
rfPred <- predict(object = RFFit, newdata = test.imp)
library(purrr)
xtab <- table(xgbPred,test.imp$default)
xgbCM <- confusionMatrix(xtab)
xtab <- table(elastinetPred,test.imp$default)
elastinetCM <- caret::confusionMatrix(xtab)
xtab <- table(knnPred,test.imp$default)
knnCM <-caret::confusionMatrix(xtab)
xtab <- table(svmPred,test.imp$default)
svmCM <-caret::confusionMatrix(xtab)
xtab <- table(rfPred,test.imp$default)
rfCM <-caret::confusionMatrix(xtab)
CM_list <- list(xgbCM, elastinetCM, knnCM, svmCM, rfCM)
compiled_results <- tibble(
models = c('xgb','elastinet','knn','svm', 'rf'),
accuracy = map_dbl(CM_list,~.x$overall[1]),
kappa = map_dbl(CM_list,~.x$overall[2]),
sensitivity = map_dbl(CM_list,~.x$byClass[1]),
specificity = map_dbl(CM_list,~.x$byClass[2]),
F1 = map_dbl(CM_list,~.x$byClass[7])
)
compiled_results %>% arrange(accuracy,kappa)
## # A tibble: 5 x 6
## models accuracy kappa sensitivity specificity F1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 elastinet 0.659 0.260 0.656 0.670 0.745
## 2 svm 0.731 0.351 0.761 0.636 0.811
## 3 rf 0.736 0.379 0.752 0.687 0.812
## 4 knn 0.738 0.336 0.790 0.572 0.820
## 5 xgb 0.767 0.433 0.788 0.699 0.837
library(ggrepel)
dotplot(reorder(models,accuracy)~accuracy,compiled_results, main = 'Accuracy (Test Set Performance)')
ggplot(compiled_results, aes(F1, accuracy)) +
geom_point(color = 'blue',shape=1) +
geom_text_repel(aes(label = models),
box.padding=unit(1,'lines'),
max.iter=1e2,segment.size=.3,
force=1) +
theme_bw()+
labs(x='F1',y='kappa', title='Kappa vs F1 (Test Set Performance)')
## Parsed with column specification:
## cols(
## ID = col_integer(),
## LIMIT_BAL = col_integer(),
## SEX = col_integer(),
## EDUCATION = col_integer(),
## MARRIAGE = col_integer(),
## AGE = col_integer(),
## PAY_PC1 = col_double(),
## PAY_PC2 = col_double(),
## PAY_PC3 = col_double(),
## AMT_PC1 = col_double(),
## AMT_PC2 = col_double(),
## AMT_PC3 = col_double(),
## AMT_PC4 = col_double(),
## AMT_PC5 = col_double(),
## AMT_PC6 = col_double(),
## AMT_PC7 = col_double()
## )
## ID LIMIT_BAL SEX EDUCATION
## Min. : 5 Min. : 10000 Male :2641 Grad_School:2393
## 1st Qu.: 7532 1st Qu.: 50000 Female:4258 Uni :3306
## Median :15062 Median :140000 High_School:1096
## Mean :15065 Mean :166217 Other : 104
## 3rd Qu.:22624 3rd Qu.:230000
## Max. :29989 Max. :800000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Married:3149 Min. :21.00 Min. :-13.302028 Min. :-4.110567
## Single :3660 1st Qu.:28.00 1st Qu.: -0.393308 1st Qu.:-0.210711
## Other : 90 Median :34.00 Median : -0.393308 Median : 0.175555
## Mean :35.47 Mean : 0.005547 Mean : 0.005926
## 3rd Qu.:41.00 3rd Qu.: 1.360047 3rd Qu.: 0.361123
## Max. :79.00 Max. : 3.813348 Max. : 4.967609
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.341166 Min. :-2.06507 Min. :-3.535521
## 1st Qu.:-0.283941 1st Qu.:-1.50940 1st Qu.:-0.424712
## Median : 0.004886 Median :-0.85844 Median :-0.211871
## Mean :-0.002183 Mean :-0.01544 Mean :-0.004575
## 3rd Qu.: 0.057637 3rd Qu.: 0.48720 3rd Qu.: 0.067081
## Max. : 3.176797 Max. :19.26397 Max. :28.783658
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-8.19007 Min. :-19.17146 Min. :-24.108569
## 1st Qu.:-0.13215 1st Qu.: -0.06678 1st Qu.: -0.082701
## Median :-0.07044 Median : 0.01664 Median : -0.032000
## Mean :-0.01284 Mean : -0.01546 Mean : -0.004962
## 3rd Qu.:-0.00122 3rd Qu.: 0.07733 3rd Qu.: 0.020652
## Max. :17.88261 Max. : 15.07758 Max. : 12.649943
## AMT_PC6 AMT_PC7
## Min. :-19.077508 Min. :-25.90403
## 1st Qu.: -0.040660 1st Qu.: -0.09017
## Median : -0.001886 Median : -0.04039
## Mean : 0.006764 Mean : 0.01371
## 3rd Qu.: 0.069230 3rd Qu.: 0.02954
## Max. : 14.222570 Max. : 18.18085
## # A tibble: 6,899 x 2
## ID default
## <int> <chr>
## 1 5 1
## 2 17 1
## 3 19 0
## 4 23 1
## 5 27 0
## 6 30 0
## 7 32 1
## 8 34 0
## 9 37 0
## 10 38 0
## # ... with 6,889 more rows