caret package for MLlibrary(tidyverse, quietly = T)
library(dplyr, quietly = T) #used by Caret
library(ggplot2, quietly = T)
library(corrplot, quietly =T)
library(caret, quietly = T)
library(Amelia, quietly = T)
library(gridExtra, quietly = T)
setwd("~/Documents/brookstruong")
train.raw <- read_csv("AT2_credit_train_STUDENT.csv")
test.raw <- read_csv("AT2_credit_test_STUDENT.csv")
23,101 observations over 17 variables
glimpse(train.raw)
## Observations: 23,101
## Variables: 17
## $ ID <int> 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ LIMIT_BAL <dbl> 20000, 120000, 90000, 50000, 50000, 500000, 100000, ...
## $ SEX <int> 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2...
## $ EDUCATION <int> 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 3, 1, 1, 3...
## $ MARRIAGE <int> 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2...
## $ AGE <int> 24, 26, 34, 37, 37, 29, 23, 28, 35, 34, 51, 41, 30, ...
## $ PAY_PC1 <dbl> 0.47746305, -1.46161240, -0.39330764, -0.39330764, -...
## $ PAY_PC2 <dbl> -3.224589961, 0.853866590, 0.175554996, 0.175554996,...
## $ PAY_PC3 <dbl> 0.145040802, -0.360863449, 0.004885522, 0.004885522,...
## $ AMT_PC1 <dbl> -1.7522077, -1.6633432, -1.1348380, -0.3971748, -0.3...
## $ AMT_PC2 <dbl> -0.22434761, -0.14388558, -0.17660872, -0.45109777, ...
## $ AMT_PC3 <dbl> -0.077841055, -0.054600404, 0.015954854, -0.09978950...
## $ AMT_PC4 <dbl> 0.006957244, -0.002851947, -0.129071306, -0.03533896...
## $ AMT_PC5 <dbl> -0.041356958, 0.043889122, 0.098245278, -0.055306582...
## $ AMT_PC6 <dbl> 0.0008865935, -0.0261897987, -0.0223825102, 0.050465...
## $ AMT_PC7 <dbl> -0.05626505, -0.09997756, -0.06898686, -0.02820475, ...
## $ default <chr> "Y", "Y", "N", "N", "N", "N", "N", "Y", "N", "N", "N...
train.raw$default <- as.factor(train.raw$default)
train.raw$SEX <- as.factor(train.raw$SEX)
train.raw$EDUCATION <- as.factor(train.raw$EDUCATION)
train.raw$MARRIAGE <- as.factor(train.raw$MARRIAGE)
summary(train.raw)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE
## Min. : 1 Min. : -99 1 : 9244 0: 11 0: 38
## 1st Qu.: 7489 1st Qu.: 50000 2 :13854 1: 8192 1:10510
## Median :14987 Median : 140000 NA's: 3 2:10724 2:12304
## Mean :14981 Mean : 167524 3: 3821 3: 249
## 3rd Qu.:22452 3rd Qu.: 240000 4: 88
## Max. :30000 Max. :1000000 5: 229
## 6: 36
## AGE PAY_PC1 PAY_PC2
## Min. : 21.0 Min. :-11.859675 Min. :-4.42243
## 1st Qu.: 28.0 1st Qu.: -0.393308 1st Qu.:-0.23617
## Median : 34.0 Median : -0.393308 Median : 0.17555
## Mean : 35.7 Mean : -0.001656 Mean :-0.00177
## 3rd Qu.: 41.0 3rd Qu.: 1.360047 3rd Qu.: 0.36112
## Max. :141.0 Max. : 3.813348 Max. : 5.44103
##
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50827 1st Qu.:-0.42961
## Median : 0.004886 Median :-0.86433 Median :-0.20780
## Mean : 0.000652 Mean : 0.00461 Mean : 0.00137
## 3rd Qu.: 0.093942 3rd Qu.: 0.49766 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
##
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13710 1st Qu.: -0.068199 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00383 Mean : 0.004618 Mean : 0.00148
## 3rd Qu.: 0.00325 3rd Qu.: 0.083236 3rd Qu.: 0.02644
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
##
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09273 Y: 5583
## Median : -0.00216 Median : -0.04099
## Mean : -0.00202 Mean : -0.00409
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
##
Looking at the dataset and how the read_csv imported the csv, we can understand that the date is imported as a
train.raw %>%
arrange(AGE>80) %>%
ggplot(aes(x=AGE)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot
## function (data = NULL, mapping = aes(), ..., environment = parent.frame())
## {
## UseMethod("ggplot")
## }
## <bytecode: 0x7feb39707070>
## <environment: namespace:ggplot2>
Missing data = 3NA for sex.
cplot <- train.raw %>%
select(-default) %>%
select_if(is.numeric)
M <- cor(cplot)
p.mat <- cor.mtest(M)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(M,
method = "color",
order= "hclust",
type="full",
col=col(200),
diag =F,
title="Correlation of Numeric Variables",
addCoef.col = "black",
sig.level = 0.05,
insig ="blank",
mar=c(0,0,3,0))
train.raw %>%
filter(SEX == "NA")
## # A tibble: 0 x 17
## # ... with 17 variables: ID <int>, LIMIT_BAL <dbl>, SEX <fct>,
## # EDUCATION <fct>, MARRIAGE <fct>, AGE <int>, PAY_PC1 <dbl>,
## # PAY_PC2 <dbl>, PAY_PC3 <dbl>, AMT_PC1 <dbl>, AMT_PC2 <dbl>,
## # AMT_PC3 <dbl>, AMT_PC4 <dbl>, AMT_PC5 <dbl>, AMT_PC6 <dbl>,
## # AMT_PC7 <dbl>, default <fct>
set.seed(42)
training_rows <- createDataPartition(y = train.raw$default, p=0.7, list=F)
test.raw <- train.raw %>% filter(!(rownames(.) %in% training_rows))
train.raw <- train.raw %>% filter(rownames(.) %in% training_rows)
dim(train.raw)
## [1] 16172 17
## [1] 6929 17
missmap(train.raw, main='Missing Values Analysis using Amelia ordered by % missing', col=c('red', 'gray'), legend =F, rank.order = T)
## Warning in if (class(obj) == "amelia") {: the condition has length > 1 and
## only the first element will be used
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.
map_int(train.raw,~sum(is.na(.x)))
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_PC1
## 0 0 2 0 0 0 0
## PAY_PC2 PAY_PC3 AMT_PC1 AMT_PC2 AMT_PC3 AMT_PC4 AMT_PC5
## 0 0 0 0 0 0 0
## AMT_PC6 AMT_PC7 default
## 0 0 0
Sex is missing 2 values.
default is the response variable. The response variable is a Yes/No boolean variable therefor is appropriate for our classification problem.
round(prop.table(table(train.raw$default)),2)
##
## N Y
## 0.76 0.24
76% of the dataset do not default and 24% have defaulted.
First step is to look at all variables available using the ggplot2 framework for visuals.
LIMIT_BAL The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.AGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.p1 <- ggplot(data=train.raw, aes(x=LIMIT_BAL)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p2 <- ggplot(data=train.raw, aes(x=AGE)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p1,p2, nrow=1)
p3 <- ggplot(data=train.raw, aes(x=PAY_PC1)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p4 <- ggplot(data=train.raw, aes(x=PAY_PC2)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p5 <- ggplot(data=train.raw, aes(x=PAY_PC3)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p3,p4,p5, nrow=1)
p6 <- ggplot(data=train.raw, aes(x=AMT_PC1)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p7 <- ggplot(data=train.raw, aes(x=AMT_PC2)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p8 <- ggplot(data=train.raw, aes(x=AMT_PC3)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p9 <- ggplot(data=train.raw, aes(x=AMT_PC4)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p10 <- ggplot(data=train.raw, aes(x=AMT_PC5)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p11 <- ggplot(data=train.raw, aes(x=AMT_PC6)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
p12 <- ggplot(data=train.raw, aes(x=AMT_PC7)) +
geom_histogram(aes(fill=default), bins = 40) +
coord_flip()
grid.arrange(p6,p7,p8,p9,p10,p11,p12, nrow=2)
#### Categorical Variables
SEX The limit Balance beings at -99 with a median of 140,000, mean of 167880 and max of 1,000,000.EDUCATION certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.MARRIAGE certainly shows many outliers beyond the 100+ range. Age begins at 21 with a median of 34, mean of 35.65 and max of 141.get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
p <- lapply(X = c('SEX', 'EDUCATION', 'MARRIAGE'),
FUN = function(x) ggplot(data = train.raw) +
aes_string(x=x, fill = 'default') +
geom_bar(position="dodge") +
theme(legend.position="none"))
legend <- get_legend(ggplot(data = train.raw, aes(x=SEX, fill = default)) +
geom_bar())
grid.arrange(p[[1]],p[[2]],p[[3]],
legend, layout_matrix = cbind(c(1,2,3),
c(4,5,3),
c(6,6,6)),
widths=c(3,3,1))
summary(train.raw$SEX)
## 1 2 NA's
## 6461 9709 2
#train.imp <- train.raw
#train.imp$SEX[is.na(train.imp$SEX)] <- '3'
#what to do with the 2 NAs?'
repeats = 5,
verboseIter = F,
classProbs = TRUE,
summaryFunction = #twoClassSummary,
# sampling = 'down',
savePredictions = T
)
xgbGrid <- expand.grid( nrounds = seq(14,24,2), max_depth = seq(2,8,2), eta=c(0.1,0.2,0.3), gamma=1, colsample_bytree=1, min_child_weight=1, subsample=1 )
default~., train.imp, method = ‘xgbTree’, trcontrol = ctrl, tuneGrid = xgbGrid )
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.