1. Introduction

The purpose of this project is to analyze defualt probability of bank customers. Let’s explore the dataset.

# set visualization theme
library(plyr)
library(tidyverse)
library(GGally)

my_theme <- function(base_size = 10, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
    theme(
      axis.text = element_text(size = 10),
      axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
      axis.title = element_text(size = 10),
      panel.grid.major = element_line(color = "grey"),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = "#f7fdff"),
      strip.background = element_rect(fill = "#001d60", color = "#00113a", size =0.7),
      strip.text = element_text(face = "bold", size = 10, color = "white"),
      legend.position = "right",
      legend.justification = "center",
      legend.background = element_blank(),
      panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
    )
}

theme_set(my_theme())

# read the data
library(readr)
train <- read_csv('train.csv')
test <- read_csv('test.csv')

# change the column names to English
column_names <- c('id','target','sex','car','realestate','kid','income','income_type',
                  'edu','marriage','house_type','res_prop','mobile','work_mobile','email',
                  'job','family','industry','age','work_year','member_year')

colnames(train) <- column_names
colnames(test) <- column_names[-2]

# change binary to factor
recode_factors <- function(data, columns) {
  for (col in columns) {
    data[[col]] <- recode_factor(data[[col]], `0` = 'No', `1` = 'Yes')
  }
  return(data)
}



columns_to_recode <- c('target','car','realestate','mobile','work_mobile','email','job')

train <- recode_factors(train,columns_to_recode)
test <- recode_factors(test,columns_to_recode[-1])

# explore data
knitr::kable(head(train))

id	target	sex	car	realestate	kid	income	income_type	edu	marriage	house_type	res_prop	mobile	work_mobile	email	job	family	industry	age	work_year	member_year
TRAIN_00000	No	여성	Yes	Yes	2	18054000	연금수령자	고등학교 졸업	기혼	주택 / 아파트	0.004960	Yes	No	No	Unknown	4	기타 1	39	1000	23
TRAIN_00001	No	남성	Yes	No	0	59472000	근로자	대학교 졸업 이상	기혼	주택 / 아파트	0.018029	Yes	Yes	No	기술직	2	사업 1	45	4	16
TRAIN_00002	No	여성	No	Yes	0	29736000	근로자	고등학교 졸업	기혼	주택 / 아파트	0.010500	Yes	Yes	No	단순 노동자	2	사업 0	32	3	9
TRAIN_00003	No	여성	Yes	No	1	38232000	기타	고등학교 졸업	기혼	주택 / 아파트	0.004849	Yes	Yes	No	Unknown	3	산업 4	34	6	12
TRAIN_00004	No	여성	No	Yes	0	26550000	근로자	고등학교 졸업	기혼	주택 / 아파트	0.025164	Yes	Yes	No	Unknown	2	사업 2	38	0	4
TRAIN_00005	No	여성	Yes	No	0	21240000	근로자	고등학교 졸업	미혼	아파트 임대	0.018634	Yes	Yes	No	핵심 노동자	1	무역 0	30	8	1

2. Data Exploration

Since the data source is from Korean website, there are variables written in Korean, but this will not really matter is the analysis.

Hmisc::describe(train)

## train 
## 
##  21  Variables      60000  Observations
## --------------------------------------------------------------------------------
## id 
##        n  missing distinct 
##    60000        0    60000 
## 
## lowest : TRAIN_00000 TRAIN_00001 TRAIN_00002 TRAIN_00003 TRAIN_00004
## highest: TRAIN_59995 TRAIN_59996 TRAIN_59997 TRAIN_59998 TRAIN_59999
## --------------------------------------------------------------------------------
## target 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  53572  6428
## Proportion 0.893 0.107
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##    60000        0        3 
##                                   
## Value         기타    남성    여성
## Frequency      1   20265   39734  
## Proportion 0.000   0.338   0.662  
## --------------------------------------------------------------------------------
## car 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  39749 20251
## Proportion 0.662 0.338
## --------------------------------------------------------------------------------
## realestate 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  17482 42518
## Proportion 0.291 0.709
## --------------------------------------------------------------------------------
## kid 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       11    0.641   0.4121   0.6297        0        0 
##      .25      .50      .75      .90      .95 
##        0        0        1        2        2 
##                                                                             
## Value          0     1     2     3     4     5     6     7    11    14    19
## Frequency  42317 11638  5229   701    87    19     5     1     1     1     1
## Proportion 0.705 0.194 0.087 0.012 0.001 0.000 0.000 0.000 0.000 0.000 0.000
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## income 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0      817    0.995 39836994 21848829 15930000 19116000 
##      .25      .50      .75      .90      .95 
## 26550000 35046000 47790000 63720000 79650000 
## 
## lowest :    6265800    6372000    6510060    6584400    6690600
## highest:  531000000  691486254  902700000 1062000000 2124000000
## --------------------------------------------------------------------------------
## income_type 
##        n  missing distinct 
##    60000        0        7 
##                                                                             
## Value          공무원     근로자       기타     사업가     실직자 연금수령자
## Frequency     4146      30693      13879          2          4      11274   
## Proportion   0.069      0.512      0.231      0.000      0.000      0.188   
##                      
## Value            학생
## Frequency        2   
## Proportion   0.000   
## --------------------------------------------------------------------------------
## edu 
##        n  missing distinct 
##    60000        0        4 
##                                                                               
## Value         고등학교 졸업 대학교 졸업 이상      대학교 중퇴         저학력자
## Frequency       42910            14459             1883              748      
## Proportion      0.715            0.241            0.031            0.012      
## --------------------------------------------------------------------------------
## marriage 
##        n  missing distinct 
##    60000        0        5 
##                                                   
## Value         기혼    미혼    별거    사별  사실혼
## Frequency  38795    8300    3770    3307    5828  
## Proportion 0.647   0.138   0.063   0.055   0.097  
## --------------------------------------------------------------------------------
## house_type 
##        n  missing distinct 
##    60000        0        4 
##                                                                   
## Value           공공분양   아파트 임대      오피스텔 주택 / 아파트
## Frequency       2352          1080           526         56042    
## Proportion     0.039         0.018         0.009         0.934    
## --------------------------------------------------------------------------------
## res_prop 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       80    0.999  0.02088  0.01447 0.005002 0.006671 
##      .25      .50      .75      .90      .95 
## 0.010006 0.018850 0.028663 0.035792 0.046220 
## 
## lowest : 0.000533 0.000938 0.001276 0.001333 0.001417
## highest: 0.031329 0.032561 0.035792 0.04622  0.072508
## --------------------------------------------------------------------------------
## mobile 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency      1 59999
## Proportion     0     1
## --------------------------------------------------------------------------------
## work_mobile 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  11278 48722
## Proportion 0.188 0.812
## --------------------------------------------------------------------------------
## email 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  56619  3381
## Proportion 0.944 0.056
## --------------------------------------------------------------------------------
## job 
##        n  missing distinct 
##    60000        0       19 
## 
## lowest : Unknown          기술직           단순 노동자      핵심 노동자      운전자          
## highest: 비서             보안 업계 종사자 부동산중개업자   IT 업계 종사자   인사 담당자     
## --------------------------------------------------------------------------------
## family 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       12    0.842    2.156   0.9257        1        1 
##      .25      .50      .75      .90      .95 
##        2        2        3        3        4 
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    13    15
## Frequency  12940 31315 10139  4839   659    82    17     5     1     1     1
## Proportion 0.216 0.522 0.169 0.081 0.011 0.001 0.000 0.000 0.000 0.000 0.000
##                 
## Value         20
## Frequency      1
## Proportion 0.000
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## industry 
##        n  missing distinct 
##    60000        0       56 
## 
## lowest : 건설      경찰      광고      국가 안보 군대     , highest: 주택      통신      학교      호텔      환경     
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       49    0.999    44.02    13.66       26       28 
##      .25      .50      .75      .90      .95 
##       34       43       54       61       63 
## 
## lowest : 21 22 23 24 25, highest: 65 66 67 68 69
## --------------------------------------------------------------------------------
## work_year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       49     0.99    192.9    307.6        0        1 
##      .25      .50      .75      .90      .95 
##        2        6       16     1000     1000 
## 
## lowest :    0    1    2    3    4, highest:   44   45   46   48 1000
## --------------------------------------------------------------------------------
## member_year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       55    0.999    13.27    10.98        0        1 
##      .25      .50      .75      .90      .95 
##        5       12       20       27       31 
## 
## lowest :  0  1  2  3  4, highest: 50 52 56 57 58
## --------------------------------------------------------------------------------

Missing Value

Let’s see if there are any missing values or not.

mycolors=c("#ff003f","#0094ff", "#ae00ff" , "#94ff00", "#ffc700","#fc1814")

# Creating function to see the missing values
bar_missing <- function(x){
  library(dplyr)
  library(reshape2)
  library(ggplot2)
  x %>%
    is.na %>%
    melt %>%
    ggplot(data = .,
           aes(x = Var2)) +
    geom_bar(aes(y=after_stat(count),fill=value),alpha=0.8) +
    scale_fill_brewer(palette = 'Set1',name = "", labels = c("Available","Missing"))+
    theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
    labs(x = "Variables in Dataset",
         y = "Observations")+coord_flip()+
    theme(legend.position = "right")
   }

# explore missing values 
bar_missing(train)

- It seems that there are no missing values in our dataset.

Dependent Variable (Overdue)

train %>% ggplot(aes(x=target,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8,width = 0.7)+
  geom_text(stat='prop',hjust=1.2,size=4.5,color='black')+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Dependent Variable (Default) Distribution')

It seems that we have an inbalanced dataset and have to conduct oversampling or undersampling when conducting classification prediction.

Categorical Variable

categorical_column1 <- c(2,3,4,5,8,9,10,11,13,14,15)
categorical_column2 <- c(2,16,18)


train[,categorical_column1] %>% gather(sex:email,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'fill',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Proportion')

train[,categorical_column1] %>% gather(sex:email,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Count')

- We will get rid of mobile column since most of observations tells that people have mobile phone.

train[,categorical_column2] %>% gather(job:industry,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Count')

train[,categorical_column2] %>% gather(job:industry,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'fill',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Proportion')

We have too many categories in job industry. Let’s revise this later.

Numerical Variable

numerical_column <- c(2,6,7,12,17,19,20,21)

# density plot
train[,numerical_column] %>% gather(kid:member_year,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_density(alpha=0.8) +
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  labs(title = 'Density Plot for Numerical Variable')

# box plot
train[,numerical_column] %>% gather(kid:member_year,key='features',value='value') %>%
  ggplot(aes(x=target,y=value,fill=target))+
  geom_boxplot(alpha=0.8)+
  facet_wrap(~features,scale='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title = 'Box Plot for Numerical Variable')

- In the work_year column (indicating continuous work year in the current occupation), we havea value of 1,000 which is physically impossible.Let’s see how many observations we have whose continuous work experience is 1,000 years.

paste('The number of observations with 1,000 work experience is :',sum(train[,'work_year']==1000))

## [1] "The number of observations with 1,000 work experience is : 11275"

Since there are more than 10k+ observations, we can’t just remove all this observations.
Let’s divide the training dataset into two parts. One without 1,000 years and one with 1,000 years. Let’s discover what this figure means.

train_no <- train[train[,'work_year']<1000,]
train_yes <- train[train[,'work_year']==1000,]

p1 <- train_no[,c('age','work_year')] %>% gather(age:work_year,key='features',value='value') %>%
  ggplot(aes(x=value,fill=features))+
  geom_bar(alpha=0.8,color='black')+
  facet_wrap(~features,scales='free',ncol=2)+
  scale_fill_brewer(palette = 'Set1')+
  labs(title='Work Year < 1000')

p2 <- train_yes[,c('age','work_year')] %>% gather(age:work_year,key='features',value='value') %>%
  ggplot(aes(x=value,fill=features))+
  geom_bar(alpha=0.8,color='black')+
  facet_wrap(~features,scales='free',ncol=2)+
  scale_fill_brewer(palette = 'Set1')+
  labs(title='Work Year = 1000')

library(gridExtra)
grid.arrange(p1,p2)

We can see that people whose continuous work year = 1000 has negatively skewed distribution of age compared to other normal work year figures. Also, the distribution of work year for work year <1000 data don’t really follow similar distribution to that of age. This might indicate that work year = 1000 is the people’s continuous work year who haven’t changed their job and kept on working at the same work place. Since the test data includes value with 1,000, we won’t change the value.

3. Data Cleaning

For this section, we will clean some data that are unnecessary or has too many categorical values.

# get rid of mobile column as a result from EDA
train <- train[,-13]

# get rid of other value in sex
train <-  train[train$sex!='기타',]

# narrow the income_type category to worker and non-worker
train <- transform(train, income_type = 
  ifelse(income_type %in% c("근로자", "공무원", "사업가"), "worker",
  ifelse(income_type %in% c("연금수령자", "학생", "실직자"), "non worker", "other")
  ))


# Calculate the overall mean income
total_mean_income <- mean(train$income)


# Calculate mean income for each industry and job category
industry_means <- train %>%
  group_by(industry) %>%
  summarise(mean_income = mean(income),
            industry_cat = ifelse(mean_income>total_mean_income,'above','below'))

job_means <- train %>%
  group_by(job) %>%
  summarise(mean_income = mean(income),
            job_cat = ifelse(mean_income>total_mean_income,'above','below'))

train <- train %>%
  left_join(industry_means %>% select(industry, industry_cat), by = "industry") %>%
  select(-industry) %>%
  rename(industry = industry_cat)

train <- train %>%
  left_join(job_means %>% select(job, job_cat), by = "job") %>%
  select(-job) %>%
  rename(job = job_cat)

chr_columns <- sapply(train,is.character)
train <- train %>%
  mutate_if(chr_columns,as.factor)

4. Modeling

Let’s divide the data into in-sample and out-of-sample to define the accuracy
Since we have an inbalanced dataset, let’s conduct undersampling to match the dataset.

set.seed(0)

library(caret)
# splot into train test dataset
index <- createDataPartition(train$target, p = 0.7, list = FALSE)
train_data<-train[index,]
test_data <- train[-index, ]

# undersample train dataset due to 
re1 <- train_data[(train_data$target=='Yes'),]
re0 <- train_data[(train_data$target=='No'),]

idx0 <- sample(1:nrow(re0), sum(train_data$target=='Yes'))
re0 <- re0[idx0,]
undersample <- rbind(re1,re0)

undersample %>% ggplot(aes(x=target,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8,width = 0.7)+
  geom_text(stat='prop',hjust=1.2,size=4.5,color='black')+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Undersample Data Dependent Variable Distribution')

Feature Selection

We want to reduce unnecessary variables before building out model to increase our predicitivity and reduce computation cost.
First we can select features by thihking logically. We can see from the EDA that the default proportion for higher age people is lower than that of younger people. However, does having an e-mail account helps predict default probability?
Also, we can get rid of some variables if they have high correlation. Let’s see if there are correlation between numerical variables.

u_numerical_column <- c(6,7,12,15,16,17,18)

# Create corrleation plot

plotfuncLow <- function(data,mapping){
  p <- ggplot(data = data,mapping=mapping)+geom_point(color=mycolors[2],size=0.5,alpha=0.7)+
    geom_smooth(method="lm",alpha=0.5)
  p
}

library(GGally)
ggpairs(undersample[,u_numerical_column],lower=list(continuous=plotfuncLow),diag=NULL)+
  ggtitle('Correlation Analysis')

We see that people with high family members tend to have a lot of kids after marriage. The correlation is below 0.9 so let’s keep this variable for our analysis.
LASSO regression is one popular method that could help us guide which variables are necessary for analysis. Let’s see what LASSO method chooses for the variables.

library(glmnet)
library(gamlr)
set.seed(0)
Xmatrix <- sparse.model.matrix(target ~., data=naref(undersample[,-1]))[,-1]
logit_lasso_model <- cv.glmnet(Xmatrix, undersample$target,family='binomial')
plot(logit_lasso_model)

# filter the coefficients that are non 0
coef(logit_lasso_model) %>% as.matrix() %>% as.data.frame() %>% filter(s1==0)

##                         s1
## realestateNo             0
## realestateYes            0
## income_typenon worker    0
## income_typeother         0
## edu대학교 중퇴           0
## edu저학력자              0
## marriage미혼             0
## marriage별거             0
## marriage사별             0
## marriage사실혼           0
## house_type공공분양       0
## house_type주택 / 아파트  0
## work_mobileNo            0
## work_mobileYes           0
## emailNo                  0
## emailYes                 0
## family                   0
## work_year                0
## jobbelow                 0

From the analysis, we see that LASSO method dropeed realestate, work_mobile, email,family,work_year. Let’s explore what other methods selects for the feature

# compute variable importance for RF
library(ranger)
library(tree)
var_imp <- ranger(target ~., data=undersample[,-1],write.forest = TRUE,num.trees = 200, min.node.size = 5, importance ='impurity')

library(timetk)
var_imp <- ranger::importance(var_imp) %>% sort(decreasing=TRUE)
var_imp9 <- var_imp%>%data.frame()%>%tk_tbl()

# Create the bar graph
library(RColorBrewer)
nb.cols <- 18
rfcolors <- colorRampPalette(brewer.pal(8, "Blues"))(nb.cols) # extending colors
ggplot(aes(x = reorder(index, .), y = ., fill = reorder(index, .)), data = var_imp9) +
  geom_bar(stat = 'identity', color = 'black', show.legend = FALSE) + 
  scale_fill_manual(values = rfcolors) +
  coord_flip() +
  labs(x = "Features", y = "Importance", title = "Random Forest Variable Importance")

work_mobile,email,house_type,job,sex seem not important factor for Random Forest Model.
For both model, email and work_mobile were not important factor. Thus, we will discard these variables and build our model.

Machine Learning Modeling

Now, we will build several model and create an ensemble model to see the performance.

Control <- trainControl(method='repeatedcv',number=5,classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = 'final')

#1 SVM
set.seed(0)

svm <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'svmLinear2',
                    trControl=Control,
                    tuneLength =5)
predsvm <- predict(svm,newdata=test_data[,-c(1,13,14)])
cm1 <- confusionMatrix(predsvm,test_data$target)


#2 CART
set.seed(0)
cart <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'rpart',
                    trControl=Control,
                    tuneLength =5)
predcart <- predict(cart,newdata=test_data[,-c(1,13,14)])
cm2 <- confusionMatrix(predcart,test_data$target)

#3 KNN
set.seed(0)
knn <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'knn',
                    trControl=Control,
                    tuneLength =5)
predknn <- predict(knn,newdata=test_data[,-c(1,13,14)])
cm3 <- confusionMatrix(predknn,test_data$target)

#4 OneR
set.seed(0)
one <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'OneR',
                    trControl=Control,
                    tuneLength =5)
predone <- predict(one,newdata=test_data[,-c(1,13,14)])
cm4 <- confusionMatrix(predone,test_data$target)

#5 RF
set.seed(0)
rf <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'ranger',
                    trControl=Control,
                    tuneLength =5)
predrf <- predict(rf,newdata=test_data[,-c(1,13,14)])
cm5 <- confusionMatrix(predrf,test_data$target)

multiclasscmplot <- function(vote,truth){
  cm=confusionMatrix(vote,truth)
  cmt=cm$table%>%as_tibble()
  freq=Hmisc::describe(truth)%>%.$values%>%.$frequency
  labels=row.names(cm$table)
  
  k=1
  cmt$rate=cmt$n/1
  for(i in 1:2){
    for(k in 1:4){
      if(cmt$Reference[k]==labels[i]){
        cmt$rate[k]<-100*(cmt$n[k]/freq[i])
        k=k+1
      }
      else{k=k}
    }
  }
  
  cmt%>%ggplot(aes(x=Prediction,y=Reference,fill=rate))+
    geom_tile(color="black")+my_theme(10)+
    scale_fill_gradient2(low="white",mid="#008cff",high="#ff0059",midpoint=50)+
    theme(axis.text.x = element_text(angle =45, hjust = 0.5))+
    ggtitle("Ensemble")+
    geom_text(aes(y=Reference,x=Prediction,label=round(rate,1)),color="black",size=4)+coord_fixed()
}

Since we have built our model, let’s visualize the confusion matrix.

# Plot Confusion Matrix
m1 <- multiclasscmplot(vote=predsvm,truth = test_data$target)+ggtitle('SVM')
m2 <- multiclasscmplot(vote=predcart,truth = test_data$target)+ggtitle('CART')
m3 <- multiclasscmplot(vote=predknn,truth = test_data$target)+ggtitle('KNN')
m4 <- multiclasscmplot(vote=predone,truth = test_data$target)+ggtitle('OneR')
m5 <- multiclasscmplot(vote=predrf,truth = test_data$target)+ggtitle('RF')

grid.arrange(m1,m2,m3,m4,m5,ncol=3)

# creating function that extracts relevant statistics from the confusion matrix
calculate_metrics <- function(confusion_matrix) {
  accuracy <- confusion_matrix$overall[1]
  sensitivity <- confusion_matrix$byClass[1]
  specificity <- confusion_matrix$byClass[2]
  f1score <- confusion_matrix$byClass[7]
  fn <- confusion_matrix$table[1,2]/(confusion_matrix$table[1,2]+confusion_matrix$table[2,2])
  return(c(accuracy, sensitivity,specificity,f1score,fn))
}

confusion_matrices <- list(cm1,cm2,cm3,cm4,cm5)

metrics_list <- lapply(confusion_matrices, calculate_metrics)

accuracy <- round(sapply(metrics_list, `[`, 1),4)
sensitivity <- round(sapply(metrics_list, `[`, 2),4)
specificity <- round(sapply(metrics_list, `[`, 3),4)
f1score <- round(sapply(metrics_list, `[`, 4),4)
fn <- round(sapply(metrics_list, `[`, 5),4)
models <- c('SVM','CART','KNN','OneR','RF')

results <-  tibble(models=rep(models,5),
       metrics=c(rep('accuracy',5),
                 rep('sensitivity',5),
                 rep('specificity',5),
                 rep('f1sore',5),
                 rep('fn',5)), scores=c(accuracy,sensitivity,specificity,f1score,fn))

results$models <- reorder(results$models, results$scores)

results %>% 
  ggplot(aes(x=models,y=scores,fill=metrics))+
  geom_bar(color='black',stat='identity',alpha=0.8,show.legend = F,width = 0.3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  facet_grid(~metrics)+
  labs(title='Model Summary')

results_wide <- pivot_wider(results,names_from =models, values_from=scores)
knitr::kable(results_wide)

metrics	SVM	CART	KNN	OneR	RF
accuracy	0.5777	0.6016	0.5533	0.5607	0.6015
sensitivity	0.5710	0.6056	0.5539	0.5586	0.6010
specificity	0.6338	0.5685	0.5488	0.5783	0.6058
f1sore	0.7071	0.7308	0.6889	0.6943	0.7292
fn	0.3662	0.4315	0.4512	0.4217	0.3942

We can see that SVM, CART, and RF model performed the best in terms of false negative rate or accuracy.

5. Ensemble

Weighted Average

In this section, we will use ensemble method (combining the models) to see whether this method returns better model thant the single model.
Since CART, SVM, RF model performed the best (considering all the tradeoffs of metrics), we will assign, 40% weight for RF, 25% ofr SVM and CAR, and 5% for KNN and OneR

pred1 <- predict(svm,newdata=test_data,type="prob")
pred1 <- pred1*0.4

pred2 <- predict(cart,newdata=test_data,type="prob")
pred2 <- pred2*0.1

pred3 <- predict(knn,newdata=test_data,type="prob")
pred3 <- pred3*0

pred4 <- predict(one,newdata=test_data,type="prob")
pred4 <- pred4*0

pred5 <- predict(rf,newdata=test_data,type="prob")
pred5 <- pred5*0.5

w_ensemble <- (pred1+pred2+pred3+pred4+pred5) %>% as.tibble() 

w_ensemble <- ifelse(w_ensemble$No>w_ensemble$Yes,'No','Yes')
w_ensemble <- w_ensemble %>% as.factor()

m6 <- multiclasscmplot(vote=w_ensemble,truth = test_data$target)+ggtitle('Weighted Ensemble')

GBM Stacking Ensemble

library(caretEnsemble)
library(doParallel)
algorithmlist = c('svmLinear2','rpart','knn','OneR','ranger')

set.seed(0)
model2 <- c(svm,cart,knn,one,rf)
stack <- caretStack(model2,method='gbm',trControl=Control,tuneLength=3)
s_ensemble <- predict(stack,newdata=test_data)
m7 <- multiclasscmplot(s_ensemble,truth=test_data$target)+ggtitle('Stacking Ensemble')

6. Results

grid.arrange(m6,m7,m1,m2,m5,ncol=3)

cm6 <- confusionMatrix(w_ensemble,reference = test_data$target)
cm7 <- confusionMatrix(s_ensemble,reference = test_data$target)

# creating function that extracts relevant statistics from the confusion matrix
calculate_metrics <- function(confusion_matrix) {
  accuracy <- confusion_matrix$overall[1]
  sensitivity <- confusion_matrix$byClass[1]
  specificity <- confusion_matrix$byClass[2]
  f1score <- confusion_matrix$byClass[7]
  fn <- confusion_matrix$table[1,2]/(confusion_matrix$table[1,2]+confusion_matrix$table[2,2])
  return(c(accuracy, sensitivity,specificity,f1score,fn))
}

confusion_matrices <- list(cm1,cm2,cm3,cm4,cm5,cm6,cm7)

metrics_list <- lapply(confusion_matrices, calculate_metrics)

accuracy <- round(sapply(metrics_list, `[`, 1),4)
sensitivity <- round(sapply(metrics_list, `[`, 2),4)
specificity <- round(sapply(metrics_list, `[`, 3),4)
f1score <- round(sapply(metrics_list, `[`, 4),4)
fn <- round(sapply(metrics_list, `[`, 5),4)
models <- c('SVM','CART','KNN','OneR','RF','wEN','sEN')

results <-  tibble(models=rep(models,5),
       metrics=c(rep('accuracy',7),
                 rep('sensitivity',7),
                 rep('specificity',7),
                 rep('f1sore',7),
                 rep('fn',7)), scores=c(accuracy,sensitivity,specificity,f1score,fn))

results$models <- reorder(results$models, results$scores)

results %>% 
  ggplot(aes(x=models,y=scores,fill=metrics))+
  geom_bar(color='black',stat='identity',alpha=0.8,show.legend = F,width = 0.3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  facet_grid(~metrics)+
  labs(title='Model Summary')

results_wide <- pivot_wider(results,names_from =models, values_from=scores)
knitr::kable(results_wide)

metrics	SVM	CART	KNN	OneR	RF	wEN	sEN
accuracy	0.5777	0.6016	0.5533	0.5607	0.6015	0.6007	0.6230
sensitivity	0.5710	0.6056	0.5539	0.5586	0.6010	0.5988	0.6280
specificity	0.6338	0.5685	0.5488	0.5783	0.6058	0.6162	0.5809
f1sore	0.7071	0.7308	0.6889	0.6943	0.7292	0.7281	0.7484
fn	0.3662	0.4315	0.4512	0.4217	0.3942	0.3838	0.4191

By using ensemble method, we have achieved higher accuracy, F1 score. However, it seems that there is a tradeoff between false negative and accuracy of the model.
Since SVM showed the lowest false negative rate and other models showed higher false negative rate, the ensemble method returned higher false negative rate compared to SVM.
It seems that GBM stacking ensemble method showed the best results considering the tradeoff between false negative rate and accuracy.

Default Analysis

Jaehyun Ahn

2023-12-18