1. Introduction

# set visualization theme
library(plyr)
library(tidyverse)
library(GGally)

my_theme <- function(base_size = 10, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
    theme(
      axis.text = element_text(size = 10),
      axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
      axis.title = element_text(size = 10),
      panel.grid.major = element_line(color = "grey"),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = "#f7fdff"),
      strip.background = element_rect(fill = "#001d60", color = "#00113a", size =0.7),
      strip.text = element_text(face = "bold", size = 10, color = "white"),
      legend.position = "right",
      legend.justification = "center",
      legend.background = element_blank(),
      panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
    )
}

theme_set(my_theme())
# read the data
library(readr)
train <- read_csv('train.csv')
test <- read_csv('test.csv')

# change the column names to English
column_names <- c('id','target','sex','car','realestate','kid','income','income_type',
                  'edu','marriage','house_type','res_prop','mobile','work_mobile','email',
                  'job','family','industry','age','work_year','member_year')

colnames(train) <- column_names
colnames(test) <- column_names[-2]

# change binary to factor
recode_factors <- function(data, columns) {
  for (col in columns) {
    data[[col]] <- recode_factor(data[[col]], `0` = 'No', `1` = 'Yes')
  }
  return(data)
}



columns_to_recode <- c('target','car','realestate','mobile','work_mobile','email','job')

train <- recode_factors(train,columns_to_recode)
test <- recode_factors(test,columns_to_recode[-1])

# explore data
knitr::kable(head(train))
id target sex car realestate kid income income_type edu marriage house_type res_prop mobile work_mobile email job family industry age work_year member_year
TRAIN_00000 No 여성 Yes Yes 2 18054000 연금수령자 고등학교 졸업 기혼 주택 / 아파트 0.004960 Yes No No Unknown 4 기타 1 39 1000 23
TRAIN_00001 No 남성 Yes No 0 59472000 근로자 대학교 졸업 이상 기혼 주택 / 아파트 0.018029 Yes Yes No 기술직 2 사업 1 45 4 16
TRAIN_00002 No 여성 No Yes 0 29736000 근로자 고등학교 졸업 기혼 주택 / 아파트 0.010500 Yes Yes No 단순 노동자 2 사업 0 32 3 9
TRAIN_00003 No 여성 Yes No 1 38232000 기타 고등학교 졸업 기혼 주택 / 아파트 0.004849 Yes Yes No Unknown 3 산업 4 34 6 12
TRAIN_00004 No 여성 No Yes 0 26550000 근로자 고등학교 졸업 기혼 주택 / 아파트 0.025164 Yes Yes No Unknown 2 사업 2 38 0 4
TRAIN_00005 No 여성 Yes No 0 21240000 근로자 고등학교 졸업 미혼 아파트 임대 0.018634 Yes Yes No 핵심 노동자 1 무역 0 30 8 1

2. Data Exploration

Hmisc::describe(train)
## train 
## 
##  21  Variables      60000  Observations
## --------------------------------------------------------------------------------
## id 
##        n  missing distinct 
##    60000        0    60000 
## 
## lowest : TRAIN_00000 TRAIN_00001 TRAIN_00002 TRAIN_00003 TRAIN_00004
## highest: TRAIN_59995 TRAIN_59996 TRAIN_59997 TRAIN_59998 TRAIN_59999
## --------------------------------------------------------------------------------
## target 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  53572  6428
## Proportion 0.893 0.107
## --------------------------------------------------------------------------------
## sex 
##        n  missing distinct 
##    60000        0        3 
##                                   
## Value         기타    남성    여성
## Frequency      1   20265   39734  
## Proportion 0.000   0.338   0.662  
## --------------------------------------------------------------------------------
## car 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  39749 20251
## Proportion 0.662 0.338
## --------------------------------------------------------------------------------
## realestate 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  17482 42518
## Proportion 0.291 0.709
## --------------------------------------------------------------------------------
## kid 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       11    0.641   0.4121   0.6297        0        0 
##      .25      .50      .75      .90      .95 
##        0        0        1        2        2 
##                                                                             
## Value          0     1     2     3     4     5     6     7    11    14    19
## Frequency  42317 11638  5229   701    87    19     5     1     1     1     1
## Proportion 0.705 0.194 0.087 0.012 0.001 0.000 0.000 0.000 0.000 0.000 0.000
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## income 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0      817    0.995 39836994 21848829 15930000 19116000 
##      .25      .50      .75      .90      .95 
## 26550000 35046000 47790000 63720000 79650000 
## 
## lowest :    6265800    6372000    6510060    6584400    6690600
## highest:  531000000  691486254  902700000 1062000000 2124000000
## --------------------------------------------------------------------------------
## income_type 
##        n  missing distinct 
##    60000        0        7 
##                                                                             
## Value          공무원     근로자       기타     사업가     실직자 연금수령자
## Frequency     4146      30693      13879          2          4      11274   
## Proportion   0.069      0.512      0.231      0.000      0.000      0.188   
##                      
## Value            학생
## Frequency        2   
## Proportion   0.000   
## --------------------------------------------------------------------------------
## edu 
##        n  missing distinct 
##    60000        0        4 
##                                                                               
## Value         고등학교 졸업 대학교 졸업 이상      대학교 중퇴         저학력자
## Frequency       42910            14459             1883              748      
## Proportion      0.715            0.241            0.031            0.012      
## --------------------------------------------------------------------------------
## marriage 
##        n  missing distinct 
##    60000        0        5 
##                                                   
## Value         기혼    미혼    별거    사별  사실혼
## Frequency  38795    8300    3770    3307    5828  
## Proportion 0.647   0.138   0.063   0.055   0.097  
## --------------------------------------------------------------------------------
## house_type 
##        n  missing distinct 
##    60000        0        4 
##                                                                   
## Value           공공분양   아파트 임대      오피스텔 주택 / 아파트
## Frequency       2352          1080           526         56042    
## Proportion     0.039         0.018         0.009         0.934    
## --------------------------------------------------------------------------------
## res_prop 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       80    0.999  0.02088  0.01447 0.005002 0.006671 
##      .25      .50      .75      .90      .95 
## 0.010006 0.018850 0.028663 0.035792 0.046220 
## 
## lowest : 0.000533 0.000938 0.001276 0.001333 0.001417
## highest: 0.031329 0.032561 0.035792 0.04622  0.072508
## --------------------------------------------------------------------------------
## mobile 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency      1 59999
## Proportion     0     1
## --------------------------------------------------------------------------------
## work_mobile 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  11278 48722
## Proportion 0.188 0.812
## --------------------------------------------------------------------------------
## email 
##        n  missing distinct 
##    60000        0        2 
##                       
## Value         No   Yes
## Frequency  56619  3381
## Proportion 0.944 0.056
## --------------------------------------------------------------------------------
## job 
##        n  missing distinct 
##    60000        0       19 
## 
## lowest : Unknown          기술직           단순 노동자      핵심 노동자      운전자          
## highest: 비서             보안 업계 종사자 부동산중개업자   IT 업계 종사자   인사 담당자     
## --------------------------------------------------------------------------------
## family 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       12    0.842    2.156   0.9257        1        1 
##      .25      .50      .75      .90      .95 
##        2        2        3        3        4 
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    13    15
## Frequency  12940 31315 10139  4839   659    82    17     5     1     1     1
## Proportion 0.216 0.522 0.169 0.081 0.011 0.001 0.000 0.000 0.000 0.000 0.000
##                 
## Value         20
## Frequency      1
## Proportion 0.000
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## industry 
##        n  missing distinct 
##    60000        0       56 
## 
## lowest : 건설      경찰      광고      국가 안보 군대     , highest: 주택      통신      학교      호텔      환경     
## --------------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       49    0.999    44.02    13.66       26       28 
##      .25      .50      .75      .90      .95 
##       34       43       54       61       63 
## 
## lowest : 21 22 23 24 25, highest: 65 66 67 68 69
## --------------------------------------------------------------------------------
## work_year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       49     0.99    192.9    307.6        0        1 
##      .25      .50      .75      .90      .95 
##        2        6       16     1000     1000 
## 
## lowest :    0    1    2    3    4, highest:   44   45   46   48 1000
## --------------------------------------------------------------------------------
## member_year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    60000        0       55    0.999    13.27    10.98        0        1 
##      .25      .50      .75      .90      .95 
##        5       12       20       27       31 
## 
## lowest :  0  1  2  3  4, highest: 50 52 56 57 58
## --------------------------------------------------------------------------------

Missing Value

  • Let’s see if there are any missing values or not.
mycolors=c("#ff003f","#0094ff", "#ae00ff" , "#94ff00", "#ffc700","#fc1814")

# Creating function to see the missing values
bar_missing <- function(x){
  library(dplyr)
  library(reshape2)
  library(ggplot2)
  x %>%
    is.na %>%
    melt %>%
    ggplot(data = .,
           aes(x = Var2)) +
    geom_bar(aes(y=after_stat(count),fill=value),alpha=0.8) +
    scale_fill_brewer(palette = 'Set1',name = "", labels = c("Available","Missing"))+
    theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
    labs(x = "Variables in Dataset",
         y = "Observations")+coord_flip()+
    theme(legend.position = "right")
   }
# explore missing values 
bar_missing(train)

- It seems that there are no missing values in our dataset.

Dependent Variable (Overdue)

train %>% ggplot(aes(x=target,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8,width = 0.7)+
  geom_text(stat='prop',hjust=1.2,size=4.5,color='black')+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Dependent Variable (Default) Distribution')

  • It seems that we have an inbalanced dataset and have to conduct oversampling or undersampling when conducting classification prediction.

Categorical Variable

categorical_column1 <- c(2,3,4,5,8,9,10,11,13,14,15)
categorical_column2 <- c(2,16,18)


train[,categorical_column1] %>% gather(sex:email,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'fill',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Proportion')

train[,categorical_column1] %>% gather(sex:email,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Count')

- We will get rid of mobile column since most of observations tells that people have mobile phone.

train[,categorical_column2] %>% gather(job:industry,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Count')

train[,categorical_column2] %>% gather(job:industry,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_bar(position = 'fill',color='black',alpha=0.8)+
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Categorical Variable Proportion')

  • We have too many categories in job industry. Let’s revise this later.

Numerical Variable

numerical_column <- c(2,6,7,12,17,19,20,21)

# density plot
train[,numerical_column] %>% gather(kid:member_year,key='features',value='value') %>%
  ggplot(aes(x=value,fill=target))+
  geom_density(alpha=0.8) +
  facet_wrap(~features,scales='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  labs(title = 'Density Plot for Numerical Variable')

# box plot
train[,numerical_column] %>% gather(kid:member_year,key='features',value='value') %>%
  ggplot(aes(x=target,y=value,fill=target))+
  geom_boxplot(alpha=0.8)+
  facet_wrap(~features,scale='free',ncol=3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title = 'Box Plot for Numerical Variable')

- In the work_year column (indicating continuous work year in the current occupation), we havea value of 1,000 which is physically impossible.Let’s see how many observations we have whose continuous work experience is 1,000 years.

paste('The number of observations with 1,000 work experience is :',sum(train[,'work_year']==1000))
## [1] "The number of observations with 1,000 work experience is : 11275"
  • Since there are more than 10k+ observations, we can’t just remove all this observations.

  • Let’s divide the training dataset into two parts. One without 1,000 years and one with 1,000 years. Let’s discover what this figure means.

train_no <- train[train[,'work_year']<1000,]
train_yes <- train[train[,'work_year']==1000,]

p1 <- train_no[,c('age','work_year')] %>% gather(age:work_year,key='features',value='value') %>%
  ggplot(aes(x=value,fill=features))+
  geom_bar(alpha=0.8,color='black')+
  facet_wrap(~features,scales='free',ncol=2)+
  scale_fill_brewer(palette = 'Set1')+
  labs(title='Work Year < 1000')

p2 <- train_yes[,c('age','work_year')] %>% gather(age:work_year,key='features',value='value') %>%
  ggplot(aes(x=value,fill=features))+
  geom_bar(alpha=0.8,color='black')+
  facet_wrap(~features,scales='free',ncol=2)+
  scale_fill_brewer(palette = 'Set1')+
  labs(title='Work Year = 1000')

library(gridExtra)
grid.arrange(p1,p2)

  • We can see that people whose continuous work year = 1000 has negatively skewed distribution of age compared to other normal work year figures. Also, the distribution of work year for work year <1000 data don’t really follow similar distribution to that of age. This might indicate that work year = 1000 is the people’s continuous work year who haven’t changed their job and kept on working at the same work place. Since the test data includes value with 1,000, we won’t change the value.

3. Data Cleaning

# get rid of mobile column as a result from EDA
train <- train[,-13]

# get rid of other value in sex
train <-  train[train$sex!='기타',]

# narrow the income_type category to worker and non-worker
train <- transform(train, income_type = 
  ifelse(income_type %in% c("근로자", "공무원", "사업가"), "worker",
  ifelse(income_type %in% c("연금수령자", "학생", "실직자"), "non worker", "other")
  ))


# Calculate the overall mean income
total_mean_income <- mean(train$income)


# Calculate mean income for each industry and job category
industry_means <- train %>%
  group_by(industry) %>%
  summarise(mean_income = mean(income),
            industry_cat = ifelse(mean_income>total_mean_income,'above','below'))

job_means <- train %>%
  group_by(job) %>%
  summarise(mean_income = mean(income),
            job_cat = ifelse(mean_income>total_mean_income,'above','below'))

train <- train %>%
  left_join(industry_means %>% select(industry, industry_cat), by = "industry") %>%
  select(-industry) %>%
  rename(industry = industry_cat)

train <- train %>%
  left_join(job_means %>% select(job, job_cat), by = "job") %>%
  select(-job) %>%
  rename(job = job_cat)

chr_columns <- sapply(train,is.character)
train <- train %>%
  mutate_if(chr_columns,as.factor)

4. Modeling

set.seed(0)

library(caret)
# splot into train test dataset
index <- createDataPartition(train$target, p = 0.7, list = FALSE)
train_data<-train[index,]
test_data <- train[-index, ]

# undersample train dataset due to 
re1 <- train_data[(train_data$target=='Yes'),]
re0 <- train_data[(train_data$target=='No'),]

idx0 <- sample(1:nrow(re0), sum(train_data$target=='Yes'))
re0 <- re0[idx0,]
undersample <- rbind(re1,re0)
undersample %>% ggplot(aes(x=target,fill=target))+
  geom_bar(position = 'identity',color='black',alpha=0.8,width = 0.7)+
  geom_text(stat='prop',hjust=1.2,size=4.5,color='black')+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  labs(title='Undersample Data Dependent Variable Distribution')

Feature Selection

  • We want to reduce unnecessary variables before building out model to increase our predicitivity and reduce computation cost.

  • First we can select features by thihking logically. We can see from the EDA that the default proportion for higher age people is lower than that of younger people. However, does having an e-mail account helps predict default probability?

  • Also, we can get rid of some variables if they have high correlation. Let’s see if there are correlation between numerical variables.

u_numerical_column <- c(6,7,12,15,16,17,18)

# Create corrleation plot

plotfuncLow <- function(data,mapping){
  p <- ggplot(data = data,mapping=mapping)+geom_point(color=mycolors[2],size=0.5,alpha=0.7)+
    geom_smooth(method="lm",alpha=0.5)
  p
}

library(GGally)
ggpairs(undersample[,u_numerical_column],lower=list(continuous=plotfuncLow),diag=NULL)+
  ggtitle('Correlation Analysis')

  • We see that people with high family members tend to have a lot of kids after marriage. The correlation is below 0.9 so let’s keep this variable for our analysis.

  • LASSO regression is one popular method that could help us guide which variables are necessary for analysis. Let’s see what LASSO method chooses for the variables.

library(glmnet)
library(gamlr)
set.seed(0)
Xmatrix <- sparse.model.matrix(target ~., data=naref(undersample[,-1]))[,-1]
logit_lasso_model <- cv.glmnet(Xmatrix, undersample$target,family='binomial')
plot(logit_lasso_model)

# filter the coefficients that are non 0
coef(logit_lasso_model) %>% as.matrix() %>% as.data.frame() %>% filter(s1==0)
##                         s1
## realestateNo             0
## realestateYes            0
## income_typenon worker    0
## income_typeother         0
## edu대학교 중퇴           0
## edu저학력자              0
## marriage미혼             0
## marriage별거             0
## marriage사별             0
## marriage사실혼           0
## house_type공공분양       0
## house_type주택 / 아파트  0
## work_mobileNo            0
## work_mobileYes           0
## emailNo                  0
## emailYes                 0
## family                   0
## work_year                0
## jobbelow                 0
  • From the analysis, we see that LASSO method dropeed realestate, work_mobile, email,family,work_year. Let’s explore what other methods selects for the feature
# compute variable importance for RF
library(ranger)
library(tree)
var_imp <- ranger(target ~., data=undersample[,-1],write.forest = TRUE,num.trees = 200, min.node.size = 5, importance ='impurity')

library(timetk)
var_imp <- ranger::importance(var_imp) %>% sort(decreasing=TRUE)
var_imp9 <- var_imp%>%data.frame()%>%tk_tbl()

# Create the bar graph
library(RColorBrewer)
nb.cols <- 18
rfcolors <- colorRampPalette(brewer.pal(8, "Blues"))(nb.cols) # extending colors
ggplot(aes(x = reorder(index, .), y = ., fill = reorder(index, .)), data = var_imp9) +
  geom_bar(stat = 'identity', color = 'black', show.legend = FALSE) + 
  scale_fill_manual(values = rfcolors) +
  coord_flip() +
  labs(x = "Features", y = "Importance", title = "Random Forest Variable Importance")

  • work_mobile,email,house_type,job,sex seem not important factor for Random Forest Model.

  • For both model, email and work_mobile were not important factor. Thus, we will discard these variables and build our model.

Machine Learning Modeling

  • Now, we will build several model and create an ensemble model to see the performance.
Control <- trainControl(method='repeatedcv',number=5,classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = 'final')

#1 SVM
set.seed(0)

svm <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'svmLinear2',
                    trControl=Control,
                    tuneLength =5)
predsvm <- predict(svm,newdata=test_data[,-c(1,13,14)])
cm1 <- confusionMatrix(predsvm,test_data$target)


#2 CART
set.seed(0)
cart <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'rpart',
                    trControl=Control,
                    tuneLength =5)
predcart <- predict(cart,newdata=test_data[,-c(1,13,14)])
cm2 <- confusionMatrix(predcart,test_data$target)

#3 KNN
set.seed(0)
knn <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'knn',
                    trControl=Control,
                    tuneLength =5)
predknn <- predict(knn,newdata=test_data[,-c(1,13,14)])
cm3 <- confusionMatrix(predknn,test_data$target)

#4 OneR
set.seed(0)
one <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'OneR',
                    trControl=Control,
                    tuneLength =5)
predone <- predict(one,newdata=test_data[,-c(1,13,14)])
cm4 <- confusionMatrix(predone,test_data$target)

#5 RF
set.seed(0)
rf <- caret::train(target~., data=undersample[,-c(1,13,14)],
                    method = 'ranger',
                    trControl=Control,
                    tuneLength =5)
predrf <- predict(rf,newdata=test_data[,-c(1,13,14)])
cm5 <- confusionMatrix(predrf,test_data$target)
multiclasscmplot <- function(vote,truth){
  cm=confusionMatrix(vote,truth)
  cmt=cm$table%>%as_tibble()
  freq=Hmisc::describe(truth)%>%.$values%>%.$frequency
  labels=row.names(cm$table)
  
  k=1
  cmt$rate=cmt$n/1
  for(i in 1:2){
    for(k in 1:4){
      if(cmt$Reference[k]==labels[i]){
        cmt$rate[k]<-100*(cmt$n[k]/freq[i])
        k=k+1
      }
      else{k=k}
    }
  }
  
  cmt%>%ggplot(aes(x=Prediction,y=Reference,fill=rate))+
    geom_tile(color="black")+my_theme(10)+
    scale_fill_gradient2(low="white",mid="#008cff",high="#ff0059",midpoint=50)+
    theme(axis.text.x = element_text(angle =45, hjust = 0.5))+
    ggtitle("Ensemble")+
    geom_text(aes(y=Reference,x=Prediction,label=round(rate,1)),color="black",size=4)+coord_fixed()
}
  • Since we have built our model, let’s visualize the confusion matrix.
# Plot Confusion Matrix
m1 <- multiclasscmplot(vote=predsvm,truth = test_data$target)+ggtitle('SVM')
m2 <- multiclasscmplot(vote=predcart,truth = test_data$target)+ggtitle('CART')
m3 <- multiclasscmplot(vote=predknn,truth = test_data$target)+ggtitle('KNN')
m4 <- multiclasscmplot(vote=predone,truth = test_data$target)+ggtitle('OneR')
m5 <- multiclasscmplot(vote=predrf,truth = test_data$target)+ggtitle('RF')

grid.arrange(m1,m2,m3,m4,m5,ncol=3)

# creating function that extracts relevant statistics from the confusion matrix
calculate_metrics <- function(confusion_matrix) {
  accuracy <- confusion_matrix$overall[1]
  sensitivity <- confusion_matrix$byClass[1]
  specificity <- confusion_matrix$byClass[2]
  f1score <- confusion_matrix$byClass[7]
  fn <- confusion_matrix$table[1,2]/(confusion_matrix$table[1,2]+confusion_matrix$table[2,2])
  return(c(accuracy, sensitivity,specificity,f1score,fn))
}

confusion_matrices <- list(cm1,cm2,cm3,cm4,cm5)

metrics_list <- lapply(confusion_matrices, calculate_metrics)

accuracy <- round(sapply(metrics_list, `[`, 1),4)
sensitivity <- round(sapply(metrics_list, `[`, 2),4)
specificity <- round(sapply(metrics_list, `[`, 3),4)
f1score <- round(sapply(metrics_list, `[`, 4),4)
fn <- round(sapply(metrics_list, `[`, 5),4)
models <- c('SVM','CART','KNN','OneR','RF')

results <-  tibble(models=rep(models,5),
       metrics=c(rep('accuracy',5),
                 rep('sensitivity',5),
                 rep('specificity',5),
                 rep('f1sore',5),
                 rep('fn',5)), scores=c(accuracy,sensitivity,specificity,f1score,fn))

results$models <- reorder(results$models, results$scores)
results %>% 
  ggplot(aes(x=models,y=scores,fill=metrics))+
  geom_bar(color='black',stat='identity',alpha=0.8,show.legend = F,width = 0.3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  facet_grid(~metrics)+
  labs(title='Model Summary')

results_wide <- pivot_wider(results,names_from =models, values_from=scores)
knitr::kable(results_wide)
metrics SVM CART KNN OneR RF
accuracy 0.5777 0.6016 0.5533 0.5607 0.6015
sensitivity 0.5710 0.6056 0.5539 0.5586 0.6010
specificity 0.6338 0.5685 0.5488 0.5783 0.6058
f1sore 0.7071 0.7308 0.6889 0.6943 0.7292
fn 0.3662 0.4315 0.4512 0.4217 0.3942
  • We can see that SVM, CART, and RF model performed the best in terms of false negative rate or accuracy.

5. Ensemble

Weighted Average

  • In this section, we will use ensemble method (combining the models) to see whether this method returns better model thant the single model.

  • Since CART, SVM, RF model performed the best (considering all the tradeoffs of metrics), we will assign, 40% weight for RF, 25% ofr SVM and CAR, and 5% for KNN and OneR

pred1 <- predict(svm,newdata=test_data,type="prob")
pred1 <- pred1*0.4

pred2 <- predict(cart,newdata=test_data,type="prob")
pred2 <- pred2*0.1

pred3 <- predict(knn,newdata=test_data,type="prob")
pred3 <- pred3*0

pred4 <- predict(one,newdata=test_data,type="prob")
pred4 <- pred4*0

pred5 <- predict(rf,newdata=test_data,type="prob")
pred5 <- pred5*0.5

w_ensemble <- (pred1+pred2+pred3+pred4+pred5) %>% as.tibble() 

w_ensemble <- ifelse(w_ensemble$No>w_ensemble$Yes,'No','Yes')
w_ensemble <- w_ensemble %>% as.factor()
m6 <- multiclasscmplot(vote=w_ensemble,truth = test_data$target)+ggtitle('Weighted Ensemble')

GBM Stacking Ensemble

library(caretEnsemble)
library(doParallel)
algorithmlist = c('svmLinear2','rpart','knn','OneR','ranger')

set.seed(0)
model2 <- c(svm,cart,knn,one,rf)
stack <- caretStack(model2,method='gbm',trControl=Control,tuneLength=3)
s_ensemble <- predict(stack,newdata=test_data)
m7 <- multiclasscmplot(s_ensemble,truth=test_data$target)+ggtitle('Stacking Ensemble')

6. Results

grid.arrange(m6,m7,m1,m2,m5,ncol=3)

cm6 <- confusionMatrix(w_ensemble,reference = test_data$target)
cm7 <- confusionMatrix(s_ensemble,reference = test_data$target)

# creating function that extracts relevant statistics from the confusion matrix
calculate_metrics <- function(confusion_matrix) {
  accuracy <- confusion_matrix$overall[1]
  sensitivity <- confusion_matrix$byClass[1]
  specificity <- confusion_matrix$byClass[2]
  f1score <- confusion_matrix$byClass[7]
  fn <- confusion_matrix$table[1,2]/(confusion_matrix$table[1,2]+confusion_matrix$table[2,2])
  return(c(accuracy, sensitivity,specificity,f1score,fn))
}

confusion_matrices <- list(cm1,cm2,cm3,cm4,cm5,cm6,cm7)

metrics_list <- lapply(confusion_matrices, calculate_metrics)

accuracy <- round(sapply(metrics_list, `[`, 1),4)
sensitivity <- round(sapply(metrics_list, `[`, 2),4)
specificity <- round(sapply(metrics_list, `[`, 3),4)
f1score <- round(sapply(metrics_list, `[`, 4),4)
fn <- round(sapply(metrics_list, `[`, 5),4)
models <- c('SVM','CART','KNN','OneR','RF','wEN','sEN')

results <-  tibble(models=rep(models,5),
       metrics=c(rep('accuracy',7),
                 rep('sensitivity',7),
                 rep('specificity',7),
                 rep('f1sore',7),
                 rep('fn',7)), scores=c(accuracy,sensitivity,specificity,f1score,fn))

results$models <- reorder(results$models, results$scores)
results %>% 
  ggplot(aes(x=models,y=scores,fill=metrics))+
  geom_bar(color='black',stat='identity',alpha=0.8,show.legend = F,width = 0.3)+
  scale_fill_brewer(palette = 'Set1')+
  coord_flip()+
  facet_grid(~metrics)+
  labs(title='Model Summary')

results_wide <- pivot_wider(results,names_from =models, values_from=scores)
knitr::kable(results_wide)
metrics SVM CART KNN OneR RF wEN sEN
accuracy 0.5777 0.6016 0.5533 0.5607 0.6015 0.6007 0.6230
sensitivity 0.5710 0.6056 0.5539 0.5586 0.6010 0.5988 0.6280
specificity 0.6338 0.5685 0.5488 0.5783 0.6058 0.6162 0.5809
f1sore 0.7071 0.7308 0.6889 0.6943 0.7292 0.7281 0.7484
fn 0.3662 0.4315 0.4512 0.4217 0.3942 0.3838 0.4191