# set visualization theme
library(plyr)
library(tidyverse)
library(GGally)
my_theme <- function(base_size = 10, base_family = "sans"){
theme_minimal(base_size = base_size, base_family = base_family) +
theme(
axis.text = element_text(size = 10),
axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
axis.title = element_text(size = 10),
panel.grid.major = element_line(color = "grey"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "#f7fdff"),
strip.background = element_rect(fill = "#001d60", color = "#00113a", size =0.7),
strip.text = element_text(face = "bold", size = 10, color = "white"),
legend.position = "right",
legend.justification = "center",
legend.background = element_blank(),
panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
)
}
theme_set(my_theme())
# read the data
library(readr)
train <- read_csv('train.csv')
test <- read_csv('test.csv')
# change the column names to English
column_names <- c('id','target','sex','car','realestate','kid','income','income_type',
'edu','marriage','house_type','res_prop','mobile','work_mobile','email',
'job','family','industry','age','work_year','member_year')
colnames(train) <- column_names
colnames(test) <- column_names[-2]
# change binary to factor
recode_factors <- function(data, columns) {
for (col in columns) {
data[[col]] <- recode_factor(data[[col]], `0` = 'No', `1` = 'Yes')
}
return(data)
}
columns_to_recode <- c('target','car','realestate','mobile','work_mobile','email','job')
train <- recode_factors(train,columns_to_recode)
test <- recode_factors(test,columns_to_recode[-1])
# explore data
knitr::kable(head(train))
| id | target | sex | car | realestate | kid | income | income_type | edu | marriage | house_type | res_prop | mobile | work_mobile | job | family | industry | age | work_year | member_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TRAIN_00000 | No | 여성 | Yes | Yes | 2 | 18054000 | 연금수령자 | 고등학교 졸업 | 기혼 | 주택 / 아파트 | 0.004960 | Yes | No | No | Unknown | 4 | 기타 1 | 39 | 1000 | 23 |
| TRAIN_00001 | No | 남성 | Yes | No | 0 | 59472000 | 근로자 | 대학교 졸업 이상 | 기혼 | 주택 / 아파트 | 0.018029 | Yes | Yes | No | 기술직 | 2 | 사업 1 | 45 | 4 | 16 |
| TRAIN_00002 | No | 여성 | No | Yes | 0 | 29736000 | 근로자 | 고등학교 졸업 | 기혼 | 주택 / 아파트 | 0.010500 | Yes | Yes | No | 단순 노동자 | 2 | 사업 0 | 32 | 3 | 9 |
| TRAIN_00003 | No | 여성 | Yes | No | 1 | 38232000 | 기타 | 고등학교 졸업 | 기혼 | 주택 / 아파트 | 0.004849 | Yes | Yes | No | Unknown | 3 | 산업 4 | 34 | 6 | 12 |
| TRAIN_00004 | No | 여성 | No | Yes | 0 | 26550000 | 근로자 | 고등학교 졸업 | 기혼 | 주택 / 아파트 | 0.025164 | Yes | Yes | No | Unknown | 2 | 사업 2 | 38 | 0 | 4 |
| TRAIN_00005 | No | 여성 | Yes | No | 0 | 21240000 | 근로자 | 고등학교 졸업 | 미혼 | 아파트 임대 | 0.018634 | Yes | Yes | No | 핵심 노동자 | 1 | 무역 0 | 30 | 8 | 1 |
Hmisc::describe(train)
## train
##
## 21 Variables 60000 Observations
## --------------------------------------------------------------------------------
## id
## n missing distinct
## 60000 0 60000
##
## lowest : TRAIN_00000 TRAIN_00001 TRAIN_00002 TRAIN_00003 TRAIN_00004
## highest: TRAIN_59995 TRAIN_59996 TRAIN_59997 TRAIN_59998 TRAIN_59999
## --------------------------------------------------------------------------------
## target
## n missing distinct
## 60000 0 2
##
## Value No Yes
## Frequency 53572 6428
## Proportion 0.893 0.107
## --------------------------------------------------------------------------------
## sex
## n missing distinct
## 60000 0 3
##
## Value 기타 남성 여성
## Frequency 1 20265 39734
## Proportion 0.000 0.338 0.662
## --------------------------------------------------------------------------------
## car
## n missing distinct
## 60000 0 2
##
## Value No Yes
## Frequency 39749 20251
## Proportion 0.662 0.338
## --------------------------------------------------------------------------------
## realestate
## n missing distinct
## 60000 0 2
##
## Value No Yes
## Frequency 17482 42518
## Proportion 0.291 0.709
## --------------------------------------------------------------------------------
## kid
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 11 0.641 0.4121 0.6297 0 0
## .25 .50 .75 .90 .95
## 0 0 1 2 2
##
## Value 0 1 2 3 4 5 6 7 11 14 19
## Frequency 42317 11638 5229 701 87 19 5 1 1 1 1
## Proportion 0.705 0.194 0.087 0.012 0.001 0.000 0.000 0.000 0.000 0.000 0.000
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## income
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 817 0.995 39836994 21848829 15930000 19116000
## .25 .50 .75 .90 .95
## 26550000 35046000 47790000 63720000 79650000
##
## lowest : 6265800 6372000 6510060 6584400 6690600
## highest: 531000000 691486254 902700000 1062000000 2124000000
## --------------------------------------------------------------------------------
## income_type
## n missing distinct
## 60000 0 7
##
## Value 공무원 근로자 기타 사업가 실직자 연금수령자
## Frequency 4146 30693 13879 2 4 11274
## Proportion 0.069 0.512 0.231 0.000 0.000 0.188
##
## Value 학생
## Frequency 2
## Proportion 0.000
## --------------------------------------------------------------------------------
## edu
## n missing distinct
## 60000 0 4
##
## Value 고등학교 졸업 대학교 졸업 이상 대학교 중퇴 저학력자
## Frequency 42910 14459 1883 748
## Proportion 0.715 0.241 0.031 0.012
## --------------------------------------------------------------------------------
## marriage
## n missing distinct
## 60000 0 5
##
## Value 기혼 미혼 별거 사별 사실혼
## Frequency 38795 8300 3770 3307 5828
## Proportion 0.647 0.138 0.063 0.055 0.097
## --------------------------------------------------------------------------------
## house_type
## n missing distinct
## 60000 0 4
##
## Value 공공분양 아파트 임대 오피스텔 주택 / 아파트
## Frequency 2352 1080 526 56042
## Proportion 0.039 0.018 0.009 0.934
## --------------------------------------------------------------------------------
## res_prop
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 80 0.999 0.02088 0.01447 0.005002 0.006671
## .25 .50 .75 .90 .95
## 0.010006 0.018850 0.028663 0.035792 0.046220
##
## lowest : 0.000533 0.000938 0.001276 0.001333 0.001417
## highest: 0.031329 0.032561 0.035792 0.04622 0.072508
## --------------------------------------------------------------------------------
## mobile
## n missing distinct
## 60000 0 2
##
## Value No Yes
## Frequency 1 59999
## Proportion 0 1
## --------------------------------------------------------------------------------
## work_mobile
## n missing distinct
## 60000 0 2
##
## Value No Yes
## Frequency 11278 48722
## Proportion 0.188 0.812
## --------------------------------------------------------------------------------
## email
## n missing distinct
## 60000 0 2
##
## Value No Yes
## Frequency 56619 3381
## Proportion 0.944 0.056
## --------------------------------------------------------------------------------
## job
## n missing distinct
## 60000 0 19
##
## lowest : Unknown 기술직 단순 노동자 핵심 노동자 운전자
## highest: 비서 보안 업계 종사자 부동산중개업자 IT 업계 종사자 인사 담당자
## --------------------------------------------------------------------------------
## family
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 12 0.842 2.156 0.9257 1 1
## .25 .50 .75 .90 .95
## 2 2 3 3 4
##
## Value 1 2 3 4 5 6 7 8 9 13 15
## Frequency 12940 31315 10139 4839 659 82 17 5 1 1 1
## Proportion 0.216 0.522 0.169 0.081 0.011 0.001 0.000 0.000 0.000 0.000 0.000
##
## Value 20
## Frequency 1
## Proportion 0.000
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## industry
## n missing distinct
## 60000 0 56
##
## lowest : 건설 경찰 광고 국가 안보 군대 , highest: 주택 통신 학교 호텔 환경
## --------------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 49 0.999 44.02 13.66 26 28
## .25 .50 .75 .90 .95
## 34 43 54 61 63
##
## lowest : 21 22 23 24 25, highest: 65 66 67 68 69
## --------------------------------------------------------------------------------
## work_year
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 49 0.99 192.9 307.6 0 1
## .25 .50 .75 .90 .95
## 2 6 16 1000 1000
##
## lowest : 0 1 2 3 4, highest: 44 45 46 48 1000
## --------------------------------------------------------------------------------
## member_year
## n missing distinct Info Mean Gmd .05 .10
## 60000 0 55 0.999 13.27 10.98 0 1
## .25 .50 .75 .90 .95
## 5 12 20 27 31
##
## lowest : 0 1 2 3 4, highest: 50 52 56 57 58
## --------------------------------------------------------------------------------
mycolors=c("#ff003f","#0094ff", "#ae00ff" , "#94ff00", "#ffc700","#fc1814")
# Creating function to see the missing values
bar_missing <- function(x){
library(dplyr)
library(reshape2)
library(ggplot2)
x %>%
is.na %>%
melt %>%
ggplot(data = .,
aes(x = Var2)) +
geom_bar(aes(y=after_stat(count),fill=value),alpha=0.8) +
scale_fill_brewer(palette = 'Set1',name = "", labels = c("Available","Missing"))+
theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
labs(x = "Variables in Dataset",
y = "Observations")+coord_flip()+
theme(legend.position = "right")
}
# explore missing values
bar_missing(train)
- It seems that there are no missing values in our dataset.
train %>% ggplot(aes(x=target,fill=target))+
geom_bar(position = 'identity',color='black',alpha=0.8,width = 0.7)+
geom_text(stat='prop',hjust=1.2,size=4.5,color='black')+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title='Dependent Variable (Default) Distribution')
categorical_column1 <- c(2,3,4,5,8,9,10,11,13,14,15)
categorical_column2 <- c(2,16,18)
train[,categorical_column1] %>% gather(sex:email,key='features',value='value') %>%
ggplot(aes(x=value,fill=target))+
geom_bar(position = 'fill',color='black',alpha=0.8)+
facet_wrap(~features,scales='free',ncol=3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title='Categorical Variable Proportion')
train[,categorical_column1] %>% gather(sex:email,key='features',value='value') %>%
ggplot(aes(x=value,fill=target))+
geom_bar(position = 'identity',color='black',alpha=0.8)+
facet_wrap(~features,scales='free',ncol=3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title='Categorical Variable Count')
- We will get rid of mobile column since most of observations tells that
people have mobile phone.
train[,categorical_column2] %>% gather(job:industry,key='features',value='value') %>%
ggplot(aes(x=value,fill=target))+
geom_bar(position = 'identity',color='black',alpha=0.8)+
facet_wrap(~features,scales='free',ncol=3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title='Categorical Variable Count')
train[,categorical_column2] %>% gather(job:industry,key='features',value='value') %>%
ggplot(aes(x=value,fill=target))+
geom_bar(position = 'fill',color='black',alpha=0.8)+
facet_wrap(~features,scales='free',ncol=3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title='Categorical Variable Proportion')
numerical_column <- c(2,6,7,12,17,19,20,21)
# density plot
train[,numerical_column] %>% gather(kid:member_year,key='features',value='value') %>%
ggplot(aes(x=value,fill=target))+
geom_density(alpha=0.8) +
facet_wrap(~features,scales='free',ncol=3)+
scale_fill_brewer(palette = 'Set1')+
labs(title = 'Density Plot for Numerical Variable')
# box plot
train[,numerical_column] %>% gather(kid:member_year,key='features',value='value') %>%
ggplot(aes(x=target,y=value,fill=target))+
geom_boxplot(alpha=0.8)+
facet_wrap(~features,scale='free',ncol=3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title = 'Box Plot for Numerical Variable')
- In the work_year column (indicating continuous work year in the
current occupation), we havea value of 1,000 which is physically
impossible.Let’s see how many observations we have whose continuous work
experience is 1,000 years.
paste('The number of observations with 1,000 work experience is :',sum(train[,'work_year']==1000))
## [1] "The number of observations with 1,000 work experience is : 11275"
Since there are more than 10k+ observations, we can’t just remove all this observations.
Let’s divide the training dataset into two parts. One without 1,000 years and one with 1,000 years. Let’s discover what this figure means.
train_no <- train[train[,'work_year']<1000,]
train_yes <- train[train[,'work_year']==1000,]
p1 <- train_no[,c('age','work_year')] %>% gather(age:work_year,key='features',value='value') %>%
ggplot(aes(x=value,fill=features))+
geom_bar(alpha=0.8,color='black')+
facet_wrap(~features,scales='free',ncol=2)+
scale_fill_brewer(palette = 'Set1')+
labs(title='Work Year < 1000')
p2 <- train_yes[,c('age','work_year')] %>% gather(age:work_year,key='features',value='value') %>%
ggplot(aes(x=value,fill=features))+
geom_bar(alpha=0.8,color='black')+
facet_wrap(~features,scales='free',ncol=2)+
scale_fill_brewer(palette = 'Set1')+
labs(title='Work Year = 1000')
library(gridExtra)
grid.arrange(p1,p2)
# get rid of mobile column as a result from EDA
train <- train[,-13]
# get rid of other value in sex
train <- train[train$sex!='기타',]
# narrow the income_type category to worker and non-worker
train <- transform(train, income_type =
ifelse(income_type %in% c("근로자", "공무원", "사업가"), "worker",
ifelse(income_type %in% c("연금수령자", "학생", "실직자"), "non worker", "other")
))
# Calculate the overall mean income
total_mean_income <- mean(train$income)
# Calculate mean income for each industry and job category
industry_means <- train %>%
group_by(industry) %>%
summarise(mean_income = mean(income),
industry_cat = ifelse(mean_income>total_mean_income,'above','below'))
job_means <- train %>%
group_by(job) %>%
summarise(mean_income = mean(income),
job_cat = ifelse(mean_income>total_mean_income,'above','below'))
train <- train %>%
left_join(industry_means %>% select(industry, industry_cat), by = "industry") %>%
select(-industry) %>%
rename(industry = industry_cat)
train <- train %>%
left_join(job_means %>% select(job, job_cat), by = "job") %>%
select(-job) %>%
rename(job = job_cat)
chr_columns <- sapply(train,is.character)
train <- train %>%
mutate_if(chr_columns,as.factor)
Let’s divide the data into in-sample and out-of-sample to define the accuracy
Since we have an inbalanced dataset, let’s conduct undersampling to match the dataset.
set.seed(0)
library(caret)
# splot into train test dataset
index <- createDataPartition(train$target, p = 0.7, list = FALSE)
train_data<-train[index,]
test_data <- train[-index, ]
# undersample train dataset due to
re1 <- train_data[(train_data$target=='Yes'),]
re0 <- train_data[(train_data$target=='No'),]
idx0 <- sample(1:nrow(re0), sum(train_data$target=='Yes'))
re0 <- re0[idx0,]
undersample <- rbind(re1,re0)
undersample %>% ggplot(aes(x=target,fill=target))+
geom_bar(position = 'identity',color='black',alpha=0.8,width = 0.7)+
geom_text(stat='prop',hjust=1.2,size=4.5,color='black')+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
labs(title='Undersample Data Dependent Variable Distribution')
We want to reduce unnecessary variables before building out model to increase our predicitivity and reduce computation cost.
First we can select features by thihking logically. We can see from the EDA that the default proportion for higher age people is lower than that of younger people. However, does having an e-mail account helps predict default probability?
Also, we can get rid of some variables if they have high correlation. Let’s see if there are correlation between numerical variables.
u_numerical_column <- c(6,7,12,15,16,17,18)
# Create corrleation plot
plotfuncLow <- function(data,mapping){
p <- ggplot(data = data,mapping=mapping)+geom_point(color=mycolors[2],size=0.5,alpha=0.7)+
geom_smooth(method="lm",alpha=0.5)
p
}
library(GGally)
ggpairs(undersample[,u_numerical_column],lower=list(continuous=plotfuncLow),diag=NULL)+
ggtitle('Correlation Analysis')
We see that people with high family members tend to have a lot of kids after marriage. The correlation is below 0.9 so let’s keep this variable for our analysis.
LASSO regression is one popular method that could help us guide which variables are necessary for analysis. Let’s see what LASSO method chooses for the variables.
library(glmnet)
library(gamlr)
set.seed(0)
Xmatrix <- sparse.model.matrix(target ~., data=naref(undersample[,-1]))[,-1]
logit_lasso_model <- cv.glmnet(Xmatrix, undersample$target,family='binomial')
plot(logit_lasso_model)
# filter the coefficients that are non 0
coef(logit_lasso_model) %>% as.matrix() %>% as.data.frame() %>% filter(s1==0)
## s1
## realestateNo 0
## realestateYes 0
## income_typenon worker 0
## income_typeother 0
## edu대학교 중퇴 0
## edu저학력자 0
## marriage미혼 0
## marriage별거 0
## marriage사별 0
## marriage사실혼 0
## house_type공공분양 0
## house_type주택 / 아파트 0
## work_mobileNo 0
## work_mobileYes 0
## emailNo 0
## emailYes 0
## family 0
## work_year 0
## jobbelow 0
# compute variable importance for RF
library(ranger)
library(tree)
var_imp <- ranger(target ~., data=undersample[,-1],write.forest = TRUE,num.trees = 200, min.node.size = 5, importance ='impurity')
library(timetk)
var_imp <- ranger::importance(var_imp) %>% sort(decreasing=TRUE)
var_imp9 <- var_imp%>%data.frame()%>%tk_tbl()
# Create the bar graph
library(RColorBrewer)
nb.cols <- 18
rfcolors <- colorRampPalette(brewer.pal(8, "Blues"))(nb.cols) # extending colors
ggplot(aes(x = reorder(index, .), y = ., fill = reorder(index, .)), data = var_imp9) +
geom_bar(stat = 'identity', color = 'black', show.legend = FALSE) +
scale_fill_manual(values = rfcolors) +
coord_flip() +
labs(x = "Features", y = "Importance", title = "Random Forest Variable Importance")
work_mobile,email,house_type,job,sex seem not important factor for Random Forest Model.
For both model, email and work_mobile were not important factor. Thus, we will discard these variables and build our model.
Control <- trainControl(method='repeatedcv',number=5,classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = 'final')
#1 SVM
set.seed(0)
svm <- caret::train(target~., data=undersample[,-c(1,13,14)],
method = 'svmLinear2',
trControl=Control,
tuneLength =5)
predsvm <- predict(svm,newdata=test_data[,-c(1,13,14)])
cm1 <- confusionMatrix(predsvm,test_data$target)
#2 CART
set.seed(0)
cart <- caret::train(target~., data=undersample[,-c(1,13,14)],
method = 'rpart',
trControl=Control,
tuneLength =5)
predcart <- predict(cart,newdata=test_data[,-c(1,13,14)])
cm2 <- confusionMatrix(predcart,test_data$target)
#3 KNN
set.seed(0)
knn <- caret::train(target~., data=undersample[,-c(1,13,14)],
method = 'knn',
trControl=Control,
tuneLength =5)
predknn <- predict(knn,newdata=test_data[,-c(1,13,14)])
cm3 <- confusionMatrix(predknn,test_data$target)
#4 OneR
set.seed(0)
one <- caret::train(target~., data=undersample[,-c(1,13,14)],
method = 'OneR',
trControl=Control,
tuneLength =5)
predone <- predict(one,newdata=test_data[,-c(1,13,14)])
cm4 <- confusionMatrix(predone,test_data$target)
#5 RF
set.seed(0)
rf <- caret::train(target~., data=undersample[,-c(1,13,14)],
method = 'ranger',
trControl=Control,
tuneLength =5)
predrf <- predict(rf,newdata=test_data[,-c(1,13,14)])
cm5 <- confusionMatrix(predrf,test_data$target)
multiclasscmplot <- function(vote,truth){
cm=confusionMatrix(vote,truth)
cmt=cm$table%>%as_tibble()
freq=Hmisc::describe(truth)%>%.$values%>%.$frequency
labels=row.names(cm$table)
k=1
cmt$rate=cmt$n/1
for(i in 1:2){
for(k in 1:4){
if(cmt$Reference[k]==labels[i]){
cmt$rate[k]<-100*(cmt$n[k]/freq[i])
k=k+1
}
else{k=k}
}
}
cmt%>%ggplot(aes(x=Prediction,y=Reference,fill=rate))+
geom_tile(color="black")+my_theme(10)+
scale_fill_gradient2(low="white",mid="#008cff",high="#ff0059",midpoint=50)+
theme(axis.text.x = element_text(angle =45, hjust = 0.5))+
ggtitle("Ensemble")+
geom_text(aes(y=Reference,x=Prediction,label=round(rate,1)),color="black",size=4)+coord_fixed()
}
# Plot Confusion Matrix
m1 <- multiclasscmplot(vote=predsvm,truth = test_data$target)+ggtitle('SVM')
m2 <- multiclasscmplot(vote=predcart,truth = test_data$target)+ggtitle('CART')
m3 <- multiclasscmplot(vote=predknn,truth = test_data$target)+ggtitle('KNN')
m4 <- multiclasscmplot(vote=predone,truth = test_data$target)+ggtitle('OneR')
m5 <- multiclasscmplot(vote=predrf,truth = test_data$target)+ggtitle('RF')
grid.arrange(m1,m2,m3,m4,m5,ncol=3)
# creating function that extracts relevant statistics from the confusion matrix
calculate_metrics <- function(confusion_matrix) {
accuracy <- confusion_matrix$overall[1]
sensitivity <- confusion_matrix$byClass[1]
specificity <- confusion_matrix$byClass[2]
f1score <- confusion_matrix$byClass[7]
fn <- confusion_matrix$table[1,2]/(confusion_matrix$table[1,2]+confusion_matrix$table[2,2])
return(c(accuracy, sensitivity,specificity,f1score,fn))
}
confusion_matrices <- list(cm1,cm2,cm3,cm4,cm5)
metrics_list <- lapply(confusion_matrices, calculate_metrics)
accuracy <- round(sapply(metrics_list, `[`, 1),4)
sensitivity <- round(sapply(metrics_list, `[`, 2),4)
specificity <- round(sapply(metrics_list, `[`, 3),4)
f1score <- round(sapply(metrics_list, `[`, 4),4)
fn <- round(sapply(metrics_list, `[`, 5),4)
models <- c('SVM','CART','KNN','OneR','RF')
results <- tibble(models=rep(models,5),
metrics=c(rep('accuracy',5),
rep('sensitivity',5),
rep('specificity',5),
rep('f1sore',5),
rep('fn',5)), scores=c(accuracy,sensitivity,specificity,f1score,fn))
results$models <- reorder(results$models, results$scores)
results %>%
ggplot(aes(x=models,y=scores,fill=metrics))+
geom_bar(color='black',stat='identity',alpha=0.8,show.legend = F,width = 0.3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
facet_grid(~metrics)+
labs(title='Model Summary')
results_wide <- pivot_wider(results,names_from =models, values_from=scores)
knitr::kable(results_wide)
| metrics | SVM | CART | KNN | OneR | RF |
|---|---|---|---|---|---|
| accuracy | 0.5777 | 0.6016 | 0.5533 | 0.5607 | 0.6015 |
| sensitivity | 0.5710 | 0.6056 | 0.5539 | 0.5586 | 0.6010 |
| specificity | 0.6338 | 0.5685 | 0.5488 | 0.5783 | 0.6058 |
| f1sore | 0.7071 | 0.7308 | 0.6889 | 0.6943 | 0.7292 |
| fn | 0.3662 | 0.4315 | 0.4512 | 0.4217 | 0.3942 |
In this section, we will use ensemble method (combining the models) to see whether this method returns better model thant the single model.
Since CART, SVM, RF model performed the best (considering all the tradeoffs of metrics), we will assign, 40% weight for RF, 25% ofr SVM and CAR, and 5% for KNN and OneR
pred1 <- predict(svm,newdata=test_data,type="prob")
pred1 <- pred1*0.4
pred2 <- predict(cart,newdata=test_data,type="prob")
pred2 <- pred2*0.1
pred3 <- predict(knn,newdata=test_data,type="prob")
pred3 <- pred3*0
pred4 <- predict(one,newdata=test_data,type="prob")
pred4 <- pred4*0
pred5 <- predict(rf,newdata=test_data,type="prob")
pred5 <- pred5*0.5
w_ensemble <- (pred1+pred2+pred3+pred4+pred5) %>% as.tibble()
w_ensemble <- ifelse(w_ensemble$No>w_ensemble$Yes,'No','Yes')
w_ensemble <- w_ensemble %>% as.factor()
m6 <- multiclasscmplot(vote=w_ensemble,truth = test_data$target)+ggtitle('Weighted Ensemble')
library(caretEnsemble)
library(doParallel)
algorithmlist = c('svmLinear2','rpart','knn','OneR','ranger')
set.seed(0)
model2 <- c(svm,cart,knn,one,rf)
stack <- caretStack(model2,method='gbm',trControl=Control,tuneLength=3)
s_ensemble <- predict(stack,newdata=test_data)
m7 <- multiclasscmplot(s_ensemble,truth=test_data$target)+ggtitle('Stacking Ensemble')
grid.arrange(m6,m7,m1,m2,m5,ncol=3)
cm6 <- confusionMatrix(w_ensemble,reference = test_data$target)
cm7 <- confusionMatrix(s_ensemble,reference = test_data$target)
# creating function that extracts relevant statistics from the confusion matrix
calculate_metrics <- function(confusion_matrix) {
accuracy <- confusion_matrix$overall[1]
sensitivity <- confusion_matrix$byClass[1]
specificity <- confusion_matrix$byClass[2]
f1score <- confusion_matrix$byClass[7]
fn <- confusion_matrix$table[1,2]/(confusion_matrix$table[1,2]+confusion_matrix$table[2,2])
return(c(accuracy, sensitivity,specificity,f1score,fn))
}
confusion_matrices <- list(cm1,cm2,cm3,cm4,cm5,cm6,cm7)
metrics_list <- lapply(confusion_matrices, calculate_metrics)
accuracy <- round(sapply(metrics_list, `[`, 1),4)
sensitivity <- round(sapply(metrics_list, `[`, 2),4)
specificity <- round(sapply(metrics_list, `[`, 3),4)
f1score <- round(sapply(metrics_list, `[`, 4),4)
fn <- round(sapply(metrics_list, `[`, 5),4)
models <- c('SVM','CART','KNN','OneR','RF','wEN','sEN')
results <- tibble(models=rep(models,5),
metrics=c(rep('accuracy',7),
rep('sensitivity',7),
rep('specificity',7),
rep('f1sore',7),
rep('fn',7)), scores=c(accuracy,sensitivity,specificity,f1score,fn))
results$models <- reorder(results$models, results$scores)
results %>%
ggplot(aes(x=models,y=scores,fill=metrics))+
geom_bar(color='black',stat='identity',alpha=0.8,show.legend = F,width = 0.3)+
scale_fill_brewer(palette = 'Set1')+
coord_flip()+
facet_grid(~metrics)+
labs(title='Model Summary')
results_wide <- pivot_wider(results,names_from =models, values_from=scores)
knitr::kable(results_wide)
| metrics | SVM | CART | KNN | OneR | RF | wEN | sEN |
|---|---|---|---|---|---|---|---|
| accuracy | 0.5777 | 0.6016 | 0.5533 | 0.5607 | 0.6015 | 0.6007 | 0.6230 |
| sensitivity | 0.5710 | 0.6056 | 0.5539 | 0.5586 | 0.6010 | 0.5988 | 0.6280 |
| specificity | 0.6338 | 0.5685 | 0.5488 | 0.5783 | 0.6058 | 0.6162 | 0.5809 |
| f1sore | 0.7071 | 0.7308 | 0.6889 | 0.6943 | 0.7292 | 0.7281 | 0.7484 |
| fn | 0.3662 | 0.4315 | 0.4512 | 0.4217 | 0.3942 | 0.3838 | 0.4191 |
By using ensemble method, we have achieved higher accuracy, F1 score. However, it seems that there is a tradeoff between false negative and accuracy of the model.
Since SVM showed the lowest false negative rate and other models showed higher false negative rate, the ensemble method returned higher false negative rate compared to SVM.
It seems that GBM stacking ensemble method showed the best results considering the tradeoff between false negative rate and accuracy.