#Description According to WHO, stroke is the second leading cause of death and the third leading cause of disability in humans in the world. It is estimated that 70% of strokes occur in low- and middle-income countries.
The Dataset link: here
The raport is structured as follow:
Introduction.
Data Preparation
Exploratory Data Analysis.
Data Preprocessing.
Modelling
Performance
# 1.1 Import Dataset
rm(list = ls())
data_df = read.csv("healthcare-dataset-stroke-data.csv")
head(data_df)
## id gender age hypertension heart_disease ever_married work_type
## 1 9046 Male 67 0 1 Yes Private
## 2 51676 Female 61 0 0 Yes Self-employed
## 3 31112 Male 80 0 1 Yes Private
## 4 60182 Female 49 0 0 Yes Private
## 5 1665 Female 79 1 0 Yes Self-employed
## 6 56669 Male 81 0 0 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 202.21 N/A never smoked 1
## 3 Rural 105.92 32.5 never smoked 1
## 4 Urban 171.23 34.4 smokes 1
## 5 Rural 174.12 24 never smoked 1
## 6 Urban 186.21 29 formerly smoked 1
## 1.2 check unique values of categorical values
table(data_df$gender) # found "other"
##
## Female Male Other
## 2994 2115 1
table(data_df$ever_married)
##
## No Yes
## 1757 3353
table(data_df$work_type)
##
## children Govt_job Never_worked Private Self-employed
## 687 657 22 2925 819
table(data_df$smoking_status)
##
## formerly smoked never smoked smokes Unknown
## 885 1892 789 1544
table(data_df$Residence_type)
##
## Rural Urban
## 2514 2596
# 1.3 Check for missing values
library(naniar)
miss_scan_count(data = data_df, search = list("N/A","Unknown","Other"))
## # A tibble: 12 × 2
## Variable n
## <chr> <int>
## 1 id 0
## 2 gender 1
## 3 age 0
## 4 hypertension 0
## 5 heart_disease 0
## 6 ever_married 0
## 7 work_type 0
## 8 Residence_type 0
## 9 avg_glucose_level 0
## 10 bmi 201
## 11 smoking_status 1544
## 12 stroke 0
# Notes
##Missing Value and unreasonable value
## 1. bmi = 201 => mean
## 2. gender = 1 => remove
# 2.1 Imputation BMI
data_df$bmi <- as.numeric(data_df$bmi)
## Warning: NAs introduced by coercion
idx <- complete.cases(data_df)
bmi_idx <- is.na(data_df$bmi)
median_bmi <- median(data_df$bmi, na.rm = TRUE)
median_bmi
## [1] 28.1
data_df[bmi_idx,]$bmi <- median_bmi
str(data_df)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : num 67 61 80 49 79 81 74 69 59 78 ...
## $ hypertension : int 0 0 0 0 1 0 1 0 0 0 ...
## $ heart_disease : int 1 0 1 0 0 0 1 0 0 0 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Self-employed" "Private" "Private" ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: num 229 202 106 171 174 ...
## $ bmi : num 36.6 28.1 32.5 34.4 24 29 27.4 22.8 28.1 24.2 ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
#Affect of age group, hypertension, heart diseases, living conditions, group of bmi, group of glucose level and smoking status on risk of stroke
#For body mass index classify I used information from here
#For classificaton by average glucose level I used information from here
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
data_imp <- data_df %>%
mutate(bmi = case_when(bmi < 18.5 ~ "underweight",
bmi >= 18.5 & bmi < 25 ~ "normal weight",
bmi >= 25 & bmi < 30 ~ "overweight",
bmi >= 30 ~ "obese"),
bmi = factor(bmi, levels = c("underweight",
"normal weight",
"overweight",
"obese"), order = TRUE)) %>%
mutate(age = case_when(age < 2 ~ "baby",
age >= 2 & age < 17 ~ "child",
age >= 17 & age < 30 ~ "young adults",
age >= 30 & age < 55~ "middle-aged adults",
age >= 55 ~ "old-aged adults"),
age = factor(age, levels = c("baby",
"child",
"young adults",
"middle-aged adults",
"old-aged adults"), order = TRUE)) %>%
mutate(avg_glucose_level = case_when(avg_glucose_level < 100 ~ "normal",
avg_glucose_level >= 100 & avg_glucose_level < 125 ~ "prediabetes",
avg_glucose_level >= 125 ~ "diabetes"),
avg_glucose_level = factor(avg_glucose_level, levels = c("normal",
"prediabetes",
"diabetes"), order = TRUE))
table(data_imp$bmi)
##
## underweight normal weight overweight obese
## 337 1243 1610 1920
table(data_imp$age)
##
## baby child young adults middle-aged adults
## 120 676 719 1816
## old-aged adults
## 1779
table(data_imp$avg_glucose_level)
##
## normal prediabetes diabetes
## 3131 979 1000
# convert data to factor
data_imp$heart_disease <- factor(data_imp$heart_disease)
data_imp$hypertension <- factor(data_imp$hypertension)
data_imp$work_type <- factor(data_imp$work_type)
data_imp$stroke <- factor(data_imp$stroke,
levels = c(0,1),
labels = c("dint have a stroke","Had stroke"))
library(ggplot2)
# 3.1 Univariate Data Analysis
ggplot(data_imp, aes(stroke,))+
geom_bar(fill=c("red","blue")) +
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
xlab("stroke")
table(data_imp$stroke)
##
## dint have a stroke Had stroke
## 4861 249
# Notes: From the data of these patients who had a stroke were 249 people
# 3.2 Biavariate Data Analysis
ggplot(data = data_imp,
aes(x=smoking_status,
fill=stroke,)) +
geom_bar() +
scale_fill_manual(values=c("aquamarine3",
"blueviolet"))
ggplot(data = data_imp,
aes(x=age,
fill=factor(stroke,))) +
geom_bar() +
scale_fill_manual(values=c("darkred",
"darkblue"))
ggplot(data = data_imp,
aes(x=heart_disease,
fill=stroke,)) +
geom_bar() +
scale_fill_manual(values=c("aquamarine4",
"darkred"))
ggplot(data = data_imp,
aes(x=hypertension,
fill=stroke,)) +
geom_bar() +
scale_fill_manual(values=c("aquamarine4",
"yellow"))
ggplot(data = data_imp,
aes(x=work_type,
fill=stroke,)) +
geom_bar() +
scale_fill_manual(values=c("aquamarine4",
"orange"))
# 3.3 Biavariate Data Analysis by Density Plot
#bmi
ggplot(data_imp, aes(x=age, fill=stroke)) +
geom_density(alpha=0.7)
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
#bmi
ggplot(data_imp, aes(x=bmi, fill=stroke)) +
geom_density(alpha=0.7)
## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf
ggplot(data_imp, aes(x=avg_glucose_level, fill=stroke)) +
geom_density(alpha=0.7)
ggplot(data_imp, aes(x=heart_disease, fill=stroke)) +
geom_density(alpha=0.7)
# Conclusions of EDA:
##1. Patients with the most strokes are old-aged adults >= 55 years old
##2. Patients who have never smoked can have a stroke
##3. Patients who have never smoked, do not have hypertension, heart disease can have a stroke, are expected to maintain a healthy body
##4. Patients with a body mass index <18.5 are advised to take better care of their health by eating nutritious and protein-rich foods.
##5.
# 4.1New Data frame
data_transformed <- data.frame(data_imp)
str(data_transformed)
## 'data.frame': 5110 obs. of 12 variables:
## $ id : int 9046 51676 31112 60182 1665 56669 53882 10434 27419 60491 ...
## $ gender : chr "Male" "Female" "Male" "Female" ...
## $ age : Ord.factor w/ 5 levels "baby"<"child"<..: 5 5 5 4 5 5 5 5 5 5 ...
## $ hypertension : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 2 1 1 1 ...
## $ heart_disease : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 1 1 1 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : Factor w/ 5 levels "children","Govt_job",..: 4 5 4 4 5 4 4 4 4 4 ...
## $ Residence_type : chr "Urban" "Rural" "Rural" "Urban" ...
## $ avg_glucose_level: Ord.factor w/ 3 levels "normal"<"prediabetes"<..: 3 3 2 3 3 3 1 1 1 1 ...
## $ bmi : Ord.factor w/ 4 levels "underweight"<..: 4 3 4 4 2 3 3 2 3 2 ...
## $ smoking_status : chr "formerly smoked" "never smoked" "never smoked" "smokes" ...
## $ stroke : Factor w/ 2 levels "dint have a stroke",..: 2 2 2 2 2 2 2 2 2 2 ...
# 4.2 Remove
## remove id in dataframe
data_transformed$id <- NULL
## remove other in gender
table(data_transformed$gender)
##
## Female Male Other
## 2994 2115 1
idx <- which(data_transformed$gender %in% c("Other"))
idx
## [1] 3117
data_transformed <- (data_transformed)[-idx,]
table(data_transformed$gender)
##
## Female Male
## 2994 2115
# 4.3 Label Encoding
#ever married
table(data_transformed$ever_married)
##
## No Yes
## 1756 3353
data_transformed$ever_married <- ifelse(data_transformed$ever_married == "Yes", 1, 0)
table(data_transformed$ever_married)
##
## 0 1
## 1756 3353
# smoking status
table(data_transformed$smoking_status)
##
## formerly smoked never smoked smokes Unknown
## 884 1892 789 1544
data_transformed$smoking_status <- as.character(data_transformed$smoking_status)
for (i in 1:length(data_transformed$gender)) {
if (data_transformed$smoking_status[i] == "Unknown") {
data_transformed$smoking_status[i] <- 0
}
#never smoked is 0
else if (data_transformed$smoking_status[i] == "never smoked") {
data_transformed$smoking_status[i] <- 1
}
#formerly smoked is 20
else if (data_transformed$smoking_status[i] == "formerly smoked") {
data_transformed$smoking_status[i] <- 2
}
#smokes is 30
else if (data_transformed$smoking_status[i] == "smokes") {
data_transformed$smoking_status[i] <- 3
}
}
table(data_transformed$smoking_status)
##
## 0 1 2 3
## 1544 1892 884 789
#bmi
data_transformed$bmi <- as.character(data_transformed$bmi)
table(data_transformed$bmi)
##
## normal weight obese overweight underweight
## 1242 1920 1610 337
for (i in 1:length(data_transformed$bmi)) {
if (data_transformed$bmi[i] == "obese") {
data_transformed$bmi[i] <- 3
}
else if (data_transformed$bmi[i] == "overweight") {
data_transformed$bmi[i] <- 2
}
#bmi
else if (data_transformed$bmi[i] == "normal weight") {
data_transformed$bmi[i] <- 0
}
#bmi
else if (data_transformed$bmi[i] == "underweight") {
data_transformed$bmi[i] <- 1
}
}
table(data_transformed$bmi)
##
## 0 1 2 3
## 1242 337 1610 1920
# avg glucose
data_transformed$avg_glucose_level <- as.character(data_transformed$avg_glucose_level)
table(data_imp$avg_glucose_level)
##
## normal prediabetes diabetes
## 3131 979 1000
for (i in 1:length(data_transformed$gender)) {
if (data_transformed$avg_glucose_level[i] == "normal") {
data_transformed$avg_glucose_level[i] <- 0
}
else if (data_transformed$avg_glucose_level[i] == "prediabetes") {
data_transformed$avg_glucose_level[i] <- 1
}
else if (data_transformed$avg_glucose_level[i] == "diabetes") {
data_transformed$avg_glucose_level[i] <- 2
}
}
table(data_transformed$avg_glucose_level)
##
## 0 1 2
## 3131 979 999
#age
data_transformed$age <- as.character(data_transformed$age)
table(data_transformed$age)
##
## baby child middle-aged adults old-aged adults
## 120 676 1816 1779
## young adults
## 718
for (i in 1:length(data_transformed$age)) {
if (data_transformed$age[i] == "baby") {
data_transformed$age[i] <- 0
}
else if (data_transformed$age[i] == "child") {
data_transformed$age[i] <- 1
}
else if (data_transformed$age[i] == "middle-aged adults") {
data_transformed$age[i] <- 2
}
else if (data_transformed$age[i] == "old-aged adults") {
data_transformed$age[i] <- 3
}
else if (data_transformed$age[i] == "young adults") {
data_transformed$age[i] <- 4
}
}
table(data_transformed$age)
##
## 0 1 2 3 4
## 120 676 1816 1779 718
# One Hot Encoding
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# data split
df1 <- data_transformed[, 2:5]
df2 <- data_transformed[, 8:11]
df3 <- data.frame(data_transformed$gender,
data_transformed$work_type,
data_transformed$Residence_type)
df4 <- dummyVars("~.", data = df3)
df5 <- data.frame(predict(df4, df3))
# combinasi data set
final <- cbind(df1,df2,df5)
str(final)
## 'data.frame': 5109 obs. of 17 variables:
## $ age : chr "3" "3" "3" "2" ...
## $ hypertension : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 2 1 1 1 ...
## $ heart_disease : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 1 1 1 ...
## $ ever_married : num 1 1 1 1 1 1 1 0 1 1 ...
## $ avg_glucose_level : chr "2" "2" "1" "2" ...
## $ bmi : chr "3" "2" "3" "3" ...
## $ smoking_status : chr "2" "1" "1" "3" ...
## $ stroke : Factor w/ 2 levels "dint have a stroke",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ data_transformed.genderFemale : num 0 1 0 1 1 0 0 1 1 1 ...
## $ data_transformed.genderMale : num 1 0 1 0 0 1 1 0 0 0 ...
## $ data_transformed.work_type.children : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_transformed.work_type.Govt_job : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_transformed.work_type.Never_worked : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data_transformed.work_type.Private : num 1 0 1 1 0 1 1 1 1 1 ...
## $ data_transformed.work_type.Self.employed: num 0 1 0 0 1 0 0 0 0 0 ...
## $ data_transformed.Residence_typeRural : num 0 1 1 0 1 0 1 0 1 0 ...
## $ data_transformed.Residence_typeUrban : num 1 0 0 1 0 1 0 1 0 1 ...
## convert to factor
final$smoking_status <- factor(final$smoking_status)
final$avg_glucose_level <- factor(final$avg_glucose_level)
final$bmi <- factor(final$bmi)
final$age <- factor(final$age)
final$ever_married <- factor(final$ever_married)
final$data_transformed.genderFemale <- factor(final$data_transformed.genderFemale )
final$data_transformed.genderMale <- factor(final$data_transformed.genderMale)
final$data_transformed.work_type.children <- factor(final$data_transformed.work_type.children)
final$data_transformed.work_type.Govt_job <- factor(final$data_transformed.work_type.Govt_job)
final$data_transformed.work_type.Private <- factor(final$data_transformed.work_type.Private)
final$data_transformed.work_type.Self.employed <- factor(final$data_transformed.work_type.Self.employed)
final$data_transformed.work_type.Never_worked <- factor(final$data_transformed.work_type.Never_worked)
final$data_transformed.Residence_typeRural <- factor(final$data_transformed.Residence_typeRural)
final$data_transformed.Residence_typeUrban <- factor(final$data_transformed.Residence_typeUrban)
str(final)
## 'data.frame': 5109 obs. of 17 variables:
## $ age : Factor w/ 5 levels "0","1","2","3",..: 4 4 4 3 4 4 4 4 4 4 ...
## $ hypertension : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 2 1 1 1 ...
## $ heart_disease : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 2 1 1 1 ...
## $ ever_married : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 2 2 ...
## $ avg_glucose_level : Factor w/ 3 levels "0","1","2": 3 3 2 3 3 3 1 1 1 1 ...
## $ bmi : Factor w/ 4 levels "0","1","2","3": 4 3 4 4 1 3 3 1 3 1 ...
## $ smoking_status : Factor w/ 4 levels "0","1","2","3": 3 2 2 4 2 3 2 2 1 1 ...
## $ stroke : Factor w/ 2 levels "dint have a stroke",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ data_transformed.genderFemale : Factor w/ 2 levels "0","1": 1 2 1 2 2 1 1 2 2 2 ...
## $ data_transformed.genderMale : Factor w/ 2 levels "0","1": 2 1 2 1 1 2 2 1 1 1 ...
## $ data_transformed.work_type.children : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ data_transformed.work_type.Govt_job : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ data_transformed.work_type.Never_worked : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ data_transformed.work_type.Private : Factor w/ 2 levels "0","1": 2 1 2 2 1 2 2 2 2 2 ...
## $ data_transformed.work_type.Self.employed: Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 1 1 1 ...
## $ data_transformed.Residence_typeRural : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 1 2 1 ...
## $ data_transformed.Residence_typeUrban : Factor w/ 2 levels "0","1": 2 1 1 2 1 2 1 2 1 2 ...
# Train & Test dataset
row <- dim(final)[1]
train_idx <- sample(row, 0.7 * row)
training_data <- final[train_idx,]
testing_data <- final[-train_idx,]
library(ROSE)
## Loaded ROSE 0.0-4
library(rpart)
training_data %>%
group_by(stroke) %>%
summarize(n = n()) %>%
mutate(prop = round(n / sum(n), 2))
## # A tibble: 2 × 3
## stroke n prop
## <fct> <int> <dbl>
## 1 dint have a stroke 3398 0.95
## 2 Had stroke 178 0.05
## 1. Dcision Tree
ti <- rpart(stroke~., data = training_data)
pred.ti <- predict(ti, newdata = testing_data)
answer <- testing_data$stroke
accuracy.meas(answer, pred.ti[,2])
##
## Call:
## accuracy.meas(response = answer, predicted = pred.ti[, 2])
##
## Examples are labelled as positive when predicted is greater than 0.5
##
## precision: NaN
## recall: 0.000
## F: NaN
# AUC ( Area under the curve)
roc.curve(answer, pred.ti[,2])
## Area under the curve (AUC): 0.500
# 2. Over Sampling
training_data %>%
group_by(stroke) %>%
summarize(n = n()) %>%
mutate(prop = round(n / sum(n), 2))
## # A tibble: 2 × 3
## stroke n prop
## <fct> <int> <dbl>
## 1 dint have a stroke 3398 0.95
## 2 Had stroke 178 0.05
table(training_data$stroke)
##
## dint have a stroke Had stroke
## 3398 178
data_balanced_over <- ovun.sample(stroke~.,
data = training_data,
method = "over",
N = 6810)$data # N = 0 x 2
data_balanced_over %>%
group_by(stroke) %>%
summarize(n = n()) %>%
mutate(prop = round(n / sum(n), 2))
## # A tibble: 2 × 3
## stroke n prop
## <fct> <int> <dbl>
## 1 dint have a stroke 3398 0.5
## 2 Had stroke 3412 0.5
# 3. Undersampling
data_balanced_under <- ovun.sample(stroke~.,
data = training_data,
method = "under",
N = 342, # data 1 x2
seed = 1)$data
table(data_balanced_under$stroke)
##
## dint have a stroke Had stroke
## 164 178
# BOth => Undersampling + Oversampling
data_balanced_both <- ovun.sample(stroke~.,
data = training_data,
p=0.5,
N = 3577, # N= data train
seed = 1)$data
table(data_balanced_both$stroke)
##
## dint have a stroke Had stroke
## 1838 1739
data.rose <- ROSE(stroke~.,
data = training_data,
seed = 1)$data
table(data.rose$stroke)
##
## dint have a stroke Had stroke
## 1838 1738
#logistic Regression
logit <- glm(formula = stroke~.,
data=data.rose,
family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
answer <- testing_data$stroke
pred.prob <- predict(logit,
testing_data,
type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# pred < 0.5 => class 0 stroke
# pred >= 0.5 => class 1 no stroke
pred.logit <- factor(pred.prob > 0.5,
levels = c(FALSE,
TRUE),
labels = c("No",
"yes"))
# Dicision Tree
#install.packages("party")
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
dt <- ctree(formula = stroke~.,
data=data_balanced_over)
pred.dt <- predict(dt,
testing_data)
[here]https://drive.google.com/file/d/1yYfse-xkBmGrgcoNSClOkLukdu33bTOy/view?usp=sharing
#Random Forest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf <- randomForest(formula=stroke~.,
data=data_balanced_both)
pred.rf <- predict(rf,
testing_data)
performance <- function(prediction, actual, nama_model){
#confusion matrix
cm <- table(actual, prediction,
dnn = c("Actual", "Prediction"))
#dnn -> The dimension Names
TP <- cm[2, 2]
TN <- cm[1, 1]
FN <- cm[2, 1]
FP <- cm[1, 2]
accuracy <- (TP + TN) / (TP + TN + FP + FN)
precision <- TP / (TP / FP)
Recall <- TP / (TP +FN)
f1_score <- (2*precision*Recall) / (precision + Recall)
result <- paste("Model : ", nama_model,
"\nAccuracy : ", round(accuracy, 3),
"\nPrecision : ", round(precision, 3),
"\nRecall : ", round(Recall, 3),
"\nf1 Score : ", round(f1_score, 3))
cat(result)
}
# Logistic Regression
performance(pred.logit, answer, "Logistic Regression")
## Model : Logistic Regression
## Accuracy : 0.7
## Precision : 449
## Recall : 0.845
## f1 Score : 1.687
# Descision Tree
performance(pred.dt, answer, "Decision Tree")
## Model : Decision Tree
## Accuracy : 0.701
## Precision : 438
## Recall : 0.718
## f1 Score : 1.434
# Random Forest
performance(pred.rf, answer, "Random Forest")
## Model : Random Forest
## Accuracy : 0.757
## Precision : 343
## Recall : 0.577
## f1 Score : 1.153
# Logistic Regression is the right choice for the model