library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(gtools)
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.2.3
library(ggplot2)
library(class)
library(tidyr)
dt2 <- read_excel("D:/SEM 6/BISMILLAH LOMBA/SATDAT JUARA PART 2/Data Cadangan(2).xlsx")
head(dt2)
## # A tibble: 6 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 69 1 0 160 234 1 2 131 0 0.1 1
## 2 69 0 0 140 239 0 0 151 0 1.8 0
## 3 66 0 0 150 226 0 0 114 0 2.6 2
## 4 65 1 0 138 282 1 2 174 0 1.4 1
## 5 64 1 0 110 211 0 2 144 1 1.8 1
## 6 64 1 0 170 227 0 2 155 0 0.6 1
## # … with 3 more variables: ca <dbl>, thal <dbl>, condition <dbl>
X1 = dt2$age
X2 = as.factor(dt2$sex)
X3 = as.factor(dt2$cp)
X4 = as.factor(dt2$trestbps)
X5 = as.factor(dt2$chol)
X6 = as.factor(dt2$fbs)
X7 = as.factor(dt2$restecg)
X8 = as.factor(dt2$thalach)
X9 = as.factor(dt2$exang)
X10 = as.factor(dt2$oldpeak)
X11 = as.factor(dt2$slope)
X12 = as.factor(dt2$ca)
X13 = as.factor(dt2$thal)
Y = as.factor(dt2$condition)
df = data.frame(X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,Y)
str(df)
## 'data.frame': 297 obs. of 14 variables:
## $ X1 : num 69 69 66 65 64 64 63 61 60 59 ...
## $ X2 : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 2 ...
## $ X3 : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ X4 : Factor w/ 50 levels "94","100","101",..: 41 29 35 28 9 44 32 25 35 47 ...
## $ X5 : Factor w/ 152 levels "126","131","141",..: 66 70 58 109 43 59 65 66 71 100 ...
## $ X6 : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 1 1 ...
## $ X7 : Factor w/ 3 levels "0","1","2": 3 1 1 3 3 3 3 1 1 3 ...
## $ X8 : Factor w/ 91 levels "71","88","90",..: 32 51 16 74 44 55 50 45 71 45 ...
## $ X9 : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ X10: Factor w/ 40 levels "0","0.1","0.2",..: 2 18 26 15 18 7 23 26 10 37 ...
## $ X11: Factor w/ 3 levels "0","1","2": 2 1 3 2 2 2 3 2 1 3 ...
## $ X12: Factor w/ 4 levels "0","1","2","3": 2 3 1 2 1 1 1 3 1 1 ...
## $ X13: Factor w/ 3 levels "0","1","2": 1 1 1 1 1 3 2 1 1 3 ...
## $ Y : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 2 1 1 ...
colSums(is.na(df))
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 Y
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0
table(df$Y)
##
## 0 1
## 160 137
prop.table(table(df$Y))
##
## 0 1
## 0.5387205 0.4612795
ggplot(df,mapping = aes(df$X1))+
geom_histogram(fill="blue",col="black",bins=20)
## Warning: Use of `df$X1` is discouraged.
## ℹ Use `X1` instead.
ggplot(df,mapping = aes(df$X1,fill=df$Y))+
geom_density(alpha=0.1)
## Warning: Use of `df$X1` is discouraged.
## ℹ Use `X1` instead.
## Warning: Use of `df$Y` is discouraged.
## ℹ Use `Y` instead.
ggplot(df,mapping = aes(Y,X1))+geom_boxplot(fill="blue",col="black")
ggplot(df,mapping = aes(X2,fill=Y))+
geom_bar(col="black")+
geom_text(stat="count",aes(label=..count..),
position=position_stack(0.5), color="white", size=4)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
# Cross Validation
set.seed(305)
# Set splitting ratio
intrain <- sample(nrow(df), nrow(df)*0.8)
# make train and test data
df_train <- df[intrain,]
df_test <- df[-intrain,]
# check proportion
prop.table(table(df_train$Y))
##
## 0 1
## 0.5527426 0.4472574
prop.table(table(df_test$Y))
##
## 0 1
## 0.4833333 0.5166667
Model yang pertama yang akan kita lakukan adalah menggunakan seluruh variabel prediktor yang baik yang berkelas integer dan factor, lalu kita akan melakukan stepwise method untuk menentukan kombinasi terbaik dari variabel prediktor yang ada.
# stepwise: automatic model selection method
options(warn=-1)
log_model_all <- glm(Y ~ ., family="binomial", data= df_train)
log_model_nothing <- glm(Y ~ 1, family="binomial", data= df_train)
log_model1 <- step(log_model_nothing,
list(lower=formula(log_model_nothing),
upper=formula(log_model_all)),
direction="both", trace = F, test= "F")
formula(log_model1)
## Y ~ X13 + X12 + X3 + X11 + X2 + X9
log_model1 <- glm(Y ~ X13 + X12 + X3 + X11 + X2 + X9,
family = "binomial", data = df_train)
summary(log_model1)
##
## Call:
## glm(formula = Y ~ X13 + X12 + X3 + X11 + X2 + X9, family = "binomial",
## data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9298 -0.5220 -0.1965 0.3894 2.7542
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.9024 0.9794 -5.006 5.57e-07 ***
## X131 0.2286 0.8439 0.271 0.786454
## X132 1.7365 0.4585 3.788 0.000152 ***
## X121 2.0084 0.5167 3.887 0.000102 ***
## X122 3.2813 0.8148 4.027 5.65e-05 ***
## X123 3.1016 1.0024 3.094 0.001975 **
## X31 1.0512 0.8462 1.242 0.214164
## X32 0.1090 0.7864 0.139 0.889781
## X33 1.9250 0.7465 2.579 0.009915 **
## X111 1.5227 0.4813 3.164 0.001557 **
## X112 1.5153 0.7935 1.910 0.056196 .
## X21 1.1324 0.5206 2.175 0.029609 *
## X91 0.8554 0.4694 1.822 0.068421 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.91 on 236 degrees of freedom
## Residual deviance: 162.65 on 224 degrees of freedom
## AIC: 188.65
##
## Number of Fisher Scoring iterations: 6
Menghilangkan peubah p value yang besar
log_model1 <- update(log_model1, .~.-X13-X3)
summary(log_model1)
##
## Call:
## glm(formula = Y ~ X12 + X11 + X2 + X9, family = "binomial", data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4885 -0.5054 -0.2375 0.6710 2.6767
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5542 0.5039 -7.053 1.75e-12 ***
## X121 1.9128 0.4451 4.298 1.73e-05 ***
## X122 3.3587 0.6794 4.944 7.67e-07 ***
## X123 3.2887 0.8308 3.958 7.54e-05 ***
## X111 1.5609 0.3962 3.939 8.17e-05 ***
## X112 1.4668 0.6540 2.243 0.024910 *
## X21 1.4570 0.4046 3.601 0.000317 ***
## X91 1.6735 0.3967 4.219 2.46e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.91 on 236 degrees of freedom
## Residual deviance: 197.72 on 229 degrees of freedom
## AIC: 213.72
##
## Number of Fisher Scoring iterations: 5
Lalu pada model kedua kita akan menggunakaan seluruh data yang kelasnya berupa factor, bukan berupa angka.
log_model2 <- glm(formula = Y ~ X2 + X3 + X6 + X7 + X9 + X11 + X12 + X13, family = "binomial",
data = df_train)
summary(log_model2)
##
## Call:
## glm(formula = Y ~ X2 + X3 + X6 + X7 + X9 + X11 + X12 + X13, family = "binomial",
## data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8845 -0.4989 -0.1635 0.3719 2.7353
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.0358 1.0234 -4.921 8.62e-07 ***
## X21 1.1388 0.5236 2.175 0.029648 *
## X31 1.0714 0.8579 1.249 0.211711
## X32 0.1802 0.7949 0.227 0.820628
## X33 1.8761 0.7527 2.493 0.012684 *
## X61 -0.5911 0.6532 -0.905 0.365499
## X71 0.5302 3.0879 0.172 0.863659
## X72 0.3293 0.4134 0.797 0.425719
## X91 0.9108 0.4732 1.925 0.054251 .
## X111 1.5079 0.4856 3.105 0.001902 **
## X112 1.4519 0.7994 1.816 0.069339 .
## X121 1.9804 0.5209 3.802 0.000144 ***
## X122 3.4682 0.8588 4.039 5.38e-05 ***
## X123 3.2147 1.0369 3.100 0.001933 **
## X131 0.3684 0.8651 0.426 0.670202
## X132 1.7662 0.4641 3.806 0.000141 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.91 on 236 degrees of freedom
## Residual deviance: 161.22 on 221 degrees of freedom
## AIC: 193.22
##
## Number of Fisher Scoring iterations: 6
Lalu terakhir kita akan menggunakan kembali seluruh variabel prediktor yang nantinya akan dilakukan permodelan menggunakan metode stepwise AIC.
log_model3 <- glm(formula = Y ~ ., family = "binomial", data = df)
summary(log_model3)
##
## Call:
## glm(formula = Y ~ ., family = "binomial", data = df)
##
## Deviance Residuals:
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [26] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [51] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [76] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [101] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [126] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [151] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [201] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [226] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [251] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [276] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## Coefficients: (49 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.454e+03 2.018e+08 0 1
## X1 3.511e+00 2.924e+05 0 1
## X21 2.828e+02 2.199e+07 0 1
## X31 -7.631e+02 6.282e+07 0 1
## X32 1.182e+03 1.016e+08 0 1
## X33 -3.257e+02 2.810e+07 0 1
## X4100 -7.339e+03 6.270e+08 0 1
## X4101 -4.529e+03 3.858e+08 0 1
## X4102 -1.698e+03 1.557e+08 0 1
## X4104 6.716e+02 7.904e+07 0 1
## X4105 -2.489e+03 2.076e+08 0 1
## X4106 -3.112e+03 2.686e+08 0 1
## X4108 -4.329e+03 3.652e+08 0 1
## X4110 -3.179e+03 2.677e+08 0 1
## X4112 -6.624e+03 5.524e+08 0 1
## X4114 4.626e+02 3.204e+07 0 1
## X4115 -2.619e+03 1.973e+08 0 1
## X4117 -7.999e+03 6.607e+08 0 1
## X4118 -5.152e+03 4.360e+08 0 1
## X4120 -4.405e+03 3.629e+08 0 1
## X4122 -4.714e+03 4.107e+08 0 1
## X4123 -4.666e+03 3.888e+08 0 1
## X4124 -4.138e+03 3.383e+08 0 1
## X4125 -5.285e+03 4.401e+08 0 1
## X4126 -2.360e+03 2.173e+08 0 1
## X4128 -2.347e+03 1.940e+08 0 1
## X4129 1.658e+03 1.528e+08 0 1
## X4130 -2.957e+03 2.404e+08 0 1
## X4132 4.500e+03 3.986e+08 0 1
## X4134 -2.400e+03 2.012e+08 0 1
## X4135 -5.528e+03 4.607e+08 0 1
## X4136 -2.118e+03 1.682e+08 0 1
## X4138 -2.883e+03 2.375e+08 0 1
## X4140 -3.271e+03 2.730e+08 0 1
## X4142 -4.187e+03 3.428e+08 0 1
## X4144 -4.426e+03 3.670e+08 0 1
## X4145 -2.106e+03 1.776e+08 0 1
## X4146 6.247e+03 5.145e+08 0 1
## X4148 -5.138e+03 4.330e+08 0 1
## X4150 -3.160e+03 2.654e+08 0 1
## X4152 -3.164e+03 2.664e+08 0 1
## X4154 -2.047e+03 1.783e+08 0 1
## X4155 -1.263e+03 9.830e+07 0 1
## X4156 3.857e+02 3.600e+07 0 1
## X4158 -2.261e+03 1.850e+08 0 1
## X4160 -5.392e+03 4.505e+08 0 1
## X4164 -5.237e+03 4.403e+08 0 1
## X4165 -1.182e+04 9.918e+08 0 1
## X4170 2.251e+03 1.991e+08 0 1
## X4172 -4.040e+03 3.422e+08 0 1
## X4174 -7.856e+03 6.569e+08 0 1
## X4178 8.977e+01 9.282e+06 0 1
## X4180 8.497e+01 1.831e+07 0 1
## X4192 -5.661e+02 4.026e+07 0 1
## X4200 2.893e+03 2.752e+08 0 1
## X5131 1.850e+03 1.553e+08 0 1
## X5141 1.789e+03 1.567e+08 0 1
## X5149 5.220e+03 4.383e+08 0 1
## X5157 3.131e+03 2.577e+08 0 1
## X5160 -1.656e+03 1.484e+08 0 1
## X5164 6.788e+03 5.645e+08 0 1
## X5166 9.899e+02 8.037e+07 0 1
## X5167 7.835e+02 6.083e+07 0 1
## X5168 6.749e+02 5.990e+07 0 1
## X5169 3.994e+03 3.259e+08 0 1
## X5172 2.847e+03 2.405e+08 0 1
## X5174 -2.351e+02 1.770e+07 0 1
## X5175 -5.930e+02 4.033e+07 0 1
## X5176 NA NA NA NA
## X5177 9.836e+02 8.041e+07 0 1
## X5178 1.207e+03 1.008e+08 0 1
## X5180 -1.502e+02 1.853e+07 0 1
## X5182 4.345e+03 3.735e+08 0 1
## X5183 5.726e+02 5.237e+07 0 1
## X5184 -1.402e+01 1.335e+07 0 1
## X5185 3.595e+03 2.968e+08 0 1
## X5186 2.622e+03 2.319e+08 0 1
## X5187 3.209e+03 2.652e+08 0 1
## X5188 3.409e+02 2.345e+07 0 1
## X5192 4.251e+03 3.760e+08 0 1
## X5193 2.876e+03 2.366e+08 0 1
## X5195 4.073e+03 3.440e+08 0 1
## X5196 -4.546e+03 3.869e+08 0 1
## X5197 4.459e+03 3.649e+08 0 1
## X5198 2.450e+03 2.131e+08 0 1
## X5199 -1.059e+03 8.650e+07 0 1
## X5200 1.488e+03 1.141e+08 0 1
## X5201 2.772e+03 2.373e+08 0 1
## X5203 4.683e+03 3.915e+08 0 1
## X5204 1.271e+03 1.058e+08 0 1
## X5205 3.954e+03 3.536e+08 0 1
## X5206 1.368e+03 1.068e+08 0 1
## X5207 -6.026e+03 5.133e+08 0 1
## X5208 1.543e+02 4.948e+06 0 1
## X5209 2.997e+03 2.524e+08 0 1
## X5210 3.315e+03 2.872e+08 0 1
## X5211 2.734e+03 2.315e+08 0 1
## X5212 1.213e+03 1.018e+08 0 1
## X5213 2.146e+03 1.887e+08 0 1
## X5214 1.107e+03 9.433e+07 0 1
## X5215 2.022e+03 1.622e+08 0 1
## X5216 2.972e+03 2.401e+08 0 1
## X5217 -4.141e+02 3.634e+07 0 1
## X5218 -1.474e+03 1.035e+08 0 1
## X5219 2.348e+03 1.907e+08 0 1
## X5220 3.659e+03 3.026e+08 0 1
## X5221 2.409e+03 2.037e+08 0 1
## X5222 3.929e+03 3.351e+08 0 1
## X5223 1.859e+03 1.676e+08 0 1
## X5224 -5.778e+03 5.011e+08 0 1
## X5225 -1.657e+03 1.521e+08 0 1
## X5226 8.719e+02 6.893e+07 0 1
## X5227 -4.590e+02 4.297e+07 0 1
## X5228 -3.266e+03 2.783e+08 0 1
## X5229 -4.422e+02 3.499e+07 0 1
## X5230 5.874e+03 4.891e+08 0 1
## X5231 1.721e+03 1.369e+08 0 1
## X5232 2.371e+03 2.032e+08 0 1
## X5233 7.887e+02 7.120e+07 0 1
## X5234 2.775e+03 2.307e+08 0 1
## X5235 -1.098e+03 8.472e+07 0 1
## X5236 3.169e+03 2.560e+08 0 1
## X5237 -1.542e+03 1.363e+08 0 1
## X5239 2.835e+03 2.382e+08 0 1
## X5240 2.775e+03 2.298e+08 0 1
## X5241 -1.424e+03 1.265e+08 0 1
## X5242 4.385e+03 3.730e+08 0 1
## X5243 4.849e+03 4.059e+08 0 1
## X5244 3.604e+03 3.020e+08 0 1
## X5245 -1.164e+03 1.043e+08 0 1
## X5246 1.910e+03 1.613e+08 0 1
## X5247 5.903e+03 5.002e+08 0 1
## X5248 3.115e+03 2.657e+08 0 1
## X5249 5.888e+03 4.853e+08 0 1
## X5250 5.275e+03 4.434e+08 0 1
## X5252 2.721e+03 2.243e+08 0 1
## X5253 1.587e+03 1.297e+08 0 1
## X5254 2.480e+03 2.023e+08 0 1
## X5255 1.214e+03 9.631e+07 0 1
## X5256 1.362e+03 1.074e+08 0 1
## X5257 1.014e+03 6.491e+07 0 1
## X5258 3.203e+03 2.661e+08 0 1
## X5259 7.271e+02 7.233e+07 0 1
## X5260 4.390e+02 2.342e+07 0 1
## X5261 1.766e+03 1.377e+08 0 1
## X5262 4.774e+03 3.953e+08 0 1
## X5263 4.653e+03 3.942e+08 0 1
## X5264 1.985e+03 1.635e+08 0 1
## X5265 1.211e+03 1.153e+08 0 1
## X5266 6.469e+03 5.404e+08 0 1
## X5267 8.933e+02 8.095e+07 0 1
## X5268 3.185e+03 2.635e+08 0 1
## X5269 1.862e+03 1.583e+08 0 1
## X5270 6.557e+02 5.643e+07 0 1
## X5271 1.524e+03 1.271e+08 0 1
## X5273 3.891e+03 3.245e+08 0 1
## X5274 -3.258e+03 2.865e+08 0 1
## X5275 -2.557e+03 2.272e+08 0 1
## X5276 1.954e+03 1.736e+08 0 1
## X5277 -7.278e+01 6.536e+06 0 1
## X5278 -5.148e+03 4.226e+08 0 1
## X5281 2.517e+02 2.197e+07 0 1
## X5282 1.384e+03 1.189e+08 0 1
## X5283 -1.155e+03 1.017e+08 0 1
## X5284 4.488e+03 3.673e+08 0 1
## X5286 3.536e+03 2.858e+08 0 1
## X5288 -5.219e+03 4.675e+08 0 1
## X5289 6.595e+03 5.521e+08 0 1
## X5290 2.369e+03 1.993e+08 0 1
## X5293 2.876e+03 2.370e+08 0 1
## X5294 -3.591e+02 2.728e+07 0 1
## X5295 3.272e+03 2.688e+08 0 1
## X5298 1.044e+03 8.218e+07 0 1
## X5299 2.399e+03 2.086e+08 0 1
## X5300 8.444e+03 7.053e+08 0 1
## X5302 4.341e+03 3.693e+08 0 1
## X5303 1.624e+03 1.266e+08 0 1
## X5304 3.350e+03 2.769e+08 0 1
## X5305 2.509e+03 2.006e+08 0 1
## X5306 1.731e+03 1.723e+08 0 1
## X5307 -9.047e+02 8.217e+07 0 1
## X5308 2.014e+03 1.681e+08 0 1
## X5309 3.965e+03 3.201e+08 0 1
## X5311 -5.773e+02 4.534e+07 0 1
## X5313 -2.299e+03 1.993e+08 0 1
## X5315 -1.248e+02 1.858e+07 0 1
## X5318 -3.586e+02 1.820e+07 0 1
## X5319 5.508e+03 4.486e+08 0 1
## X5321 4.755e+02 3.879e+07 0 1
## X5322 5.943e+03 4.967e+08 0 1
## X5325 1.201e+03 8.706e+07 0 1
## X5326 -5.794e+03 5.034e+08 0 1
## X5327 2.590e+02 2.760e+07 0 1
## X5330 1.624e+03 1.278e+08 0 1
## X5335 9.347e+02 7.472e+07 0 1
## X5340 1.200e+03 9.324e+07 0 1
## X5341 -3.426e+03 3.096e+08 0 1
## X5342 -9.044e+03 7.854e+08 0 1
## X5353 -6.113e+03 5.365e+08 0 1
## X5354 2.925e+03 2.518e+08 0 1
## X5360 4.979e+03 4.165e+08 0 1
## X5394 3.628e+03 3.021e+08 0 1
## X5407 5.629e+03 4.644e+08 0 1
## X5409 7.924e+02 6.610e+07 0 1
## X5417 2.777e+03 2.308e+08 0 1
## X5564 9.541e+02 5.679e+07 0 1
## X61 -2.662e+02 2.185e+07 0 1
## X71 -3.350e+03 2.932e+08 0 1
## X72 -4.163e+02 3.474e+07 0 1
## X888 3.501e+03 3.108e+08 0 1
## X890 NA NA NA NA
## X895 NA NA NA NA
## X896 -4.085e+02 3.683e+07 0 1
## X897 -5.148e+03 4.461e+08 0 1
## X899 1.993e+02 9.609e+06 0 1
## X8103 1.503e+03 1.278e+08 0 1
## X8105 -7.362e+03 6.236e+08 0 1
## X8106 1.188e+03 9.463e+07 0 1
## X8108 -1.058e+03 7.466e+07 0 1
## X8109 -5.200e+03 4.376e+08 0 1
## X8111 3.971e+02 3.872e+07 0 1
## X8112 -1.319e+03 1.124e+08 0 1
## X8113 5.878e+02 5.918e+07 0 1
## X8114 -4.241e+02 2.457e+07 0 1
## X8115 -1.814e+03 1.570e+08 0 1
## X8116 -1.691e+03 1.108e+08 0 1
## X8117 NA NA NA NA
## X8118 -4.355e+02 2.300e+07 0 1
## X8120 5.920e+02 5.024e+07 0 1
## X8121 -1.412e+03 1.304e+08 0 1
## X8122 -1.047e+03 7.857e+07 0 1
## X8123 NA NA NA NA
## X8124 NA NA NA NA
## X8125 -1.000e+03 8.025e+07 0 1
## X8126 -2.942e+03 2.407e+08 0 1
## X8127 8.504e+02 7.858e+07 0 1
## X8128 -3.832e+03 3.161e+08 0 1
## X8129 1.934e+03 1.584e+08 0 1
## X8130 -1.262e+03 1.150e+08 0 1
## X8131 -3.508e+03 2.828e+08 0 1
## X8132 -1.301e+03 1.037e+08 0 1
## X8133 NA NA NA NA
## X8134 NA NA NA NA
## X8136 -3.355e+03 2.775e+08 0 1
## X8137 -9.620e+03 8.047e+08 0 1
## X8138 6.404e+03 5.487e+08 0 1
## X8139 NA NA NA NA
## X8140 6.570e+02 6.734e+07 0 1
## X8141 -8.075e+02 6.322e+07 0 1
## X8142 -2.168e+03 1.785e+08 0 1
## X8143 -1.051e+03 7.811e+07 0 1
## X8144 -2.838e+03 2.303e+08 0 1
## X8145 -3.300e+03 2.710e+08 0 1
## X8146 -2.928e+03 2.399e+08 0 1
## X8147 -2.615e+03 2.102e+08 0 1
## X8148 -3.716e+03 3.159e+08 0 1
## X8149 -4.082e+03 3.394e+08 0 1
## X8150 -9.856e+02 7.896e+07 0 1
## X8151 -2.287e+03 1.871e+08 0 1
## X8152 -4.577e+03 3.797e+08 0 1
## X8153 2.132e+03 1.794e+08 0 1
## X8154 -4.376e+03 3.563e+08 0 1
## X8155 -4.011e+03 3.319e+08 0 1
## X8156 -1.826e+03 1.316e+08 0 1
## X8157 -1.939e+03 1.571e+08 0 1
## X8158 -1.781e+03 1.456e+08 0 1
## X8159 -1.216e+03 8.812e+07 0 1
## X8160 -1.817e+03 1.475e+08 0 1
## X8161 -2.144e+03 1.719e+08 0 1
## X8162 -1.015e+03 7.905e+07 0 1
## X8163 -1.232e+03 1.058e+08 0 1
## X8164 -2.055e+03 1.677e+08 0 1
## X8165 -2.930e+03 2.441e+08 0 1
## X8166 3.007e+03 2.609e+08 0 1
## X8167 NA NA NA NA
## X8168 -1.823e+03 1.564e+08 0 1
## X8169 -5.666e+02 4.407e+07 0 1
## X8170 -1.409e+03 1.134e+08 0 1
## X8171 -5.331e+03 4.430e+08 0 1
## X8172 -6.616e+02 5.053e+07 0 1
## X8173 -2.402e+03 2.050e+08 0 1
## X8174 -1.660e+03 1.364e+08 0 1
## X8175 -9.239e+02 7.540e+07 0 1
## X8177 -3.404e+03 2.709e+08 0 1
## X8178 -1.860e+02 4.298e+06 0 1
## X8179 -2.740e+03 2.283e+08 0 1
## X8180 6.858e+02 5.427e+07 0 1
## X8181 -1.220e+03 1.085e+08 0 1
## X8182 -8.704e+02 6.771e+07 0 1
## X8184 -3.523e+03 3.138e+08 0 1
## X8185 NA NA NA NA
## X8186 -1.406e+03 9.944e+07 0 1
## X8187 -6.393e+03 5.391e+08 0 1
## X8188 -1.129e+03 8.930e+07 0 1
## X8190 NA NA NA NA
## X8192 NA NA NA NA
## X8194 -1.421e+03 1.161e+08 0 1
## X8195 NA NA NA NA
## X8202 NA NA NA NA
## X91 7.103e+02 5.889e+07 0 1
## X100.1 3.801e+03 3.153e+08 0 1
## X100.2 1.683e+03 1.503e+08 0 1
## X100.3 1.032e+03 8.642e+07 0 1
## X100.4 9.859e+01 1.850e+06 0 1
## X100.5 2.252e+03 1.891e+08 0 1
## X100.6 -3.531e+02 3.224e+07 0 1
## X100.7 NA NA NA NA
## X100.8 -7.749e+02 6.643e+07 0 1
## X100.9 3.025e+03 2.594e+08 0 1
## X101 3.327e+03 2.840e+08 0 1
## X101.1 -7.753e+01 1.369e+07 0 1
## X101.2 -3.745e+02 2.902e+07 0 1
## X101.3 NA NA NA NA
## X101.4 9.022e+02 6.901e+07 0 1
## X101.5 NA NA NA NA
## X101.6 NA NA NA NA
## X101.8 NA NA NA NA
## X101.9 NA NA NA NA
## X102 NA NA NA NA
## X102.1 NA NA NA NA
## X102.2 NA NA NA NA
## X102.3 NA NA NA NA
## X102.4 NA NA NA NA
## X102.5 NA NA NA NA
## X102.6 NA NA NA NA
## X102.8 NA NA NA NA
## X102.9 NA NA NA NA
## X103 NA NA NA NA
## X103.1 NA NA NA NA
## X103.2 NA NA NA NA
## X103.4 NA NA NA NA
## X103.5 NA NA NA NA
## X103.6 NA NA NA NA
## X103.8 NA NA NA NA
## X104 NA NA NA NA
## X104.2 NA NA NA NA
## X104.4 NA NA NA NA
## X105.6 NA NA NA NA
## X106.2 NA NA NA NA
## X111 NA NA NA NA
## X112 NA NA NA NA
## X121 NA NA NA NA
## X122 NA NA NA NA
## X123 NA NA NA NA
## X131 NA NA NA NA
## X132 NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4.0995e+02 on 296 degrees of freedom
## Residual deviance: 1.7231e-09 on 0 degrees of freedom
## AIC: 594
##
## Number of Fisher Scoring iterations: 25
df2 = data.frame(X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,Y)
str(df2)
## 'data.frame': 297 obs. of 14 variables:
## $ X1 : num 69 69 66 65 64 64 63 61 60 59 ...
## $ X2 : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 2 ...
## $ X3 : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ X4 : Factor w/ 50 levels "94","100","101",..: 41 29 35 28 9 44 32 25 35 47 ...
## $ X5 : Factor w/ 152 levels "126","131","141",..: 66 70 58 109 43 59 65 66 71 100 ...
## $ X6 : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 2 1 1 1 ...
## $ X7 : Factor w/ 3 levels "0","1","2": 3 1 1 3 3 3 3 1 1 3 ...
## $ X8 : Factor w/ 91 levels "71","88","90",..: 32 51 16 74 44 55 50 45 71 45 ...
## $ X9 : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ X10: Factor w/ 40 levels "0","0.1","0.2",..: 2 18 26 15 18 7 23 26 10 37 ...
## $ X11: Factor w/ 3 levels "0","1","2": 2 1 3 2 2 2 3 2 1 3 ...
## $ X12: Factor w/ 4 levels "0","1","2","3": 2 3 1 2 1 1 1 3 1 1 ...
## $ X13: Factor w/ 3 levels "0","1","2": 1 1 1 1 1 3 2 1 1 3 ...
## $ Y : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 2 1 1 ...
log_back_model2 <- stepAIC(log_model2, direction = "backward", trace = F)
summary(log_back_model2)
##
## Call:
## glm(formula = Y ~ X2 + X3 + X9 + X11 + X12 + X13, family = "binomial",
## data = df_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9298 -0.5220 -0.1965 0.3894 2.7542
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.9024 0.9794 -5.006 5.57e-07 ***
## X21 1.1324 0.5206 2.175 0.029609 *
## X31 1.0512 0.8462 1.242 0.214164
## X32 0.1090 0.7864 0.139 0.889781
## X33 1.9250 0.7465 2.579 0.009915 **
## X91 0.8554 0.4694 1.822 0.068421 .
## X111 1.5227 0.4813 3.164 0.001557 **
## X112 1.5153 0.7935 1.910 0.056196 .
## X121 2.0084 0.5167 3.887 0.000102 ***
## X122 3.2813 0.8148 4.027 5.65e-05 ***
## X123 3.1016 1.0024 3.094 0.001975 **
## X131 0.2286 0.8439 0.271 0.786454
## X132 1.7365 0.4585 3.788 0.000152 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.91 on 236 degrees of freedom
## Residual deviance: 162.65 on 224 degrees of freedom
## AIC: 188.65
##
## Number of Fisher Scoring iterations: 6
Sama seperti sebelumnya kita akan melakukan stepwise method untuk mendapatkan kombinasi variabel prediktor terbaik. Lalu setaelah dilakukan stepwise akan dieliminasi variabel prediktor yang tidak berpengaruh secara signifikan.
log_back_model3 <- stepAIC(log_model3, direction = "backward", trace = F)
summary(log_back_model3)
##
## Call:
## glm(formula = Y ~ X10, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8930 -0.7948 -0.5168 0.8576 2.0393
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.99040 0.22967 -4.312 1.62e-05 ***
## X100.1 0.29725 0.89596 0.332 0.74006
## X100.2 -0.10821 0.70512 -0.153 0.87803
## X100.3 0.29725 1.24609 0.239 0.81146
## X100.4 -0.95551 1.09344 -0.874 0.38219
## X100.5 -0.39590 1.14138 -0.347 0.72870
## X100.6 0.07411 0.63462 0.117 0.90704
## X100.7 -16.57567 3956.18033 -0.004 0.99666
## X100.8 0.83625 0.60189 1.389 0.16472
## X100.9 1.68355 1.24609 1.351 0.17668
## X101 1.80133 0.64332 2.800 0.00511 **
## X101.1 -16.57567 2797.44195 -0.006 0.99527
## X101.2 1.34707 0.54370 2.478 0.01323 *
## X101.3 -16.57567 3956.18033 -0.004 0.99666
## X101.4 1.14455 0.60189 1.902 0.05722 .
## X101.5 -0.39590 1.14138 -0.347 0.72870
## X101.6 0.43078 0.66754 0.645 0.51871
## X101.8 1.83770 0.72728 2.527 0.01151 *
## X101.9 1.39586 0.94132 1.483 0.13811
## X102 2.24316 0.83403 2.690 0.00715 **
## X102.1 18.55647 3956.18033 0.005 0.99626
## X102.2 18.55647 1978.09018 0.009 0.99252
## X102.3 -16.57567 2797.44195 -0.006 0.99527
## X102.4 1.68355 1.24609 1.351 0.17668
## X102.5 18.55647 2797.44195 0.007 0.99471
## X102.6 2.59984 1.11926 2.323 0.02019 *
## X102.8 18.55647 1615.10387 0.011 0.99083
## X102.9 18.55647 3956.18033 0.005 0.99626
## X103 2.37669 1.14138 2.082 0.03732 *
## X103.1 18.55647 3956.18033 0.005 0.99626
## X103.2 18.55647 2797.44195 0.007 0.99471
## X103.4 18.55647 2284.10179 0.008 0.99352
## X103.5 -16.57567 3956.18033 -0.004 0.99666
## X103.6 18.55647 1978.09018 0.009 0.99252
## X103.8 18.55647 3956.18033 0.005 0.99626
## X104 18.55647 2284.10179 0.008 0.99352
## X104.2 0.99040 1.43274 0.691 0.48940
## X104.4 18.55647 3956.18034 0.005 0.99626
## X105.6 18.55647 3956.18033 0.005 0.99626
## X106.2 18.55647 3956.18034 0.005 0.99626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 409.95 on 296 degrees of freedom
## Residual deviance: 308.58 on 257 degrees of freedom
## AIC: 388.58
##
## Number of Fisher Scoring iterations: 16
Dari ketiga model di atas, kita akan mulai mengeliminasi model-model mana yang relatif kurang baik dibandingkan model lainnya. Pertama akan kita ambil nilai AIC yang terburuk.
log_model1$aic
## [1] 213.7179
log_back_model2$aic
## [1] 188.6541
log_back_model3$aic
## [1] 388.5827
Didapatkan AIC terburuk adalah pada model 3. Setelah melngecek nilai AIC, perlu juga kita memverifikasi fit dari model terhadap deviance dari statistik. Null Hypothesis disini adalah model deviasi nya tidak terdistribusi normal.
pchisq(log_model1$deviance, df = log_model1$df.residual, lower.tail = T)
## [1] 0.0664693
pchisq(log_back_model2$deviance, df = log_model1$df.residual, lower.tail = T)
## [1] 0.0003003839
pchisq(log_back_model3$deviance, df = log_model1$df.residual, lower.tail = T)
## [1] 0.999649
Dari data tersebut didapat model 3 datanya tidak terdistribusi nomal alias > 0.05 untuk itu dengan menimbang nilai AIC yang juga relatif besar, maka model 3 akan kita buang.
Setelah melakukan uji deviasi kita akan lakukan pengujian multi-colinearity untuk melihat apakah antar variabel prediktor meiliki hubungan yang kuat.
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:gtools':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
vif(log_model1)
## GVIF Df GVIF^(1/(2*Df))
## X12 1.171791 3 1.026774
## X11 1.142280 2 1.033816
## X2 1.089833 1 1.043951
## X9 1.062685 1 1.030866
Tidak ada multicolienarity pada model 1 karena tidak meenuhi nilai GVIF^(1/(2*Df)).
vif(log_back_model2)
## GVIF Df GVIF^(1/(2*Df))
## X2 1.382042 1 1.175603
## X3 1.536971 3 1.074264
## X9 1.186972 1 1.089483
## X11 1.386448 2 1.085115
## X12 1.334434 3 1.049259
## X13 1.373315 2 1.082536
Tidak ada multicolienarity pada Model 2 karena tidak meenuhi nilai GVIF^(1/(2*Df)).
library(hnp)
set.seed(305)
hnp(log_model1, halfnormal = F, paint.out = T, pch = 16, sim = 500, print.on = T)
## Binomial model
hnp(log_back_model2, halfnormal = F, paint.out = T, pch = 16, sim = 500, print.on = T)
## Binomial model
# Prediction Setelah itu akan kita lakukan prediksi untuk melihat
bagaimana fit model yang telah kita bangun terhadap data-data baru.
Pertama kita akan melakukan prediksi tersebut menggunakan model 1 kepada
data test. lalu data hasil akan berupa tipe “response” dimana adalah
berupa angka 0-1 yang merupakan probabilitas apakah prediksi
mendekati
df_test$prob1 <- predict(object = log_model1, type = "response", newdata = df_test)
df_test$pred.label1 <- ifelse(test = df_test$prob1 > 0.5, yes = "Not Health", no = "Health")
df_test$Y_type <- ifelse(test = df_test$Y == "Not Health", 1, 0)
Untuk mengevaluasi kualitas model dalam mengklasifikasikan pasien menderita penyakit jantung atau tidak, kami menggunakan kurva ROC. Model yang dikembangkan menyajikan area di bawah kurva 0,90, nilai ini tidak diragukan lagi sangat baik.
library(ROCR)
df_test$prob1 <- predict(object = log_model1, type = "response", newdata = df_test)
df_test$pred.label1 <- ifelse(df_test$prob1 > 0.5, yes= "1",no = "0")
df_test$Y_type <- ifelse(df_test$Y == "0", 0, 1)
pred_model1 <- prediction(df_test$prob1, df_test$Y_type)
perf <- performance(pred_model1, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "ROC Curve")
abline(0, 1, lty = 2)
performance(pred_model1,"auc")@y.values[[1]] # area below the curve
## [1] 0.8926585
df_test$prob2 <- predict(object = log_back_model2, type = "response", newdata = df_test)
df_test$pred.label2 <- ifelse(test = df_test$prob2 > 0.5, yes = "1", no = "0")
Sama seperti model 1, Model 2 yang telah dikembangkan menyajikan area di bawah kurva sebesar 0,90, nilai ini tidak diragukan lagi sangat baik.
pred_model2 <- prediction(df_test$prob2, df_test$Y_type)
perf <- performance(pred_model2,"tpr", "fpr")
plot(perf, colorize = TRUE, main="ROC Curve")
abline(0, 1, lty = 2)
performance(pred_model2,"auc")@y.values[[1]] # area below the curve
## [1] 0.932703
library(ggplot2)
ggplot(df_test, aes(x = prob1)) +
geom_density(lwd = 0.5) +
theme_minimal()
Pada grafik diatas, dapat diinterpretasikan bahwa hasil prediksi yang
dilakukan lecondongan sama di kedua variabel target yang artinya tidak
sakit dan sakit jantung hampir seimbang.
ggplot(df_test, aes(x = prob2)) +
geom_density(lwd = 0.5) +
theme_minimal()
Pada grafik diatas, dapat diinterpretasikan bahwa hasil prediksi yang
dilakukan sedikit condong ke arah 1 yang artinya sakit. Lalu berikut
adalah hasil prediksi yang telah di bangun.
library(dplyr)
df_test[,c("pred.label1", "Y", "pred.label2")] %>% sample()
## pred.label2 Y pred.label1
## 2 0 0 0
## 3 0 0 0
## 4 0 1 1
## 8 1 1 1
## 9 0 0 0
## 18 0 0 1
## 19 0 1 0
## 20 0 0 1
## 26 0 0 0
## 41 0 0 0
## 44 0 0 0
## 45 0 0 1
## 64 0 0 0
## 65 0 0 0
## 72 0 0 0
## 91 1 0 1
## 94 0 1 0
## 99 1 1 1
## 101 1 1 1
## 103 0 0 0
## 105 0 0 0
## 106 1 0 0
## 107 1 1 1
## 110 0 0 0
## 114 0 0 0
## 115 0 0 0
## 128 0 0 0
## 140 0 0 0
## 143 0 0 0
## 145 0 0 0
## 147 0 0 0
## 155 0 0 0
## 161 1 1 1
## 163 1 1 1
## 166 1 0 0
## 173 0 0 0
## 180 1 1 1
## 185 1 1 1
## 193 1 1 1
## 195 1 1 1
## 196 0 1 0
## 201 1 1 1
## 215 1 1 1
## 217 1 1 1
## 221 1 1 0
## 225 1 1 1
## 227 1 1 1
## 229 1 1 1
## 236 1 1 1
## 243 0 1 0
## 248 1 1 1
## 249 1 1 1
## 258 1 1 0
## 261 1 1 1
## 264 1 1 0
## 274 1 1 1
## 277 0 0 0
## 278 1 1 1
## 290 0 0 0
## 297 1 1 0
library(caret)
## Loading required package: lattice
model_eval1 <- confusionMatrix(as.factor(df_test$pred.label1), as.factor(df_test$Y), positive = "1")
model_eval1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 25 8
## 1 4 23
##
## Accuracy : 0.8
## 95% CI : (0.6767, 0.8922)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 5.222e-06
##
## Kappa : 0.6013
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.7419
## Specificity : 0.8621
## Pos Pred Value : 0.8519
## Neg Pred Value : 0.7576
## Prevalence : 0.5167
## Detection Rate : 0.3833
## Detection Prevalence : 0.4500
## Balanced Accuracy : 0.8020
##
## 'Positive' Class : 1
##
model_eval2 <- confusionMatrix(as.factor(df_test$pred.label2), as.factor(df_test$Y), positive = "1")
model_eval2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 26 5
## 1 3 26
##
## Accuracy : 0.8667
## 95% CI : (0.7541, 0.9406)
## No Information Rate : 0.5167
## P-Value [Acc > NIR] : 1.105e-08
##
## Kappa : 0.7336
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.8387
## Specificity : 0.8966
## Pos Pred Value : 0.8966
## Neg Pred Value : 0.8387
## Prevalence : 0.5167
## Detection Rate : 0.4333
## Detection Prevalence : 0.4833
## Balanced Accuracy : 0.8676
##
## 'Positive' Class : 1
##
model_eval1$byClass
## Sensitivity Specificity Pos Pred Value
## 0.7419355 0.8620690 0.8518519
## Neg Pred Value Precision Recall
## 0.7575758 0.8518519 0.7419355
## F1 Prevalence Detection Rate
## 0.7931034 0.5166667 0.3833333
## Detection Prevalence Balanced Accuracy
## 0.4500000 0.8020022
Recall <- 74.19
Specificity <- 86.20
precision <- 85.18
Accuracy <- 80.20
performance1 <- cbind.data.frame(Accuracy, Recall, precision, Specificity)
performance1
## Accuracy Recall precision Specificity
## 1 80.2 74.19 85.18 86.2
model_eval2$byClass
## Sensitivity Specificity Pos Pred Value
## 0.8387097 0.8965517 0.8965517
## Neg Pred Value Precision Recall
## 0.8387097 0.8965517 0.8387097
## F1 Prevalence Detection Rate
## 0.8666667 0.5166667 0.4333333
## Detection Prevalence Balanced Accuracy
## 0.4833333 0.8676307
Recall <- 83.87
Specificity <- 89.65
precision <- 89.65
Accuracy <- 86.80
performance2 <- cbind.data.frame(Accuracy, Recall, precision, Specificity)
performance2
## Accuracy Recall precision Specificity
## 1 86.8 83.87 89.65 89.65
Dari confusion matrix di atas, model 2 dapat menebak lebih baik apakah pasien health dan not health karena nilainya adalah sebesar 86.8%. Dari pasien yang sakit, model 2 juga lebih baik karena dapat menebak dengan sebesar 83.87% dari data aktual. Dari pasien yang health, kedua model 2 mampu menebak dengan benar pada 89.65% dari data aktual. Dari keseluruhan hasil prediksi positif yang bisa ditebak oleh model, model 2 mampu menebak dengan benar kelas positif sebesat 89,65%
performa <- function(cutoff, prob, ref, posY, negY)
{
predict <- factor(ifelse(prob >= cutoff, posY, negY))
conf <- caret::confusionMatrix(predict , ref, positive = posY)
acc <- conf$overall[1]
rec <- conf$byClass[1]
prec <- conf$byClass[3]
spec <- conf$byClass[2]
mat <- t(as.matrix(c(rec , acc , prec, spec)))
colnames(mat) <- c("recall", "accuracy", "precicion", "specificity")
return(mat)
}
co <- seq(0.01,0.80,length=100)
result <- matrix(0,100,4)
for(i in 1:100){
result[i,] = performa(cutoff = co[i],
prob = df_test$prob1,
ref = df_test$Y,
posY = "1",
negY = "0")
}
data_frame("Recall" = result[,1],
"Accuracy" = result[,2],
"Precision" = result[,3],
"Specificity" = result[,4],
"Cutoff" = co) %>%
gather(key = "performa", value = "value", 1:4) %>%
ggplot(aes(x = Cutoff, y = value, col = performa)) +
geom_line(lwd = 1.5) +
scale_color_manual(values = c("darkred","darkgreen","orange", "blue")) +
scale_y_continuous(breaks = seq(0,1,0.1), limits = c(0,1)) +
scale_x_continuous(breaks = seq(0,1,0.1)) +
labs(title = "Tradeoff model perfomance") +
theme_minimal() +
theme(legend.position = "top",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank())
# Model Interpretation
options(scipen = 999)
exp(log_model1$coefficients) %>% data.frame()
## .
## (Intercept) 0.02860413
## X121 6.77220586
## X122 28.75065039
## X123 26.80749971
## X111 4.76306390
## X112 4.33546237
## X21 4.29309161
## X91 5.33056287