Packages

library(caret)
library(pROC)
library(e1071)
library(ggplot2)

Import Data

data_cancer<-readxl::read_xlsx("C:/Users/ASUS/Downloads/Cancer vs AP_Xray_Stage_Grade_Age.xlsx")
data_cancer<-as.data.frame(data_cancer)
head(data_cancer)
str(data_cancer)
## 'data.frame':    300 obs. of  6 variables:
##  $ AP   : num  128 46 165 102 42 52 112 146 157 174 ...
##  $ XRay : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stage: num  1 1 0 1 1 1 0 1 1 1 ...
##  $ Grade: num  1 1 1 1 0 0 0 0 1 0 ...
##  $ Age  : num  54 55 53 50 61 55 52 55 59 60 ...
##  $ Y    : num  1 0 1 1 0 0 0 1 1 1 ...

Pemilihan sampel sebanyak 200 dari 300 amatan

set.seed(22058)
angka_acak<-sample(1:300,size=200,replace=F)
list(awal=head(angka_acak,n=3),akhir=tail(angka_acak,n=3))
## $awal
## [1]  58 107 139
## 
## $akhir
## [1] 169  97 252
data_cancer_sample<-data_cancer[angka_acak,]
head(data_cancer_sample,n=3)
tail(data_cancer_sample,n=3)

Partisi Data (Latih 80% dan Uji 20%)

set.seed(22058)
kelas0<-subset(data_cancer_sample,data_cancer_sample$Y==0)
kelas1<-subset(data_cancer_sample,data_cancer_sample$Y==1)
k0_p<-sample(1:nrow(kelas0), size = round(0.8*nrow(kelas0)), replace=FALSE)
k1_p<-sample(1:nrow(kelas1), size = round(0.8*nrow(kelas1)), replace=FALSE)
latih<-rbind(kelas0[k0_p,],kelas1[k1_p,])
uji<-rbind(kelas0[-k0_p,],kelas1[-k1_p,])
f_data_asli<-as.vector(table(data_cancer_sample$Y))
f_data_latih<-as.vector(table(latih$Y))
f_data_uji<-as.vector(table(uji$Y))
nama_data<-c("Data Asli","Data Latih","Data Uji")
partisi_data<-data.frame(nama_data,rbind(f_data_asli,f_data_latih,f_data_uji))
rownames(partisi_data)<-NULL
colnames(partisi_data)<-c("Data","Kelas_0","Kelas_1")
partisi_data

Model Regresi Logistik

model_logistik<-glm(Y~.,data=latih)
summary(model_logistik)
## 
## Call:
## glm(formula = Y ~ ., data = latih)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7565  -0.3176   0.0476   0.2744   0.6630  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.4770868  0.3052766   1.563  0.12015    
## AP           0.0056538  0.0007327   7.716 1.40e-12 ***
## XRay         0.3374267  0.0709781   4.754 4.54e-06 ***
## Stage        0.1996738  0.0713534   2.798  0.00579 ** 
## Grade        0.1755707  0.0608666   2.885  0.00448 ** 
## Age         -0.0135592  0.0049648  -2.731  0.00705 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1452245)
## 
##     Null deviance: 36.094  on 159  degrees of freedom
## Residual deviance: 22.365  on 154  degrees of freedom
## AIC: 153.23
## 
## Number of Fisher Scoring iterations: 2

Confusion Matrix dan Performa Klasifikasi (Data Latih)

p<-predict(model_logistik,newdata=latih,type="response")
pred<-as.factor(ifelse(p<0.5,0,1))
aktual=as.factor(latih$Y)
confusionMatrix(aktual,pred,positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 38 17
##          1 15 90
##                                          
##                Accuracy : 0.8            
##                  95% CI : (0.7296, 0.859)
##     No Information Rate : 0.6688         
##     P-Value [Acc > NIR] : 0.0001711      
##                                          
##                   Kappa : 0.5528         
##                                          
##  Mcnemar's Test P-Value : 0.8596838      
##                                          
##             Sensitivity : 0.8411         
##             Specificity : 0.7170         
##          Pos Pred Value : 0.8571         
##          Neg Pred Value : 0.6909         
##              Prevalence : 0.6687         
##          Detection Rate : 0.5625         
##    Detection Prevalence : 0.6562         
##       Balanced Accuracy : 0.7791         
##                                          
##        'Positive' Class : 1              
## 

Confusion Matrix dan Performa Klasifikasi (Data Uji)

p<-predict(model_logistik,newdata=uji,type="response")
pred_log<-as.factor(ifelse(p<0.5,0,1))
aktual<-as.factor(uji$Y)
confusionMatrix(aktual,pred_log,positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 11  3
##          1  2 24
##                                          
##                Accuracy : 0.875          
##                  95% CI : (0.732, 0.9581)
##     No Information Rate : 0.675          
##     P-Value [Acc > NIR] : 0.003455       
##                                          
##                   Kappa : 0.7207         
##                                          
##  Mcnemar's Test P-Value : 1.000000       
##                                          
##             Sensitivity : 0.8889         
##             Specificity : 0.8462         
##          Pos Pred Value : 0.9231         
##          Neg Pred Value : 0.7857         
##              Prevalence : 0.6750         
##          Detection Rate : 0.6000         
##    Detection Prevalence : 0.6500         
##       Balanced Accuracy : 0.8675         
##                                          
##        'Positive' Class : 1              
## 

Kurva ROC (Data Uji)

roc_pk<-roc(pred_log~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
       legend=c("Regresi Logistik"),
       col=c("violetred3"),
       lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

Model Naive Bayes

model_nb = naiveBayes(as.factor(Y) ~., data = latih)
model_nb
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##       0       1 
## 0.34375 0.65625 
## 
## Conditional probabilities:
##    AP
## Y        [,1]     [,2]
##   0  82.25455 32.13756
##   1 123.29524 39.14763
## 
##    XRay
## Y        [,1]      [,2]
##   0 0.2909091 0.4583678
##   1 0.5714286 0.4972452
## 
##    Stage
## Y        [,1]      [,2]
##   0 0.5454545 0.5025189
##   1 0.5714286 0.4972452
## 
##    Grade
## Y        [,1]      [,2]
##   0 0.4000000 0.4944132
##   1 0.5333333 0.5012804
## 
##    Age
## Y       [,1]     [,2]
##   0 59.94545 6.189651
##   1 58.09524 6.043948

Confusion Matrix dan Performa Klasifikasi (Data Latih)

pred_nb<-as.factor(predict(model_nb,newdata=latih))
aktual<-as.factor(latih$Y)
confusionMatrix(aktual,pred_nb,positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 32 23
##          1 13 92
##                                           
##                Accuracy : 0.775           
##                  95% CI : (0.7024, 0.8372)
##     No Information Rate : 0.7188          
##     P-Value [Acc > NIR] : 0.06518         
##                                           
##                   Kappa : 0.4787          
##                                           
##  Mcnemar's Test P-Value : 0.13361         
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.7111          
##          Pos Pred Value : 0.8762          
##          Neg Pred Value : 0.5818          
##              Prevalence : 0.7188          
##          Detection Rate : 0.5750          
##    Detection Prevalence : 0.6562          
##       Balanced Accuracy : 0.7556          
##                                           
##        'Positive' Class : 1               
## 

Confusion Matrix dan Performa Klasifikasi (Data Uji)

pred_nb<-as.factor(predict(model_nb,newdata=uji))
aktual<-as.factor(uji$Y)
confusionMatrix(aktual,pred_nb,positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 10  4
##          1  2 24
##                                           
##                Accuracy : 0.85            
##                  95% CI : (0.7016, 0.9429)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.02376         
##                                           
##                   Kappa : 0.6591          
##                                           
##  Mcnemar's Test P-Value : 0.68309         
##                                           
##             Sensitivity : 0.8571          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.9231          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.7000          
##          Detection Rate : 0.6000          
##    Detection Prevalence : 0.6500          
##       Balanced Accuracy : 0.8452          
##                                           
##        'Positive' Class : 1               
## 

Kurva ROC(Data Uji)

roc_pk<-roc(pred_nb~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
       legend=c("Naive Bayes"),
       col=c("violetred3"),
       lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

Perbandingan Performa Klasifikasi

#Logistic
p_logistik<-predict(model_logistik,newdata=uji,type="response")
pred_log<-as.factor(ifelse(p_logistik<0.5,0,1))
aktual<-as.factor(uji$Y)
cm_log<-confusionMatrix(aktual,pred_log,positive = "1")
nilai_log<-c(cm_log$overall[1],cm_log$byClass[c(1,2,11)])
#Naive Bayes
pred_nb<-as.factor(predict(model_nb,newdata=uji))
cm_nb<-confusionMatrix(aktual,pred_nb,positive = "1")
nilai_nb<-c(cm_nb$overall[1],cm_nb$byClass[c(1,2,11)])
performa=c("Akurasi","Sensitivitas","Spesifisitas","AUC")
metode=c("Regresi Logistik","Naive Bayes")
perbandingan<-data.frame(Metode=rep(metode,1,each=4),Performa=rep(performa,2),Nilai=round(c(nilai_log,nilai_nb),4)*100)
perbandingan
ggplot(data=perbandingan, aes(x=Performa, y=Nilai, fill=Metode)) + 
     geom_bar(position = 'dodge', stat='identity') +
     geom_text(aes(label=Nilai), position=position_dodge(width=0.9), vjust=-0.25,cex=3)+
     scale_fill_manual(values=c("sandybrown","slategray"))

roc_pk<-roc(pred_log~uji$Y,plot=TRUE,col="slategray",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_pk<-roc(pred_nb~uji$Y,plot=TRUE,col="sandybrown",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,add=TRUE,print.auc=T,print.auc.y=0.52)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
       legend=c("Regresi Logistik","Naive Bayes"),
       col=c("slategray", "sandybrown"),
       lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)