Packages
library(caret)
library(pROC)
library(e1071)
library(ggplot2)
Import Data
data_cancer<-readxl::read_xlsx("C:/Users/ASUS/Downloads/Cancer vs AP_Xray_Stage_Grade_Age.xlsx")
data_cancer<-as.data.frame(data_cancer)
head(data_cancer)
## 'data.frame': 300 obs. of 6 variables:
## $ AP : num 128 46 165 102 42 52 112 146 157 174 ...
## $ XRay : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Stage: num 1 1 0 1 1 1 0 1 1 1 ...
## $ Grade: num 1 1 1 1 0 0 0 0 1 0 ...
## $ Age : num 54 55 53 50 61 55 52 55 59 60 ...
## $ Y : num 1 0 1 1 0 0 0 1 1 1 ...
Pemilihan sampel sebanyak 200 dari 300 amatan
set.seed(22058)
angka_acak<-sample(1:300,size=200,replace=F)
list(awal=head(angka_acak,n=3),akhir=tail(angka_acak,n=3))
## $awal
## [1] 58 107 139
##
## $akhir
## [1] 169 97 252
data_cancer_sample<-data_cancer[angka_acak,]
head(data_cancer_sample,n=3)
tail(data_cancer_sample,n=3)
Partisi Data (Latih 80% dan Uji 20%)
set.seed(22058)
kelas0<-subset(data_cancer_sample,data_cancer_sample$Y==0)
kelas1<-subset(data_cancer_sample,data_cancer_sample$Y==1)
k0_p<-sample(1:nrow(kelas0), size = round(0.8*nrow(kelas0)), replace=FALSE)
k1_p<-sample(1:nrow(kelas1), size = round(0.8*nrow(kelas1)), replace=FALSE)
latih<-rbind(kelas0[k0_p,],kelas1[k1_p,])
uji<-rbind(kelas0[-k0_p,],kelas1[-k1_p,])
f_data_asli<-as.vector(table(data_cancer_sample$Y))
f_data_latih<-as.vector(table(latih$Y))
f_data_uji<-as.vector(table(uji$Y))
nama_data<-c("Data Asli","Data Latih","Data Uji")
partisi_data<-data.frame(nama_data,rbind(f_data_asli,f_data_latih,f_data_uji))
rownames(partisi_data)<-NULL
colnames(partisi_data)<-c("Data","Kelas_0","Kelas_1")
partisi_data
Model Regresi Logistik
model_logistik<-glm(Y~.,data=latih)
summary(model_logistik)
##
## Call:
## glm(formula = Y ~ ., data = latih)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7565 -0.3176 0.0476 0.2744 0.6630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4770868 0.3052766 1.563 0.12015
## AP 0.0056538 0.0007327 7.716 1.40e-12 ***
## XRay 0.3374267 0.0709781 4.754 4.54e-06 ***
## Stage 0.1996738 0.0713534 2.798 0.00579 **
## Grade 0.1755707 0.0608666 2.885 0.00448 **
## Age -0.0135592 0.0049648 -2.731 0.00705 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1452245)
##
## Null deviance: 36.094 on 159 degrees of freedom
## Residual deviance: 22.365 on 154 degrees of freedom
## AIC: 153.23
##
## Number of Fisher Scoring iterations: 2
Kurva ROC (Data Uji)
roc_pk<-roc(pred_log~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
legend=c("Regresi Logistik"),
col=c("violetred3"),
lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

Model Naive Bayes
model_nb = naiveBayes(as.factor(Y) ~., data = latih)
model_nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.34375 0.65625
##
## Conditional probabilities:
## AP
## Y [,1] [,2]
## 0 82.25455 32.13756
## 1 123.29524 39.14763
##
## XRay
## Y [,1] [,2]
## 0 0.2909091 0.4583678
## 1 0.5714286 0.4972452
##
## Stage
## Y [,1] [,2]
## 0 0.5454545 0.5025189
## 1 0.5714286 0.4972452
##
## Grade
## Y [,1] [,2]
## 0 0.4000000 0.4944132
## 1 0.5333333 0.5012804
##
## Age
## Y [,1] [,2]
## 0 59.94545 6.189651
## 1 58.09524 6.043948
Kurva ROC(Data Uji)
roc_pk<-roc(pred_nb~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
legend=c("Naive Bayes"),
col=c("violetred3"),
lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

Perbandingan Performa Klasifikasi
#Logistic
p_logistik<-predict(model_logistik,newdata=uji,type="response")
pred_log<-as.factor(ifelse(p_logistik<0.5,0,1))
aktual<-as.factor(uji$Y)
cm_log<-confusionMatrix(aktual,pred_log,positive = "1")
nilai_log<-c(cm_log$overall[1],cm_log$byClass[c(1,2,11)])
#Naive Bayes
pred_nb<-as.factor(predict(model_nb,newdata=uji))
cm_nb<-confusionMatrix(aktual,pred_nb,positive = "1")
nilai_nb<-c(cm_nb$overall[1],cm_nb$byClass[c(1,2,11)])
performa=c("Akurasi","Sensitivitas","Spesifisitas","AUC")
metode=c("Regresi Logistik","Naive Bayes")
perbandingan<-data.frame(Metode=rep(metode,1,each=4),Performa=rep(performa,2),Nilai=round(c(nilai_log,nilai_nb),4)*100)
perbandingan
ggplot(data=perbandingan, aes(x=Performa, y=Nilai, fill=Metode)) +
geom_bar(position = 'dodge', stat='identity') +
geom_text(aes(label=Nilai), position=position_dodge(width=0.9), vjust=-0.25,cex=3)+
scale_fill_manual(values=c("sandybrown","slategray"))

roc_pk<-roc(pred_log~uji$Y,plot=TRUE,col="slategray",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_pk<-roc(pred_nb~uji$Y,plot=TRUE,col="sandybrown",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,add=TRUE,print.auc=T,print.auc.y=0.52)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
legend=c("Regresi Logistik","Naive Bayes"),
col=c("slategray", "sandybrown"),
lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)
