Import Data

datas<-read.csv("Cancer vs AP_Xray_Stage_Grade_Age.CSV",header=T,sep = ';')
datas<-as.data.frame(datas)

Pengambilan 200 sampel secara acak.

set.seed(221016)
cancers <- datas[sample.int(nrow(datas),size=200,replace = FALSE,prob = NULL),]
head(cancers,n=3)
tail(cancers,n=3)

Pembagian Data Latih (80%) dan Data Uji (20%)

set.seed(221016)
sample <- sample.split(cancers$Y, SplitRatio = 0.8)
latih  <- subset(cancers, sample == TRUE)
uji   <- subset(cancers, sample == FALSE)
latih
uji
#melihat partisi data
f_data_asli<-as.vector(table(datas$Y))
f_data_sampel<-as.vector(table(cancers$Y))
f_data_latih<-as.vector(table(latih$Y))
f_data_uji<-as.vector(table(uji$Y))
nama_data<-c("Data Asli","Data Sampel","Data Latih","Data Uji")
partisi_data<-data.frame(nama_data,rbind(f_data_asli,f_data_sampel,f_data_latih,f_data_uji))
rownames(partisi_data)<-NULL
colnames(partisi_data)<-c("Data","Tidak Kanker_0","Kanker_1")
partisi_data

Regresi Logistik

mod_logit <- train(Y~., data = latih, 
                method ='glm', family = 'binomial', 
                tuneLength = 10, 
                trControl = trainControl(method = "cv",number = 10))
mod_logit
## Generalized Linear Model 
## 
## 160 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.3639191  0.405249  0.2535064
p<-predict(mod_logit$finalModel,newdata=latih,type="response")
pred<-as.factor(ifelse(p<0.5,0,1))
aktual=as.factor(latih$Y)
confusionMatrix(aktual,pred,positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 31 18
##          1 12 99
##                                           
##                Accuracy : 0.8125          
##                  95% CI : (0.7433, 0.8698)
##     No Information Rate : 0.7312          
##     P-Value [Acc > NIR] : 0.01087         
##                                           
##                   Kappa : 0.5431          
##                                           
##  Mcnemar's Test P-Value : 0.36131         
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.7209          
##          Pos Pred Value : 0.8919          
##          Neg Pred Value : 0.6327          
##              Prevalence : 0.7312          
##          Detection Rate : 0.6188          
##    Detection Prevalence : 0.6937          
##       Balanced Accuracy : 0.7835          
##                                           
##        'Positive' Class : 1               
## 
p1<-predict(mod_logit$finalModel,newdata=uji,type="response")
pred_log<-as.factor(ifelse(p1<0.5,0,1))
aktual<-as.factor(uji$Y)
reglog<-confusionMatrix(aktual,pred_log,positive = "1");reglog
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  9  3
##          1  1 27
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.7634, 0.9721)
##     No Information Rate : 0.75            
##     P-Value [Acc > NIR] : 0.01604         
##                                           
##                   Kappa : 0.75            
##                                           
##  Mcnemar's Test P-Value : 0.61708         
##                                           
##             Sensitivity : 0.9000          
##             Specificity : 0.9000          
##          Pos Pred Value : 0.9643          
##          Neg Pred Value : 0.7500          
##              Prevalence : 0.7500          
##          Detection Rate : 0.6750          
##    Detection Prevalence : 0.7000          
##       Balanced Accuracy : 0.9000          
##                                           
##        'Positive' Class : 1               
## 
roc_log<-roc(pred_log~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
       legend=c("Regresi Logistik"),
       col=c("violetred3"),
       lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

eval_reglog <- tibble(Accuracy = reglog$overall[1],
                     Sensitivity = reglog$byClass[1],
                     Specificity = reglog$byClass[2],
                     Precision = reglog$byClass[3],
AUC = reglog$byClass[11])
eval_reglog

Decision Tree

set.seed(501221016)
tree <-train(Y~., 
                  data=latih, 
                  method="rpart", 
                  trControl = trainControl(method = "cv",number = 10),tuneLength = 10)
tree
## CART 
## 
## 160 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE       Rsquared    MAE      
##   0.00000000  0.4478075  0.19243141  0.3269484
##   0.01532865  0.4404321  0.22130733  0.3223474
##   0.03065729  0.4438132  0.19084962  0.3385214
##   0.04598594  0.4510843  0.16620855  0.3489266
##   0.06131458  0.4506988  0.18391887  0.3582153
##   0.07664323  0.4505512  0.18406126  0.3591845
##   0.09197188  0.4650818  0.12981376  0.3754764
##   0.10730052  0.4636291  0.07097552  0.3933908
##   0.12262917  0.4691091  0.07481319  0.4008175
##   0.13795781  0.4740235  0.05797568  0.4117702
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01532865.
rpart.plot(tree$finalModel,box.palette = "RdBu",shadow="gray",tweak = 1.3)

p_tree<-predict(tree$finalModel,newdata=latih)
pred_tree<-as.factor(ifelse(p_tree<0.5,0,1))
aktual_tree=as.factor(latih$Y)
confusionMatrix(aktual_tree,pred_tree,positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 33 16
##          1 16 95
##                                          
##                Accuracy : 0.8            
##                  95% CI : (0.7296, 0.859)
##     No Information Rate : 0.6938         
##     P-Value [Acc > NIR] : 0.001705       
##                                          
##                   Kappa : 0.5293         
##                                          
##  Mcnemar's Test P-Value : 1.000000       
##                                          
##             Sensitivity : 0.8559         
##             Specificity : 0.6735         
##          Pos Pred Value : 0.8559         
##          Neg Pred Value : 0.6735         
##              Prevalence : 0.6937         
##          Detection Rate : 0.5938         
##    Detection Prevalence : 0.6937         
##       Balanced Accuracy : 0.7647         
##                                          
##        'Positive' Class : 1              
## 
p1_tree<-predict(tree$finalModel,newdata=uji)
pred1_tree<-as.factor(ifelse(p1_tree<0.5,0,1))
aktual1_tree=as.factor(uji$Y)
dtree<-confusionMatrix(aktual1_tree,pred1_tree,positive = "1");dtree
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0  7  5
##          1  4 24
##                                           
##                Accuracy : 0.775           
##                  95% CI : (0.6155, 0.8916)
##     No Information Rate : 0.725           
##     P-Value [Acc > NIR] : 0.3048          
##                                           
##                   Kappa : 0.4512          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.8276          
##             Specificity : 0.6364          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.5833          
##              Prevalence : 0.7250          
##          Detection Rate : 0.6000          
##    Detection Prevalence : 0.7000          
##       Balanced Accuracy : 0.7320          
##                                           
##        'Positive' Class : 1               
## 
roc_tree<-roc(pred1_tree~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
       legend=c("Regresi Logistik"),
       col=c("violetred3"),
       lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

eval_dtree <- tibble(Accuracy = dtree$overall[1],
                     Sensitivity = dtree$byClass[1],
                     Specificity = dtree$byClass[2],
                     Precision = dtree$byClass[3],
AUC = dtree$byClass[11])
eval_dtree
roc_log<-roc(pred_log~uji$Y,plot=TRUE,col="lightblue",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.50)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_tree<-roc(pred1_tree~uji$Y,plot=TRUE,col="salmon",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60,add=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
       legend=c("Regresi Logistik","Decision Tree"),
       col=c("lightblue","Salmon"),
       lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)

modelcompare <- resamples(list(glm=mod_logit, rpart=tree))
dotplot(modelcompare)

Perbandingan Akurasi dengan Threshold Optimal

Perbandingan Ketiga Model

final_all <- as.data.frame(rbind(eval_reglog,eval_dtree))
rownames(final_all) <- c("Regresi Logistik", "Decision Tree")
final_all