datas<-read.csv("Cancer vs AP_Xray_Stage_Grade_Age.CSV",header=T,sep = ';')
datas<-as.data.frame(datas)Pengambilan 200 sampel secara acak.
set.seed(221016)
cancers <- datas[sample.int(nrow(datas),size=200,replace = FALSE,prob = NULL),]
head(cancers,n=3)tail(cancers,n=3)Pembagian Data Latih (80%) dan Data Uji (20%)
set.seed(221016)
sample <- sample.split(cancers$Y, SplitRatio = 0.8)
latih <- subset(cancers, sample == TRUE)
uji <- subset(cancers, sample == FALSE)
latihuji#melihat partisi data
f_data_asli<-as.vector(table(datas$Y))
f_data_sampel<-as.vector(table(cancers$Y))
f_data_latih<-as.vector(table(latih$Y))
f_data_uji<-as.vector(table(uji$Y))
nama_data<-c("Data Asli","Data Sampel","Data Latih","Data Uji")
partisi_data<-data.frame(nama_data,rbind(f_data_asli,f_data_sampel,f_data_latih,f_data_uji))
rownames(partisi_data)<-NULL
colnames(partisi_data)<-c("Data","Tidak Kanker_0","Kanker_1")
partisi_datamod_logit <- train(Y~., data = latih,
method ='glm', family = 'binomial',
tuneLength = 10,
trControl = trainControl(method = "cv",number = 10))
mod_logit## Generalized Linear Model
##
## 160 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.3639191 0.405249 0.2535064
p<-predict(mod_logit$finalModel,newdata=latih,type="response")
pred<-as.factor(ifelse(p<0.5,0,1))
aktual=as.factor(latih$Y)
confusionMatrix(aktual,pred,positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 18
## 1 12 99
##
## Accuracy : 0.8125
## 95% CI : (0.7433, 0.8698)
## No Information Rate : 0.7312
## P-Value [Acc > NIR] : 0.01087
##
## Kappa : 0.5431
##
## Mcnemar's Test P-Value : 0.36131
##
## Sensitivity : 0.8462
## Specificity : 0.7209
## Pos Pred Value : 0.8919
## Neg Pred Value : 0.6327
## Prevalence : 0.7312
## Detection Rate : 0.6188
## Detection Prevalence : 0.6937
## Balanced Accuracy : 0.7835
##
## 'Positive' Class : 1
##
p1<-predict(mod_logit$finalModel,newdata=uji,type="response")
pred_log<-as.factor(ifelse(p1<0.5,0,1))
aktual<-as.factor(uji$Y)
reglog<-confusionMatrix(aktual,pred_log,positive = "1");reglog## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9 3
## 1 1 27
##
## Accuracy : 0.9
## 95% CI : (0.7634, 0.9721)
## No Information Rate : 0.75
## P-Value [Acc > NIR] : 0.01604
##
## Kappa : 0.75
##
## Mcnemar's Test P-Value : 0.61708
##
## Sensitivity : 0.9000
## Specificity : 0.9000
## Pos Pred Value : 0.9643
## Neg Pred Value : 0.7500
## Prevalence : 0.7500
## Detection Rate : 0.6750
## Detection Prevalence : 0.7000
## Balanced Accuracy : 0.9000
##
## 'Positive' Class : 1
##
roc_log<-roc(pred_log~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
legend=c("Regresi Logistik"),
col=c("violetred3"),
lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)eval_reglog <- tibble(Accuracy = reglog$overall[1],
Sensitivity = reglog$byClass[1],
Specificity = reglog$byClass[2],
Precision = reglog$byClass[3],
AUC = reglog$byClass[11])
eval_reglogset.seed(501221016)
tree <-train(Y~.,
data=latih,
method="rpart",
trControl = trainControl(method = "cv",number = 10),tuneLength = 10)
tree## CART
##
## 160 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.00000000 0.4478075 0.19243141 0.3269484
## 0.01532865 0.4404321 0.22130733 0.3223474
## 0.03065729 0.4438132 0.19084962 0.3385214
## 0.04598594 0.4510843 0.16620855 0.3489266
## 0.06131458 0.4506988 0.18391887 0.3582153
## 0.07664323 0.4505512 0.18406126 0.3591845
## 0.09197188 0.4650818 0.12981376 0.3754764
## 0.10730052 0.4636291 0.07097552 0.3933908
## 0.12262917 0.4691091 0.07481319 0.4008175
## 0.13795781 0.4740235 0.05797568 0.4117702
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.01532865.
rpart.plot(tree$finalModel,box.palette = "RdBu",shadow="gray",tweak = 1.3)p_tree<-predict(tree$finalModel,newdata=latih)
pred_tree<-as.factor(ifelse(p_tree<0.5,0,1))
aktual_tree=as.factor(latih$Y)
confusionMatrix(aktual_tree,pred_tree,positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 33 16
## 1 16 95
##
## Accuracy : 0.8
## 95% CI : (0.7296, 0.859)
## No Information Rate : 0.6938
## P-Value [Acc > NIR] : 0.001705
##
## Kappa : 0.5293
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8559
## Specificity : 0.6735
## Pos Pred Value : 0.8559
## Neg Pred Value : 0.6735
## Prevalence : 0.6937
## Detection Rate : 0.5938
## Detection Prevalence : 0.6937
## Balanced Accuracy : 0.7647
##
## 'Positive' Class : 1
##
p1_tree<-predict(tree$finalModel,newdata=uji)
pred1_tree<-as.factor(ifelse(p1_tree<0.5,0,1))
aktual1_tree=as.factor(uji$Y)
dtree<-confusionMatrix(aktual1_tree,pred1_tree,positive = "1");dtree## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7 5
## 1 4 24
##
## Accuracy : 0.775
## 95% CI : (0.6155, 0.8916)
## No Information Rate : 0.725
## P-Value [Acc > NIR] : 0.3048
##
## Kappa : 0.4512
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.8276
## Specificity : 0.6364
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.5833
## Prevalence : 0.7250
## Detection Rate : 0.6000
## Detection Prevalence : 0.7000
## Balanced Accuracy : 0.7320
##
## 'Positive' Class : 1
##
roc_tree<-roc(pred1_tree~uji$Y,plot=TRUE,col="violetred3",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
legend=c("Regresi Logistik"),
col=c("violetred3"),
lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)eval_dtree <- tibble(Accuracy = dtree$overall[1],
Sensitivity = dtree$byClass[1],
Specificity = dtree$byClass[2],
Precision = dtree$byClass[3],
AUC = dtree$byClass[11])
eval_dtreeroc_log<-roc(pred_log~uji$Y,plot=TRUE,col="lightblue",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.50)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_tree<-roc(pred1_tree~uji$Y,plot=TRUE,col="salmon",lwd=3,legacy.axes=TRUE,main="ROC Curves",asp=NA,print.auc=T,print.auc.y=0.60,add=TRUE)## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("bottom",
legend=c("Regresi Logistik","Decision Tree"),
col=c("lightblue","Salmon"),
lwd=4, cex =0.6, xpd = TRUE, horiz = TRUE)modelcompare <- resamples(list(glm=mod_logit, rpart=tree))dotplot(modelcompare)final_all <- as.data.frame(rbind(eval_reglog,eval_dtree))
rownames(final_all) <- c("Regresi Logistik", "Decision Tree")
final_all