Hợp nhất 2 dataset khi development và validation. Dưới đây là các biến số của dataframe dùng để phân tích.
names (d)
## [1] "name" "gender" "age" "addate" "pulse" "sys" "dia"
## [8] "tem" "br" "spo2" "gcs" "cpr" "mv" "prether"
## [15] "pread" "los" "depa" "aids" "dic" "lym" "leumye"
## [22] "canc" "rf" "crf" "cir" "dm" "hf" "fa"
## [29] "urea" "gluco" "cre" "alb" "na" "alt" "crp"
## [36] "exc" "disc" "ED" "outcome" "hos" "ID" "dis"
Recode biến outcome
d$oc = d$outcome
d$oc [d$outcome == 1] = "D"
d$oc [d$outcome == 0] = "S"
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(123)
indx= d [,-c(1, 4, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 36, 37, 38, 39, 40, 41, 42)]
indxTrain = createDataPartition (y = indx$oc, p = 8/10,list = FALSE) # Chia bộ dữ liệu theo tỉ lệ 70:30
training = indx [indxTrain,]
testing = indx [-indxTrain,]
dim(testing)
## [1] 698 23
dim(training)
## [1] 2795 23
Control= trainControl (method="repeatedcv", number=10, repeats=10, classProbs=TRUE, summaryFunction= multiClassSummary)
Mô hình logistic
set.seed(29)
logistic = train (oc ~., data=training, method="glm", family="binomial", trControl=Control)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logistic
## Generalized Linear Model
##
## 2795 samples
## 22 predictor
## 2 classes: 'D', 'S'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ...
## Resampling results:
##
## logLoss ROC Accuracy Kappa Sensitivity Specificity
## 0.2400483 0.833652 0.9173948 0.2841245 0.2228833 0.984426
## Pos_Pred_Value Neg_Pred_Value Detection_Rate Balanced_Accuracy
## 0.5910232 0.929244 0.01960794 0.6036547
##
##
Mô hình probit
set.seed(2)
probit= train(oc ~ ., data=training, method="glm", family="binomial"(link="probit"), trControl=Control)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
probit
## Generalized Linear Model
##
## 2795 samples
## 22 predictor
## 2 classes: 'D', 'S'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2516, 2515, 2516, 2517, 2515, 2515, ...
## Resampling results:
##
## logLoss ROC Accuracy Kappa Sensitivity Specificity
## 0.2385555 0.836283 0.9181767 0.2805863 0.2181 0.9857216
## Pos_Pred_Value Neg_Pred_Value Detection_Rate Balanced_Accuracy
## 0.6085739 0.9289704 0.01921124 0.6019108
##
##
Random Forest
library (randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(29)
rf = train(oc ~ ., data=training, method="rf", preProcess=c("center","scale"), trControl=Control)
rf
## Random Forest
##
## 2795 samples
## 22 predictor
## 2 classes: 'D', 'S'
##
## Pre-processing: centered (22), scaled (22)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ...
## Resampling results across tuning parameters:
##
## mtry logLoss ROC Accuracy Kappa Sensitivity
## 2 0.2286701 0.8426736 0.9195047 0.1892300 0.1235667
## 12 0.2571539 0.8343837 0.9186110 0.2726866 0.2053667
## 22 0.2639347 0.8297559 0.9175720 0.2780082 0.2150667
## Specificity Pos_Pred_Value Neg_Pred_Value Detection_Rate
## 0.9963131 0.7751548 0.9217778 0.01087751
## 0.9874480 0.6223244 0.9279633 0.01806979
## 0.9853679 0.5932651 0.9286455 0.01892770
## Balanced_Accuracy
## 0.5599399
## 0.5964074
## 0.6002173
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
Neutral Network
set.seed(29)
library (nnet)
ann = train(oc ~ ., data=training, method="nnet", preProcess=c("center","scale"), trControl=Control,trace=FALSE)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
ann
## Neural Network
##
## 2795 samples
## 22 predictor
## 2 classes: 'D', 'S'
##
## Pre-processing: centered (22), scaled (22)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ...
## Resampling results across tuning parameters:
##
## size decay logLoss ROC Accuracy Kappa Sensitivity
## 1 0e+00 0.2416574 0.8085456 0.9133487 0.2191968 0.1922500
## 1 1e-04 0.2399195 0.8090348 0.9146007 0.2238847 0.1911333
## 1 1e-01 0.2309245 0.8358248 0.9171798 0.3043100 0.2472833
## 3 0e+00 0.2769452 0.7740291 0.9029353 0.2471775 0.2503333
## 3 1e-04 0.2743185 0.7893768 0.9034701 0.2639780 0.2671333
## 3 1e-01 0.2501818 0.8095434 0.9121304 0.2856183 0.2482833
## 5 0e+00 0.3434655 0.7713838 0.8974582 0.2359353 0.2540833
## 5 1e-04 0.3385270 0.7746364 0.8972796 0.2532164 0.2683167
## 5 1e-01 0.2714828 0.7917748 0.9077687 0.2733135 0.2495667
## Specificity Pos_Pred_Value Neg_Pred_Value Detection_Rate
## 0.9829335 0.5340225 0.9267985 0.01692437
## 0.9844267 0.5438530 0.9267761 0.01681697
## 0.9818362 0.5712230 0.9311490 0.02175502
## 0.9659125 0.4439109 0.9306267 0.02203829
## 0.9648442 0.4369167 0.9319932 0.02354151
## 0.9761874 0.5110014 0.9309173 0.02185934
## 0.9595537 0.3955336 0.9305656 0.02236229
## 0.9579872 0.3928036 0.9315560 0.02361433
## 0.9712825 0.4604134 0.9306970 0.02196970
## Balanced_Accuracy
## 0.5875917
## 0.5877800
## 0.6145598
## 0.6081229
## 0.6159888
## 0.6122354
## 0.6068185
## 0.6131519
## 0.6104246
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.1.
Support Vector Machine
set.seed(29)
svm = train(oc ~., data=training, method="svmRadial", preProcess=c("center","scale"), trControl=Control)
## Loading required package: kernlab
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
svm
## Support Vector Machines with Radial Basis Function Kernel
##
## 2795 samples
## 22 predictor
## 2 classes: 'D', 'S'
##
## Pre-processing: centered (22), scaled (22)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ...
## Resampling results across tuning parameters:
##
## C logLoss ROC Accuracy Kappa Sensitivity
## 0.25 0.2590849 0.7526951 0.9159265 0.1937638 0.1383167
## 0.50 0.2575245 0.7525731 0.9161052 0.1903534 0.1350333
## 1.00 0.2545030 0.7533965 0.9152826 0.1898710 0.1367167
## Specificity Pos_Pred_Value Neg_Pred_Value Detection_Rate
## 0.9909775 0.6324632 0.9226138 0.01216682
## 0.9914871 0.6388427 0.9223826 0.01188046
## 0.9904283 0.6006162 0.9224398 0.01202383
## Balanced_Accuracy
## 0.5646471
## 0.5632602
## 0.5635725
##
## Tuning parameter 'sigma' was held constant at a value of 0.05917051
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.05917051 and C = 0.5.
Cây CART
set.seed(29)
cart= train(oc ~ ., data=training, method="rpart2", preProcess=c("center","scale"), trControl=Control)
## Loading required package: rpart
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
cart
## CART
##
## 2795 samples
## 22 predictor
## 2 classes: 'D', 'S'
##
## Pre-processing: centered (22), scaled (22)
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ...
## Resampling results across tuning parameters:
##
## maxdepth logLoss ROC Accuracy Kappa Sensitivity
## 2 0.2642490 0.6381930 0.9148166 0.1712111 0.1239833
## 3 0.2663270 0.6381720 0.9149602 0.2082080 0.1560833
## 4 0.2716385 0.6503692 0.9138857 0.2149223 0.1646167
## Specificity Pos_Pred_Value Neg_Pred_Value Detection_Rate
## 0.9911348 0.5683975 0.9214679 0.01091271
## 0.9881925 0.5529439 0.9239159 0.01373914
## 0.9861918 0.5364767 0.9244776 0.01448990
## Balanced_Accuracy
## 0.5575591
## 0.5721379
## 0.5754042
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was maxdepth = 3.
Đánh giá các mô hình
r1=as.data.frame(logistic$resample)
r1$MoHinh="LOG"
r1=r1[,-11]
r2=as.data.frame(probit$resample)
r2$MoHinh="PRO"
r2=r2[,-11]
r3=as.data.frame(rf$resample)
r3$MoHinh="RF"
r3=r3[,-11]
r4=as.data.frame(ann$resample)
r4$MoHinh="ANN"
r4=r4[,-11]
r5=as.data.frame(svm$resample)
r5$MoHinh="SVM"
r5=r5[,-11]
r6=as.data.frame(cart$resample)
r6$MoHinh="CAR"
r6=r6[,-11]
resamplemod=rbind(r1,r2,r3,r4,r5,r6) # Hợp nhất 6 data frame lại.
Đánh giá độ chính xác của mô hình qua 4 tiêu chí: Accuracy, Positive Predictive Value, Negative Predictive Value, và AUC.
library(ggplot2);library(ggthemes);library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
h1=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=Accuracy,fill=MoHinh))+coord_flip()+geom_hline(yintercept=0.7,color="blue")+theme_wsj()
h2=ggplot(resamplemod, aes(Accuracy, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)
grid.arrange(h1,h2, ncol=2, nrow =1)
h3=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=Pos_Pred_Value,fill=MoHinh))+coord_flip()+theme_wsj()
h4=ggplot(resamplemod, aes(Pos_Pred_Value, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)
grid.arrange(h3,h4, ncol=2, nrow =1)
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing non-finite values (stat_density).
h5=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=Neg_Pred_Value,fill=MoHinh))+coord_flip()+theme_wsj()
h6=ggplot(resamplemod, aes(Neg_Pred_Value, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)
grid.arrange(h5,h6, ncol=2, nrow =1)
h7=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=ROC,fill=MoHinh))+coord_flip()+theme_wsj()
h8=ggplot(resamplemod, aes(ROC, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)
grid.arrange(h7,h8, ncol=2, nrow =1)
Một cách khác đánh giá độ chính xác bằng hình ảnh
### Chẩn đoán bằng hình ảnh về sự khác biệt ###
list=resamples(list(LOG=logistic,PRO=probit,RF=rf,ANN=ann,SVM=svm,CAR=cart))
bwplot(list,models=c("LOG","PRO","RF","ANN","SVM","CAR"),layout = c(2, 5))
summary(list)
##
## Call:
## summary.resamples(object = list)
##
## Models: LOG, PRO, RF, ANN, SVM, CAR
## Number of resamples: 100
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.8853 0.9107 0.9176 0.9174 0.9247 0.9429 0
## PRO 0.8964 0.9107 0.9179 0.9182 0.9250 0.9424 0
## RF 0.9068 0.9143 0.9179 0.9195 0.9247 0.9393 0
## ANN 0.8853 0.9071 0.9159 0.9172 0.9247 0.9464 0
## SVM 0.8964 0.9107 0.9176 0.9161 0.9211 0.9353 0
## CAR 0.8893 0.9104 0.9143 0.9150 0.9211 0.9393 0
##
## Balanced_Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.5122 0.5755 0.5972 0.6037 0.6287 0.7174 0
## PRO 0.5141 0.5720 0.5990 0.6019 0.6229 0.7154 0
## RF 0.4980 0.5380 0.5580 0.5599 0.5800 0.6600 0
## ANN 0.5122 0.5848 0.6141 0.6146 0.6388 0.7194 0
## SVM 0.4922 0.5380 0.5580 0.5633 0.5933 0.6419 0
## CAR 0.4941 0.5457 0.5741 0.5721 0.5980 0.6961 0
##
## Detection_Rate
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.003571 0.014320 0.01792 0.01961 0.02500 0.03943 0
## PRO 0.003571 0.014290 0.01792 0.01921 0.02244 0.03943 0
## RF 0.000000 0.007143 0.01071 0.01088 0.01434 0.02857 0
## ANN 0.003571 0.017860 0.02143 0.02176 0.02511 0.03943 0
## SVM 0.000000 0.007143 0.01073 0.01188 0.01786 0.02518 0
## CAR 0.000000 0.009828 0.01429 0.01374 0.01792 0.03571 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.038040 0.2187 0.2822 0.2841 0.3514 0.5299 0
## PRO 0.045450 0.2058 0.2827 0.2806 0.3379 0.5284 0
## RF -0.006929 0.1261 0.1869 0.1892 0.2547 0.4615 0
## ANN 0.038040 0.2254 0.3054 0.3043 0.3677 0.5697 0
## SVM -0.025250 0.1261 0.1791 0.1904 0.2636 0.3958 0
## CAR -0.019510 0.1332 0.2163 0.2082 0.2933 0.5123 0
##
## logLoss
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.1801 0.2155 0.2374 0.2400 0.2562 0.3258 0
## PRO 0.1897 0.2177 0.2314 0.2386 0.2539 0.3391 0
## RF 0.1851 0.2106 0.2248 0.2287 0.2397 0.3702 0
## ANN 0.1848 0.2141 0.2310 0.2309 0.2478 0.2894 0
## SVM 0.2090 0.2398 0.2593 0.2575 0.2707 0.3321 0
## CAR 0.2190 0.2526 0.2656 0.2663 0.2824 0.3207 0
##
## Neg_Pred_Value
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.9127 0.9250 0.9291 0.9292 0.9333 0.9504 0
## PRO 0.9130 0.9234 0.9290 0.9290 0.9333 0.9502 0
## RF 0.9134 0.9173 0.9206 0.9218 0.9239 0.9375 0
## ANN 0.9127 0.9257 0.9301 0.9311 0.9363 0.9506 0
## SVM 0.9094 0.9173 0.9211 0.9224 0.9267 0.9368 0
## CAR 0.9097 0.9194 0.9234 0.9239 0.9291 0.9440 0
##
## Pos_Pred_Value
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.20 0.5000 0.5625 0.5910 0.7036 1 0
## PRO 0.25 0.5000 0.6000 0.6086 0.7036 1 0
## RF 0.00 0.5929 0.8000 0.7752 1.0000 1 0
## ANN 0.20 0.4520 0.5635 0.5712 0.6667 1 0
## SVM 0.00 0.5000 0.6667 0.6388 0.7778 1 0
## CAR 0.00 0.4444 0.5556 0.5529 0.6667 1 1
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.7118 0.8088 0.8343 0.8337 0.8658 0.9298 0
## PRO 0.7248 0.8096 0.8371 0.8363 0.8612 0.9400 0
## RF 0.7127 0.8178 0.8471 0.8427 0.8738 0.9329 0
## ANN 0.7155 0.8129 0.8355 0.8358 0.8706 0.9266 0
## SVM 0.6275 0.7000 0.7560 0.7526 0.7891 0.8794 0
## CAR 0.5542 0.5989 0.6431 0.6382 0.6704 0.7315 0
##
## Sensitivity
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.04 0.1650 0.2083 0.2229 0.2800 0.4583 0
## PRO 0.04 0.1600 0.2083 0.2181 0.2575 0.4583 0
## RF 0.00 0.0800 0.1200 0.1236 0.1600 0.3200 0
## ANN 0.04 0.2000 0.2400 0.2473 0.2917 0.4583 0
## SVM 0.00 0.0800 0.1225 0.1350 0.2000 0.2917 0
## CAR 0.00 0.1108 0.1600 0.1561 0.2083 0.4000 0
##
## Specificity
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LOG 0.9606 0.9804 0.9843 0.9844 0.9921 1 0
## PRO 0.9647 0.9804 0.9882 0.9857 0.9922 1 0
## RF 0.9804 0.9961 0.9961 0.9963 1.0000 1 0
## ANN 0.9567 0.9765 0.9823 0.9818 0.9882 1 0
## SVM 0.9686 0.9882 0.9922 0.9915 0.9961 1 0
## CAR 0.9647 0.9843 0.9882 0.9882 0.9922 1 0
### Đánh giá cả 10 tiêu chí bằng Bonferroni Correction:
summary(diff(list))
##
## Call:
## summary.diff.resamples(object = diff(list))
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## LOG PRO RF ANN SVM CAR
## LOG -0.0007818 -0.0021098 0.0002151 0.0012897 0.0024346
## PRO 1.0000000 -0.0013280 0.0009969 0.0020715 0.0032164
## RF 0.3809092 1.0000000 0.0023249 0.0033995 0.0045444
## ANN 1.0000000 1.0000000 0.2525970 0.0010746 0.0022195
## SVM 1.0000000 1.0000000 0.0005729 1.0000000 0.0011449
## CAR 0.1074827 0.2160838 5.448e-07 0.2134471 1.0000000
##
## Balanced_Accuracy
## LOG PRO RF ANN SVM CAR
## LOG 0.001744 0.043715 -0.010905 0.040394 0.031517
## PRO 1.00000 0.041971 -0.012649 0.038651 0.029773
## RF < 2.2e-16 1.904e-11 -0.054620 -0.003320 -0.012198
## ANN 2.874e-07 0.65416 < 2.2e-16 0.051300 0.042422
## SVM < 2.2e-16 8.578e-09 1.00000 < 2.2e-16 -0.008878
## CAR 3.990e-12 4.382e-05 0.00207 < 2.2e-16 0.13065
##
## Detection_Rate
## LOG PRO RF ANN SVM CAR
## LOG 0.0003967 0.0087304 -0.0021471 0.0077275 0.0058688
## PRO 1.00000 0.0083337 -0.0025438 0.0073308 0.0054721
## RF < 2.2e-16 1.391e-13 -0.0108775 -0.0010029 -0.0028616
## ANN 9.417e-09 0.31308 < 2.2e-16 0.0098746 0.0080159
## SVM < 2.2e-16 1.135e-09 0.99887 < 2.2e-16 -0.0018587
## CAR 9.203e-13 2.469e-05 1.579e-05 < 2.2e-16 0.04258
##
## Kappa
## LOG PRO RF ANN SVM CAR
## LOG 0.003538 0.094894 -0.020186 0.093771 0.075916
## PRO 1.0000000 0.091356 -0.023724 0.090233 0.072378
## RF < 2.2e-16 1.748e-08 -0.115080 -0.001123 -0.018978
## ANN 0.0001560 1.0000000 < 2.2e-16 0.113957 0.096102
## SVM 5.301e-15 1.695e-07 1.0000000 < 2.2e-16 -0.017855
## CAR 2.388e-11 0.0001056 0.3692649 < 2.2e-16 0.6602844
##
## logLoss
## LOG PRO RF ANN SVM CAR
## LOG 0.001493 0.011378 0.009124 -0.017476 -0.026279
## PRO 1.00000 0.009885 0.007631 -0.018969 -0.027771
## RF 0.00257 0.39310 -0.002254 -0.028854 -0.037657
## ANN 4.458e-05 0.89347 1.00000 -0.026600 -0.035402
## SVM 2.316e-09 5.184e-05 1.989e-15 < 2.2e-16 -0.008802
## CAR < 2.2e-16 1.439e-09 < 2.2e-16 < 2.2e-16 1.561e-06
##
## Neg_Pred_Value
## LOG PRO RF ANN SVM CAR
## LOG 0.0002736 0.0074661 -0.0019050 0.0068614 0.0053281
## PRO 1.000000 0.0071926 -0.0021786 0.0065878 0.0050545
## RF < 2.2e-16 2.018e-11 -0.0093712 -0.0006047 -0.0021380
## ANN 1.453e-07 0.672578 < 2.2e-16 0.0087664 0.0072331
## SVM < 2.2e-16 1.005e-08 1.000000 < 2.2e-16 -0.0015333
## CAR 4.474e-12 4.963e-05 0.001078 < 2.2e-16 0.102167
##
## Pos_Pred_Value
## LOG PRO RF ANN SVM CAR
## LOG -0.01755 -0.18413 0.01980 -0.04782 0.03900
## PRO 1.000000 -0.16658 0.03735 -0.03027 0.05673
## RF 6.609e-12 1.299e-06 0.20393 0.13631 0.21994
## ANN 0.332455 1.000000 2.062e-14 -0.06762 0.02026
## SVM 0.385846 1.000000 9.663e-05 0.032212 0.08562
## CAR 0.429400 0.615460 5.783e-14 1.000000 0.004981
##
## ROC
## LOG PRO RF ANN SVM CAR
## LOG -0.0026310 -0.0090215 -0.0021728 0.0810789 0.1954800
## PRO 1.00000 -0.0063905 0.0004582 0.0837100 0.1981110
## RF 0.01002 1.00000 0.0068488 0.0901005 0.2045015
## ANN 0.02594 1.00000 0.09476 0.0832517 0.1976528
## SVM < 2e-16 < 2e-16 < 2e-16 < 2e-16 0.1144011
## CAR < 2e-16 < 2e-16 < 2e-16 < 2e-16 < 2e-16
##
## Sensitivity
## LOG PRO RF ANN SVM CAR
## LOG 0.004783 0.099317 -0.024400 0.087850 0.066800
## PRO 1.00000 0.094533 -0.029183 0.083067 0.062017
## RF < 2.2e-16 1.278e-13 -0.123717 -0.011467 -0.032517
## ANN 8.569e-09 0.29544 < 2.2e-16 0.112250 0.091200
## SVM < 2.2e-16 1.198e-09 0.97323 < 2.2e-16 -0.021050
## CAR 7.630e-13 2.414e-05 1.487e-05 < 2.2e-16 0.04479
##
## Specificity
## LOG PRO RF ANN SVM CAR
## LOG -0.001296 -0.011887 0.002590 -0.007061 -0.003767
## PRO 1.0000000 -0.010591 0.003885 -0.005765 -0.002471
## RF < 2.2e-16 < 2.2e-16 0.014477 0.004826 0.008121
## ANN 6.505e-07 0.0156697 < 2.2e-16 -0.009651 -0.006356
## SVM 4.958e-13 1.956e-06 8.787e-09 < 2.2e-16 0.003295
## CAR 0.0002254 0.2390434 < 2.2e-16 1.182e-10 0.0001396
Thử đánh giá tiêu chí ROC giữa mô hình Logistic và Random Forest bằng thống kê t:
t.test(r1$ROC,r3$ROC)
##
## Welch Two Sample t-test
##
## data: r1$ROC and r3$ROC
## t = -1.4449, df = 198, p-value = 0.1501
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.021333916 0.003290823
## sample estimates:
## mean of x mean of y
## 0.8336520 0.8426736
pred = predict(logistic, newdata=testing)
a=confusionMatrix (data=pred, testing$oc)
a
## Confusion Matrix and Statistics
##
## Reference
## Prediction D S
## D 14 5
## S 47 632
##
## Accuracy : 0.9255
## 95% CI : (0.9035, 0.9439)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.126
##
## Kappa : 0.3218
## Mcnemar's Test P-Value : 1.303e-08
##
## Sensitivity : 0.22951
## Specificity : 0.99215
## Pos Pred Value : 0.73684
## Neg Pred Value : 0.93078
## Prevalence : 0.08739
## Detection Rate : 0.02006
## Detection Prevalence : 0.02722
## Balanced Accuracy : 0.61083
##
## 'Positive' Class : D
##
pred = predict(probit, newdata=testing)
b=confusionMatrix(data=pred, testing$oc)
b
## Confusion Matrix and Statistics
##
## Reference
## Prediction D S
## D 14 5
## S 47 632
##
## Accuracy : 0.9255
## 95% CI : (0.9035, 0.9439)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.126
##
## Kappa : 0.3218
## Mcnemar's Test P-Value : 1.303e-08
##
## Sensitivity : 0.22951
## Specificity : 0.99215
## Pos Pred Value : 0.73684
## Neg Pred Value : 0.93078
## Prevalence : 0.08739
## Detection Rate : 0.02006
## Detection Prevalence : 0.02722
## Balanced Accuracy : 0.61083
##
## 'Positive' Class : D
##
pred = predict(rf, newdata=testing)
c=confusionMatrix(data=pred, testing$oc)
c
## Confusion Matrix and Statistics
##
## Reference
## Prediction D S
## D 8 4
## S 53 633
##
## Accuracy : 0.9183
## 95% CI : (0.8955, 0.9376)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.3247
##
## Kappa : 0.1961
## Mcnemar's Test P-Value : 2.047e-10
##
## Sensitivity : 0.13115
## Specificity : 0.99372
## Pos Pred Value : 0.66667
## Neg Pred Value : 0.92274
## Prevalence : 0.08739
## Detection Rate : 0.01146
## Detection Prevalence : 0.01719
## Balanced Accuracy : 0.56243
##
## 'Positive' Class : D
##
pred = predict(ann, newdata=testing)
dd=confusionMatrix(data=pred, testing$oc)
dd
## Confusion Matrix and Statistics
##
## Reference
## Prediction D S
## D 19 8
## S 42 629
##
## Accuracy : 0.9284
## 95% CI : (0.9066, 0.9464)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.0767
##
## Kappa : 0.3996
## Mcnemar's Test P-Value : 3.058e-06
##
## Sensitivity : 0.31148
## Specificity : 0.98744
## Pos Pred Value : 0.70370
## Neg Pred Value : 0.93741
## Prevalence : 0.08739
## Detection Rate : 0.02722
## Detection Prevalence : 0.03868
## Balanced Accuracy : 0.64946
##
## 'Positive' Class : D
##
pred = predict(svm, newdata=testing)
e=confusionMatrix(data=pred, testing$oc)
e
## Confusion Matrix and Statistics
##
## Reference
## Prediction D S
## D 9 5
## S 52 632
##
## Accuracy : 0.9183
## 95% CI : (0.8955, 0.9376)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.3247
##
## Kappa : 0.2144
## Mcnemar's Test P-Value : 1.109e-09
##
## Sensitivity : 0.14754
## Specificity : 0.99215
## Pos Pred Value : 0.64286
## Neg Pred Value : 0.92398
## Prevalence : 0.08739
## Detection Rate : 0.01289
## Detection Prevalence : 0.02006
## Balanced Accuracy : 0.56985
##
## 'Positive' Class : D
##
pred = predict(cart, newdata=testing)
f=confusionMatrix(data=pred, testing$oc)
f
## Confusion Matrix and Statistics
##
## Reference
## Prediction D S
## D 15 4
## S 46 633
##
## Accuracy : 0.9284
## 95% CI : (0.9066, 0.9464)
## No Information Rate : 0.9126
## P-Value [Acc > NIR] : 0.0767
##
## Kappa : 0.3479
## Mcnemar's Test P-Value : 6.7e-09
##
## Sensitivity : 0.24590
## Specificity : 0.99372
## Pos Pred Value : 0.78947
## Neg Pred Value : 0.93225
## Prevalence : 0.08739
## Detection Rate : 0.02149
## Detection Prevalence : 0.02722
## Balanced Accuracy : 0.61981
##
## 'Positive' Class : D
##
library(randomForest)
model = randomForest (factor(oc) ~ ., data= training, importance=T, ntree=2000)
varImpPlot(model, pch=16)