So sanh cac mo hinh

Hợp nhất 2 dataset khi development và validation. Dưới đây là các biến số của dataframe dùng để phân tích.

names (d)

##  [1] "name"    "gender"  "age"     "addate"  "pulse"   "sys"     "dia"    
##  [8] "tem"     "br"      "spo2"    "gcs"     "cpr"     "mv"      "prether"
## [15] "pread"   "los"     "depa"    "aids"    "dic"     "lym"     "leumye" 
## [22] "canc"    "rf"      "crf"     "cir"     "dm"      "hf"      "fa"     
## [29] "urea"    "gluco"   "cre"     "alb"     "na"      "alt"     "crp"    
## [36] "exc"     "disc"    "ED"      "outcome" "hos"     "ID"      "dis"

Recode biến outcome

d$oc = d$outcome
d$oc [d$outcome == 1] = "D"
d$oc [d$outcome == 0] = "S"

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

set.seed(123)
indx= d [,-c(1, 4, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 36, 37, 38, 39, 40, 41, 42)]
indxTrain = createDataPartition (y = indx$oc, p = 8/10,list = FALSE) # Chia bộ dữ liệu theo tỉ lệ 70:30
training = indx [indxTrain,]  
testing = indx [-indxTrain,] 
dim(testing)

## [1] 698  23

dim(training)

## [1] 2795   23

Control= trainControl (method="repeatedcv", number=10, repeats=10, classProbs=TRUE, summaryFunction= multiClassSummary)

Mô hình logistic

set.seed(29)
logistic = train (oc ~., data=training, method="glm", family="binomial", trControl=Control)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

logistic

## Generalized Linear Model 
## 
## 2795 samples
##   22 predictor
##    2 classes: 'D', 'S' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ... 
## Resampling results:
## 
##   logLoss    ROC       Accuracy   Kappa      Sensitivity  Specificity
##   0.2400483  0.833652  0.9173948  0.2841245  0.2228833    0.984426   
##   Pos_Pred_Value  Neg_Pred_Value  Detection_Rate  Balanced_Accuracy
##   0.5910232       0.929244        0.01960794      0.6036547        
## 
##

Mô hình probit

set.seed(2)
probit= train(oc ~ ., data=training, method="glm", family="binomial"(link="probit"), trControl=Control)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

probit

## Generalized Linear Model 
## 
## 2795 samples
##   22 predictor
##    2 classes: 'D', 'S' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 2516, 2515, 2516, 2517, 2515, 2515, ... 
## Resampling results:
## 
##   logLoss    ROC       Accuracy   Kappa      Sensitivity  Specificity
##   0.2385555  0.836283  0.9181767  0.2805863  0.2181       0.9857216  
##   Pos_Pred_Value  Neg_Pred_Value  Detection_Rate  Balanced_Accuracy
##   0.6085739       0.9289704       0.01921124      0.6019108        
## 
##

Random Forest

library (randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(29)
rf = train(oc ~ ., data=training, method="rf", preProcess=c("center","scale"), trControl=Control)
rf

## Random Forest 
## 
## 2795 samples
##   22 predictor
##    2 classes: 'D', 'S' 
## 
## Pre-processing: centered (22), scaled (22) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ... 
## Resampling results across tuning parameters:
## 
##   mtry  logLoss    ROC        Accuracy   Kappa      Sensitivity
##    2    0.2286701  0.8426736  0.9195047  0.1892300  0.1235667  
##   12    0.2571539  0.8343837  0.9186110  0.2726866  0.2053667  
##   22    0.2639347  0.8297559  0.9175720  0.2780082  0.2150667  
##   Specificity  Pos_Pred_Value  Neg_Pred_Value  Detection_Rate
##   0.9963131    0.7751548       0.9217778       0.01087751    
##   0.9874480    0.6223244       0.9279633       0.01806979    
##   0.9853679    0.5932651       0.9286455       0.01892770    
##   Balanced_Accuracy
##   0.5599399        
##   0.5964074        
##   0.6002173        
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

Neutral Network

set.seed(29)
library (nnet)
ann = train(oc ~ ., data=training, method="nnet", preProcess=c("center","scale"), trControl=Control,trace=FALSE)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

ann

## Neural Network 
## 
## 2795 samples
##   22 predictor
##    2 classes: 'D', 'S' 
## 
## Pre-processing: centered (22), scaled (22) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  logLoss    ROC        Accuracy   Kappa      Sensitivity
##   1     0e+00  0.2416574  0.8085456  0.9133487  0.2191968  0.1922500  
##   1     1e-04  0.2399195  0.8090348  0.9146007  0.2238847  0.1911333  
##   1     1e-01  0.2309245  0.8358248  0.9171798  0.3043100  0.2472833  
##   3     0e+00  0.2769452  0.7740291  0.9029353  0.2471775  0.2503333  
##   3     1e-04  0.2743185  0.7893768  0.9034701  0.2639780  0.2671333  
##   3     1e-01  0.2501818  0.8095434  0.9121304  0.2856183  0.2482833  
##   5     0e+00  0.3434655  0.7713838  0.8974582  0.2359353  0.2540833  
##   5     1e-04  0.3385270  0.7746364  0.8972796  0.2532164  0.2683167  
##   5     1e-01  0.2714828  0.7917748  0.9077687  0.2733135  0.2495667  
##   Specificity  Pos_Pred_Value  Neg_Pred_Value  Detection_Rate
##   0.9829335    0.5340225       0.9267985       0.01692437    
##   0.9844267    0.5438530       0.9267761       0.01681697    
##   0.9818362    0.5712230       0.9311490       0.02175502    
##   0.9659125    0.4439109       0.9306267       0.02203829    
##   0.9648442    0.4369167       0.9319932       0.02354151    
##   0.9761874    0.5110014       0.9309173       0.02185934    
##   0.9595537    0.3955336       0.9305656       0.02236229    
##   0.9579872    0.3928036       0.9315560       0.02361433    
##   0.9712825    0.4604134       0.9306970       0.02196970    
##   Balanced_Accuracy
##   0.5875917        
##   0.5877800        
##   0.6145598        
##   0.6081229        
##   0.6159888        
##   0.6122354        
##   0.6068185        
##   0.6131519        
##   0.6104246        
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were size = 1 and decay = 0.1.

Support Vector Machine

set.seed(29)
svm = train(oc ~., data=training, method="svmRadial", preProcess=c("center","scale"), trControl=Control)

## Loading required package: kernlab

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

svm

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 2795 samples
##   22 predictor
##    2 classes: 'D', 'S' 
## 
## Pre-processing: centered (22), scaled (22) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ... 
## Resampling results across tuning parameters:
## 
##   C     logLoss    ROC        Accuracy   Kappa      Sensitivity
##   0.25  0.2590849  0.7526951  0.9159265  0.1937638  0.1383167  
##   0.50  0.2575245  0.7525731  0.9161052  0.1903534  0.1350333  
##   1.00  0.2545030  0.7533965  0.9152826  0.1898710  0.1367167  
##   Specificity  Pos_Pred_Value  Neg_Pred_Value  Detection_Rate
##   0.9909775    0.6324632       0.9226138       0.01216682    
##   0.9914871    0.6388427       0.9223826       0.01188046    
##   0.9904283    0.6006162       0.9224398       0.01202383    
##   Balanced_Accuracy
##   0.5646471        
##   0.5632602        
##   0.5635725        
## 
## Tuning parameter 'sigma' was held constant at a value of 0.05917051
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were sigma = 0.05917051 and C = 0.5.

Cây CART

set.seed(29)
cart= train(oc ~ ., data=training, method="rpart2", preProcess=c("center","scale"), trControl=Control)

## Loading required package: rpart

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

cart

## CART 
## 
## 2795 samples
##   22 predictor
##    2 classes: 'D', 'S' 
## 
## Pre-processing: centered (22), scaled (22) 
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 2515, 2516, 2515, 2515, 2516, 2517, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  logLoss    ROC        Accuracy   Kappa      Sensitivity
##   2         0.2642490  0.6381930  0.9148166  0.1712111  0.1239833  
##   3         0.2663270  0.6381720  0.9149602  0.2082080  0.1560833  
##   4         0.2716385  0.6503692  0.9138857  0.2149223  0.1646167  
##   Specificity  Pos_Pred_Value  Neg_Pred_Value  Detection_Rate
##   0.9911348    0.5683975       0.9214679       0.01091271    
##   0.9881925    0.5529439       0.9239159       0.01373914    
##   0.9861918    0.5364767       0.9244776       0.01448990    
##   Balanced_Accuracy
##   0.5575591        
##   0.5721379        
##   0.5754042        
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was maxdepth = 3.

Đánh giá các mô hình

Đánh giá bằng hình ảnh

r1=as.data.frame(logistic$resample)
r1$MoHinh="LOG"
r1=r1[,-11]
r2=as.data.frame(probit$resample)
r2$MoHinh="PRO"
r2=r2[,-11]
r3=as.data.frame(rf$resample)
r3$MoHinh="RF"
r3=r3[,-11]
r4=as.data.frame(ann$resample)
r4$MoHinh="ANN"
r4=r4[,-11]
r5=as.data.frame(svm$resample)
r5$MoHinh="SVM"
r5=r5[,-11]
r6=as.data.frame(cart$resample)
r6$MoHinh="CAR"
r6=r6[,-11]
resamplemod=rbind(r1,r2,r3,r4,r5,r6) # Hợp nhất 6 data frame lại.

Đánh giá độ chính xác của mô hình qua 4 tiêu chí: Accuracy, Positive Predictive Value, Negative Predictive Value, và AUC.

library(ggplot2);library(ggthemes);library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:randomForest':
## 
##     combine

h1=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=Accuracy,fill=MoHinh))+coord_flip()+geom_hline(yintercept=0.7,color="blue")+theme_wsj()

h2=ggplot(resamplemod, aes(Accuracy, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)

grid.arrange(h1,h2, ncol=2, nrow =1)

h3=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=Pos_Pred_Value,fill=MoHinh))+coord_flip()+theme_wsj()

h4=ggplot(resamplemod, aes(Pos_Pred_Value, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)

grid.arrange(h3,h4, ncol=2, nrow =1)

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_density).

h5=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=Neg_Pred_Value,fill=MoHinh))+coord_flip()+theme_wsj()

h6=ggplot(resamplemod, aes(Neg_Pred_Value, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)
grid.arrange(h5,h6, ncol=2, nrow =1)

h7=ggplot(resamplemod)+geom_boxplot(aes(x=MoHinh,y=ROC,fill=MoHinh))+coord_flip()+theme_wsj()

h8=ggplot(resamplemod, aes(ROC, fill=MoHinh)) + geom_density(alpha=0.3)+theme_wsj()+facet_wrap(~MoHinh)
grid.arrange(h7,h8, ncol=2, nrow =1)

Một cách khác đánh giá độ chính xác bằng hình ảnh

### Chẩn đoán bằng hình ảnh về sự khác biệt ###
list=resamples(list(LOG=logistic,PRO=probit,RF=rf,ANN=ann,SVM=svm,CAR=cart))
bwplot(list,models=c("LOG","PRO","RF","ANN","SVM","CAR"),layout = c(2, 5))

Sử dụng tiêu chí Bonferroni Correction nhằm đánh giá khác biệt về chất lượng phân loại của cả 6 mô hình:

summary(list)

## 
## Call:
## summary.resamples(object = list)
## 
## Models: LOG, PRO, RF, ANN, SVM, CAR 
## Number of resamples: 100 
## 
## Accuracy 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG 0.8853  0.9107 0.9176 0.9174  0.9247 0.9429    0
## PRO 0.8964  0.9107 0.9179 0.9182  0.9250 0.9424    0
## RF  0.9068  0.9143 0.9179 0.9195  0.9247 0.9393    0
## ANN 0.8853  0.9071 0.9159 0.9172  0.9247 0.9464    0
## SVM 0.8964  0.9107 0.9176 0.9161  0.9211 0.9353    0
## CAR 0.8893  0.9104 0.9143 0.9150  0.9211 0.9393    0
## 
## Balanced_Accuracy 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG 0.5122  0.5755 0.5972 0.6037  0.6287 0.7174    0
## PRO 0.5141  0.5720 0.5990 0.6019  0.6229 0.7154    0
## RF  0.4980  0.5380 0.5580 0.5599  0.5800 0.6600    0
## ANN 0.5122  0.5848 0.6141 0.6146  0.6388 0.7194    0
## SVM 0.4922  0.5380 0.5580 0.5633  0.5933 0.6419    0
## CAR 0.4941  0.5457 0.5741 0.5721  0.5980 0.6961    0
## 
## Detection_Rate 
##         Min.  1st Qu.  Median    Mean 3rd Qu.    Max. NA's
## LOG 0.003571 0.014320 0.01792 0.01961 0.02500 0.03943    0
## PRO 0.003571 0.014290 0.01792 0.01921 0.02244 0.03943    0
## RF  0.000000 0.007143 0.01071 0.01088 0.01434 0.02857    0
## ANN 0.003571 0.017860 0.02143 0.02176 0.02511 0.03943    0
## SVM 0.000000 0.007143 0.01073 0.01188 0.01786 0.02518    0
## CAR 0.000000 0.009828 0.01429 0.01374 0.01792 0.03571    0
## 
## Kappa 
##          Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG  0.038040  0.2187 0.2822 0.2841  0.3514 0.5299    0
## PRO  0.045450  0.2058 0.2827 0.2806  0.3379 0.5284    0
## RF  -0.006929  0.1261 0.1869 0.1892  0.2547 0.4615    0
## ANN  0.038040  0.2254 0.3054 0.3043  0.3677 0.5697    0
## SVM -0.025250  0.1261 0.1791 0.1904  0.2636 0.3958    0
## CAR -0.019510  0.1332 0.2163 0.2082  0.2933 0.5123    0
## 
## logLoss 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG 0.1801  0.2155 0.2374 0.2400  0.2562 0.3258    0
## PRO 0.1897  0.2177 0.2314 0.2386  0.2539 0.3391    0
## RF  0.1851  0.2106 0.2248 0.2287  0.2397 0.3702    0
## ANN 0.1848  0.2141 0.2310 0.2309  0.2478 0.2894    0
## SVM 0.2090  0.2398 0.2593 0.2575  0.2707 0.3321    0
## CAR 0.2190  0.2526 0.2656 0.2663  0.2824 0.3207    0
## 
## Neg_Pred_Value 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG 0.9127  0.9250 0.9291 0.9292  0.9333 0.9504    0
## PRO 0.9130  0.9234 0.9290 0.9290  0.9333 0.9502    0
## RF  0.9134  0.9173 0.9206 0.9218  0.9239 0.9375    0
## ANN 0.9127  0.9257 0.9301 0.9311  0.9363 0.9506    0
## SVM 0.9094  0.9173 0.9211 0.9224  0.9267 0.9368    0
## CAR 0.9097  0.9194 0.9234 0.9239  0.9291 0.9440    0
## 
## Pos_Pred_Value 
##     Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## LOG 0.20  0.5000 0.5625 0.5910  0.7036    1    0
## PRO 0.25  0.5000 0.6000 0.6086  0.7036    1    0
## RF  0.00  0.5929 0.8000 0.7752  1.0000    1    0
## ANN 0.20  0.4520 0.5635 0.5712  0.6667    1    0
## SVM 0.00  0.5000 0.6667 0.6388  0.7778    1    0
## CAR 0.00  0.4444 0.5556 0.5529  0.6667    1    1
## 
## ROC 
##       Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG 0.7118  0.8088 0.8343 0.8337  0.8658 0.9298    0
## PRO 0.7248  0.8096 0.8371 0.8363  0.8612 0.9400    0
## RF  0.7127  0.8178 0.8471 0.8427  0.8738 0.9329    0
## ANN 0.7155  0.8129 0.8355 0.8358  0.8706 0.9266    0
## SVM 0.6275  0.7000 0.7560 0.7526  0.7891 0.8794    0
## CAR 0.5542  0.5989 0.6431 0.6382  0.6704 0.7315    0
## 
## Sensitivity 
##     Min. 1st Qu. Median   Mean 3rd Qu.   Max. NA's
## LOG 0.04  0.1650 0.2083 0.2229  0.2800 0.4583    0
## PRO 0.04  0.1600 0.2083 0.2181  0.2575 0.4583    0
## RF  0.00  0.0800 0.1200 0.1236  0.1600 0.3200    0
## ANN 0.04  0.2000 0.2400 0.2473  0.2917 0.4583    0
## SVM 0.00  0.0800 0.1225 0.1350  0.2000 0.2917    0
## CAR 0.00  0.1108 0.1600 0.1561  0.2083 0.4000    0
## 
## Specificity 
##       Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## LOG 0.9606  0.9804 0.9843 0.9844  0.9921    1    0
## PRO 0.9647  0.9804 0.9882 0.9857  0.9922    1    0
## RF  0.9804  0.9961 0.9961 0.9963  1.0000    1    0
## ANN 0.9567  0.9765 0.9823 0.9818  0.9882    1    0
## SVM 0.9686  0.9882 0.9922 0.9915  0.9961    1    0
## CAR 0.9647  0.9843 0.9882 0.9882  0.9922    1    0

### Đánh giá cả 10 tiêu chí bằng Bonferroni Correction:  
summary(diff(list))

## 
## Call:
## summary.diff.resamples(object = diff(list))
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##     LOG       PRO        RF         ANN        SVM        CAR       
## LOG           -0.0007818 -0.0021098  0.0002151  0.0012897  0.0024346
## PRO 1.0000000            -0.0013280  0.0009969  0.0020715  0.0032164
## RF  0.3809092 1.0000000              0.0023249  0.0033995  0.0045444
## ANN 1.0000000 1.0000000  0.2525970              0.0010746  0.0022195
## SVM 1.0000000 1.0000000  0.0005729  1.0000000              0.0011449
## CAR 0.1074827 0.2160838  5.448e-07  0.2134471  1.0000000            
## 
## Balanced_Accuracy 
##     LOG       PRO       RF        ANN       SVM       CAR      
## LOG            0.001744  0.043715 -0.010905  0.040394  0.031517
## PRO 1.00000              0.041971 -0.012649  0.038651  0.029773
## RF  < 2.2e-16 1.904e-11           -0.054620 -0.003320 -0.012198
## ANN 2.874e-07 0.65416   < 2.2e-16            0.051300  0.042422
## SVM < 2.2e-16 8.578e-09 1.00000   < 2.2e-16           -0.008878
## CAR 3.990e-12 4.382e-05 0.00207   < 2.2e-16 0.13065            
## 
## Detection_Rate 
##     LOG       PRO        RF         ANN        SVM        CAR       
## LOG            0.0003967  0.0087304 -0.0021471  0.0077275  0.0058688
## PRO 1.00000               0.0083337 -0.0025438  0.0073308  0.0054721
## RF  < 2.2e-16 1.391e-13             -0.0108775 -0.0010029 -0.0028616
## ANN 9.417e-09 0.31308    < 2.2e-16              0.0098746  0.0080159
## SVM < 2.2e-16 1.135e-09  0.99887    < 2.2e-16             -0.0018587
## CAR 9.203e-13 2.469e-05  1.579e-05  < 2.2e-16  0.04258              
## 
## Kappa 
##     LOG       PRO       RF        ANN       SVM       CAR      
## LOG            0.003538  0.094894 -0.020186  0.093771  0.075916
## PRO 1.0000000            0.091356 -0.023724  0.090233  0.072378
## RF  < 2.2e-16 1.748e-08           -0.115080 -0.001123 -0.018978
## ANN 0.0001560 1.0000000 < 2.2e-16            0.113957  0.096102
## SVM 5.301e-15 1.695e-07 1.0000000 < 2.2e-16           -0.017855
## CAR 2.388e-11 0.0001056 0.3692649 < 2.2e-16 0.6602844          
## 
## logLoss 
##     LOG       PRO       RF        ANN       SVM       CAR      
## LOG            0.001493  0.011378  0.009124 -0.017476 -0.026279
## PRO 1.00000              0.009885  0.007631 -0.018969 -0.027771
## RF  0.00257   0.39310             -0.002254 -0.028854 -0.037657
## ANN 4.458e-05 0.89347   1.00000             -0.026600 -0.035402
## SVM 2.316e-09 5.184e-05 1.989e-15 < 2.2e-16           -0.008802
## CAR < 2.2e-16 1.439e-09 < 2.2e-16 < 2.2e-16 1.561e-06          
## 
## Neg_Pred_Value 
##     LOG       PRO        RF         ANN        SVM        CAR       
## LOG            0.0002736  0.0074661 -0.0019050  0.0068614  0.0053281
## PRO 1.000000              0.0071926 -0.0021786  0.0065878  0.0050545
## RF  < 2.2e-16 2.018e-11             -0.0093712 -0.0006047 -0.0021380
## ANN 1.453e-07 0.672578   < 2.2e-16              0.0087664  0.0072331
## SVM < 2.2e-16 1.005e-08  1.000000   < 2.2e-16             -0.0015333
## CAR 4.474e-12 4.963e-05  0.001078   < 2.2e-16  0.102167             
## 
## Pos_Pred_Value 
##     LOG       PRO       RF        ANN      SVM      CAR     
## LOG           -0.01755  -0.18413   0.01980 -0.04782  0.03900
## PRO 1.000000            -0.16658   0.03735 -0.03027  0.05673
## RF  6.609e-12 1.299e-06            0.20393  0.13631  0.21994
## ANN 0.332455  1.000000  2.062e-14          -0.06762  0.02026
## SVM 0.385846  1.000000  9.663e-05 0.032212           0.08562
## CAR 0.429400  0.615460  5.783e-14 1.000000 0.004981         
## 
## ROC 
##     LOG     PRO        RF         ANN        SVM        CAR       
## LOG         -0.0026310 -0.0090215 -0.0021728  0.0810789  0.1954800
## PRO 1.00000            -0.0063905  0.0004582  0.0837100  0.1981110
## RF  0.01002 1.00000                0.0068488  0.0901005  0.2045015
## ANN 0.02594 1.00000    0.09476                0.0832517  0.1976528
## SVM < 2e-16 < 2e-16    < 2e-16    < 2e-16                0.1144011
## CAR < 2e-16 < 2e-16    < 2e-16    < 2e-16    < 2e-16              
## 
## Sensitivity 
##     LOG       PRO       RF        ANN       SVM       CAR      
## LOG            0.004783  0.099317 -0.024400  0.087850  0.066800
## PRO 1.00000              0.094533 -0.029183  0.083067  0.062017
## RF  < 2.2e-16 1.278e-13           -0.123717 -0.011467 -0.032517
## ANN 8.569e-09 0.29544   < 2.2e-16            0.112250  0.091200
## SVM < 2.2e-16 1.198e-09 0.97323   < 2.2e-16           -0.021050
## CAR 7.630e-13 2.414e-05 1.487e-05 < 2.2e-16 0.04479            
## 
## Specificity 
##     LOG       PRO       RF        ANN       SVM       CAR      
## LOG           -0.001296 -0.011887  0.002590 -0.007061 -0.003767
## PRO 1.0000000           -0.010591  0.003885 -0.005765 -0.002471
## RF  < 2.2e-16 < 2.2e-16            0.014477  0.004826  0.008121
## ANN 6.505e-07 0.0156697 < 2.2e-16           -0.009651 -0.006356
## SVM 4.958e-13 1.956e-06 8.787e-09 < 2.2e-16            0.003295
## CAR 0.0002254 0.2390434 < 2.2e-16 1.182e-10 0.0001396

Thử đánh giá tiêu chí ROC giữa mô hình Logistic và Random Forest bằng thống kê t:

t.test(r1$ROC,r3$ROC)

## 
##  Welch Two Sample t-test
## 
## data:  r1$ROC and r3$ROC
## t = -1.4449, df = 198, p-value = 0.1501
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.021333916  0.003290823
## sample estimates:
## mean of x mean of y 
## 0.8336520 0.8426736

Nên sử dụng mô hình nào

pred = predict(logistic, newdata=testing)
a=confusionMatrix (data=pred, testing$oc)
a

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   D   S
##          D  14   5
##          S  47 632
##                                           
##                Accuracy : 0.9255          
##                  95% CI : (0.9035, 0.9439)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.126           
##                                           
##                   Kappa : 0.3218          
##  Mcnemar's Test P-Value : 1.303e-08       
##                                           
##             Sensitivity : 0.22951         
##             Specificity : 0.99215         
##          Pos Pred Value : 0.73684         
##          Neg Pred Value : 0.93078         
##              Prevalence : 0.08739         
##          Detection Rate : 0.02006         
##    Detection Prevalence : 0.02722         
##       Balanced Accuracy : 0.61083         
##                                           
##        'Positive' Class : D               
##

pred = predict(probit, newdata=testing)
b=confusionMatrix(data=pred, testing$oc)
b

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   D   S
##          D  14   5
##          S  47 632
##                                           
##                Accuracy : 0.9255          
##                  95% CI : (0.9035, 0.9439)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.126           
##                                           
##                   Kappa : 0.3218          
##  Mcnemar's Test P-Value : 1.303e-08       
##                                           
##             Sensitivity : 0.22951         
##             Specificity : 0.99215         
##          Pos Pred Value : 0.73684         
##          Neg Pred Value : 0.93078         
##              Prevalence : 0.08739         
##          Detection Rate : 0.02006         
##    Detection Prevalence : 0.02722         
##       Balanced Accuracy : 0.61083         
##                                           
##        'Positive' Class : D               
##

pred = predict(rf, newdata=testing)
c=confusionMatrix(data=pred, testing$oc)
c

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   D   S
##          D   8   4
##          S  53 633
##                                           
##                Accuracy : 0.9183          
##                  95% CI : (0.8955, 0.9376)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.3247          
##                                           
##                   Kappa : 0.1961          
##  Mcnemar's Test P-Value : 2.047e-10       
##                                           
##             Sensitivity : 0.13115         
##             Specificity : 0.99372         
##          Pos Pred Value : 0.66667         
##          Neg Pred Value : 0.92274         
##              Prevalence : 0.08739         
##          Detection Rate : 0.01146         
##    Detection Prevalence : 0.01719         
##       Balanced Accuracy : 0.56243         
##                                           
##        'Positive' Class : D               
##

pred = predict(ann, newdata=testing)
dd=confusionMatrix(data=pred, testing$oc)
dd

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   D   S
##          D  19   8
##          S  42 629
##                                           
##                Accuracy : 0.9284          
##                  95% CI : (0.9066, 0.9464)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.0767          
##                                           
##                   Kappa : 0.3996          
##  Mcnemar's Test P-Value : 3.058e-06       
##                                           
##             Sensitivity : 0.31148         
##             Specificity : 0.98744         
##          Pos Pred Value : 0.70370         
##          Neg Pred Value : 0.93741         
##              Prevalence : 0.08739         
##          Detection Rate : 0.02722         
##    Detection Prevalence : 0.03868         
##       Balanced Accuracy : 0.64946         
##                                           
##        'Positive' Class : D               
##

pred = predict(svm, newdata=testing)
e=confusionMatrix(data=pred, testing$oc)
e

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   D   S
##          D   9   5
##          S  52 632
##                                           
##                Accuracy : 0.9183          
##                  95% CI : (0.8955, 0.9376)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.3247          
##                                           
##                   Kappa : 0.2144          
##  Mcnemar's Test P-Value : 1.109e-09       
##                                           
##             Sensitivity : 0.14754         
##             Specificity : 0.99215         
##          Pos Pred Value : 0.64286         
##          Neg Pred Value : 0.92398         
##              Prevalence : 0.08739         
##          Detection Rate : 0.01289         
##    Detection Prevalence : 0.02006         
##       Balanced Accuracy : 0.56985         
##                                           
##        'Positive' Class : D               
##

pred = predict(cart, newdata=testing)
f=confusionMatrix(data=pred, testing$oc)
f

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   D   S
##          D  15   4
##          S  46 633
##                                           
##                Accuracy : 0.9284          
##                  95% CI : (0.9066, 0.9464)
##     No Information Rate : 0.9126          
##     P-Value [Acc > NIR] : 0.0767          
##                                           
##                   Kappa : 0.3479          
##  Mcnemar's Test P-Value : 6.7e-09         
##                                           
##             Sensitivity : 0.24590         
##             Specificity : 0.99372         
##          Pos Pred Value : 0.78947         
##          Neg Pred Value : 0.93225         
##              Prevalence : 0.08739         
##          Detection Rate : 0.02149         
##    Detection Prevalence : 0.02722         
##       Balanced Accuracy : 0.61981         
##                                           
##        'Positive' Class : D               
##

Xác định biến số quan trọng

library(randomForest)
model = randomForest (factor(oc) ~ ., data= training, importance=T, ntree=2000)
varImpPlot(model, pch=16)

So sanh cac mo hinh

Ha Tan Duc

May 6, 2017