library(party)
library(caret)
dat=read.csv("UrineData.csv")
dat=dat[,-1]
head(dat)
    AGP2 ESR  CRP DAS28 duration RAD
1 10.767 100 4.70  5.59        3 yes
2  4.065  92 8.54  8.27       10  no
3  5.541 103 1.40  6.60       10 yes
4  0.750  64 7.76  8.88       10 yes
5  6.381  96 5.37  5.14        2 yes
6  3.231  88 5.10  8.98        0  no
gtree <- ctree(RAD ~ ., data = dat)
plot(gtree,main="Radiologic Progression Predicted by CRP and urinary AGP2 level")

plot of chunk unnamed-chunk-1

plot(gtree, inner_panel = node_barplot,
     edge_panel = function(...) invisible(), tnex = 1)

plot of chunk unnamed-chunk-1

table(Predict(gtree), dat$RAD)
     
       no yes
  no  179  12
  yes   5  12
confusionMatrix(Predict(gtree), dat$RAD)
Confusion Matrix and Statistics

          Reference
Prediction  no yes
       no  179  12
       yes   5  12
                                        
               Accuracy : 0.918         
                 95% CI : (0.872, 0.952)
    No Information Rate : 0.885         
    P-Value [Acc > NIR] : 0.0744        
                                        
                  Kappa : 0.541         
 Mcnemar's Test P-Value : 0.1456        
                                        
            Sensitivity : 0.973         
            Specificity : 0.500         
         Pos Pred Value : 0.937         
         Neg Pred Value : 0.706         
             Prevalence : 0.885         
         Detection Rate : 0.861         
   Detection Prevalence : 0.918         
      Balanced Accuracy : 0.736         
                                        
       'Positive' Class : no            
                                        
plot(CRP~AGP2,col="black",bg=ifelse(dat$RAD=="yes",0,1)+2,pch=21,data=dat,
     main="Radiologic Progression,CRP and urinary AGP2 level",
     ylab="serum CRP level", xlab="urinary AGP2 level")
abline(h=0.96,lwd=2,lty="dotted")
abline(v=4.065,lwd=2,col="red",lty="dotted")
text(5.6,7,"AGP2=4.065",col="red")
text(9,1.2,"CRP=0.96")
legend(8,7,legend=c("Progression","No progression"),pch=21,col="black",
       pt.bg=2:3)

plot of chunk unnamed-chunk-2

이 부분은 cross-validation 및 random-forest model

library(caret)
library(tree)
library(ISLR)
library(gbm)
#tree.urine=tree(RAD~.,data=data3)
#summary(tree.urine)
#tree.urine
#plot(tree.urine)
#text(tree.urine,pretty=0)

set.seed(2)
train=sample(1:nrow(dat),nrow(dat)*0.7)
dat.test=dat[-train,]
tree.urine1=tree(RAD~.,data=dat,subset=train)
tree.pred=predict(tree.urine1,dat.test,type="class")
table(tree.pred,dat.test$RAD)
##          
## tree.pred no yes
##       no  51  10
##       yes  1   1
confusionMatrix(tree.pred,dat.test$RAD)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  51  10
##        yes  1   1
##                                         
##                Accuracy : 0.825         
##                  95% CI : (0.709, 0.909)
##     No Information Rate : 0.825         
##     P-Value [Acc > NIR] : 0.5796        
##                                         
##                   Kappa : 0.106         
##  Mcnemar's Test P-Value : 0.0159        
##                                         
##             Sensitivity : 0.9808        
##             Specificity : 0.0909        
##          Pos Pred Value : 0.8361        
##          Neg Pred Value : 0.5000        
##              Prevalence : 0.8254        
##          Detection Rate : 0.8095        
##    Detection Prevalence : 0.9683        
##       Balanced Accuracy : 0.5358        
##                                         
##        'Positive' Class : no            
## 
# cross-validation

set.seed(3)
cv.urine=cv.tree(tree.urine1,FUN=prune.misclass)
names(cv.urine)
## [1] "size"   "dev"    "k"      "method"
cv.urine
## $size
## [1] 7 6 1
## 
## $dev
## [1] 15 15 19
## 
## $k
## [1] -Inf  0.0  1.4
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"
plot(cv.urine$size,cv.urine$dev,type="b")

plot of chunk unnamed-chunk-3

plot(cv.urine$k,cv.urine$dev,type="b")

plot of chunk unnamed-chunk-3

prune.urine=prune.misclass(tree.urine1,best=3)
plot(prune.urine)
text(prune.urine,pretty=0)

plot of chunk unnamed-chunk-3

tree.pred=predict(prune.urine,dat.test,type="class")
confusionMatrix(tree.pred,dat.test$RAD)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  51  10
##        yes  1   1
##                                         
##                Accuracy : 0.825         
##                  95% CI : (0.709, 0.909)
##     No Information Rate : 0.825         
##     P-Value [Acc > NIR] : 0.5796        
##                                         
##                   Kappa : 0.106         
##  Mcnemar's Test P-Value : 0.0159        
##                                         
##             Sensitivity : 0.9808        
##             Specificity : 0.0909        
##          Pos Pred Value : 0.8361        
##          Neg Pred Value : 0.5000        
##              Prevalence : 0.8254        
##          Detection Rate : 0.8095        
##    Detection Prevalence : 0.9683        
##       Balanced Accuracy : 0.5358        
##                                         
##        'Positive' Class : no            
## 
prune.urine=prune.misclass(tree.urine1,best=6)
plot(prune.urine)
text(prune.urine,pretty=0)

plot of chunk unnamed-chunk-3

tree.pred=predict(prune.urine,dat.test,type="class")
confusionMatrix(tree.pred,dat.test$RAD)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  51  10
##        yes  1   1
##                                         
##                Accuracy : 0.825         
##                  95% CI : (0.709, 0.909)
##     No Information Rate : 0.825         
##     P-Value [Acc > NIR] : 0.5796        
##                                         
##                   Kappa : 0.106         
##  Mcnemar's Test P-Value : 0.0159        
##                                         
##             Sensitivity : 0.9808        
##             Specificity : 0.0909        
##          Pos Pred Value : 0.8361        
##          Neg Pred Value : 0.5000        
##              Prevalence : 0.8254        
##          Detection Rate : 0.8095        
##    Detection Prevalence : 0.9683        
##       Balanced Accuracy : 0.5358        
##                                         
##        'Positive' Class : no            
## 
library(randomForest)
set.seed(1)
bag.urine=randomForest(RAD~.,data=dat,subset=train,mtry=5,importance=TRUE)
bag.urine
## 
## Call:
##  randomForest(formula = RAD ~ ., data = dat, mtry = 5, importance = TRUE,      subset = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 11.72%
## Confusion matrix:
##      no yes class.error
## no  127   5     0.03788
## yes  12   1     0.92308
bag.pred=predict(bag.urine,newdata=dat.test)
confusionMatrix(bag.pred,dat.test$RAD)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  52  10
##        yes  0   1
##                                         
##                Accuracy : 0.841         
##                  95% CI : (0.727, 0.921)
##     No Information Rate : 0.825         
##     P-Value [Acc > NIR] : 0.44827       
##                                         
##                   Kappa : 0.142         
##  Mcnemar's Test P-Value : 0.00443       
##                                         
##             Sensitivity : 1.0000        
##             Specificity : 0.0909        
##          Pos Pred Value : 0.8387        
##          Neg Pred Value : 1.0000        
##              Prevalence : 0.8254        
##          Detection Rate : 0.8254        
##    Detection Prevalence : 0.9841        
##       Balanced Accuracy : 0.5455        
##                                         
##        'Positive' Class : no            
## 
set.seed(1)
rf.urine=randomForest(RAD~.,data=dat,sunset=train, mtry=4,importance=TRUE)
rf.urine
## 
## Call:
##  randomForest(formula = RAD ~ ., data = dat, sunset = train, mtry = 4,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 9.62%
## Confusion matrix:
##      no yes class.error
## no  178   6     0.03261
## yes  14  10     0.58333
plot(rf.urine)

plot of chunk unnamed-chunk-3

plot(rf.urine,log="y")

plot of chunk unnamed-chunk-3

#rf.urine=randomForest(RAD~.,data3,proximity=TRUE,keep.forest=FALSE)
#MDSplot(rf.urine,data3$RAD)

rf.pred=predict(rf.urine,newdata=dat.test)
confusionMatrix(rf.pred,dat.test$RAD)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  52   0
##        yes  0  11
##                                     
##                Accuracy : 1         
##                  95% CI : (0.943, 1)
##     No Information Rate : 0.825     
##     P-Value [Acc > NIR] : 5.62e-06  
##                                     
##                   Kappa : 1         
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.000     
##             Specificity : 1.000     
##          Pos Pred Value : 1.000     
##          Neg Pred Value : 1.000     
##              Prevalence : 0.825     
##          Detection Rate : 0.825     
##    Detection Prevalence : 0.825     
##       Balanced Accuracy : 1.000     
##                                     
##        'Positive' Class : no        
## 
getTree(rf.urine,1,labelVar=TRUE)
##    left daughter right daughter split var split point status prediction
## 1              2              3      AGP2       4.581      1       <NA>
## 2              4              5       CRP       1.715      1       <NA>
## 3              6              7     DAS28       3.385      1       <NA>
## 4              8              9       ESR     106.500      1       <NA>
## 5             10             11       CRP       1.790      1       <NA>
## 6              0              0      <NA>       0.000     -1         no
## 7              0              0      <NA>       0.000     -1        yes
## 8             12             13       ESR      65.000      1       <NA>
## 9              0              0      <NA>       0.000     -1        yes
## 10             0              0      <NA>       0.000     -1        yes
## 11            14             15  duration      20.500      1       <NA>
## 12             0              0      <NA>       0.000     -1         no
## 13            16             17     DAS28       3.415      1       <NA>
## 14            18             19       ESR      38.000      1       <NA>
## 15             0              0      <NA>       0.000     -1        yes
## 16             0              0      <NA>       0.000     -1        yes
## 17             0              0      <NA>       0.000     -1         no
## 18             0              0      <NA>       0.000     -1         no
## 19            20             21       ESR      47.000      1       <NA>
## 20             0              0      <NA>       0.000     -1        yes
## 21            22             23       CRP       6.100      1       <NA>
## 22            24             25       ESR      80.000      1       <NA>
## 23             0              0      <NA>       0.000     -1        yes
## 24             0              0      <NA>       0.000     -1         no
## 25            26             27     DAS28       4.785      1       <NA>
## 26             0              0      <NA>       0.000     -1        yes
## 27             0              0      <NA>       0.000     -1         no
# partialPlot(rf.urine,pred.data=data3,x.var="CRP")
# partialPlot(rf.urine,pred.data=data3,x.var="ESR")
# partialPlot(rf.urine,pred.data=data3,x.var="AGP2")

importance(rf.urine)
##             no    yes MeanDecreaseAccuracy MeanDecreaseGini
## AGP2     2.430 15.633               12.396            9.760
## ESR      4.207  7.758                7.771            7.926
## CRP      6.936 31.105               21.714           14.842
## DAS28    5.836  1.150                6.059            5.547
## duration 4.056  6.443                6.920            4.396
varImpPlot(rf.urine,main="Variable Importance Plot predicting Radiologic Progression")

plot of chunk unnamed-chunk-3

Two measures of variable importance are reported. The former is based upon the mean decrease of accuracy in predictions on the out of bag samples when a given variable is excluded from the model. The latter is a measure of the total decrease in node impurity that results from splits over that variable, averaged over all trees (this was plotted in Figure 8.9). In the case of regression trees, the node impurity is measured by the training RSS, and for classification trees by the deviance. Plots of these importance measures can be produced using the varImpPlot() function.

다음은 boosting

dat$RAD1=ifelse(dat$RAD=="yes",1,0)
boost.urine=gbm(RAD1~.-RAD,data=dat[train,],distribution="bernoulli",
                n.trees=5000,interaction.depth=4)
summary(boost.urine)

plot of chunk unnamed-chunk-4

##               var rel.inf
## CRP           CRP   44.70
## ESR           ESR   19.10
## duration duration   15.11
## AGP2         AGP2   10.70
## DAS28       DAS28   10.40
plot(boost.urine)

plot of chunk unnamed-chunk-4

plot(boost.urine,i="CRP")

plot of chunk unnamed-chunk-4

plot(boost.urine,i="ESR")

plot of chunk unnamed-chunk-4

plot(boost.urine,i="AGP2")

plot of chunk unnamed-chunk-4

boost.pred=predict(boost.urine,newdata=dat.test,n.trees=5000)
confusionMatrix(rf.pred,dat.test$RAD)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  52   0
##        yes  0  11
##                                     
##                Accuracy : 1         
##                  95% CI : (0.943, 1)
##     No Information Rate : 0.825     
##     P-Value [Acc > NIR] : 5.62e-06  
##                                     
##                   Kappa : 1         
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.000     
##             Specificity : 1.000     
##          Pos Pred Value : 1.000     
##          Neg Pred Value : 1.000     
##              Prevalence : 0.825     
##          Detection Rate : 0.825     
##    Detection Prevalence : 0.825     
##       Balanced Accuracy : 1.000     
##                                     
##        'Positive' Class : no        
##