library(party)
library(caret)
dat=read.csv("UrineData.csv")
dat=dat[,-1]
head(dat)
AGP2 ESR CRP DAS28 duration RAD
1 10.767 100 4.70 5.59 3 yes
2 4.065 92 8.54 8.27 10 no
3 5.541 103 1.40 6.60 10 yes
4 0.750 64 7.76 8.88 10 yes
5 6.381 96 5.37 5.14 2 yes
6 3.231 88 5.10 8.98 0 no
gtree <- ctree(RAD ~ ., data = dat)
plot(gtree,main="Radiologic Progression Predicted by CRP and urinary AGP2 level")
plot(gtree, inner_panel = node_barplot,
edge_panel = function(...) invisible(), tnex = 1)
table(Predict(gtree), dat$RAD)
no yes
no 179 12
yes 5 12
confusionMatrix(Predict(gtree), dat$RAD)
Confusion Matrix and Statistics
Reference
Prediction no yes
no 179 12
yes 5 12
Accuracy : 0.918
95% CI : (0.872, 0.952)
No Information Rate : 0.885
P-Value [Acc > NIR] : 0.0744
Kappa : 0.541
Mcnemar's Test P-Value : 0.1456
Sensitivity : 0.973
Specificity : 0.500
Pos Pred Value : 0.937
Neg Pred Value : 0.706
Prevalence : 0.885
Detection Rate : 0.861
Detection Prevalence : 0.918
Balanced Accuracy : 0.736
'Positive' Class : no
plot(CRP~AGP2,col="black",bg=ifelse(dat$RAD=="yes",0,1)+2,pch=21,data=dat,
main="Radiologic Progression,CRP and urinary AGP2 level",
ylab="serum CRP level", xlab="urinary AGP2 level")
abline(h=0.96,lwd=2,lty="dotted")
abline(v=4.065,lwd=2,col="red",lty="dotted")
text(5.6,7,"AGP2=4.065",col="red")
text(9,1.2,"CRP=0.96")
legend(8,7,legend=c("Progression","No progression"),pch=21,col="black",
pt.bg=2:3)
이 부분은 cross-validation 및 random-forest model
library(caret)
library(tree)
library(ISLR)
library(gbm)
#tree.urine=tree(RAD~.,data=data3)
#summary(tree.urine)
#tree.urine
#plot(tree.urine)
#text(tree.urine,pretty=0)
set.seed(2)
train=sample(1:nrow(dat),nrow(dat)*0.7)
dat.test=dat[-train,]
tree.urine1=tree(RAD~.,data=dat,subset=train)
tree.pred=predict(tree.urine1,dat.test,type="class")
table(tree.pred,dat.test$RAD)
##
## tree.pred no yes
## no 51 10
## yes 1 1
confusionMatrix(tree.pred,dat.test$RAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 51 10
## yes 1 1
##
## Accuracy : 0.825
## 95% CI : (0.709, 0.909)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.5796
##
## Kappa : 0.106
## Mcnemar's Test P-Value : 0.0159
##
## Sensitivity : 0.9808
## Specificity : 0.0909
## Pos Pred Value : 0.8361
## Neg Pred Value : 0.5000
## Prevalence : 0.8254
## Detection Rate : 0.8095
## Detection Prevalence : 0.9683
## Balanced Accuracy : 0.5358
##
## 'Positive' Class : no
##
# cross-validation
set.seed(3)
cv.urine=cv.tree(tree.urine1,FUN=prune.misclass)
names(cv.urine)
## [1] "size" "dev" "k" "method"
cv.urine
## $size
## [1] 7 6 1
##
## $dev
## [1] 15 15 19
##
## $k
## [1] -Inf 0.0 1.4
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.urine$size,cv.urine$dev,type="b")
plot(cv.urine$k,cv.urine$dev,type="b")
prune.urine=prune.misclass(tree.urine1,best=3)
plot(prune.urine)
text(prune.urine,pretty=0)
tree.pred=predict(prune.urine,dat.test,type="class")
confusionMatrix(tree.pred,dat.test$RAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 51 10
## yes 1 1
##
## Accuracy : 0.825
## 95% CI : (0.709, 0.909)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.5796
##
## Kappa : 0.106
## Mcnemar's Test P-Value : 0.0159
##
## Sensitivity : 0.9808
## Specificity : 0.0909
## Pos Pred Value : 0.8361
## Neg Pred Value : 0.5000
## Prevalence : 0.8254
## Detection Rate : 0.8095
## Detection Prevalence : 0.9683
## Balanced Accuracy : 0.5358
##
## 'Positive' Class : no
##
prune.urine=prune.misclass(tree.urine1,best=6)
plot(prune.urine)
text(prune.urine,pretty=0)
tree.pred=predict(prune.urine,dat.test,type="class")
confusionMatrix(tree.pred,dat.test$RAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 51 10
## yes 1 1
##
## Accuracy : 0.825
## 95% CI : (0.709, 0.909)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.5796
##
## Kappa : 0.106
## Mcnemar's Test P-Value : 0.0159
##
## Sensitivity : 0.9808
## Specificity : 0.0909
## Pos Pred Value : 0.8361
## Neg Pred Value : 0.5000
## Prevalence : 0.8254
## Detection Rate : 0.8095
## Detection Prevalence : 0.9683
## Balanced Accuracy : 0.5358
##
## 'Positive' Class : no
##
library(randomForest)
set.seed(1)
bag.urine=randomForest(RAD~.,data=dat,subset=train,mtry=5,importance=TRUE)
bag.urine
##
## Call:
## randomForest(formula = RAD ~ ., data = dat, mtry = 5, importance = TRUE, subset = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 11.72%
## Confusion matrix:
## no yes class.error
## no 127 5 0.03788
## yes 12 1 0.92308
bag.pred=predict(bag.urine,newdata=dat.test)
confusionMatrix(bag.pred,dat.test$RAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 52 10
## yes 0 1
##
## Accuracy : 0.841
## 95% CI : (0.727, 0.921)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 0.44827
##
## Kappa : 0.142
## Mcnemar's Test P-Value : 0.00443
##
## Sensitivity : 1.0000
## Specificity : 0.0909
## Pos Pred Value : 0.8387
## Neg Pred Value : 1.0000
## Prevalence : 0.8254
## Detection Rate : 0.8254
## Detection Prevalence : 0.9841
## Balanced Accuracy : 0.5455
##
## 'Positive' Class : no
##
set.seed(1)
rf.urine=randomForest(RAD~.,data=dat,sunset=train, mtry=4,importance=TRUE)
rf.urine
##
## Call:
## randomForest(formula = RAD ~ ., data = dat, sunset = train, mtry = 4, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 9.62%
## Confusion matrix:
## no yes class.error
## no 178 6 0.03261
## yes 14 10 0.58333
plot(rf.urine)
plot(rf.urine,log="y")
#rf.urine=randomForest(RAD~.,data3,proximity=TRUE,keep.forest=FALSE)
#MDSplot(rf.urine,data3$RAD)
rf.pred=predict(rf.urine,newdata=dat.test)
confusionMatrix(rf.pred,dat.test$RAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 52 0
## yes 0 11
##
## Accuracy : 1
## 95% CI : (0.943, 1)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 5.62e-06
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.825
## Detection Rate : 0.825
## Detection Prevalence : 0.825
## Balanced Accuracy : 1.000
##
## 'Positive' Class : no
##
getTree(rf.urine,1,labelVar=TRUE)
## left daughter right daughter split var split point status prediction
## 1 2 3 AGP2 4.581 1 <NA>
## 2 4 5 CRP 1.715 1 <NA>
## 3 6 7 DAS28 3.385 1 <NA>
## 4 8 9 ESR 106.500 1 <NA>
## 5 10 11 CRP 1.790 1 <NA>
## 6 0 0 <NA> 0.000 -1 no
## 7 0 0 <NA> 0.000 -1 yes
## 8 12 13 ESR 65.000 1 <NA>
## 9 0 0 <NA> 0.000 -1 yes
## 10 0 0 <NA> 0.000 -1 yes
## 11 14 15 duration 20.500 1 <NA>
## 12 0 0 <NA> 0.000 -1 no
## 13 16 17 DAS28 3.415 1 <NA>
## 14 18 19 ESR 38.000 1 <NA>
## 15 0 0 <NA> 0.000 -1 yes
## 16 0 0 <NA> 0.000 -1 yes
## 17 0 0 <NA> 0.000 -1 no
## 18 0 0 <NA> 0.000 -1 no
## 19 20 21 ESR 47.000 1 <NA>
## 20 0 0 <NA> 0.000 -1 yes
## 21 22 23 CRP 6.100 1 <NA>
## 22 24 25 ESR 80.000 1 <NA>
## 23 0 0 <NA> 0.000 -1 yes
## 24 0 0 <NA> 0.000 -1 no
## 25 26 27 DAS28 4.785 1 <NA>
## 26 0 0 <NA> 0.000 -1 yes
## 27 0 0 <NA> 0.000 -1 no
# partialPlot(rf.urine,pred.data=data3,x.var="CRP")
# partialPlot(rf.urine,pred.data=data3,x.var="ESR")
# partialPlot(rf.urine,pred.data=data3,x.var="AGP2")
importance(rf.urine)
## no yes MeanDecreaseAccuracy MeanDecreaseGini
## AGP2 2.430 15.633 12.396 9.760
## ESR 4.207 7.758 7.771 7.926
## CRP 6.936 31.105 21.714 14.842
## DAS28 5.836 1.150 6.059 5.547
## duration 4.056 6.443 6.920 4.396
varImpPlot(rf.urine,main="Variable Importance Plot predicting Radiologic Progression")
Two measures of variable importance are reported. The former is based upon the mean decrease of accuracy in predictions on the out of bag samples when a given variable is excluded from the model. The latter is a measure of the total decrease in node impurity that results from splits over that variable, averaged over all trees (this was plotted in Figure 8.9). In the case of regression trees, the node impurity is measured by the training RSS, and for classification trees by the deviance. Plots of these importance measures can be produced using the varImpPlot() function.
다음은 boosting
dat$RAD1=ifelse(dat$RAD=="yes",1,0)
boost.urine=gbm(RAD1~.-RAD,data=dat[train,],distribution="bernoulli",
n.trees=5000,interaction.depth=4)
summary(boost.urine)
## var rel.inf
## CRP CRP 44.70
## ESR ESR 19.10
## duration duration 15.11
## AGP2 AGP2 10.70
## DAS28 DAS28 10.40
plot(boost.urine)
plot(boost.urine,i="CRP")
plot(boost.urine,i="ESR")
plot(boost.urine,i="AGP2")
boost.pred=predict(boost.urine,newdata=dat.test,n.trees=5000)
confusionMatrix(rf.pred,dat.test$RAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 52 0
## yes 0 11
##
## Accuracy : 1
## 95% CI : (0.943, 1)
## No Information Rate : 0.825
## P-Value [Acc > NIR] : 5.62e-06
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.000
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 1.000
## Prevalence : 0.825
## Detection Rate : 0.825
## Detection Prevalence : 0.825
## Balanced Accuracy : 1.000
##
## 'Positive' Class : no
##