virco <- read.csv(file="Virco_data.csv", header=T, sep=",")
attach(virco)
VircoGeno <- data.frame(virco[,substr(names(virco),1,1)=="P"]!="-")
Trait <- as.factor(IDV.Fold > NFV.Fold)
library(rpart)
ClassTree <- rpart(Trait~., method="class", data=VircoGeno)
ClassTree
## n=976 (90 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 976 399 FALSE (0.5911885 0.4088115)
## 2) P54< 0.5 480 130 FALSE (0.7291667 0.2708333)
## 4) P76< 0.5 466 116 FALSE (0.7510730 0.2489270) *
## 5) P76>=0.5 14 0 TRUE (0.0000000 1.0000000) *
## 3) P54>=0.5 496 227 TRUE (0.4576613 0.5423387)
## 6) P46< 0.5 158 57 FALSE (0.6392405 0.3607595)
## 12) P1< 0.5 115 31 FALSE (0.7304348 0.2695652) *
## 13) P1>=0.5 43 17 TRUE (0.3953488 0.6046512) *
## 7) P46>=0.5 338 126 TRUE (0.3727811 0.6272189)
## 14) P10< 0.5 22 7 FALSE (0.6818182 0.3181818) *
## 15) P10>=0.5 316 111 TRUE (0.3512658 0.6487342)
## 30) P48< 0.5 278 106 TRUE (0.3812950 0.6187050)
## 60) P20< 0.5 113 55 TRUE (0.4867257 0.5132743)
## 120) P76< 0.5 92 40 FALSE (0.5652174 0.4347826) *
## 121) P76>=0.5 21 3 TRUE (0.1428571 0.8571429) *
## 61) P20>=0.5 165 51 TRUE (0.3090909 0.6909091) *
## 31) P48>=0.5 38 5 TRUE (0.1315789 0.8684211) *
plot(ClassTree)
text(ClassTree)

rpart(Trait~., method="class", parms=list(split="information"), data = VircoGeno)
## n=976 (90 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 976 399 FALSE (0.5911885 0.4088115)
## 2) P54< 0.5 480 130 FALSE (0.7291667 0.2708333)
## 4) P76< 0.5 466 116 FALSE (0.7510730 0.2489270) *
## 5) P76>=0.5 14 0 TRUE (0.0000000 1.0000000) *
## 3) P54>=0.5 496 227 TRUE (0.4576613 0.5423387)
## 6) P46< 0.5 158 57 FALSE (0.6392405 0.3607595)
## 12) P1< 0.5 115 31 FALSE (0.7304348 0.2695652) *
## 13) P1>=0.5 43 17 TRUE (0.3953488 0.6046512) *
## 7) P46>=0.5 338 126 TRUE (0.3727811 0.6272189)
## 14) P48< 0.5 299 120 TRUE (0.4013378 0.5986622)
## 28) P20< 0.5 125 62 FALSE (0.5040000 0.4960000)
## 56) P76< 0.5 104 44 FALSE (0.5769231 0.4230769) *
## 57) P76>=0.5 21 3 TRUE (0.1428571 0.8571429) *
## 29) P20>=0.5 174 57 TRUE (0.3275862 0.6724138) *
## 15) P48>=0.5 39 6 TRUE (0.1538462 0.8461538) *
rpart(Trait~., method="class", parms=list(split="gini"), control=rpart.control(minsplit=150, minbucket=50), data=VircoGeno)
## n=976 (90 observations deleted due to missingness)
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 976 399 FALSE (0.5911885 0.4088115)
## 2) P54< 0.5 480 130 FALSE (0.7291667 0.2708333) *
## 3) P54>=0.5 496 227 TRUE (0.4576613 0.5423387)
## 6) P46< 0.5 158 57 FALSE (0.6392405 0.3607595) *
## 7) P46>=0.5 338 126 TRUE (0.3727811 0.6272189) *
attach(virco)
## The following objects are masked from virco (pos = 4):
##
## APV.Fold, APV.FoldMatch, ATV.Fold, ATV.FoldMatch, CompMutList,
## IDV.Fold, IDV.FoldMatch, IsolateName, LPV.Fold, LPV.FoldMatch,
## MajorMutList, Method, MinorMutList, NFV.Fold, NFV.FoldMatch,
## P1, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P2, P20,
## P21, P22, P23, P24, P25, P26, P27, P28, P29, P3, P30, P31,
## P32, P33, P34, P35, P36, P37, P38, P39, P4, P40, P41, P42,
## P43, P44, P45, P46, P47, P48, P49, P5, P50, P51, P52, P53,
## P54, P55, P56, P57, P58, P59, P6, P60, P61, P62, P63, P64,
## P65, P66, P67, P68, P69, P7, P70, P71, P72, P73, P74, P75,
## P76, P77, P78, P79, P8, P80, P81, P82, P83, P84, P85, P86,
## P87, P88, P89, P9, P90, P91, P92, P93, P94, P95, P96, P97,
## P98, P99, RefID, RTV.Fold, RTV.FoldMatch, SeqID, SQV.Fold,
## SQV.FoldMatch, SubType, TPV.Fold, TPV.FoldMatch, Type
Trait <- NFV.Fold - IDV.Fold
library(rpart)
RegTree <- rpart(Trait~., method="anova", data=VircoGeno)
RegTree
## n=976 (90 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 976 6437933.00 4.288320
## 2) P54>=0.5 496 1247111.00 -3.916935
## 4) P46>=0.5 338 343395.20 -10.567160 *
## 5) P46< 0.5 158 856789.90 10.309490
## 10) P58< 0.5 144 110944.10 2.570139 *
## 11) P58>=0.5 14 648503.60 89.914290 *
## 3) P54< 0.5 480 5122921.00 12.767080
## 6) P73< 0.5 422 145579.90 5.706635 *
## 7) P73>=0.5 58 4803244.00 64.137930
## 14) P35< 0.5 45 26136.17 8.171111 *
## 15) P35>=0.5 13 4148242.00 257.869200 *
#Categorical and ordinal predictors in a tree
fmsURL <- "http://www.stat-gen.org/book.e1/data/FMS_data.txt"
fms <- read.delim(file=fmsURL, header=T, sep="\t")
attach(fms)
Trait <- NDRM.CH
library(rpart)
RegTree <- rpart(Trait~resistin_c30t+resistin_c398t+
resistin_g540a+resistin_c980g+resistin_c180g+
resistin_a537c, method="anova")
RegTree
## n=611 (786 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 611 665669.4 52.85352
## 2) resistin_c980g=CC,CG 510 491113.4 51.23314 *
## 3) resistin_c980g=GG 101 166455.3 61.03564 *
RegTreeOr <- rpart(Trait~as.numeric(resistin_c30t)+
as.numeric(resistin_c398t)+as.numeric(resistin_g540a)+
as.numeric(resistin_c980g)+as.numeric(resistin_c180g)+
as.numeric(resistin_a537c), method="anova")
RegTreeOr
## n=611 (786 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 611 665669.4 52.85352
## 2) as.numeric(resistin_c980g)< 2.5 510 491113.4 51.23314 *
## 3) as.numeric(resistin_c980g)>=2.5 101 166455.3 61.03564 *
#Cost-complexity pruning
attach(virco)
## The following objects are masked from virco (pos = 4):
##
## APV.Fold, APV.FoldMatch, ATV.Fold, ATV.FoldMatch, CompMutList,
## IDV.Fold, IDV.FoldMatch, IsolateName, LPV.Fold, LPV.FoldMatch,
## MajorMutList, Method, MinorMutList, NFV.Fold, NFV.FoldMatch,
## P1, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P2, P20,
## P21, P22, P23, P24, P25, P26, P27, P28, P29, P3, P30, P31,
## P32, P33, P34, P35, P36, P37, P38, P39, P4, P40, P41, P42,
## P43, P44, P45, P46, P47, P48, P49, P5, P50, P51, P52, P53,
## P54, P55, P56, P57, P58, P59, P6, P60, P61, P62, P63, P64,
## P65, P66, P67, P68, P69, P7, P70, P71, P72, P73, P74, P75,
## P76, P77, P78, P79, P8, P80, P81, P82, P83, P84, P85, P86,
## P87, P88, P89, P9, P90, P91, P92, P93, P94, P95, P96, P97,
## P98, P99, RefID, RTV.Fold, RTV.FoldMatch, SeqID, SQV.Fold,
## SQV.FoldMatch, SubType, TPV.Fold, TPV.FoldMatch, Type
## The following objects are masked from virco (pos = 6):
##
## APV.Fold, APV.FoldMatch, ATV.Fold, ATV.FoldMatch, CompMutList,
## IDV.Fold, IDV.FoldMatch, IsolateName, LPV.Fold, LPV.FoldMatch,
## MajorMutList, Method, MinorMutList, NFV.Fold, NFV.FoldMatch,
## P1, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P2, P20,
## P21, P22, P23, P24, P25, P26, P27, P28, P29, P3, P30, P31,
## P32, P33, P34, P35, P36, P37, P38, P39, P4, P40, P41, P42,
## P43, P44, P45, P46, P47, P48, P49, P5, P50, P51, P52, P53,
## P54, P55, P56, P57, P58, P59, P6, P60, P61, P62, P63, P64,
## P65, P66, P67, P68, P69, P7, P70, P71, P72, P73, P74, P75,
## P76, P77, P78, P79, P8, P80, P81, P82, P83, P84, P85, P86,
## P87, P88, P89, P9, P90, P91, P92, P93, P94, P95, P96, P97,
## P98, P99, RefID, RTV.Fold, RTV.FoldMatch, SeqID, SQV.Fold,
## SQV.FoldMatch, SubType, TPV.Fold, TPV.FoldMatch, Type
VircoGeno <- data.frame(virco[,substr(names(virco),1,1)=="P"])
library(rpart)
Tree <- rpart(APV.Fold~.,data=VircoGeno)
Tree
## n=939 (127 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 939 356632.300 12.946540
## 2) P54=-,A,L,MI,S,T,TI,TS,V,VA,VI,VIM,VL,X 889 237601.200 10.726550
## 4) P46=-,ILM,IM,LM,LMI,MI,MIL,ML,V,VIM,X 481 44960.940 4.506653
## 8) P54=-,T,TI,TS,VI,X 342 4475.893 1.980702 *
## 9) P54=A,L,MI,S,V,VA 139 32934.020 10.721580
## 18) P89=-,M,ML 132 24074.510 9.125000
## 36) P82=-,A,AT,AV,S,T,TS 122 15185.570 7.560656 *
## 37) P82=C,F,M,TA 10 4948.009 28.210000 *
## 19) P89=I,V,VL 7 2178.014 40.828570 *
## 5) P46=I,L,LI,LIM,VL 408 152093.900 18.059310
## 10) P47=- 340 107235.300 15.332650
## 20) P84=-,VI 232 56131.660 11.931900
## 40) P50=-,L 214 37976.750 10.106540
## 80) P33=-,I,LF,M,V 167 15681.720 7.290419 *
## 81) P33=F,FL,IL,MIL 47 16264.770 20.112770 *
## 41) P50=V 18 8964.740 33.633330 *
## 21) P84=A,V,X 108 42656.790 22.637960
## 42) P91=-,A,N,ST,Z 101 34293.240 20.867330
## 84) P76=- 92 25472.030 18.966300 *
## 85) P76=V 9 5090.080 40.300000 *
## 43) P91=S,SA 7 3478.089 48.185710 *
## 11) P47=A,V,VI 68 29691.790 31.692650
## 22) P20=-,M,RK,T,TI,TK,VI 35 7891.207 23.125710 *
## 23) P20=I,R,V 33 16507.440 40.778790
## 46) P53=L,LF 13 4679.171 26.661540 *
## 47) P53=- 20 7553.350 49.955000 *
## 3) P54=LI,M,MIL,VM 50 36749.950 52.418000
## 6) P20=-,M,R,TK,V 33 21075.000 43.024240
## 12) P46=- 11 5723.236 20.881820 *
## 13) P46=I,IM,V,VI 22 7262.030 54.095450 *
## 7) P20=I,QK,RK,T,VI 17 7110.222 70.652940 *
plotcp(Tree)

printcp(Tree)
##
## Regression tree:
## rpart(formula = APV.Fold ~ ., data = VircoGeno)
##
## Variables actually used in tree construction:
## [1] P20 P33 P46 P47 P50 P53 P54 P76 P82 P84 P89 P91
##
## Root node error: 356632/939 = 379.8
##
## n=939 (127 observations deleted due to missingness)
##
## CP nsplit rel error xerror xstd
## 1 0.230717 0 1.00000 1.00301 0.080948
## 2 0.113693 1 0.76928 0.80500 0.066722
## 3 0.042528 2 0.65559 0.68941 0.058177
## 4 0.024727 3 0.61306 0.67063 0.058294
## 5 0.024016 5 0.56361 0.71491 0.062660
## 6 0.022684 6 0.53959 0.70965 0.062781
## 7 0.021173 7 0.51691 0.69979 0.062105
## 8 0.018735 8 0.49574 0.68340 0.061068
## 9 0.016909 9 0.47700 0.68528 0.060556
## 10 0.014842 10 0.46009 0.68363 0.060733
## 11 0.013699 11 0.44525 0.68861 0.060901
## 12 0.011987 12 0.43155 0.66463 0.057332
## 13 0.011050 13 0.41956 0.65640 0.056768
## 14 0.010462 14 0.40851 0.65305 0.056479
## 15 0.010000 15 0.39805 0.64656 0.056241
prunedTree <- prune(Tree,cp=.03)
prunedTree
## n=939 (127 observations deleted due to missingness)
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 939 356632.30 12.946540
## 2) P54=-,A,L,MI,S,T,TI,TS,V,VA,VI,VIM,VL,X 889 237601.20 10.726550
## 4) P46=-,ILM,IM,LM,LMI,MI,MIL,ML,V,VIM,X 481 44960.94 4.506653 *
## 5) P46=I,L,LI,LIM,VL 408 152093.90 18.059310
## 10) P47=- 340 107235.30 15.332650 *
## 11) P47=A,V,VI 68 29691.79 31.692650 *
## 3) P54=LI,M,MIL,VM 50 36749.95 52.418000 *