virco <- read.csv(file="Virco_data.csv", header=T, sep=",")
attach(virco)
VircoGeno <- data.frame(virco[,substr(names(virco),1,1)=="P"]!="-")
Trait <- as.factor(IDV.Fold > NFV.Fold)
library(rpart)
ClassTree <- rpart(Trait~., method="class", data=VircoGeno)
ClassTree
## n=976 (90 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 976 399 FALSE (0.5911885 0.4088115)  
##     2) P54< 0.5 480 130 FALSE (0.7291667 0.2708333)  
##       4) P76< 0.5 466 116 FALSE (0.7510730 0.2489270) *
##       5) P76>=0.5 14   0 TRUE (0.0000000 1.0000000) *
##     3) P54>=0.5 496 227 TRUE (0.4576613 0.5423387)  
##       6) P46< 0.5 158  57 FALSE (0.6392405 0.3607595)  
##        12) P1< 0.5 115  31 FALSE (0.7304348 0.2695652) *
##        13) P1>=0.5 43  17 TRUE (0.3953488 0.6046512) *
##       7) P46>=0.5 338 126 TRUE (0.3727811 0.6272189)  
##        14) P10< 0.5 22   7 FALSE (0.6818182 0.3181818) *
##        15) P10>=0.5 316 111 TRUE (0.3512658 0.6487342)  
##          30) P48< 0.5 278 106 TRUE (0.3812950 0.6187050)  
##            60) P20< 0.5 113  55 TRUE (0.4867257 0.5132743)  
##             120) P76< 0.5 92  40 FALSE (0.5652174 0.4347826) *
##             121) P76>=0.5 21   3 TRUE (0.1428571 0.8571429) *
##            61) P20>=0.5 165  51 TRUE (0.3090909 0.6909091) *
##          31) P48>=0.5 38   5 TRUE (0.1315789 0.8684211) *
plot(ClassTree)
text(ClassTree)

rpart(Trait~., method="class", parms=list(split="information"), data = VircoGeno)
## n=976 (90 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 976 399 FALSE (0.5911885 0.4088115)  
##    2) P54< 0.5 480 130 FALSE (0.7291667 0.2708333)  
##      4) P76< 0.5 466 116 FALSE (0.7510730 0.2489270) *
##      5) P76>=0.5 14   0 TRUE (0.0000000 1.0000000) *
##    3) P54>=0.5 496 227 TRUE (0.4576613 0.5423387)  
##      6) P46< 0.5 158  57 FALSE (0.6392405 0.3607595)  
##       12) P1< 0.5 115  31 FALSE (0.7304348 0.2695652) *
##       13) P1>=0.5 43  17 TRUE (0.3953488 0.6046512) *
##      7) P46>=0.5 338 126 TRUE (0.3727811 0.6272189)  
##       14) P48< 0.5 299 120 TRUE (0.4013378 0.5986622)  
##         28) P20< 0.5 125  62 FALSE (0.5040000 0.4960000)  
##           56) P76< 0.5 104  44 FALSE (0.5769231 0.4230769) *
##           57) P76>=0.5 21   3 TRUE (0.1428571 0.8571429) *
##         29) P20>=0.5 174  57 TRUE (0.3275862 0.6724138) *
##       15) P48>=0.5 39   6 TRUE (0.1538462 0.8461538) *
rpart(Trait~., method="class", parms=list(split="gini"), control=rpart.control(minsplit=150, minbucket=50), data=VircoGeno)
## n=976 (90 observations deleted due to missingness)
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 976 399 FALSE (0.5911885 0.4088115)  
##   2) P54< 0.5 480 130 FALSE (0.7291667 0.2708333) *
##   3) P54>=0.5 496 227 TRUE (0.4576613 0.5423387)  
##     6) P46< 0.5 158  57 FALSE (0.6392405 0.3607595) *
##     7) P46>=0.5 338 126 TRUE (0.3727811 0.6272189) *
attach(virco)
## The following objects are masked from virco (pos = 4):
## 
##     APV.Fold, APV.FoldMatch, ATV.Fold, ATV.FoldMatch, CompMutList,
##     IDV.Fold, IDV.FoldMatch, IsolateName, LPV.Fold, LPV.FoldMatch,
##     MajorMutList, Method, MinorMutList, NFV.Fold, NFV.FoldMatch,
##     P1, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P2, P20,
##     P21, P22, P23, P24, P25, P26, P27, P28, P29, P3, P30, P31,
##     P32, P33, P34, P35, P36, P37, P38, P39, P4, P40, P41, P42,
##     P43, P44, P45, P46, P47, P48, P49, P5, P50, P51, P52, P53,
##     P54, P55, P56, P57, P58, P59, P6, P60, P61, P62, P63, P64,
##     P65, P66, P67, P68, P69, P7, P70, P71, P72, P73, P74, P75,
##     P76, P77, P78, P79, P8, P80, P81, P82, P83, P84, P85, P86,
##     P87, P88, P89, P9, P90, P91, P92, P93, P94, P95, P96, P97,
##     P98, P99, RefID, RTV.Fold, RTV.FoldMatch, SeqID, SQV.Fold,
##     SQV.FoldMatch, SubType, TPV.Fold, TPV.FoldMatch, Type
Trait <- NFV.Fold - IDV.Fold

library(rpart)
RegTree <- rpart(Trait~., method="anova", data=VircoGeno)
RegTree
## n=976 (90 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 976 6437933.00   4.288320  
##    2) P54>=0.5 496 1247111.00  -3.916935  
##      4) P46>=0.5 338  343395.20 -10.567160 *
##      5) P46< 0.5 158  856789.90  10.309490  
##       10) P58< 0.5 144  110944.10   2.570139 *
##       11) P58>=0.5 14  648503.60  89.914290 *
##    3) P54< 0.5 480 5122921.00  12.767080  
##      6) P73< 0.5 422  145579.90   5.706635 *
##      7) P73>=0.5 58 4803244.00  64.137930  
##       14) P35< 0.5 45   26136.17   8.171111 *
##       15) P35>=0.5 13 4148242.00 257.869200 *
#Categorical and ordinal predictors in a tree
fmsURL <- "http://www.stat-gen.org/book.e1/data/FMS_data.txt"
fms <- read.delim(file=fmsURL, header=T, sep="\t")
attach(fms)

Trait <- NDRM.CH
library(rpart)
RegTree <- rpart(Trait~resistin_c30t+resistin_c398t+
resistin_g540a+resistin_c980g+resistin_c180g+
resistin_a537c, method="anova")
RegTree
## n=611 (786 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 611 665669.4 52.85352  
##   2) resistin_c980g=CC,CG 510 491113.4 51.23314 *
##   3) resistin_c980g=GG 101 166455.3 61.03564 *
RegTreeOr <- rpart(Trait~as.numeric(resistin_c30t)+
as.numeric(resistin_c398t)+as.numeric(resistin_g540a)+
as.numeric(resistin_c980g)+as.numeric(resistin_c180g)+
as.numeric(resistin_a537c), method="anova")
RegTreeOr
## n=611 (786 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 611 665669.4 52.85352  
##   2) as.numeric(resistin_c980g)< 2.5 510 491113.4 51.23314 *
##   3) as.numeric(resistin_c980g)>=2.5 101 166455.3 61.03564 *
#Cost-complexity pruning
attach(virco)
## The following objects are masked from virco (pos = 4):
## 
##     APV.Fold, APV.FoldMatch, ATV.Fold, ATV.FoldMatch, CompMutList,
##     IDV.Fold, IDV.FoldMatch, IsolateName, LPV.Fold, LPV.FoldMatch,
##     MajorMutList, Method, MinorMutList, NFV.Fold, NFV.FoldMatch,
##     P1, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P2, P20,
##     P21, P22, P23, P24, P25, P26, P27, P28, P29, P3, P30, P31,
##     P32, P33, P34, P35, P36, P37, P38, P39, P4, P40, P41, P42,
##     P43, P44, P45, P46, P47, P48, P49, P5, P50, P51, P52, P53,
##     P54, P55, P56, P57, P58, P59, P6, P60, P61, P62, P63, P64,
##     P65, P66, P67, P68, P69, P7, P70, P71, P72, P73, P74, P75,
##     P76, P77, P78, P79, P8, P80, P81, P82, P83, P84, P85, P86,
##     P87, P88, P89, P9, P90, P91, P92, P93, P94, P95, P96, P97,
##     P98, P99, RefID, RTV.Fold, RTV.FoldMatch, SeqID, SQV.Fold,
##     SQV.FoldMatch, SubType, TPV.Fold, TPV.FoldMatch, Type
## The following objects are masked from virco (pos = 6):
## 
##     APV.Fold, APV.FoldMatch, ATV.Fold, ATV.FoldMatch, CompMutList,
##     IDV.Fold, IDV.FoldMatch, IsolateName, LPV.Fold, LPV.FoldMatch,
##     MajorMutList, Method, MinorMutList, NFV.Fold, NFV.FoldMatch,
##     P1, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P2, P20,
##     P21, P22, P23, P24, P25, P26, P27, P28, P29, P3, P30, P31,
##     P32, P33, P34, P35, P36, P37, P38, P39, P4, P40, P41, P42,
##     P43, P44, P45, P46, P47, P48, P49, P5, P50, P51, P52, P53,
##     P54, P55, P56, P57, P58, P59, P6, P60, P61, P62, P63, P64,
##     P65, P66, P67, P68, P69, P7, P70, P71, P72, P73, P74, P75,
##     P76, P77, P78, P79, P8, P80, P81, P82, P83, P84, P85, P86,
##     P87, P88, P89, P9, P90, P91, P92, P93, P94, P95, P96, P97,
##     P98, P99, RefID, RTV.Fold, RTV.FoldMatch, SeqID, SQV.Fold,
##     SQV.FoldMatch, SubType, TPV.Fold, TPV.FoldMatch, Type
VircoGeno <- data.frame(virco[,substr(names(virco),1,1)=="P"])
library(rpart)
Tree <- rpart(APV.Fold~.,data=VircoGeno)
Tree
## n=939 (127 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 939 356632.300 12.946540  
##    2) P54=-,A,L,MI,S,T,TI,TS,V,VA,VI,VIM,VL,X 889 237601.200 10.726550  
##      4) P46=-,ILM,IM,LM,LMI,MI,MIL,ML,V,VIM,X 481  44960.940  4.506653  
##        8) P54=-,T,TI,TS,VI,X 342   4475.893  1.980702 *
##        9) P54=A,L,MI,S,V,VA 139  32934.020 10.721580  
##         18) P89=-,M,ML 132  24074.510  9.125000  
##           36) P82=-,A,AT,AV,S,T,TS 122  15185.570  7.560656 *
##           37) P82=C,F,M,TA 10   4948.009 28.210000 *
##         19) P89=I,V,VL 7   2178.014 40.828570 *
##      5) P46=I,L,LI,LIM,VL 408 152093.900 18.059310  
##       10) P47=- 340 107235.300 15.332650  
##         20) P84=-,VI 232  56131.660 11.931900  
##           40) P50=-,L 214  37976.750 10.106540  
##             80) P33=-,I,LF,M,V 167  15681.720  7.290419 *
##             81) P33=F,FL,IL,MIL 47  16264.770 20.112770 *
##           41) P50=V 18   8964.740 33.633330 *
##         21) P84=A,V,X 108  42656.790 22.637960  
##           42) P91=-,A,N,ST,Z 101  34293.240 20.867330  
##             84) P76=- 92  25472.030 18.966300 *
##             85) P76=V 9   5090.080 40.300000 *
##           43) P91=S,SA 7   3478.089 48.185710 *
##       11) P47=A,V,VI 68  29691.790 31.692650  
##         22) P20=-,M,RK,T,TI,TK,VI 35   7891.207 23.125710 *
##         23) P20=I,R,V 33  16507.440 40.778790  
##           46) P53=L,LF 13   4679.171 26.661540 *
##           47) P53=- 20   7553.350 49.955000 *
##    3) P54=LI,M,MIL,VM 50  36749.950 52.418000  
##      6) P20=-,M,R,TK,V 33  21075.000 43.024240  
##       12) P46=- 11   5723.236 20.881820 *
##       13) P46=I,IM,V,VI 22   7262.030 54.095450 *
##      7) P20=I,QK,RK,T,VI 17   7110.222 70.652940 *
plotcp(Tree)

printcp(Tree)
## 
## Regression tree:
## rpart(formula = APV.Fold ~ ., data = VircoGeno)
## 
## Variables actually used in tree construction:
##  [1] P20 P33 P46 P47 P50 P53 P54 P76 P82 P84 P89 P91
## 
## Root node error: 356632/939 = 379.8
## 
## n=939 (127 observations deleted due to missingness)
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.230717      0   1.00000 1.00301 0.080948
## 2  0.113693      1   0.76928 0.80500 0.066722
## 3  0.042528      2   0.65559 0.68941 0.058177
## 4  0.024727      3   0.61306 0.67063 0.058294
## 5  0.024016      5   0.56361 0.71491 0.062660
## 6  0.022684      6   0.53959 0.70965 0.062781
## 7  0.021173      7   0.51691 0.69979 0.062105
## 8  0.018735      8   0.49574 0.68340 0.061068
## 9  0.016909      9   0.47700 0.68528 0.060556
## 10 0.014842     10   0.46009 0.68363 0.060733
## 11 0.013699     11   0.44525 0.68861 0.060901
## 12 0.011987     12   0.43155 0.66463 0.057332
## 13 0.011050     13   0.41956 0.65640 0.056768
## 14 0.010462     14   0.40851 0.65305 0.056479
## 15 0.010000     15   0.39805 0.64656 0.056241
prunedTree <- prune(Tree,cp=.03)
prunedTree
## n=939 (127 observations deleted due to missingness)
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 939 356632.30 12.946540  
##    2) P54=-,A,L,MI,S,T,TI,TS,V,VA,VI,VIM,VL,X 889 237601.20 10.726550  
##      4) P46=-,ILM,IM,LM,LMI,MI,MIL,ML,V,VIM,X 481  44960.94  4.506653 *
##      5) P46=I,L,LI,LIM,VL 408 152093.90 18.059310  
##       10) P47=- 340 107235.30 15.332650 *
##       11) P47=A,V,VI 68  29691.79 31.692650 *
##    3) P54=LI,M,MIL,VM 50  36749.95 52.418000 *