# library for Classification & Regression Trees
library(xtable)
# library for Ionosphere data
library(rpart)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.12
data("kyphosis")
#Data on Children who have had Corrective Spinal Surgery
kyp<-kyphosis
str(kyp)
## 'data.frame': 81 obs. of 4 variables:
## $ Kyphosis: Factor w/ 2 levels "absent","present": 1 1 2 1 1 1 1 1 1 2 ...
## $ Age : int 71 158 128 2 1 1 61 37 113 59 ...
## $ Number : int 3 3 4 5 4 2 2 3 2 6 ...
## $ Start : int 5 14 5 1 15 16 17 16 16 12 ...
v<-kyp$Kyphosis
table(v)
## v
## absent present
## 64 17
#set seed to ensure reproducible results
set.seed(250)
#spliting into training and test data sets in 3:1 ratio
kyp[,'train'] <- ifelse(runif(nrow(kyp))<0.75,1,0)
#separate training and test sets
train_kyp <- kyp[kyp$train==1,]
test_kyp <- kyp[kyp$train==0,]
#get column index of train flag
kyp_trainColNum <- grep('train',names(train_kyp))
str(train_kyp)
## 'data.frame': 64 obs. of 5 variables:
## $ Kyphosis: Factor w/ 2 levels "absent","present": 1 2 1 1 2 1 1 1 1 1 ...
## $ Age : int 71 128 37 113 82 148 18 1 168 1 ...
## $ Number : int 3 4 3 2 5 3 5 4 3 3 ...
## $ Start : int 5 5 16 16 14 16 2 12 18 16 ...
## $ train : num 1 1 1 1 1 1 1 1 1 1 ...
str(test_kyp)
## 'data.frame': 17 obs. of 5 variables:
## $ Kyphosis: Factor w/ 2 levels "absent","present": 1 1 1 1 1 2 2 1 1 1 ...
## $ Age : int 158 2 1 1 61 59 105 9 100 31 ...
## $ Number : int 3 5 4 2 2 6 6 5 3 3 ...
## $ Start : int 14 1 15 16 17 12 5 13 14 16 ...
## $ train : num 0 0 0 0 0 0 0 0 0 0 ...
#Obtaining the train and test data set
#remove train flag column from train and test sets
train_kyp <- train_kyp[,-kyp_trainColNum]
test_kyp <- test_kyp[,-kyp_trainColNum]
#Get column index of predicted variable in dataset
typeColNum_kyp <- grep('Kyphosis',names(kyp))
#Constructing the required Decision tree model
rpart_model_kyp <- rpart(Kyphosis~.,data = train_kyp, method= 'class')
# Plotting the tree
plot(rpart_model_kyp)
text(rpart_model_kyp)

summary(rpart_model_kyp)
## Call:
## rpart(formula = Kyphosis ~ ., data = train_kyp, method = "class")
## n= 64
##
## CP nsplit rel error xerror xstd
## 1 0.20 0 1.0 1 0.225924
## 2 0.01 1 0.8 1 0.225924
##
## Variable importance
## Start Number
## 85 15
##
## Node number 1: 64 observations, complexity param=0.2
## predicted class=absent expected loss=0.234375 P(node) =1
## class counts: 49 15
## probabilities: 0.766 0.234
## left son=2 (47 obs) right son=3 (17 obs)
## Primary splits:
## Start < 8.5 to the right, improve=5.797286, (0 missing)
## Age < 39.5 to the left, improve=2.180267, (0 missing)
## Number < 6.5 to the left, improve=1.785793, (0 missing)
## Surrogate splits:
## Number < 6.5 to the left, agree=0.781, adj=0.176, (0 split)
##
## Node number 2: 47 observations
## predicted class=absent expected loss=0.106383 P(node) =0.734375
## class counts: 42 5
## probabilities: 0.894 0.106
##
## Node number 3: 17 observations
## predicted class=present expected loss=0.4117647 P(node) =0.265625
## class counts: 7 10
## probabilities: 0.412 0.588
#Checking how good the model is
rpart_predict_kyp<- predict(rpart_model_kyp,test_kyp[,-typeColNum_kyp],type='class')
mn_kyp <- mean(rpart_predict_kyp==test_kyp$Kyphosis)
mn_kyp
## [1] 0.8823529
# Constructing the confusion matrix to find out the efficiency of the model
table(pred=rpart_predict_kyp,true=test_kyp$Kyphosis)
## true
## pred absent present
## absent 14 1
## present 1 1
# Applying the cost-complexity pruning
printcp(rpart_model_kyp)
##
## Classification tree:
## rpart(formula = Kyphosis ~ ., data = train_kyp, method = "class")
##
## Variables actually used in tree construction:
## [1] Start
##
## Root node error: 15/64 = 0.23438
##
## n= 64
##
## CP nsplit rel error xerror xstd
## 1 0.20 0 1.0 1 0.22592
## 2 0.01 1 0.8 1 0.22592
#Finding index of CP with lowest xerror
opt_kyp <- which.min(rpart_model_kyp$cptable[,'xerror'])
#Finding the values of CP
cp_kyp <- rpart_model_kyp$cptable[opt_kyp, 'CP' ]
cp_kyp
## [1] 0.2
# Pruning not required