library(readxl)
R_Data <- read_excel("C:/Users/Amelia/Desktop/Business Analytics/R Data.xlsx")
mydata <- na.omit(R_Data)
#Decision Trees
library(rpart)
library(party)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(caret)
library(pander)
mydata$Life[mydata$`Life expectancy` > 69] <- "high"
mydata$Life[mydata$`Life expectancy` <= 69] <- "low"
mytree <- rpart(Life~`Region`+`Status`+`Alcohol`+`BMI`+`GDP`+`Schooling`, data=mydata, method="class", cp=0.01)
summary(mytree)
## Call:
## rpart(formula = Life ~ Region + Status + Alcohol + BMI + GDP +
## Schooling, data = mydata, method = "class", cp = 0.01)
## n= 1649
##
## CP nsplit rel error xerror xstd
## 1 0.57780980 0 1.0000000 1.0000000 0.02888761
## 2 0.09942363 1 0.4221902 0.4308357 0.02254409
## 3 0.01873199 2 0.3227666 0.3386167 0.02045450
## 4 0.01248799 3 0.3040346 0.3285303 0.02019736
## 5 0.01000000 7 0.2507205 0.3025937 0.01950609
##
## Variable importance
## Schooling BMI Region GDP Alcohol Status
## 30 25 16 14 13 2
##
## Node number 1: 1649 observations, complexity param=0.5778098
## predicted class=high expected loss=0.4208611 P(node) =1
## class counts: 955 694
## probabilities: 0.579 0.421
## left son=2 (926 obs) right son=3 (723 obs)
## Primary splits:
## Schooling < 11.85 to the right, improve=327.1816, (0 missing)
## BMI < 45.05 to the right, improve=270.7555, (0 missing)
## Region splits as RLRLR, improve=245.9012, (0 missing)
## GDP < 1746.369 to the right, improve=202.8500, (0 missing)
## Alcohol < 1.235 to the right, improve=121.7241, (0 missing)
## Surrogate splits:
## BMI < 47.75 to the right, agree=0.770, adj=0.474, (0 split)
## GDP < 1472.771 to the right, agree=0.767, adj=0.469, (0 split)
## Region splits as RLRLL, agree=0.733, adj=0.391, (0 split)
## Alcohol < 3.4 to the right, agree=0.731, adj=0.387, (0 split)
## Status splits as LR, agree=0.583, adj=0.048, (0 split)
##
## Node number 2: 926 observations, complexity param=0.01248799
## predicted class=high expected loss=0.1425486 P(node) =0.5615525
## class counts: 794 132
## probabilities: 0.857 0.143
## left son=4 (556 obs) right son=5 (370 obs)
## Primary splits:
## Region splits as RLRLR, improve=21.84242, (0 missing)
## Schooling < 14.85 to the right, improve=15.91865, (0 missing)
## Status splits as LR, improve=13.16600, (0 missing)
## BMI < 49.25 to the right, improve=13.09348, (0 missing)
## Alcohol < 0.315 to the right, improve=11.11403, (0 missing)
## Surrogate splits:
## Alcohol < 3.315 to the right, agree=0.814, adj=0.535, (0 split)
## BMI < 37.5 to the right, agree=0.712, adj=0.278, (0 split)
## Schooling < 13.15 to the right, agree=0.660, adj=0.149, (0 split)
## GDP < 277.5732 to the right, agree=0.621, adj=0.051, (0 split)
##
## Node number 3: 723 observations, complexity param=0.09942363
## predicted class=low expected loss=0.2226833 P(node) =0.4384475
## class counts: 161 562
## probabilities: 0.223 0.777
## left son=6 (153 obs) right son=7 (570 obs)
## Primary splits:
## BMI < 43.45 to the right, improve=98.12674, (0 missing)
## Region splits as RLRLR, improve=56.81442, (0 missing)
## Schooling < 9.85 to the right, improve=37.02464, (0 missing)
## GDP < 1643.298 to the right, improve=35.10320, (0 missing)
## Alcohol < 1.825 to the right, improve=12.95442, (0 missing)
## Surrogate splits:
## Region splits as RLRLL, agree=0.849, adj=0.288, (0 split)
## GDP < 1640.147 to the right, agree=0.804, adj=0.072, (0 split)
## Status splits as LR, agree=0.791, adj=0.013, (0 split)
##
## Node number 4: 556 observations
## predicted class=high expected loss=0.05395683 P(node) =0.337174
## class counts: 526 30
## probabilities: 0.946 0.054
##
## Node number 5: 370 observations, complexity param=0.01248799
## predicted class=high expected loss=0.2756757 P(node) =0.2243784
## class counts: 268 102
## probabilities: 0.724 0.276
## left son=10 (157 obs) right son=11 (213 obs)
## Primary splits:
## BMI < 51.25 to the right, improve=14.143070, (0 missing)
## Schooling < 13.05 to the right, improve=12.353470, (0 missing)
## Alcohol < 0.315 to the right, improve= 6.823757, (0 missing)
## GDP < 11880.82 to the right, improve= 5.690734, (0 missing)
## Status splits as LR, improve= 4.782690, (0 missing)
## Surrogate splits:
## Region splits as R-R-L, agree=0.711, adj=0.318, (0 split)
## Schooling < 13.25 to the right, agree=0.697, adj=0.287, (0 split)
## Status splits as LR, agree=0.643, adj=0.159, (0 split)
## Alcohol < 9.03 to the right, agree=0.635, adj=0.140, (0 split)
## GDP < 12465.06 to the right, agree=0.635, adj=0.140, (0 split)
##
## Node number 6: 153 observations, complexity param=0.01873199
## predicted class=high expected loss=0.2745098 P(node) =0.09278351
## class counts: 111 42
## probabilities: 0.725 0.275
## left son=12 (118 obs) right son=13 (35 obs)
## Primary splits:
## Schooling < 10.25 to the right, improve=15.346990, (0 missing)
## Region splits as LLLLR, improve=10.369990, (0 missing)
## Alcohol < 0.825 to the right, improve= 9.067949, (0 missing)
## BMI < 57.5 to the left, improve= 4.980315, (0 missing)
## GDP < 189.0987 to the right, improve= 3.523785, (0 missing)
## Surrogate splits:
## Alcohol < 0.48 to the right, agree=0.810, adj=0.171, (0 split)
## Region splits as RLLLR, agree=0.784, adj=0.057, (0 split)
## GDP < 88.14048 to the right, agree=0.778, adj=0.029, (0 split)
##
## Node number 7: 570 observations
## predicted class=low expected loss=0.0877193 P(node) =0.345664
## class counts: 50 520
## probabilities: 0.088 0.912
##
## Node number 10: 157 observations
## predicted class=high expected loss=0.1146497 P(node) =0.09520922
## class counts: 139 18
## probabilities: 0.885 0.115
##
## Node number 11: 213 observations, complexity param=0.01248799
## predicted class=high expected loss=0.3943662 P(node) =0.1291692
## class counts: 129 84
## probabilities: 0.606 0.394
## left son=22 (137 obs) right son=23 (76 obs)
## Primary splits:
## BMI < 32.6 to the left, improve=18.091660, (0 missing)
## Alcohol < 6.35 to the left, improve=11.490320, (0 missing)
## Schooling < 13.05 to the right, improve= 5.112676, (0 missing)
## GDP < 1759.189 to the right, improve= 2.328121, (0 missing)
## Region splits as R-L-L, improve= 2.168701, (0 missing)
## Surrogate splits:
## Alcohol < 6.43 to the left, agree=0.690, adj=0.132, (0 split)
## GDP < 9658.504 to the left, agree=0.657, adj=0.039, (0 split)
##
## Node number 12: 118 observations
## predicted class=high expected loss=0.1525424 P(node) =0.07155852
## class counts: 100 18
## probabilities: 0.847 0.153
##
## Node number 13: 35 observations
## predicted class=low expected loss=0.3142857 P(node) =0.02122498
## class counts: 11 24
## probabilities: 0.314 0.686
##
## Node number 22: 137 observations
## predicted class=high expected loss=0.2408759 P(node) =0.08308065
## class counts: 104 33
## probabilities: 0.759 0.241
##
## Node number 23: 76 observations, complexity param=0.01248799
## predicted class=low expected loss=0.3289474 P(node) =0.04608854
## class counts: 25 51
## probabilities: 0.329 0.671
## left son=46 (25 obs) right son=47 (51 obs)
## Primary splits:
## Alcohol < 1.57 to the left, improve=11.3942000, (0 missing)
## GDP < 3617.107 to the left, improve= 3.6026320, (0 missing)
## BMI < 48.05 to the right, improve= 2.9444560, (0 missing)
## Schooling < 13.65 to the left, improve= 1.7192980, (0 missing)
## Region splits as R-L-R, improve= 0.4915205, (0 missing)
## Surrogate splits:
## GDP < 87.63788 to the left, agree=0.711, adj=0.12, (0 split)
## Schooling < 14.65 to the right, agree=0.684, adj=0.04, (0 split)
##
## Node number 46: 25 observations
## predicted class=high expected loss=0.28 P(node) =0.0151607
## class counts: 18 7
## probabilities: 0.720 0.280
##
## Node number 47: 51 observations
## predicted class=low expected loss=0.1372549 P(node) =0.03092784
## class counts: 7 44
## probabilities: 0.137 0.863
rpart.plot(mytree, box.palette="RdBu", cex=0.55)

fit = predict(mytree,type="class")
cmatrix = table(mydata$Life,fit)
pander(cmatrix)