BA Project

library(readxl)
R_Data <- read_excel("C:/Users/Amelia/Desktop/Business Analytics/R Data.xlsx")

mydata <- na.omit(R_Data)


#Decision Trees

library(rpart)
library(party)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(caret)
library(pander)

mydata$Life[mydata$`Life expectancy` > 69] <- "high"
mydata$Life[mydata$`Life expectancy` <= 69] <- "low"

mytree <- rpart(Life~`Region`+`Status`+`Alcohol`+`BMI`+`GDP`+`Schooling`, data=mydata, method="class", cp=0.01)
summary(mytree)

## Call:
## rpart(formula = Life ~ Region + Status + Alcohol + BMI + GDP + 
##     Schooling, data = mydata, method = "class", cp = 0.01)
##   n= 1649 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.57780980      0 1.0000000 1.0000000 0.02888761
## 2 0.09942363      1 0.4221902 0.4308357 0.02254409
## 3 0.01873199      2 0.3227666 0.3386167 0.02045450
## 4 0.01248799      3 0.3040346 0.3285303 0.02019736
## 5 0.01000000      7 0.2507205 0.3025937 0.01950609
## 
## Variable importance
## Schooling       BMI    Region       GDP   Alcohol    Status 
##        30        25        16        14        13         2 
## 
## Node number 1: 1649 observations,    complexity param=0.5778098
##   predicted class=high  expected loss=0.4208611  P(node) =1
##     class counts:   955   694
##    probabilities: 0.579 0.421 
##   left son=2 (926 obs) right son=3 (723 obs)
##   Primary splits:
##       Schooling < 11.85    to the right, improve=327.1816, (0 missing)
##       BMI       < 45.05    to the right, improve=270.7555, (0 missing)
##       Region    splits as  RLRLR,        improve=245.9012, (0 missing)
##       GDP       < 1746.369 to the right, improve=202.8500, (0 missing)
##       Alcohol   < 1.235    to the right, improve=121.7241, (0 missing)
##   Surrogate splits:
##       BMI     < 47.75    to the right, agree=0.770, adj=0.474, (0 split)
##       GDP     < 1472.771 to the right, agree=0.767, adj=0.469, (0 split)
##       Region  splits as  RLRLL,        agree=0.733, adj=0.391, (0 split)
##       Alcohol < 3.4      to the right, agree=0.731, adj=0.387, (0 split)
##       Status  splits as  LR,           agree=0.583, adj=0.048, (0 split)
## 
## Node number 2: 926 observations,    complexity param=0.01248799
##   predicted class=high  expected loss=0.1425486  P(node) =0.5615525
##     class counts:   794   132
##    probabilities: 0.857 0.143 
##   left son=4 (556 obs) right son=5 (370 obs)
##   Primary splits:
##       Region    splits as  RLRLR,        improve=21.84242, (0 missing)
##       Schooling < 14.85    to the right, improve=15.91865, (0 missing)
##       Status    splits as  LR,           improve=13.16600, (0 missing)
##       BMI       < 49.25    to the right, improve=13.09348, (0 missing)
##       Alcohol   < 0.315    to the right, improve=11.11403, (0 missing)
##   Surrogate splits:
##       Alcohol   < 3.315    to the right, agree=0.814, adj=0.535, (0 split)
##       BMI       < 37.5     to the right, agree=0.712, adj=0.278, (0 split)
##       Schooling < 13.15    to the right, agree=0.660, adj=0.149, (0 split)
##       GDP       < 277.5732 to the right, agree=0.621, adj=0.051, (0 split)
## 
## Node number 3: 723 observations,    complexity param=0.09942363
##   predicted class=low   expected loss=0.2226833  P(node) =0.4384475
##     class counts:   161   562
##    probabilities: 0.223 0.777 
##   left son=6 (153 obs) right son=7 (570 obs)
##   Primary splits:
##       BMI       < 43.45    to the right, improve=98.12674, (0 missing)
##       Region    splits as  RLRLR,        improve=56.81442, (0 missing)
##       Schooling < 9.85     to the right, improve=37.02464, (0 missing)
##       GDP       < 1643.298 to the right, improve=35.10320, (0 missing)
##       Alcohol   < 1.825    to the right, improve=12.95442, (0 missing)
##   Surrogate splits:
##       Region splits as  RLRLL,        agree=0.849, adj=0.288, (0 split)
##       GDP    < 1640.147 to the right, agree=0.804, adj=0.072, (0 split)
##       Status splits as  LR,           agree=0.791, adj=0.013, (0 split)
## 
## Node number 4: 556 observations
##   predicted class=high  expected loss=0.05395683  P(node) =0.337174
##     class counts:   526    30
##    probabilities: 0.946 0.054 
## 
## Node number 5: 370 observations,    complexity param=0.01248799
##   predicted class=high  expected loss=0.2756757  P(node) =0.2243784
##     class counts:   268   102
##    probabilities: 0.724 0.276 
##   left son=10 (157 obs) right son=11 (213 obs)
##   Primary splits:
##       BMI       < 51.25    to the right, improve=14.143070, (0 missing)
##       Schooling < 13.05    to the right, improve=12.353470, (0 missing)
##       Alcohol   < 0.315    to the right, improve= 6.823757, (0 missing)
##       GDP       < 11880.82 to the right, improve= 5.690734, (0 missing)
##       Status    splits as  LR,           improve= 4.782690, (0 missing)
##   Surrogate splits:
##       Region    splits as  R-R-L,        agree=0.711, adj=0.318, (0 split)
##       Schooling < 13.25    to the right, agree=0.697, adj=0.287, (0 split)
##       Status    splits as  LR,           agree=0.643, adj=0.159, (0 split)
##       Alcohol   < 9.03     to the right, agree=0.635, adj=0.140, (0 split)
##       GDP       < 12465.06 to the right, agree=0.635, adj=0.140, (0 split)
## 
## Node number 6: 153 observations,    complexity param=0.01873199
##   predicted class=high  expected loss=0.2745098  P(node) =0.09278351
##     class counts:   111    42
##    probabilities: 0.725 0.275 
##   left son=12 (118 obs) right son=13 (35 obs)
##   Primary splits:
##       Schooling < 10.25    to the right, improve=15.346990, (0 missing)
##       Region    splits as  LLLLR,        improve=10.369990, (0 missing)
##       Alcohol   < 0.825    to the right, improve= 9.067949, (0 missing)
##       BMI       < 57.5     to the left,  improve= 4.980315, (0 missing)
##       GDP       < 189.0987 to the right, improve= 3.523785, (0 missing)
##   Surrogate splits:
##       Alcohol < 0.48     to the right, agree=0.810, adj=0.171, (0 split)
##       Region  splits as  RLLLR,        agree=0.784, adj=0.057, (0 split)
##       GDP     < 88.14048 to the right, agree=0.778, adj=0.029, (0 split)
## 
## Node number 7: 570 observations
##   predicted class=low   expected loss=0.0877193  P(node) =0.345664
##     class counts:    50   520
##    probabilities: 0.088 0.912 
## 
## Node number 10: 157 observations
##   predicted class=high  expected loss=0.1146497  P(node) =0.09520922
##     class counts:   139    18
##    probabilities: 0.885 0.115 
## 
## Node number 11: 213 observations,    complexity param=0.01248799
##   predicted class=high  expected loss=0.3943662  P(node) =0.1291692
##     class counts:   129    84
##    probabilities: 0.606 0.394 
##   left son=22 (137 obs) right son=23 (76 obs)
##   Primary splits:
##       BMI       < 32.6     to the left,  improve=18.091660, (0 missing)
##       Alcohol   < 6.35     to the left,  improve=11.490320, (0 missing)
##       Schooling < 13.05    to the right, improve= 5.112676, (0 missing)
##       GDP       < 1759.189 to the right, improve= 2.328121, (0 missing)
##       Region    splits as  R-L-L,        improve= 2.168701, (0 missing)
##   Surrogate splits:
##       Alcohol < 6.43     to the left,  agree=0.690, adj=0.132, (0 split)
##       GDP     < 9658.504 to the left,  agree=0.657, adj=0.039, (0 split)
## 
## Node number 12: 118 observations
##   predicted class=high  expected loss=0.1525424  P(node) =0.07155852
##     class counts:   100    18
##    probabilities: 0.847 0.153 
## 
## Node number 13: 35 observations
##   predicted class=low   expected loss=0.3142857  P(node) =0.02122498
##     class counts:    11    24
##    probabilities: 0.314 0.686 
## 
## Node number 22: 137 observations
##   predicted class=high  expected loss=0.2408759  P(node) =0.08308065
##     class counts:   104    33
##    probabilities: 0.759 0.241 
## 
## Node number 23: 76 observations,    complexity param=0.01248799
##   predicted class=low   expected loss=0.3289474  P(node) =0.04608854
##     class counts:    25    51
##    probabilities: 0.329 0.671 
##   left son=46 (25 obs) right son=47 (51 obs)
##   Primary splits:
##       Alcohol   < 1.57     to the left,  improve=11.3942000, (0 missing)
##       GDP       < 3617.107 to the left,  improve= 3.6026320, (0 missing)
##       BMI       < 48.05    to the right, improve= 2.9444560, (0 missing)
##       Schooling < 13.65    to the left,  improve= 1.7192980, (0 missing)
##       Region    splits as  R-L-R,        improve= 0.4915205, (0 missing)
##   Surrogate splits:
##       GDP       < 87.63788 to the left,  agree=0.711, adj=0.12, (0 split)
##       Schooling < 14.65    to the right, agree=0.684, adj=0.04, (0 split)
## 
## Node number 46: 25 observations
##   predicted class=high  expected loss=0.28  P(node) =0.0151607
##     class counts:    18     7
##    probabilities: 0.720 0.280 
## 
## Node number 47: 51 observations
##   predicted class=low   expected loss=0.1372549  P(node) =0.03092784
##     class counts:     7    44
##    probabilities: 0.137 0.863

rpart.plot(mytree, box.palette="RdBu", cex=0.55)

fit = predict(mytree,type="class")
cmatrix = table(mydata$Life,fit)
pander(cmatrix)

	high	low
high	887	68
low	106	588

BA Project

Amelia McCarthy

5/9/2021