library(rpart)
library(rpart.plot)
library(MASS)
library(caret)
library(tidyverse)
data("Pima.tr");data("Pima.te")
Pima <- rbind(Pima.tr,Pima.te)
rm(Pima.tr,Pima.te)
str(Pima)
## 'data.frame': 532 obs. of 8 variables:
## $ npreg: int 5 7 5 0 0 5 3 1 3 2 ...
## $ glu : int 86 195 77 165 107 97 83 193 142 128 ...
## $ bp : int 68 70 82 76 60 76 58 50 80 78 ...
## $ skin : int 28 33 41 43 25 27 31 16 15 37 ...
## $ bmi : num 30.2 25.1 35.8 47.9 26.4 35.6 34.3 25.9 32.4 43.3 ...
## $ ped : num 0.364 0.163 0.156 0.259 0.133 ...
## $ age : int 24 55 35 26 23 52 25 24 63 31 ...
## $ type : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 1 1 2 ...
summary(Pima)
## npreg glu bp skin
## Min. : 0.000 Min. : 56.00 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 98.75 1st Qu.: 64.00 1st Qu.:22.00
## Median : 2.000 Median :115.00 Median : 72.00 Median :29.00
## Mean : 3.517 Mean :121.03 Mean : 71.51 Mean :29.18
## 3rd Qu.: 5.000 3rd Qu.:141.25 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.00 Max. :110.00 Max. :99.00
## bmi ped age type
## Min. :18.20 Min. :0.0850 Min. :21.00 No :355
## 1st Qu.:27.88 1st Qu.:0.2587 1st Qu.:23.00 Yes:177
## Median :32.80 Median :0.4160 Median :28.00
## Mean :32.89 Mean :0.5030 Mean :31.61
## 3rd Qu.:36.90 3rd Qu.:0.6585 3rd Qu.:38.00
## Max. :67.10 Max. :2.4200 Max. :81.00
Age<-Pima$age
Npreg<-Pima$npreg
Glu<-Pima$glu
Bp<-Pima$bp
Skin<-Pima$skin
Bmi<-Pima$bmi
Ped<-Pima$ped
Type<-Pima$type
set.seed(123)
Arbol_1<-rpart(Type ~ Age + Npreg + Bmi +Glu + Skin + Bp + Ped,
data = Pima,
method = "class")
rpart.plot(Arbol_1,digits = -1,type = 2,extra = 101,cex = 0.7)
-Matriz de confusión
Prediccion1<-predict(Arbol_1, newdata=Pima,type="class")
caret::confusionMatrix(Prediccion1,Pima$type,positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 326 46
## Yes 29 131
##
## Accuracy : 0.859
## 95% CI : (0.8265, 0.8875)
## No Information Rate : 0.6673
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6747
##
## Mcnemar's Test P-Value : 0.06467
##
## Sensitivity : 0.7401
## Specificity : 0.9183
## Pos Pred Value : 0.8187
## Neg Pred Value : 0.8763
## Prevalence : 0.3327
## Detection Rate : 0.2462
## Detection Prevalence : 0.3008
## Balanced Accuracy : 0.8292
##
## 'Positive' Class : Yes
##
Arbol_2<-rpart(Type ~ Age + Npreg + Bmi +Glu + Skin + Bp + Ped,
data = Pima,
control = rpart.control(minsplit = 50,minbucket = 30),
method = "class")
rpart.plot(Arbol_2,digits = -1,type = 2,extra = 101,cex = 0.7)
-Matriz de confusión
Prediccion2<-predict(Arbol_2, newdata=Pima,type="class")
caret::confusionMatrix(Prediccion2,Pima$type,positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 313 58
## Yes 42 119
##
## Accuracy : 0.812
## 95% CI : (0.7762, 0.8444)
## No Information Rate : 0.6673
## P-Value [Acc > NIR] : 7.571e-14
##
## Kappa : 0.5669
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 0.6723
## Specificity : 0.8817
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.8437
## Prevalence : 0.3327
## Detection Rate : 0.2237
## Detection Prevalence : 0.3026
## Balanced Accuracy : 0.7770
##
## 'Positive' Class : Yes
##
Arbol_3<-rpart(Type ~ Age + Npreg + Bmi +Glu + Skin + Bp + Ped,
data = Pima,
control = rpart.control(minsplit = 50,minbucket = 30,cp=0.05),
method = "class")
rpart.plot(Arbol_3,digits = -1,type = 2,extra = 101,cex = 0.7)
-Matriz de confusión
Prediccion3<-predict(Arbol_3, newdata=Pima,type="class")
caret::confusionMatrix(Prediccion3,Pima$type,positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 311 66
## Yes 44 111
##
## Accuracy : 0.7932
## 95% CI : (0.7563, 0.8269)
## No Information Rate : 0.6673
## P-Value [Acc > NIR] : 9.62e-11
##
## Kappa : 0.5194
##
## Mcnemar's Test P-Value : 0.04526
##
## Sensitivity : 0.6271
## Specificity : 0.8761
## Pos Pred Value : 0.7161
## Neg Pred Value : 0.8249
## Prevalence : 0.3327
## Detection Rate : 0.2086
## Detection Prevalence : 0.2914
## Balanced Accuracy : 0.7516
##
## 'Positive' Class : Yes
##
Arbol_4<- prune.rpart(Arbol_1, cp= 0.03)
rpart.plot(Arbol_4,digits = -1,type = 2,extra = 102,cex = 0.7, nn = TRUE)
-Matriz de confusión
Prediccion4<-predict(Arbol_4, newdata=Pima,type="class")
caret::confusionMatrix(Prediccion4,Pima$type,positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 311 66
## Yes 44 111
##
## Accuracy : 0.7932
## 95% CI : (0.7563, 0.8269)
## No Information Rate : 0.6673
## P-Value [Acc > NIR] : 9.62e-11
##
## Kappa : 0.5194
##
## Mcnemar's Test P-Value : 0.04526
##
## Sensitivity : 0.6271
## Specificity : 0.8761
## Pos Pred Value : 0.7161
## Neg Pred Value : 0.8249
## Prevalence : 0.3327
## Detection Rate : 0.2086
## Detection Prevalence : 0.2914
## Balanced Accuracy : 0.7516
##
## 'Positive' Class : Yes
##
Curiosamente, al podar el primer árbol e incluir un costo de complejidad más laxo que el anterior (0.03%) se obtiene el mismo arbol que en el punto 3. La diferencia en este caso es que en el árbol del primer caso tiene los valores de minsplit y minbucket de default, mientras que en el punto 3 se tienen en 50 y 30 respectivamente.