library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart)
library(ggplot2)
library(readr)
## Warning: package 'readr' was built under R version 4.0.4
diabetes <- read_csv("diabetes.csv") #Cargamos los datos
##
## -- Column specification ---------------------------------------------------------------------
## cols(
## Pregnancies = col_double(),
## Glucose = col_double(),
## BloodPressure = col_double(),
## SkinThickness = col_double(),
## Insulin = col_double(),
## BMI = col_double(),
## DiabetesPedigreeFunction = col_double(),
## Age = col_double(),
## Outcome = col_double()
## )
#Divimos la base en datos de Prueba y de ENtrenamiento
set.seed(2500)
ind <- sample(2, nrow(diabetes), replace = TRUE, prob = c(0.6, 0.4))
entrena_diabetes <- diabetes[ind==1,]
test_diabetes <- diabetes[ind==2,]
#Generamos el arbol de decisiones con los datos de Entrenamiento
arbolDiabetes <- rpart(Outcome ~ ., data = entrena_diabetes, method = "class")
rpart.plot(arbolDiabetes)

print(arbolDiabetes)
## n= 462
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 462 151 0 (0.67316017 0.32683983)
## 2) Glucose< 157.5 401 102 0 (0.74563591 0.25436409)
## 4) Age< 27.5 193 19 0 (0.90155440 0.09844560) *
## 5) Age>=27.5 208 83 0 (0.60096154 0.39903846)
## 10) BMI< 26.3 32 3 0 (0.90625000 0.09375000) *
## 11) BMI>=26.3 176 80 0 (0.54545455 0.45454545)
## 22) DiabetesPedigreeFunction< 0.85 152 61 0 (0.59868421 0.40131579)
## 44) BMI< 43.1 141 52 0 (0.63120567 0.36879433)
## 88) Insulin< 107.5 98 30 0 (0.69387755 0.30612245)
## 176) BloodPressure>=81 23 2 0 (0.91304348 0.08695652) *
## 177) BloodPressure< 81 75 28 0 (0.62666667 0.37333333)
## 354) Insulin>=24.5 12 1 0 (0.91666667 0.08333333) *
## 355) Insulin< 24.5 63 27 0 (0.57142857 0.42857143)
## 710) DiabetesPedigreeFunction< 0.239 19 4 0 (0.78947368 0.21052632) *
## 711) DiabetesPedigreeFunction>=0.239 44 21 1 (0.47727273 0.52272727)
## 1422) Pregnancies< 4.5 13 4 0 (0.69230769 0.30769231) *
## 1423) Pregnancies>=4.5 31 12 1 (0.38709677 0.61290323)
## 2846) SkinThickness< 26.5 23 11 0 (0.52173913 0.47826087)
## 5692) Pregnancies>=9.5 7 1 0 (0.85714286 0.14285714) *
## 5693) Pregnancies< 9.5 16 6 1 (0.37500000 0.62500000) *
## 2847) SkinThickness>=26.5 8 0 1 (0.00000000 1.00000000) *
## 89) Insulin>=107.5 43 21 1 (0.48837209 0.51162791)
## 178) Pregnancies< 7.5 30 11 0 (0.63333333 0.36666667)
## 356) Pregnancies>=3.5 20 4 0 (0.80000000 0.20000000) *
## 357) Pregnancies< 3.5 10 3 1 (0.30000000 0.70000000) *
## 179) Pregnancies>=7.5 13 2 1 (0.15384615 0.84615385) *
## 45) BMI>=43.1 11 2 1 (0.18181818 0.81818182) *
## 23) DiabetesPedigreeFunction>=0.85 24 5 1 (0.20833333 0.79166667) *
## 3) Glucose>=157.5 61 12 1 (0.19672131 0.80327869) *
#Evaluamos el arbol con los datos de prueba
testdiabetesarbol <- predict(arbolDiabetes, newdata = test_diabetes, type = "class")
table(testdiabetesarbol, test_diabetes$Outcome) #Tabla de confusion
##
## testdiabetesarbol 0 1
## 0 155 49
## 1 34 68
printcp(arbolDiabetes) #Grados de precisión del arbol
##
## Classification tree:
## rpart(formula = Outcome ~ ., data = entrena_diabetes, method = "class")
##
## Variables actually used in tree construction:
## [1] Age BloodPressure BMI
## [4] DiabetesPedigreeFunction Glucose Insulin
## [7] Pregnancies SkinThickness
##
## Root node error: 151/462 = 0.32684
##
## n= 462
##
## CP nsplit rel error xerror xstd
## 1 0.245033 0 1.00000 1.00000 0.066768
## 2 0.030905 1 0.75497 0.81457 0.062915
## 3 0.029801 5 0.61589 0.79470 0.062417
## 4 0.026490 7 0.55629 0.78808 0.062248
## 5 0.011589 8 0.52980 0.76821 0.061726
## 6 0.010000 14 0.45033 0.80132 0.062585
plotcp(arbolDiabetes) #Graficamos
