library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.5
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.0.5
library(rpart)
library(ggplot2)
library(readr)
## Warning: package 'readr' was built under R version 4.0.4
diabetes <- read_csv("diabetes.csv") #Cargamos los datos
## 
## -- Column specification ---------------------------------------------------------------------
## cols(
##   Pregnancies = col_double(),
##   Glucose = col_double(),
##   BloodPressure = col_double(),
##   SkinThickness = col_double(),
##   Insulin = col_double(),
##   BMI = col_double(),
##   DiabetesPedigreeFunction = col_double(),
##   Age = col_double(),
##   Outcome = col_double()
## )
#Divimos la base en datos de Prueba y de ENtrenamiento
set.seed(2500)
ind <- sample(2, nrow(diabetes), replace = TRUE, prob = c(0.6, 0.4))
entrena_diabetes <- diabetes[ind==1,]
test_diabetes <- diabetes[ind==2,]
#Generamos el arbol de decisiones con los datos de Entrenamiento
arbolDiabetes <- rpart(Outcome ~ ., data = entrena_diabetes, method = "class")
rpart.plot(arbolDiabetes)

print(arbolDiabetes)
## n= 462 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 462 151 0 (0.67316017 0.32683983)  
##      2) Glucose< 157.5 401 102 0 (0.74563591 0.25436409)  
##        4) Age< 27.5 193  19 0 (0.90155440 0.09844560) *
##        5) Age>=27.5 208  83 0 (0.60096154 0.39903846)  
##         10) BMI< 26.3 32   3 0 (0.90625000 0.09375000) *
##         11) BMI>=26.3 176  80 0 (0.54545455 0.45454545)  
##           22) DiabetesPedigreeFunction< 0.85 152  61 0 (0.59868421 0.40131579)  
##             44) BMI< 43.1 141  52 0 (0.63120567 0.36879433)  
##               88) Insulin< 107.5 98  30 0 (0.69387755 0.30612245)  
##                176) BloodPressure>=81 23   2 0 (0.91304348 0.08695652) *
##                177) BloodPressure< 81 75  28 0 (0.62666667 0.37333333)  
##                  354) Insulin>=24.5 12   1 0 (0.91666667 0.08333333) *
##                  355) Insulin< 24.5 63  27 0 (0.57142857 0.42857143)  
##                    710) DiabetesPedigreeFunction< 0.239 19   4 0 (0.78947368 0.21052632) *
##                    711) DiabetesPedigreeFunction>=0.239 44  21 1 (0.47727273 0.52272727)  
##                     1422) Pregnancies< 4.5 13   4 0 (0.69230769 0.30769231) *
##                     1423) Pregnancies>=4.5 31  12 1 (0.38709677 0.61290323)  
##                       2846) SkinThickness< 26.5 23  11 0 (0.52173913 0.47826087)  
##                         5692) Pregnancies>=9.5 7   1 0 (0.85714286 0.14285714) *
##                         5693) Pregnancies< 9.5 16   6 1 (0.37500000 0.62500000) *
##                       2847) SkinThickness>=26.5 8   0 1 (0.00000000 1.00000000) *
##               89) Insulin>=107.5 43  21 1 (0.48837209 0.51162791)  
##                178) Pregnancies< 7.5 30  11 0 (0.63333333 0.36666667)  
##                  356) Pregnancies>=3.5 20   4 0 (0.80000000 0.20000000) *
##                  357) Pregnancies< 3.5 10   3 1 (0.30000000 0.70000000) *
##                179) Pregnancies>=7.5 13   2 1 (0.15384615 0.84615385) *
##             45) BMI>=43.1 11   2 1 (0.18181818 0.81818182) *
##           23) DiabetesPedigreeFunction>=0.85 24   5 1 (0.20833333 0.79166667) *
##      3) Glucose>=157.5 61  12 1 (0.19672131 0.80327869) *
#Evaluamos el arbol con los datos de prueba
testdiabetesarbol <- predict(arbolDiabetes, newdata = test_diabetes, type = "class")
table(testdiabetesarbol, test_diabetes$Outcome) #Tabla de confusion
##                  
## testdiabetesarbol   0   1
##                 0 155  49
##                 1  34  68
printcp(arbolDiabetes) #Grados de precisión del arbol
## 
## Classification tree:
## rpart(formula = Outcome ~ ., data = entrena_diabetes, method = "class")
## 
## Variables actually used in tree construction:
## [1] Age                      BloodPressure            BMI                     
## [4] DiabetesPedigreeFunction Glucose                  Insulin                 
## [7] Pregnancies              SkinThickness           
## 
## Root node error: 151/462 = 0.32684
## 
## n= 462 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.245033      0   1.00000 1.00000 0.066768
## 2 0.030905      1   0.75497 0.81457 0.062915
## 3 0.029801      5   0.61589 0.79470 0.062417
## 4 0.026490      7   0.55629 0.78808 0.062248
## 5 0.011589      8   0.52980 0.76821 0.061726
## 6 0.010000     14   0.45033 0.80132 0.062585
plotcp(arbolDiabetes) #Graficamos

Fin