Librerias

library(readr)
library(rpart)
library(rpart.plot)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Lectura de Datos

haberman <- read_csv("C:/Users/HP/OneDrive - FEMSA Comercio/Escritorio/haberman.csv")
## Rows: 306 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): x1, x2, x3, y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Renombración de Variables

haberman <- rename(haberman,Edad = "x1", Operacion_Anio = "x2", NGangleos = "x3", Sobrevivio = "y")

Set Seed

set.seed(8)
sample <- sample(nrow(haberman),nrow(haberman)*0.80)
train <- haberman[sample,]
test <- haberman[-sample,]
haberman1 <- rpart(formula = Sobrevivio~., data = haberman)
haberman1
## n= 306 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 306 59.558820 1.264706  
##    2) NGangleos< 4.5 230 34.330430 1.182609  
##      4) NGangleos< 1.5 177 22.881360 1.152542  
##        8) Operacion_Anio< 62.5 76  7.157895 1.105263  
##         16) Edad< 61.5 58  2.844828 1.051724 *
##         17) Edad>=61.5 18  3.611111 1.277778 *
##        9) Operacion_Anio>=62.5 101 15.425740 1.188119  
##         18) NGangleos< 0.5 81 10.222220 1.148148  
##           36) Edad>=50.5 48  2.812500 1.062500 *
##           37) Edad< 50.5 33  6.545455 1.272727 *
##         19) NGangleos>=0.5 20  4.550000 1.350000  
##           38) Edad< 52 11  1.636364 1.181818 *
##           39) Edad>=52 9  2.222222 1.555556 *
##      5) NGangleos>=1.5 53 10.754720 1.283019  
##       10) Edad< 50.5 25  2.640000 1.120000 *
##       11) Edad>=50.5 28  6.857143 1.428571  
##         22) Operacion_Anio>=59.5 20  4.200000 1.300000 *
##         23) Operacion_Anio< 59.5 8  1.500000 1.750000 *
##    3) NGangleos>=4.5 76 18.986840 1.513158  
##      6) Edad< 42.5 15  2.400000 1.200000 *
##      7) Edad>=42.5 61 14.754100 1.590164  
##       14) Operacion_Anio>=65.5 12  2.916667 1.416667 *
##       15) Operacion_Anio< 65.5 49 11.387760 1.632653  
##         30) NGangleos< 9.5 23  5.739130 1.478261 *
##         31) NGangleos>=9.5 26  4.615385 1.769231  
##           62) NGangleos>=13.5 17  3.882353 1.647059 *
##           63) NGangleos< 13.5 9  0.000000 2.000000 *
haberman11 <- rpart(Sobrevivio ~ ., data = train, method = "class",
                       control = rpart.control(cp = 0))
summary(haberman11)
## Call:
## rpart(formula = Sobrevivio ~ ., data = train, method = "class", 
##     control = rpart.control(cp = 0))
##   n= 244 
## 
##           CP nsplit rel error    xerror      xstd
## 1 0.07692308      0 1.0000000 1.0000000 0.1062368
## 2 0.03076923      2 0.8461538 1.0153846 0.1067514
## 3 0.01538462      4 0.7846154 0.9538462 0.1046221
## 4 0.00000000      6 0.7538462 1.0615385 0.1082269
## 
## Variable importance
##      NGangleos           Edad Operacion_Anio 
##             65             33              2 
## 
## Node number 1: 244 observations,    complexity param=0.07692308
##   predicted class=1  expected loss=0.2663934  P(node) =1
##     class counts:   179    65
##    probabilities: 0.734 0.266 
##   left son=2 (178 obs) right son=3 (66 obs)
##   Primary splits:
##       NGangleos      < 4.5  to the left,  improve=11.1969100, (0 missing)
##       Edad           < 42.5 to the left,  improve= 2.2951350, (0 missing)
##       Operacion_Anio < 61.5 to the left,  improve= 0.4951075, (0 missing)
## 
## Node number 2: 178 observations,    complexity param=0.01538462
##   predicted class=1  expected loss=0.1741573  P(node) =0.7295082
##     class counts:   147    31
##    probabilities: 0.826 0.174 
##   left son=4 (105 obs) right son=5 (73 obs)
##   Primary splits:
##       NGangleos      < 0.5  to the left,  improve=1.2980070, (0 missing)
##       Edad           < 69.5 to the left,  improve=1.2423400, (0 missing)
##       Operacion_Anio < 68.5 to the left,  improve=0.9432664, (0 missing)
##   Surrogate splits:
##       Edad           < 71   to the left,  agree=0.607, adj=0.041, (0 split)
##       Operacion_Anio < 68.5 to the left,  agree=0.596, adj=0.014, (0 split)
## 
## Node number 3: 66 observations,    complexity param=0.07692308
##   predicted class=2  expected loss=0.4848485  P(node) =0.2704918
##     class counts:    32    34
##    probabilities: 0.485 0.515 
##   left son=6 (14 obs) right son=7 (52 obs)
##   Primary splits:
##       Edad           < 42.5 to the left,  improve=3.216950, (0 missing)
##       NGangleos      < 10.5 to the left,  improve=1.499109, (0 missing)
##       Operacion_Anio < 61.5 to the left,  improve=0.454575, (0 missing)
## 
## Node number 4: 105 observations
##   predicted class=1  expected loss=0.1238095  P(node) =0.4303279
##     class counts:    92    13
##    probabilities: 0.876 0.124 
## 
## Node number 5: 73 observations,    complexity param=0.01538462
##   predicted class=1  expected loss=0.2465753  P(node) =0.2991803
##     class counts:    55    18
##    probabilities: 0.753 0.247 
##   left son=10 (59 obs) right son=11 (14 obs)
##   Primary splits:
##       Edad           < 60.5 to the left,  improve=3.6559750, (0 missing)
##       Operacion_Anio < 64.5 to the left,  improve=1.6633950, (0 missing)
##       NGangleos      < 1.5  to the left,  improve=0.1429846, (0 missing)
## 
## Node number 6: 14 observations
##   predicted class=1  expected loss=0.2142857  P(node) =0.05737705
##     class counts:    11     3
##    probabilities: 0.786 0.214 
## 
## Node number 7: 52 observations,    complexity param=0.03076923
##   predicted class=2  expected loss=0.4038462  P(node) =0.2131148
##     class counts:    21    31
##    probabilities: 0.404 0.596 
##   left son=14 (21 obs) right son=15 (31 obs)
##   Primary splits:
##       NGangleos      < 8.5  to the left,  improve=1.0138840, (0 missing)
##       Edad           < 46.5 to the right, improve=0.7180481, (0 missing)
##       Operacion_Anio < 65.5 to the right, improve=0.5009938, (0 missing)
##   Surrogate splits:
##       Edad < 66.5 to the right, agree=0.635, adj=0.095, (0 split)
## 
## Node number 10: 59 observations
##   predicted class=1  expected loss=0.1694915  P(node) =0.2418033
##     class counts:    49    10
##    probabilities: 0.831 0.169 
## 
## Node number 11: 14 observations
##   predicted class=2  expected loss=0.4285714  P(node) =0.05737705
##     class counts:     6     8
##    probabilities: 0.429 0.571 
## 
## Node number 14: 21 observations,    complexity param=0.03076923
##   predicted class=1  expected loss=0.4761905  P(node) =0.08606557
##     class counts:    11    10
##    probabilities: 0.524 0.476 
##   left son=28 (10 obs) right son=29 (11 obs)
##   Primary splits:
##       NGangleos      < 6.5  to the right, improve=1.1852810, (0 missing)
##       Operacion_Anio < 62.5 to the left,  improve=0.6428571, (0 missing)
##       Edad           < 54.5 to the left,  improve=0.1984127, (0 missing)
##   Surrogate splits:
##       Operacion_Anio < 63.5 to the right, agree=0.714, adj=0.4, (0 split)
##       Edad           < 47.5 to the right, agree=0.667, adj=0.3, (0 split)
## 
## Node number 15: 31 observations
##   predicted class=2  expected loss=0.3225806  P(node) =0.1270492
##     class counts:    10    21
##    probabilities: 0.323 0.677 
## 
## Node number 28: 10 observations
##   predicted class=1  expected loss=0.3  P(node) =0.04098361
##     class counts:     7     3
##    probabilities: 0.700 0.300 
## 
## Node number 29: 11 observations
##   predicted class=2  expected loss=0.3636364  P(node) =0.04508197
##     class counts:     4     7
##    probabilities: 0.364 0.636
printcp(haberman11)
## 
## Classification tree:
## rpart(formula = Sobrevivio ~ ., data = train, method = "class", 
##     control = rpart.control(cp = 0))
## 
## Variables actually used in tree construction:
## [1] Edad      NGangleos
## 
## Root node error: 65/244 = 0.26639
## 
## n= 244 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.076923      0   1.00000 1.00000 0.10624
## 2 0.030769      2   0.84615 1.01538 0.10675
## 3 0.015385      4   0.78462 0.95385 0.10462
## 4 0.000000      6   0.75385 1.06154 0.10823
plotcp(haberman11)

test$pred <- predict(haberman11, test, type = "class")
base_accuracy <- mean(test$pred == test$Sobrevivio)
haberman12 <- rpart(Sobrevivio ~ ., data = train, method = "class", 
                          control = rpart.control(cp = 0, maxdepth = 8,minsplit = 100))
test$pred <- predict(haberman12, test, type = "class")
accuracy_preprun <- mean(test$pred == test$Sobrevivio)
haberman1_model_preprun <- prune(haberman11, cp = 0.0099502 )
test$pred <- predict(haberman1_model_preprun, test, type = "class")
accuracy_postprun <- mean(test$pred == test$Sobrevivio)
data.frame(base_accuracy, accuracy_preprun, accuracy_postprun)
##   base_accuracy accuracy_preprun accuracy_postprun
## 1     0.7258065        0.7419355         0.7258065
rpart.plot(haberman1_model_preprun)

Realiza una conclusión.

Se puede visualizar que los pacientes que se les removieron los gangleos son mas propensos a sobrevivir. Asimismo se puede ver que los pacientes menores de 61 años son más probables a sobrevivir que los que son mayores a 61.