library(readr)
library(rpart)
library(rpart.plot)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
haberman <- read_csv("C:/Users/HP/OneDrive - FEMSA Comercio/Escritorio/haberman.csv")
## Rows: 306 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): x1, x2, x3, y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
haberman <- rename(haberman,Edad = "x1", Operacion_Anio = "x2", NGangleos = "x3", Sobrevivio = "y")
set.seed(8)
sample <- sample(nrow(haberman),nrow(haberman)*0.80)
train <- haberman[sample,]
test <- haberman[-sample,]
haberman1 <- rpart(formula = Sobrevivio~., data = haberman)
haberman1
## n= 306
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 306 59.558820 1.264706
## 2) NGangleos< 4.5 230 34.330430 1.182609
## 4) NGangleos< 1.5 177 22.881360 1.152542
## 8) Operacion_Anio< 62.5 76 7.157895 1.105263
## 16) Edad< 61.5 58 2.844828 1.051724 *
## 17) Edad>=61.5 18 3.611111 1.277778 *
## 9) Operacion_Anio>=62.5 101 15.425740 1.188119
## 18) NGangleos< 0.5 81 10.222220 1.148148
## 36) Edad>=50.5 48 2.812500 1.062500 *
## 37) Edad< 50.5 33 6.545455 1.272727 *
## 19) NGangleos>=0.5 20 4.550000 1.350000
## 38) Edad< 52 11 1.636364 1.181818 *
## 39) Edad>=52 9 2.222222 1.555556 *
## 5) NGangleos>=1.5 53 10.754720 1.283019
## 10) Edad< 50.5 25 2.640000 1.120000 *
## 11) Edad>=50.5 28 6.857143 1.428571
## 22) Operacion_Anio>=59.5 20 4.200000 1.300000 *
## 23) Operacion_Anio< 59.5 8 1.500000 1.750000 *
## 3) NGangleos>=4.5 76 18.986840 1.513158
## 6) Edad< 42.5 15 2.400000 1.200000 *
## 7) Edad>=42.5 61 14.754100 1.590164
## 14) Operacion_Anio>=65.5 12 2.916667 1.416667 *
## 15) Operacion_Anio< 65.5 49 11.387760 1.632653
## 30) NGangleos< 9.5 23 5.739130 1.478261 *
## 31) NGangleos>=9.5 26 4.615385 1.769231
## 62) NGangleos>=13.5 17 3.882353 1.647059 *
## 63) NGangleos< 13.5 9 0.000000 2.000000 *
haberman11 <- rpart(Sobrevivio ~ ., data = train, method = "class",
control = rpart.control(cp = 0))
summary(haberman11)
## Call:
## rpart(formula = Sobrevivio ~ ., data = train, method = "class",
## control = rpart.control(cp = 0))
## n= 244
##
## CP nsplit rel error xerror xstd
## 1 0.07692308 0 1.0000000 1.0000000 0.1062368
## 2 0.03076923 2 0.8461538 1.0153846 0.1067514
## 3 0.01538462 4 0.7846154 0.9538462 0.1046221
## 4 0.00000000 6 0.7538462 1.0615385 0.1082269
##
## Variable importance
## NGangleos Edad Operacion_Anio
## 65 33 2
##
## Node number 1: 244 observations, complexity param=0.07692308
## predicted class=1 expected loss=0.2663934 P(node) =1
## class counts: 179 65
## probabilities: 0.734 0.266
## left son=2 (178 obs) right son=3 (66 obs)
## Primary splits:
## NGangleos < 4.5 to the left, improve=11.1969100, (0 missing)
## Edad < 42.5 to the left, improve= 2.2951350, (0 missing)
## Operacion_Anio < 61.5 to the left, improve= 0.4951075, (0 missing)
##
## Node number 2: 178 observations, complexity param=0.01538462
## predicted class=1 expected loss=0.1741573 P(node) =0.7295082
## class counts: 147 31
## probabilities: 0.826 0.174
## left son=4 (105 obs) right son=5 (73 obs)
## Primary splits:
## NGangleos < 0.5 to the left, improve=1.2980070, (0 missing)
## Edad < 69.5 to the left, improve=1.2423400, (0 missing)
## Operacion_Anio < 68.5 to the left, improve=0.9432664, (0 missing)
## Surrogate splits:
## Edad < 71 to the left, agree=0.607, adj=0.041, (0 split)
## Operacion_Anio < 68.5 to the left, agree=0.596, adj=0.014, (0 split)
##
## Node number 3: 66 observations, complexity param=0.07692308
## predicted class=2 expected loss=0.4848485 P(node) =0.2704918
## class counts: 32 34
## probabilities: 0.485 0.515
## left son=6 (14 obs) right son=7 (52 obs)
## Primary splits:
## Edad < 42.5 to the left, improve=3.216950, (0 missing)
## NGangleos < 10.5 to the left, improve=1.499109, (0 missing)
## Operacion_Anio < 61.5 to the left, improve=0.454575, (0 missing)
##
## Node number 4: 105 observations
## predicted class=1 expected loss=0.1238095 P(node) =0.4303279
## class counts: 92 13
## probabilities: 0.876 0.124
##
## Node number 5: 73 observations, complexity param=0.01538462
## predicted class=1 expected loss=0.2465753 P(node) =0.2991803
## class counts: 55 18
## probabilities: 0.753 0.247
## left son=10 (59 obs) right son=11 (14 obs)
## Primary splits:
## Edad < 60.5 to the left, improve=3.6559750, (0 missing)
## Operacion_Anio < 64.5 to the left, improve=1.6633950, (0 missing)
## NGangleos < 1.5 to the left, improve=0.1429846, (0 missing)
##
## Node number 6: 14 observations
## predicted class=1 expected loss=0.2142857 P(node) =0.05737705
## class counts: 11 3
## probabilities: 0.786 0.214
##
## Node number 7: 52 observations, complexity param=0.03076923
## predicted class=2 expected loss=0.4038462 P(node) =0.2131148
## class counts: 21 31
## probabilities: 0.404 0.596
## left son=14 (21 obs) right son=15 (31 obs)
## Primary splits:
## NGangleos < 8.5 to the left, improve=1.0138840, (0 missing)
## Edad < 46.5 to the right, improve=0.7180481, (0 missing)
## Operacion_Anio < 65.5 to the right, improve=0.5009938, (0 missing)
## Surrogate splits:
## Edad < 66.5 to the right, agree=0.635, adj=0.095, (0 split)
##
## Node number 10: 59 observations
## predicted class=1 expected loss=0.1694915 P(node) =0.2418033
## class counts: 49 10
## probabilities: 0.831 0.169
##
## Node number 11: 14 observations
## predicted class=2 expected loss=0.4285714 P(node) =0.05737705
## class counts: 6 8
## probabilities: 0.429 0.571
##
## Node number 14: 21 observations, complexity param=0.03076923
## predicted class=1 expected loss=0.4761905 P(node) =0.08606557
## class counts: 11 10
## probabilities: 0.524 0.476
## left son=28 (10 obs) right son=29 (11 obs)
## Primary splits:
## NGangleos < 6.5 to the right, improve=1.1852810, (0 missing)
## Operacion_Anio < 62.5 to the left, improve=0.6428571, (0 missing)
## Edad < 54.5 to the left, improve=0.1984127, (0 missing)
## Surrogate splits:
## Operacion_Anio < 63.5 to the right, agree=0.714, adj=0.4, (0 split)
## Edad < 47.5 to the right, agree=0.667, adj=0.3, (0 split)
##
## Node number 15: 31 observations
## predicted class=2 expected loss=0.3225806 P(node) =0.1270492
## class counts: 10 21
## probabilities: 0.323 0.677
##
## Node number 28: 10 observations
## predicted class=1 expected loss=0.3 P(node) =0.04098361
## class counts: 7 3
## probabilities: 0.700 0.300
##
## Node number 29: 11 observations
## predicted class=2 expected loss=0.3636364 P(node) =0.04508197
## class counts: 4 7
## probabilities: 0.364 0.636
printcp(haberman11)
##
## Classification tree:
## rpart(formula = Sobrevivio ~ ., data = train, method = "class",
## control = rpart.control(cp = 0))
##
## Variables actually used in tree construction:
## [1] Edad NGangleos
##
## Root node error: 65/244 = 0.26639
##
## n= 244
##
## CP nsplit rel error xerror xstd
## 1 0.076923 0 1.00000 1.00000 0.10624
## 2 0.030769 2 0.84615 1.01538 0.10675
## 3 0.015385 4 0.78462 0.95385 0.10462
## 4 0.000000 6 0.75385 1.06154 0.10823
plotcp(haberman11)
test$pred <- predict(haberman11, test, type = "class")
base_accuracy <- mean(test$pred == test$Sobrevivio)
haberman12 <- rpart(Sobrevivio ~ ., data = train, method = "class",
control = rpart.control(cp = 0, maxdepth = 8,minsplit = 100))
test$pred <- predict(haberman12, test, type = "class")
accuracy_preprun <- mean(test$pred == test$Sobrevivio)
haberman1_model_preprun <- prune(haberman11, cp = 0.0099502 )
test$pred <- predict(haberman1_model_preprun, test, type = "class")
accuracy_postprun <- mean(test$pred == test$Sobrevivio)
data.frame(base_accuracy, accuracy_preprun, accuracy_postprun)
## base_accuracy accuracy_preprun accuracy_postprun
## 1 0.7258065 0.7419355 0.7258065
rpart.plot(haberman1_model_preprun)
Se puede visualizar que los pacientes que se les removieron los gangleos son mas propensos a sobrevivir. Asimismo se puede ver que los pacientes menores de 61 años son más probables a sobrevivir que los que son mayores a 61.