Installar y llamar librerías

library(rpart)
library(rpart.plot)
library(tidyverse)
library(ggplot2)
library(lattice)
library(caret)
library(datasets)

Importar la base de datos

df = read.csv("C:\\Users\\erik-\\OneDrive\\Documentos\\Escuela\\Universidad\\7ºSemestre\\Modulo_2\\titanic.csv")

Entender la base de datos

summary(df)
##      pclass         survived         name               sex           
##  Min.   :1.000   Min.   :0.000   Length:1310        Length:1310       
##  1st Qu.:2.000   1st Qu.:0.000   Class :character   Class :character  
##  Median :3.000   Median :0.000   Mode  :character   Mode  :character  
##  Mean   :2.295   Mean   :0.382                                        
##  3rd Qu.:3.000   3rd Qu.:1.000                                        
##  Max.   :3.000   Max.   :1.000                                        
##  NA's   :1       NA's   :1                                            
##       age              sibsp            parch          ticket         
##  Min.   : 0.1667   Min.   :0.0000   Min.   :0.000   Length:1310       
##  1st Qu.:21.0000   1st Qu.:0.0000   1st Qu.:0.000   Class :character  
##  Median :28.0000   Median :0.0000   Median :0.000   Mode  :character  
##  Mean   :29.8811   Mean   :0.4989   Mean   :0.385                     
##  3rd Qu.:39.0000   3rd Qu.:1.0000   3rd Qu.:0.000                     
##  Max.   :80.0000   Max.   :8.0000   Max.   :9.000                     
##  NA's   :264       NA's   :1        NA's   :1                         
##       fare            cabin             embarked             boat          
##  Min.   :  0.000   Length:1310        Length:1310        Length:1310       
##  1st Qu.:  7.896   Class :character   Class :character   Class :character  
##  Median : 14.454   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 33.295                                                           
##  3rd Qu.: 31.275                                                           
##  Max.   :512.329                                                           
##  NA's   :2                                                                 
##       body        home.dest        
##  Min.   :  1.0   Length:1310       
##  1st Qu.: 72.0   Class :character  
##  Median :155.0   Mode  :character  
##  Mean   :160.8                     
##  3rd Qu.:256.0                     
##  Max.   :328.0                     
##  NA's   :1189
str(df)
## 'data.frame':    1310 obs. of  14 variables:
##  $ pclass   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : int  1 1 0 0 0 1 1 0 1 0 ...
##  $ name     : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ sex      : chr  "female" "male" "female" "male" ...
##  $ age      : num  29 0.917 2 30 25 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : chr  "24160" "113781" "113781" "113781" ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked : chr  "S" "S" "S" "S" ...
##  $ boat     : chr  "2" "11" "" "" ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: chr  "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
head(df)
##   pclass survived                                            name    sex
## 1      1        1                   Allen, Miss. Elisabeth Walton female
## 2      1        1                  Allison, Master. Hudson Trevor   male
## 3      1        0                    Allison, Miss. Helen Loraine female
## 4      1        0            Allison, Mr. Hudson Joshua Creighton   male
## 5      1        0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female
## 6      1        1                             Anderson, Mr. Harry   male
##       age sibsp parch ticket     fare   cabin embarked boat body
## 1 29.0000     0     0  24160 211.3375      B5        S    2   NA
## 2  0.9167     1     2 113781 151.5500 C22 C26        S   11   NA
## 3  2.0000     1     2 113781 151.5500 C22 C26        S        NA
## 4 30.0000     1     2 113781 151.5500 C22 C26        S       135
## 5 25.0000     1     2 113781 151.5500 C22 C26        S        NA
## 6 48.0000     0     0  19952  26.5500     E12        S    3   NA
##                         home.dest
## 1                    St Louis, MO
## 2 Montreal, PQ / Chesterville, ON
## 3 Montreal, PQ / Chesterville, ON
## 4 Montreal, PQ / Chesterville, ON
## 5 Montreal, PQ / Chesterville, ON
## 6                    New York, NY

Crear el árbol

df$survived <- as.factor(df$survived)
df$pclass <- as.factor(df$pclass)
str(df)
## 'data.frame':    1310 obs. of  14 variables:
##  $ pclass   : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ survived : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
##  $ name     : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
##  $ sex      : chr  "female" "male" "female" "male" ...
##  $ age      : num  29 0.917 2 30 25 ...
##  $ sibsp    : int  0 1 1 1 1 0 1 0 2 0 ...
##  $ parch    : int  0 2 2 2 2 0 0 0 0 0 ...
##  $ ticket   : chr  "24160" "113781" "113781" "113781" ...
##  $ fare     : num  211 152 152 152 152 ...
##  $ cabin    : chr  "B5" "C22 C26" "C22 C26" "C22 C26" ...
##  $ embarked : chr  "S" "S" "S" "S" ...
##  $ boat     : chr  "2" "11" "" "" ...
##  $ body     : int  NA NA NA 135 NA NA NA NA NA 22 ...
##  $ home.dest: chr  "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
titanic <- df[,c("pclass","age", "sex", "survived")]


arbol_titanic <- rpart(survived~., data = titanic)

rpart.plot(arbol_titanic)

prp(arbol_titanic, extra = 7, prefix = "fracción\n")

Conclusiones

En conclusión, las probabilidades de sobrevivir en el naurfragio del Titanic son:

  • 100%: Si eres niño varón menor de 9.5 años de 1º o 2º clase.
  • 73%: Si eres mujer.

Y por el contrario, las más bajas probabilidades de sobrevivir son:

  • 17%: Si eres hombre mayor a 9.5 años
  • 38%: Si eres niño varón menor de 9.5 años de 3º clase

Cancer de mama

df_mama <- read.csv("C:\\Users\\erik-\\OneDrive\\Documentos\\Escuela\\Universidad\\7ºSemestre\\Modulo_2\\cancer_de_mama.csv")

Separar los datos en entrenamiento y prueba

df_mama$diagnosis <- as.factor(df_mama$diagnosis)
set.seed(123)
renglones_entrenamiento <- createDataPartition(df_mama$diagnosis, p = 0.8, list = FALSE) 
# el argumento list asegura la aleatoridad en las particiones
entrenamiento <- df_mama[renglones_entrenamiento,]
prueba <- df_mama[-renglones_entrenamiento,]

Evaluación del árbol de decisiónes del cancer de mama

modelo_arbol_mama <- train(diagnosis ~ ., data = entrenamiento, #Species es la y 
                 method = "rpart", # Cambiar
                 preProcess= c("scale", "center"),
                 trControl = trainControl(method = "cv", number =10), 
                 tuneLength = 10 
                 )

resultado_entrenamiento_mama <- predict(modelo_arbol_mama, entrenamiento)
resultado_prueba_mama <- predict(modelo_arbol_mama, prueba)

# Matriz de Confusion 
# Es una tabal de evaluación que desglosa el rendimiento del modelo de clasificación. 


# Matriz de Confusión del Resultado del Entrenamiento
mcre_arbol <- confusionMatrix(resultado_entrenamiento_mama, entrenamiento$diagnosis)
mcre_arbol
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   B   M
##          B 270   9
##          M  16 161
##                                           
##                Accuracy : 0.9452          
##                  95% CI : (0.9201, 0.9642)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8837          
##                                           
##  Mcnemar's Test P-Value : 0.2301          
##                                           
##             Sensitivity : 0.9441          
##             Specificity : 0.9471          
##          Pos Pred Value : 0.9677          
##          Neg Pred Value : 0.9096          
##              Prevalence : 0.6272          
##          Detection Rate : 0.5921          
##    Detection Prevalence : 0.6118          
##       Balanced Accuracy : 0.9456          
##                                           
##        'Positive' Class : B               
## 
# Matriz de Confusión del Resultado del Prueba
mcrp_arbol <- confusionMatrix(resultado_prueba_mama, prueba$diagnosis)
mcrp_arbol
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 62  4
##          M  9 38
##                                           
##                Accuracy : 0.885           
##                  95% CI : (0.8113, 0.9373)
##     No Information Rate : 0.6283          
##     P-Value [Acc > NIR] : 8.286e-10       
##                                           
##                   Kappa : 0.7595          
##                                           
##  Mcnemar's Test P-Value : 0.2673          
##                                           
##             Sensitivity : 0.8732          
##             Specificity : 0.9048          
##          Pos Pred Value : 0.9394          
##          Neg Pred Value : 0.8085          
##              Prevalence : 0.6283          
##          Detection Rate : 0.5487          
##    Detection Prevalence : 0.5841          
##       Balanced Accuracy : 0.8890          
##                                           
##        'Positive' Class : B               
## 

Grafico el árbol de decisiones del cancer de mama

arbol_cancer_mama <- rpart(diagnosis~., data = df_mama)
rpart.plot(arbol_cancer_mama)

prp(arbol_cancer_mama, extra = 7, prefix = "fracción\n")

Conclusiones

En conclusión las probabilidades de tener un cancer maligno de acuerdo con diferentes circustancias son las siguientes:

Si el radius_worst es mayor a 17 - Existe un 94% que el cancer sea maligno.

Si el readius_worst es mayor de 17, los concave point son mayores que 0.14 y el texture_worst es mayor que 26. - Existe un 89% que el cancer sea benigno.

LS0tDQp0aXRsZTogIkFyYm9sIGRlIERlY2lzaW9uZXMgVGl0YW5pYyB5IENhbmNlciBkZSBNYW1hIg0KYXV0aG9yOiAiRXJpayBHb256YWxleiINCmRhdGU6ICIyMDI1LTA4LTIwIg0Kb3V0cHV0OiAgDQogIGh0bWxfZG9jdW1lbnQ6DQogICAgdG9jOiBUUlVFDQogICAgdG9jX2Zsb2F0OiBUcnVlIA0KICAgIGNvZGVfZG93bmxvYWQ6IFRSVUUNCiAgICB0aGVtZTogY29zbW8NCg0KLS0tDQoNCg0KPGNlbnRlcj4NCiFbXShodHRwczovL21lZGlhMS50ZW5vci5jb20vbS9xWUJiRWpBNl9jSUFBQUFDL2EtbmlnaHQtdG8tcmVtZW1iZXItbW92aWUtYS1uaWdodC10by1yZW1lbWJlci5naWYpDQo8L2NlbnRlcj4NCg0KIyA8c3BhbiBzdHlsZSA9ICJjb2xvcjpibHVlOyI+IEluc3RhbGxhciB5IGxsYW1hciBsaWJyZXLDrWFzIDwvc3Bhbj4NCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9DQpsaWJyYXJ5KHJwYXJ0KQ0KbGlicmFyeShycGFydC5wbG90KQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KGxhdHRpY2UpDQpsaWJyYXJ5KGNhcmV0KQ0KbGlicmFyeShkYXRhc2V0cykNCmBgYA0KDQoNCg0KIyA8c3BhbiBzdHlsZSA9ICJjb2xvcjpibHVlOyI+IEltcG9ydGFyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPg0KYGBge3J9DQpkZiA9IHJlYWQuY3N2KCJDOlxcVXNlcnNcXGVyaWstXFxPbmVEcml2ZVxcRG9jdW1lbnRvc1xcRXNjdWVsYVxcVW5pdmVyc2lkYWRcXDfCulNlbWVzdHJlXFxNb2R1bG9fMlxcdGl0YW5pYy5jc3YiKQ0KYGBgDQoNCg0KIyA8c3BhbiBzdHlsZSA9ICJjb2xvcjpibHVlOyI+IEVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPg0KYGBge3J9DQpzdW1tYXJ5KGRmKQ0Kc3RyKGRmKQ0KaGVhZChkZikNCmBgYA0KDQoNCiMgPHNwYW4gc3R5bGUgPSAiY29sb3I6Ymx1ZTsiPiBDcmVhciBlbCDDoXJib2wgPC9zcGFuPg0KYGBge3J9DQpkZiRzdXJ2aXZlZCA8LSBhcy5mYWN0b3IoZGYkc3Vydml2ZWQpDQpkZiRwY2xhc3MgPC0gYXMuZmFjdG9yKGRmJHBjbGFzcykNCnN0cihkZikNCg0KdGl0YW5pYyA8LSBkZlssYygicGNsYXNzIiwiYWdlIiwgInNleCIsICJzdXJ2aXZlZCIpXQ0KDQoNCmFyYm9sX3RpdGFuaWMgPC0gcnBhcnQoc3Vydml2ZWR+LiwgZGF0YSA9IHRpdGFuaWMpDQoNCnJwYXJ0LnBsb3QoYXJib2xfdGl0YW5pYykNCnBycChhcmJvbF90aXRhbmljLCBleHRyYSA9IDcsIHByZWZpeCA9ICJmcmFjY2nDs25cbiIpDQpgYGANCg0KDQojIDxzcGFuIHN0eWxlID0gImNvbG9yOmJsdWU7Ij4gQ29uY2x1c2lvbmVzPC9zcGFuPg0KDQpFbiBjb25jbHVzacOzbiwgbGFzIHByb2JhYmlsaWRhZGVzIGRlIHNvYnJldml2aXIgZW4gZWwgbmF1cmZyYWdpbyBkZWwgVGl0YW5pYyBzb246IA0KDQotIDEwMCU6IFNpIGVyZXMgbmnDsW8gdmFyw7NuIG1lbm9yIGRlIDkuNSBhw7FvcyBkZSAxwrogbyAywrogY2xhc2UuDQotIDczJTogU2kgZXJlcyBtdWplci4gDQoNClkgcG9yIGVsIGNvbnRyYXJpbywgbGFzIG3DoXMgYmFqYXMgcHJvYmFiaWxpZGFkZXMgZGUgc29icmV2aXZpciBzb246IA0KDQotIDE3JTogU2kgZXJlcyBob21icmUgbWF5b3IgYSA5LjUgYcOxb3MNCi0gMzglOiBTaSBlcmVzIG5pw7FvIHZhcsOzbiBtZW5vciBkZSA5LjUgYcOxb3MgZGUgM8K6IGNsYXNlDQoNCg0KIyA8c3BhbiBzdHlsZSA9ICJjb2xvcjpibHVlOyI+IENhbmNlciBkZSBtYW1hIDwvc3Bhbj4NCmBgYHtyfQ0KZGZfbWFtYSA8LSByZWFkLmNzdigiQzpcXFVzZXJzXFxlcmlrLVxcT25lRHJpdmVcXERvY3VtZW50b3NcXEVzY3VlbGFcXFVuaXZlcnNpZGFkXFw3wrpTZW1lc3RyZVxcTW9kdWxvXzJcXGNhbmNlcl9kZV9tYW1hLmNzdiIpDQpgYGANCg0KDQojIDxzcGFuIHN0eWxlID0gImNvbG9yOmJsdWU7Ij5TZXBhcmFyIGxvcyBkYXRvcyBlbiBlbnRyZW5hbWllbnRvIHkgcHJ1ZWJhIDwvc3Bhbj4NCmBgYHtyfQ0KZGZfbWFtYSRkaWFnbm9zaXMgPC0gYXMuZmFjdG9yKGRmX21hbWEkZGlhZ25vc2lzKQ0Kc2V0LnNlZWQoMTIzKQ0KcmVuZ2xvbmVzX2VudHJlbmFtaWVudG8gPC0gY3JlYXRlRGF0YVBhcnRpdGlvbihkZl9tYW1hJGRpYWdub3NpcywgcCA9IDAuOCwgbGlzdCA9IEZBTFNFKSANCiMgZWwgYXJndW1lbnRvIGxpc3QgYXNlZ3VyYSBsYSBhbGVhdG9yaWRhZCBlbiBsYXMgcGFydGljaW9uZXMNCmVudHJlbmFtaWVudG8gPC0gZGZfbWFtYVtyZW5nbG9uZXNfZW50cmVuYW1pZW50byxdDQpwcnVlYmEgPC0gZGZfbWFtYVstcmVuZ2xvbmVzX2VudHJlbmFtaWVudG8sXQ0KYGBgDQoNCg0KIyA8c3BhbiBzdHlsZSA9ICJjb2xvcjpibHVlOyI+IEV2YWx1YWNpw7NuIGRlbCDDoXJib2wgZGUgZGVjaXNpw7NuZXMgZGVsIGNhbmNlciBkZSBtYW1hIDwvc3Bhbj4NCmBgYHtyfQ0KbW9kZWxvX2FyYm9sX21hbWEgPC0gdHJhaW4oZGlhZ25vc2lzIH4gLiwgZGF0YSA9IGVudHJlbmFtaWVudG8sICNTcGVjaWVzIGVzIGxhIHkgDQogICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJycGFydCIsICMgQ2FtYmlhcg0KICAgICAgICAgICAgICAgICBwcmVQcm9jZXNzPSBjKCJzY2FsZSIsICJjZW50ZXIiKSwNCiAgICAgICAgICAgICAgICAgdHJDb250cm9sID0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9MTApLCANCiAgICAgICAgICAgICAgICAgdHVuZUxlbmd0aCA9IDEwIA0KICAgICAgICAgICAgICAgICApDQoNCnJlc3VsdGFkb19lbnRyZW5hbWllbnRvX21hbWEgPC0gcHJlZGljdChtb2RlbG9fYXJib2xfbWFtYSwgZW50cmVuYW1pZW50bykNCnJlc3VsdGFkb19wcnVlYmFfbWFtYSA8LSBwcmVkaWN0KG1vZGVsb19hcmJvbF9tYW1hLCBwcnVlYmEpDQoNCiMgTWF0cml6IGRlIENvbmZ1c2lvbiANCiMgRXMgdW5hIHRhYmFsIGRlIGV2YWx1YWNpw7NuIHF1ZSBkZXNnbG9zYSBlbCByZW5kaW1pZW50byBkZWwgbW9kZWxvIGRlIGNsYXNpZmljYWNpw7NuLiANCg0KDQojIE1hdHJpeiBkZSBDb25mdXNpw7NuIGRlbCBSZXN1bHRhZG8gZGVsIEVudHJlbmFtaWVudG8NCm1jcmVfYXJib2wgPC0gY29uZnVzaW9uTWF0cml4KHJlc3VsdGFkb19lbnRyZW5hbWllbnRvX21hbWEsIGVudHJlbmFtaWVudG8kZGlhZ25vc2lzKQ0KbWNyZV9hcmJvbA0KDQojIE1hdHJpeiBkZSBDb25mdXNpw7NuIGRlbCBSZXN1bHRhZG8gZGVsIFBydWViYQ0KbWNycF9hcmJvbCA8LSBjb25mdXNpb25NYXRyaXgocmVzdWx0YWRvX3BydWViYV9tYW1hLCBwcnVlYmEkZGlhZ25vc2lzKQ0KbWNycF9hcmJvbA0KYGBgDQoNCg0KIyA8c3BhbiBzdHlsZSA9ICJjb2xvcjpibHVlOyI+IEdyYWZpY28gZWwgw6FyYm9sIGRlIGRlY2lzaW9uZXMgZGVsIGNhbmNlciBkZSBtYW1hIDwvc3Bhbj4NCmBgYHtyfQ0KYXJib2xfY2FuY2VyX21hbWEgPC0gcnBhcnQoZGlhZ25vc2lzfi4sIGRhdGEgPSBkZl9tYW1hKQ0KcnBhcnQucGxvdChhcmJvbF9jYW5jZXJfbWFtYSkNCnBycChhcmJvbF9jYW5jZXJfbWFtYSwgZXh0cmEgPSA3LCBwcmVmaXggPSAiZnJhY2Npw7NuXG4iKQ0KYGBgDQoNCiMgPHNwYW4gc3R5bGUgPSAiY29sb3I6Ymx1ZTsiPiBDb25jbHVzaW9uZXMgPC9zcGFuPg0KRW4gY29uY2x1c2nDs24gbGFzIHByb2JhYmlsaWRhZGVzIGRlIHRlbmVyIHVuIGNhbmNlciBtYWxpZ25vIGRlIGFjdWVyZG8gY29uIGRpZmVyZW50ZXMgY2lyY3VzdGFuY2lhcyBzb24gbGFzIHNpZ3VpZW50ZXM6IA0KDQpTaSBlbCByYWRpdXNfd29yc3QgZXMgbWF5b3IgYSAxNyANCi0gRXhpc3RlIHVuIDk0JSBxdWUgZWwgY2FuY2VyIHNlYSBtYWxpZ25vLiANCg0KU2kgZWwgcmVhZGl1c193b3JzdCBlcyBtYXlvciBkZSAxNywgbG9zIGNvbmNhdmUgcG9pbnQgc29uIG1heW9yZXMgcXVlIDAuMTQgeSBlbCB0ZXh0dXJlX3dvcnN0IGVzIG1heW9yIHF1ZSAyNi4NCi0gRXhpc3RlIHVuIDg5JSBxdWUgZWwgY2FuY2VyIHNlYSBiZW5pZ25vLiANCg==