Cargamos base de datos y hacemos análisis exploratorio

df <- read.csv("/Users/monicagonzalez/Downloads/titanic-1.csv")
summary(df)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
str(df)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

Entender base de datos

summary(df)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 

Filtrar base de datos

Titanic <- df[,c("Pclass","Age","Sex","Survived")]
Titanic$Survived <- as.factor(ifelse(Titanic$Survived==0, "Murio", "Sobrevive"))
Titanic$Pclass <- as.factor(Titanic$Pclass)
Titanic$Sex <-  as.factor(Titanic$Sex)
str(Titanic)
## 'data.frame':    891 obs. of  4 variables:
##  $ Pclass  : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Age     : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Survived: Factor w/ 2 levels "Murio","Sobrevive": 1 2 2 2 1 1 1 1 2 2 ...
sum(is.na(Titanic))
## [1] 177
sapply(Titanic, function(x) sum(is.na(x)))
##   Pclass      Age      Sex Survived 
##        0      177        0        0
Titanic <- na.omit(Titanic)

Crear árbol de decisión

# install.packages("rpart")
library(rpart)
arbol <- rpart(formula=Survived ~ ., data = Titanic)
arbol
## n= 714 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 714 290 Murio (0.59383754 0.40616246)  
##    2) Sex=male 453  93 Murio (0.79470199 0.20529801)  
##      4) Age>=6.5 429  77 Murio (0.82051282 0.17948718) *
##      5) Age< 6.5 24   8 Sobrevive (0.33333333 0.66666667) *
##    3) Sex=female 261  64 Sobrevive (0.24521073 0.75478927)  
##      6) Pclass=3 102  47 Murio (0.53921569 0.46078431)  
##       12) Age>=38.5 12   1 Murio (0.91666667 0.08333333) *
##       13) Age< 38.5 90  44 Sobrevive (0.48888889 0.51111111)  
##         26) Age>=5.5 75  35 Murio (0.53333333 0.46666667)  
##           52) Age< 12 8   0 Murio (1.00000000 0.00000000) *
##           53) Age>=12 67  32 Sobrevive (0.47761194 0.52238806) *
##         27) Age< 5.5 15   4 Sobrevive (0.26666667 0.73333333) *
##      7) Pclass=1,2 159   9 Sobrevive (0.05660377 0.94339623) *
# install.packages("rpart.plot")
library(rpart.plot)
rpart.plot(arbol)

prp(arbol,extra = 7,prefix = "fraccion")

# Conclusión Las más altas probabilidades de sobrevivir en el Titanic son niño varón menor de 9.5 años de 1° y 2° clase (100%), y mujeres en 1° y 2° clase (93%). Las más bajas probabilidades de sobrevivir en el Titanic son los hombres mayores de 9.5 años (18%), y los hombres menores de 9.5 años en 3° clase (38%)