df <- read.csv("/Users/monicagonzalez/Downloads/titanic-1.csv")
summary(df)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
str(df)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
summary(df)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
Titanic <- df[,c("Pclass","Age","Sex","Survived")]
Titanic$Survived <- as.factor(ifelse(Titanic$Survived==0, "Murio", "Sobrevive"))
Titanic$Pclass <- as.factor(Titanic$Pclass)
Titanic$Sex <- as.factor(Titanic$Sex)
str(Titanic)
## 'data.frame': 891 obs. of 4 variables:
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Survived: Factor w/ 2 levels "Murio","Sobrevive": 1 2 2 2 1 1 1 1 2 2 ...
sum(is.na(Titanic))
## [1] 177
sapply(Titanic, function(x) sum(is.na(x)))
## Pclass Age Sex Survived
## 0 177 0 0
Titanic <- na.omit(Titanic)
# install.packages("rpart")
library(rpart)
arbol <- rpart(formula=Survived ~ ., data = Titanic)
arbol
## n= 714
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 714 290 Murio (0.59383754 0.40616246)
## 2) Sex=male 453 93 Murio (0.79470199 0.20529801)
## 4) Age>=6.5 429 77 Murio (0.82051282 0.17948718) *
## 5) Age< 6.5 24 8 Sobrevive (0.33333333 0.66666667) *
## 3) Sex=female 261 64 Sobrevive (0.24521073 0.75478927)
## 6) Pclass=3 102 47 Murio (0.53921569 0.46078431)
## 12) Age>=38.5 12 1 Murio (0.91666667 0.08333333) *
## 13) Age< 38.5 90 44 Sobrevive (0.48888889 0.51111111)
## 26) Age>=5.5 75 35 Murio (0.53333333 0.46666667)
## 52) Age< 12 8 0 Murio (1.00000000 0.00000000) *
## 53) Age>=12 67 32 Sobrevive (0.47761194 0.52238806) *
## 27) Age< 5.5 15 4 Sobrevive (0.26666667 0.73333333) *
## 7) Pclass=1,2 159 9 Sobrevive (0.05660377 0.94339623) *
# install.packages("rpart.plot")
library(rpart.plot)
rpart.plot(arbol)
prp(arbol,extra = 7,prefix = "fraccion")
# Conclusión Las más altas probabilidades de sobrevivir en el Titanic
son niño varón menor de 9.5 años de 1° y 2° clase (100%), y mujeres en
1° y 2° clase (93%). Las más bajas probabilidades de sobrevivir en el
Titanic son los hombres mayores de 9.5 años (18%), y los hombres menores
de 9.5 años en 3° clase (38%)