# file.choose()
titanic <- read.csv("C:\\Users\\rylun\\Downloads\\titanic(1).csv")
summary(titanic)
## pclass survived name sex
## Min. :1.000 Min. :0.000 Length:1310 Length:1310
## 1st Qu.:2.000 1st Qu.:0.000 Class :character Class :character
## Median :3.000 Median :0.000 Mode :character Mode :character
## Mean :2.295 Mean :0.382
## 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :1.000
## NA's :1 NA's :1
## age sibsp parch ticket
## Min. : 0.1667 Min. :0.0000 Min. :0.000 Length:1310
## 1st Qu.:21.0000 1st Qu.:0.0000 1st Qu.:0.000 Class :character
## Median :28.0000 Median :0.0000 Median :0.000 Mode :character
## Mean :29.8811 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.0000 Max. :8.0000 Max. :9.000
## NA's :264 NA's :1 NA's :1
## fare cabin embarked boat
## Min. : 0.000 Length:1310 Length:1310 Length:1310
## 1st Qu.: 7.896 Class :character Class :character Class :character
## Median : 14.454 Mode :character Mode :character Mode :character
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :2
## body home.dest
## Min. : 1.0 Length:1310
## 1st Qu.: 72.0 Class :character
## Median :155.0 Mode :character
## Mean :160.8
## 3rd Qu.:256.0
## Max. :328.0
## NA's :1189
str(titanic)
## 'data.frame': 1310 obs. of 14 variables:
## $ pclass : int 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : chr "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
## $ sex : chr "female" "male" "female" "male" ...
## $ age : num 29 0.917 2 30 25 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : chr "24160" "113781" "113781" "113781" ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : chr "B5" "C22 C26" "C22 C26" "C22 C26" ...
## $ embarked : chr "S" "S" "S" "S" ...
## $ boat : chr "2" "11" "" "" ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: chr "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
Titanic <- titanic[,c("pclass","age","sex","survived")]
Titanic$survived <- as.factor(ifelse(Titanic$survived==0, "Murio", "Sobrevive"))
Titanic$pclass <- as.factor(Titanic$pclass)
Titanic$sex <- as.factor(Titanic$sex)
str(Titanic)
## 'data.frame': 1310 obs. of 4 variables:
## $ pclass : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num 29 0.917 2 30 25 ...
## $ sex : Factor w/ 3 levels "","female","male": 2 3 2 3 2 3 2 3 2 3 ...
## $ survived: Factor w/ 2 levels "Murio","Sobrevive": 2 2 1 1 1 2 2 1 2 1 ...
sum(is.na(Titanic))
## [1] 266
sapply(Titanic, function(x) sum(is.na(x)))
## pclass age sex survived
## 1 264 0 1
Titanic <- na.omit(Titanic)
#install.packages("rpart")
library(rpart)
arbol <- rpart(formula=survived ~ ., data = Titanic)
arbol
## n= 1046
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1046 427 Murio (0.59177820 0.40822180)
## 2) sex=male 658 135 Murio (0.79483283 0.20516717)
## 4) age>=9.5 615 110 Murio (0.82113821 0.17886179) *
## 5) age< 9.5 43 18 Sobrevive (0.41860465 0.58139535)
## 10) pclass=3 29 11 Murio (0.62068966 0.37931034) *
## 11) pclass=1,2 14 0 Sobrevive (0.00000000 1.00000000) *
## 3) sex=female 388 96 Sobrevive (0.24742268 0.75257732)
## 6) pclass=3 152 72 Murio (0.52631579 0.47368421)
## 12) age>=1.5 145 66 Murio (0.54482759 0.45517241) *
## 13) age< 1.5 7 1 Sobrevive (0.14285714 0.85714286) *
## 7) pclass=1,2 236 16 Sobrevive (0.06779661 0.93220339) *
#install.packages("rpart.plot")
library(rpart.plot)
rpart.plot(arbol)
prp(arbol,extra = 7,prefix = "fraccion")
## Conclusiones 1. Las mujeres tienen una tasa de supervivencia mucho
mayor (0,75 en la raíz del segundo árbol). 2. Si un hombre tenía más de
9,5 años y estaba en tercera clase, la probabilidad de morir era muy
alta 3. Aunque estar en tercera clase reducía las posibilidades de
sobrevivir, las mujeres tenían una mayor tasa de supervivencia incluso
en esta categoría