Titanic

1. Importar la base de datos

library(readr)
titanic <- read_csv("titanic.csv")
## Rows: 1310 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (7): pclass, survived, age, sibsp, parch, fare, body
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2. Entender la base de datos

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
summary(titanic)
##      pclass         survived         name               sex           
##  Min.   :1.000   Min.   :0.000   Length:1310        Length:1310       
##  1st Qu.:2.000   1st Qu.:0.000   Class :character   Class :character  
##  Median :3.000   Median :0.000   Mode  :character   Mode  :character  
##  Mean   :2.295   Mean   :0.382                                        
##  3rd Qu.:3.000   3rd Qu.:1.000                                        
##  Max.   :3.000   Max.   :1.000                                        
##  NA's   :1       NA's   :1                                            
##       age              sibsp            parch          ticket         
##  Min.   : 0.1667   Min.   :0.0000   Min.   :0.000   Length:1310       
##  1st Qu.:21.0000   1st Qu.:0.0000   1st Qu.:0.000   Class :character  
##  Median :28.0000   Median :0.0000   Median :0.000   Mode  :character  
##  Mean   :29.8811   Mean   :0.4989   Mean   :0.385                     
##  3rd Qu.:39.0000   3rd Qu.:1.0000   3rd Qu.:0.000                     
##  Max.   :80.0000   Max.   :8.0000   Max.   :9.000                     
##  NA's   :264       NA's   :1        NA's   :1                         
##       fare            cabin             embarked             boat          
##  Min.   :  0.000   Length:1310        Length:1310        Length:1310       
##  1st Qu.:  7.896   Class :character   Class :character   Class :character  
##  Median : 14.454   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 33.295                                                           
##  3rd Qu.: 31.275                                                           
##  Max.   :512.329                                                           
##  NA's   :2                                                                 
##       body        home.dest        
##  Min.   :  1.0   Length:1310       
##  1st Qu.: 72.0   Class :character  
##  Median :155.0   Mode  :character  
##  Mean   :160.8                     
##  3rd Qu.:256.0                     
##  Max.   :328.0                     
##  NA's   :1189
#count(titanic, name, sort=TRUE)
#count(titanic, sex, sort=TRUE)
#count(titanic, ticket, sort=TRUE)
#count(titanic, cabin, sort=TRUE)
#count(titanic, embarked, sort=TRUE)
#count(titanic, boat, sort=TRUE)
#count(titanic, home.dest, sort=TRUE)

Observaciones: 1. Tenemos NA en la base de datos. 2. Un par de nombres estan repetidos.

3. Limpiar la base de datos

# Cambiar de nombre la variable pclass
colnames(titanic)[1] <- "class"

# Extraer las variables de interes
Titanic <- titanic[,c("class","age","sex","survived")]

#Cuantos NA tenemos en la base de datos?
sum(is.na(Titanic))
## [1] 267
# Cuantos NA tengo por variable?
sapply(Titanic, function(x) sum(is.na(x)))
##    class      age      sex survived 
##        1      264        1        1
# Eliminar NA
Titanic <- na.omit(Titanic)

# Convertir las variables categoricas en factores
Titanic$class <- as.factor(Titanic$class)
Titanic$sex <- as.factor(Titanic$sex)
Titanic$survived <- as.factor(Titanic$survived)

4. Crear el arbol de decisiones

library(rpart)
library(rpart.plot)

arbol <- rpart(formula=survived~ ., data=Titanic)
rpart.plot(arbol)

prp(arbol, extra=7)

LS0tCnRpdGxlOiAiVGl0YW5pYyIKYXV0aG9yOiAiUGF0cmljaW8gU2FuY2hleiIKZGF0ZTogIjIwMjMtMDktMTkiCm91dHB1dDogCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFCiAgICB0aGVtZTogInJlYWRhYmxlIgogICAgaGlnaGxpZ2h0OiAicHlnbWVudHMiCi0tLQoKPGNlbnRlcj4KIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7Ij5UaXRhbmljPC9zcGFuPgo8Y2VudGVyPgoKPGNlbnRlcj4KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+MS4gSW1wb3J0YXIgbGEgYmFzZSBkZSBkYXRvczwvc3Bhbj4KPGNlbnRlcj4KYGBge3J9CmxpYnJhcnkocmVhZHIpCnRpdGFuaWMgPC0gcmVhZF9jc3YoInRpdGFuaWMuY3N2IikKYGBgCjxjZW50ZXI+CiMjIDxzcGFuIHN0eWxlPSJjb2xvcjogYmx1ZTsiPjIuIEVudGVuZGVyIGxhIGJhc2UgZGUgZGF0b3M8L3NwYW4+CjxjZW50ZXI+CmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpzdW1tYXJ5KHRpdGFuaWMpCiNjb3VudCh0aXRhbmljLCBuYW1lLCBzb3J0PVRSVUUpCiNjb3VudCh0aXRhbmljLCBzZXgsIHNvcnQ9VFJVRSkKI2NvdW50KHRpdGFuaWMsIHRpY2tldCwgc29ydD1UUlVFKQojY291bnQodGl0YW5pYywgY2FiaW4sIHNvcnQ9VFJVRSkKI2NvdW50KHRpdGFuaWMsIGVtYmFya2VkLCBzb3J0PVRSVUUpCiNjb3VudCh0aXRhbmljLCBib2F0LCBzb3J0PVRSVUUpCiNjb3VudCh0aXRhbmljLCBob21lLmRlc3QsIHNvcnQ9VFJVRSkKYGBgCk9ic2VydmFjaW9uZXM6CjEuIFRlbmVtb3MgTkEgZW4gbGEgYmFzZSBkZSBkYXRvcy4KMi4gVW4gcGFyIGRlIG5vbWJyZXMgZXN0YW4gcmVwZXRpZG9zLgoKPGNlbnRlcj4KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+My4gTGltcGlhciBsYSBiYXNlIGRlIGRhdG9zPC9zcGFuPgo8Y2VudGVyPgpgYGB7cn0KIyBDYW1iaWFyIGRlIG5vbWJyZSBsYSB2YXJpYWJsZSBwY2xhc3MKY29sbmFtZXModGl0YW5pYylbMV0gPC0gImNsYXNzIgoKIyBFeHRyYWVyIGxhcyB2YXJpYWJsZXMgZGUgaW50ZXJlcwpUaXRhbmljIDwtIHRpdGFuaWNbLGMoImNsYXNzIiwiYWdlIiwic2V4Iiwic3Vydml2ZWQiKV0KCiNDdWFudG9zIE5BIHRlbmVtb3MgZW4gbGEgYmFzZSBkZSBkYXRvcz8Kc3VtKGlzLm5hKFRpdGFuaWMpKQoKIyBDdWFudG9zIE5BIHRlbmdvIHBvciB2YXJpYWJsZT8Kc2FwcGx5KFRpdGFuaWMsIGZ1bmN0aW9uKHgpIHN1bShpcy5uYSh4KSkpCgojIEVsaW1pbmFyIE5BClRpdGFuaWMgPC0gbmEub21pdChUaXRhbmljKQoKIyBDb252ZXJ0aXIgbGFzIHZhcmlhYmxlcyBjYXRlZ29yaWNhcyBlbiBmYWN0b3JlcwpUaXRhbmljJGNsYXNzIDwtIGFzLmZhY3RvcihUaXRhbmljJGNsYXNzKQpUaXRhbmljJHNleCA8LSBhcy5mYWN0b3IoVGl0YW5pYyRzZXgpClRpdGFuaWMkc3Vydml2ZWQgPC0gYXMuZmFjdG9yKFRpdGFuaWMkc3Vydml2ZWQpCmBgYAoKPGNlbnRlcj4KIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+NC4gQ3JlYXIgZWwgYXJib2wgZGUgZGVjaXNpb25lczwvc3Bhbj4KPGNlbnRlcj4KYGBge3J9CmxpYnJhcnkocnBhcnQpCmxpYnJhcnkocnBhcnQucGxvdCkKCmFyYm9sIDwtIHJwYXJ0KGZvcm11bGE9c3Vydml2ZWR+IC4sIGRhdGE9VGl0YW5pYykKcnBhcnQucGxvdChhcmJvbCkKCnBycChhcmJvbCwgZXh0cmE9NykKYGBgCgo=