# librerias a cargar
library(mlbench)  # dataset destinados a pruebas de machine learning
library(tidyverse)# 
library(corrplot) # 
library(plotly)   #
library(caret)    # creación de entrenamiento de clasificación y regresión
library(caTools)  # 
library(reshape2) #

Clasificación

library(titanic)
datos <- titanic_train

Analisis exploratorio

En el dataset de Titanic las variables disponibles son:

PassengerId: identificador único del pasajero.
Survived: si el pasajero sobrevivió al naufragio, codificada como 0 (no) y 1 (si). Esta es la variable respuesta que interesa predecir.
Pclass: clase a la que pertenecía el pasajero: 1, 2 o 3.
Name: nombre del pasajero.
Sex: sexo del pasajero.
Age: edad del pasajero.
SibSp: número de hermanos, hermanas, hermanastros o hermanastras en el barco.
Parch: número de padres e hijos en el barco.
Ticket: identificador del billete.
Fare: precio pagado por el billete.
Cabin: identificador del camarote asignado al pasajero.
Embarked: puerto en el que embarcó el pasajero.

# Resumen del set de datos
glimpse(datos)

## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, …
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, …
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (F…
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "ma…
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14,…
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, …
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, …
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "3…
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625…
## $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "…
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S…

Cambio de tipo de variable en survived

datos$Survived <- if_else(datos$Survived == 1, "Si", "No")
datos$Survived <- as.factor(datos$Survived)

La variable Pclass es cualitativa ordinal. Se cambiara como factor.

datos$Pclass <- as.factor(datos$Pclass)

Las variables SibSp y Parch, Sex y Embarked son cuantitativas discretas, pueden tomar únicamente determinados valores numéricos.

datos$SibSp <- as.factor(datos$SibSp)
datos$Parch <- as.factor(datos$Parch)
datos$Sex      <- as.factor(datos$Sex)
datos$Embarked <- as.factor(datos$Embarked)

head(datos)

##   PassengerId Survived Pclass
## 1           1       No      3
## 2           2       Si      1
## 3           3       Si      3
## 4           4       Si      1
## 5           5       No      3
## 6           6       No      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

Valores ausentes

# numero de observaciones
nrow(datos)

## [1] 891

Hay filas incompletas?

# existe filas incompletas
any(!complete.cases(datos))

## [1] TRUE

# que variables presentan valores nulos o ausentes
map_dbl(datos, .f = function(x){sum(is.na(x))})

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0

# que variables contienen ""
datos %>% map_lgl(.f = function(x){any(!is.na(x) & x == "")})

## PassengerId    Survived      Pclass        Name         Sex         Age 
##       FALSE       FALSE       FALSE       FALSE       FALSE       FALSE 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##       FALSE       FALSE       FALSE       FALSE        TRUE        TRUE

# Las variables Cabin y Embarked contienen al menos un valor "", se sustituyen por NA.
# La variable Cabin está almacenada como character
datos$Cabin[datos$Cabin == ""] <- NA

Para el caso de factors deben cambiar el tipo.

datos$Embarked <- as.character(datos$Embarked)
datos$Embarked[datos$Embarked == ""] <- NA
datos$Embarked <- as.factor(datos$Embarked)
levels(datos$Embarked)

## [1] "C" "Q" "S"

# Número de datos ausentes por variable
map_dbl(datos, .f = function(x){sum(is.na(x))})

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

Distribucion de respuesta

ggplot(data = datos, aes(x = Survived, y = ..count.., fill = Survived)) +
  geom_bar() +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  labs(title = "Supervivencia") +
  theme_bw() +
  theme(legend.position = "bottom")

# frecuencias
table(datos$Survived)

## 
##  No  Si 
## 549 342

# en forma de proporción
prop.table(table(datos$Survived)) %>% round(digits = 2)

## 
##   No   Si 
## 0.62 0.38

# Porcentaje de aciertos para predicción "no sobrevivieron".
n_observaciones <- nrow(datos)
predicciones <- rep(x = "No",  n_observaciones)
mean(predicciones == datos$Survived) * 100

## [1] 61.61616

Distribución de variables continuas

Como el objetivo del estudio es predecir qué pasajeros sobrevivieron y cuáles no, el análisis de cada variable se hace en relación a la variable respuesta Survived. Analizando los datos de esta forma, se pueden empezar a extraer ideas sobre qué variables están más relacionadas con la supervivencia.

Edad

library(ggpubr)
p1 <- ggplot(data = datos, aes(x = Age, fill = Survived)) +
      geom_density(alpha = 0.5) +
      scale_fill_manual(values = c("gray50", "orangered2")) +
      geom_rug(aes(color = Survived), alpha = 0.5) +
      scale_color_manual(values = c("gray50", "orangered2")) +
      theme_bw()
p2 <- ggplot(data = datos, aes(x = Survived, y = Age, color = Survived)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(alpha = 0.3, width = 0.15) +
      scale_color_manual(values = c("gray50", "orangered2")) +
      theme_bw()
final_plot <- ggarrange(p1, p2, legend = "top")
final_plot <- annotate_figure(final_plot, top = text_grob("Age", size = 15))
final_plot

# Estadísticos de la edad de los supervivientes y fallecidos
datos %>% filter(!is.na(Age)) %>% group_by(Survived) %>%
          summarise(media = mean(Age),
                    mediana = median(Age),
                    min = min(Age),
                    max = max(Age))

## # A tibble: 2 x 5
##   Survived media mediana   min   max
##   <fct>    <dbl>   <dbl> <dbl> <dbl>
## 1 No        30.6      28  1       74
## 2 Si        28.3      28  0.42    80

Fare (costo del billete)

p1 <- ggplot(data = datos, aes(x = Fare, fill = Survived)) +
      geom_density(alpha = 0.5) +
      scale_fill_manual(values = c("gray50", "orangered2")) +
      geom_rug(aes(color = Survived), alpha = 0.5) +
      scale_color_manual(values = c("gray50", "orangered2")) +
      theme_bw()
p2 <- ggplot(data = datos, aes(x = Survived, y = Fare, color = Survived)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(alpha = 0.3, width = 0.15) +
      scale_color_manual(values = c("gray50", "orangered2")) +
      theme_bw()
final_plot <- ggarrange(p1, p2, legend = "top")
final_plot <- annotate_figure(final_plot, top = text_grob("Fare", size = 15))
final_plot

# Estadísticos del precio del billete de los supervivientes y fallecidos
datos %>% filter(!is.na(Fare)) %>% group_by(Survived) %>%
          summarise(media = mean(Fare),
                    mediana = median(Fare),
                    min = min(Fare),
                    max = max(Fare))

## # A tibble: 2 x 5
##   Survived media mediana   min   max
##   <fct>    <dbl>   <dbl> <dbl> <dbl>
## 1 No        22.1    10.5     0  263 
## 2 Si        48.4    26       0  512.

p1 <- ggplot(data = datos, aes(x = log(Fare), fill = Survived)) +
      geom_density(alpha = 0.5) +
      scale_fill_manual(values = c("gray50", "orangered2")) +
      geom_rug(aes(color = Survived), alpha = 0.5) +
      scale_color_manual(values = c("gray50", "orangered2")) +
      theme_bw()
p2 <- ggplot(data = datos, aes(x = Survived, y = log(Fare), color = Survived)) +
      geom_boxplot(outlier.shape = NA) +
      geom_jitter(alpha = 0.3, width = 0.15) +
      scale_color_manual(values = c("gray50", "orangered2")) +
      theme_bw()
final_plot <- ggarrange(p1, p2, legend = "top")
final_plot <- annotate_figure(final_plot, top = text_grob("Log(Fare)", size =15))
final_plot

Los datos indican que el precio medio de los billetes de las personas que sobrevivieron era superior al de los que fallecieron.

Variables cualitativas

Clase

ggplot(data = datos, aes(x = Pclass, y = ..count.., fill = Survived)) +
  geom_bar() +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  labs(title = "Pclass") +
  theme_bw() +
  theme(legend.position = "bottom")

# Tabla de frecuencias relativas de supervivientes por clase
prop.table(table(datos$Pclass, datos$Survived), margin = 1) %>% round(digits = 2)

##    
##       No   Si
##   1 0.37 0.63
##   2 0.53 0.47
##   3 0.76 0.24

Sex

ggplot(data = datos, aes(x = Sex, y = ..count.., fill = Survived)) +
  geom_bar() +
  labs(title = "Sex") +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  theme_bw() +
  theme(legend.position = "bottom")

# Tabla de frecuencias relativas de supervivientes por sexo
prop.table(table(datos$Sex, datos$Survived), margin = 1) %>% round(digits = 2)

##         
##            No   Si
##   female 0.26 0.74
##   male   0.81 0.19

SibSp (hermanos)

ggplot(data = datos, aes(x = SibSp, y = ..count.., fill = Survived)) +
  geom_bar() +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  labs(title = "SibSp") +
  theme_bw() +
  theme(legend.position = "bottom")

# Tabla de frecuencias relativas de supervivientes por número de familiares
prop.table(table(datos$SibSp, datos$Survived), margin = 1) %>% round(digits = 2)

##    
##       No   Si
##   0 0.65 0.35
##   1 0.46 0.54
##   2 0.54 0.46
##   3 0.75 0.25
##   4 0.83 0.17
##   5 1.00 0.00
##   8 1.00 0.00

Parch (numero de padres e hijos)

ggplot(data = datos, aes(x = Parch, y = ..count.., fill = Survived)) +
  geom_bar() +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  labs(title = "Parch") +
  theme_bw() +
  theme(legend.position = "bottom")

# Tabla de frecuencias relativas de supervivientes por Parch
prop.table(table(datos$Parch, datos$Survived), margin = 1) %>% round(digits = 2)

##    
##       No   Si
##   0 0.66 0.34
##   1 0.45 0.55
##   2 0.50 0.50
##   3 0.40 0.60
##   4 1.00 0.00
##   5 0.80 0.20
##   6 1.00 0.00

Edad como grupo

datos <- datos %>%
         mutate(Age_grupo = case_when(Age <= 10  ~ "niño",
                                      Age > 10 & Age <= 60  ~ "adulto",
                                      Age > 60 ~ "anciano"))
datos$Age_grupo <- as.factor(datos$Age_grupo)

ggplot(data = datos, aes(x = Age_grupo, y = ..count.., fill = Survived)) +
  geom_bar() +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  labs(title = "Age_grupo") +
  theme_bw() +
  theme(legend.position = "bottom")

# Tabla de frecuencias relativas de supervivientes por grupo de edad
prop.table(table(datos$Age_grupo, datos$Survived), margin = 1) %>% round(digits = 2)

##          
##             No   Si
##   adulto  0.61 0.39
##   anciano 0.77 0.23
##   niño    0.41 0.59

Titanic

Patricio Araneda

2021-04-20