Tarea 2: Ejercicios ‘Tu turno’ — PARSONS PROBLEMS

Author

Iván Marcelo Canio Herrera

0.1 Introducción

El presente documento desarrolla los ejercicios correspondientes a la sección “Tu turno” del tutorial de Análisis Exploratorio de Datos (EDA).

Se emplean herramientas del ecosistema tidyverse, incluyendo funciones de dplyr para manipulación de datos y ggplot2 para visualización, junto con patchwork para la composición de figuras.

En primer lugar cargaremos las liberias

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.1     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(patchwork)

data("msleep")
glimpse(msleep)
Rows: 83
Columns: 11
$ name         <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Greater shor…
$ genus        <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos", "Bra…
$ vore         <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "carn…
$ order        <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha", "Art…
$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", NA, "dome…
$ sleep_total  <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1, 3.0, 5…
$ sleep_rem    <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6, 0.8, …
$ sleep_cycle  <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.3833333, N…
$ awake        <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9, 21.0, 1…
$ brainwt      <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.07000, 0…
$ bodywt       <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.490, 0.04…

0.2 Ejercicio 1.1

msleep_sueno_z <- msleep %>%
  mutate(
    across(
      c(sleep_total, sleep_rem, sleep_cycle, awake),
      ~ (. - mean(., na.rm = TRUE)) / sd(., na.rm = TRUE),
      .names = "{.col}_z"
    )
  )

msleep_sueno_z %>%
  select(name, sleep_total, sleep_total_z) %>%
  head(3)
# A tibble: 3 × 3
  name            sleep_total sleep_total_z
  <chr>                 <dbl>         <dbl>
1 Cheetah                12.1         0.374
2 Owl monkey             17           1.48 
3 Mountain beaver        14.4         0.891

0.3 Ejercicio 1.2 — Clasificación del sueño total

msleep_categoria <- msleep %>%
  mutate(
    sleep_category = case_when(
      sleep_total < 6 ~ "short",
      sleep_total >= 6 & sleep_total < 10 ~ "medium",
      sleep_total >= 10 & sleep_total <= 16 ~ "long",
      sleep_total > 16 ~ "very_long",
      TRUE ~ NA_character_
    )
  )

msleep_categoria %>%
  count(sleep_category)
# A tibble: 4 × 2
  sleep_category     n
  <chr>          <int>
1 long              37
2 medium            22
3 short             16
4 very_long          8
## La variable `sleep_category` clasifica a los mamíferos según la cantidad total de horas de sueño.

0.4 Ejercicio 1.3

# (a) Construir la tabla de referencia
vore_lookup <- tibble(
  vore = c("carni", "herbi", "omni", "insecti"),
  vore_es = c("carnívoro", "herbívoro", "omnívoro", "insectívoro")
)
vore_lookup
# A tibble: 4 × 2
  vore    vore_es    
  <chr>   <chr>      
1 carni   carnívoro  
2 herbi   herbívoro  
3 omni    omnívoro   
4 insecti insectívoro
# (b) Left join
msleep_left <- msleep %>%
  left_join(vore_lookup, by = "vore")

# Conteo de NA en la nueva columna
na_vore_es <- sum(is.na(msleep_left$vore_es))
na_vore_es
[1] 7
# Conteo de NA en la variable original
na_vore_original <- sum(is.na(msleep$vore))
na_vore_original
[1] 7
# (c) Inner join
msleep_inner <- msleep %>%
  inner_join(vore_lookup, by = "vore")

nrow(msleep_inner)
[1] 76
##El "inner_join()" conserva únicamente las observaciones que presentan coincidencia en ambas tablas, eliminando aquellas filas donde `vore` es NA. Por esta razón, el número total de filas disminuye en comparación con el conjunto de datos original.

0.5 Ejercicio 2.1 — Figura facetada de sueño REM

ggplot(msleep %>% filter(!is.na(vore)),
       aes(x = sleep_total, y = sleep_rem)) +
  geom_point(alpha = 0.4, size = 2, color = "#2C3E50") +
  geom_smooth(method = "lm", se = FALSE,
              color = "#1B3A4B", linewidth = 1.2) +
  facet_wrap(
    ~ vore,
    scales = "free_y",
    labeller = as_labeller(c(
      carni = "Carnívoros",
      herbi = "Herbívoros",
      omni = "Omnívoros",
      insecti = "Insectívoros"
    ))
  ) +
  labs(
    x = "Sueño total (horas)",
    y = "Sueño REM (horas)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.major = element_line(linewidth = 0.4, color = "grey80"),
    panel.grid.minor = element_line(linewidth = 0.2, color = "grey90"),
    strip.text = element_text(face = "bold"),
    axis.title = element_text(face = "bold")
  )
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).

0.6 Ejercicio 2.2 — Aplicación de tema personalizado

theme_clase03 <- function(base_size = 12) {
  theme_minimal(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold", size = base_size + 3),
      plot.subtitle = element_text(color = "grey40"),
      strip.text = element_text(face = "bold"),
      strip.background = element_rect(fill = "grey92", color = NA),
      panel.grid.major = element_line(linewidth = 0.4, color = "grey80"),
      panel.grid.minor = element_line(linewidth = 0.2, color = "grey90"),
      axis.title = element_text(face = "bold"),
      legend.position = "bottom"
    )
}
ggplot(msleep %>% filter(!is.na(vore)),
       aes(x = sleep_total, y = sleep_rem, color = vore)) +
  geom_point(alpha = 0.4, size = 2) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.2) +
  facet_wrap(~ vore, scales = "free_y") +
  scale_color_brewer(palette = "Set2") +
  labs(
    x = "Sueño total (horas)",
    y = "Sueño REM (horas)",
    color = "Dieta"
  ) +
  theme_clase03() +
  theme(legend.position = "top")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).

0.7 Ejercicio 2.3 - Figura compuesta 3 paneles

# Panel A
p1 <- ggplot(msleep %>% filter(!is.na(vore)),
             aes(x = sleep_total, fill = vore)) +
  geom_density(alpha = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = "Sueño total (horas)", y = "Densidad", fill = "Dieta") +
  theme_clase03()

# Panel B
p2 <- ggplot(msleep, aes(x = bodywt, y = brainwt)) +
  geom_point(alpha = 0.5, size = 2) +
  scale_x_log10() +
  scale_y_log10() +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.2) +
  labs(
    x = "Masa corporal (kg, escala log)",
    y = "Masa cerebral (kg, escala log)"
  ) +
  theme_clase03()

# Panel C
p3 <- ggplot(msleep %>% filter(!is.na(conservation)),
             aes(x = conservation, y = awake, fill = conservation)) +
  geom_boxplot(linewidth = 0.7) +
  scale_fill_brewer(palette = "Set3") +
  labs(
    x = "Estado de conservación",
    y = "Horas despierto"
  ) +
  theme_clase03() +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    legend.position = "none"
  )

# Figura final
(p1 | p2) / p3 +
  plot_layout(guides = "collect") +
  plot_annotation(
    tag_levels = "A",
    title = "Análisis exploratorio de patrones de sueño"
  ) &
  theme(legend.position = "bottom")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 27 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 27 rows containing missing values or values outside the scale range
(`geom_point()`).

0.8 Ejercicio 3

# Inspección basica de los datos
dim(msleep)
[1] 83 11
glimpse(msleep)
Rows: 83
Columns: 11
$ name         <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Greater shor…
$ genus        <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos", "Bra…
$ vore         <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "carn…
$ order        <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha", "Art…
$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", NA, "dome…
$ sleep_total  <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1, 3.0, 5…
$ sleep_rem    <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6, 0.8, …
$ sleep_cycle  <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.3833333, N…
$ awake        <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9, 21.0, 1…
$ brainwt      <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.07000, 0…
$ bodywt       <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.490, 0.04…
# Missingness en sleep_rem
msleep %>%
  summarise(
    n_na = sum(is.na(sleep_rem)),
    porcentaje = round(100 * n_na / n(), 1)
  )
# A tibble: 1 × 2
   n_na porcentaje
  <int>      <dbl>
1    22       26.5
# Distribución univariada
p_hist <- ggplot(msleep, aes(sleep_rem)) +
  geom_histogram(bins = 15, fill = "#1C7293", color = "black", linewidth = 0.2) +
  labs(x = "Sueño REM (horas)", y = "Frecuencia") +
  theme_clase03()

p_box <- ggplot(msleep %>% filter(!is.na(vore)),
                aes(vore, sleep_rem, fill = vore)) +
  geom_boxplot(linewidth = 0.7) +
  scale_fill_brewer(palette = "Set2") +
  labs(x = "Dieta", y = "Sueño REM (horas)") +
  theme_clase03() +
  theme(legend.position = "none")

p_hist | p_box
Warning: Removed 22 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 20 rows containing non-finite outside the scale range
(`stat_boxplot()`).

# Relación REM vs total
ggplot(msleep, aes(sleep_total, sleep_rem)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.2) +
  labs(
    x = "Sueño total (horas)",
    y = "Sueño REM (horas)"
  ) +
  theme_clase03()
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 22 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 22 rows containing missing values or values outside the scale range
(`geom_point()`).

#El análisis muestra que el sueño REM presenta una relación positiva con el sueño total, aunque con variabilidad entre observaciones. La distribución evidencia dispersión y posibles diferencias entre grupos según dieta, lo que podria sugerir la influencia de factores biológicos adicionales.

1 Parsons problems

1.1 Problema 1 — Media de sueño por dieta

msleep %>%
  group_by(vore) %>%
  summarise(mean_sleep = mean(sleep_total, na.rm = TRUE)) %>%
  arrange(desc(mean_sleep))
# A tibble: 5 × 2
  vore    mean_sleep
  <chr>        <dbl>
1 insecti      14.9 
2 omni         10.9 
3 carni        10.4 
4 <NA>         10.2 
5 herbi         9.51

1.2 Problema 2 — Filtrar y clasificar

msleep %>%
  filter(sleep_total > 14) %>%
  mutate(
    categoria = case_when(
      sleep_total > 18 ~ "extremo",
      TRUE ~ "alto"
    )
  ) %>%
  select(name, vore, sleep_total, categoria)
# A tibble: 20 × 4
   name                           vore    sleep_total categoria
   <chr>                          <chr>         <dbl> <chr>    
 1 Owl monkey                     omni           17   alto     
 2 Mountain beaver                herbi          14.4 alto     
 3 Greater short-tailed shrew     omni           14.9 alto     
 4 Three-toed sloth               herbi          14.4 alto     
 5 Long-nosed armadillo           carni          17.4 alto     
 6 North American Opossum         omni           18   alto     
 7 Big brown bat                  insecti        19.7 extremo  
 8 Western american chipmunk      herbi          14.9 alto     
 9 Thick-tailed opposum           carni          19.4 extremo  
10 Mongolian gerbil               herbi          14.2 alto     
11 Golden hamster                 herbi          14.3 alto     
12 Little brown bat               insecti        19.9 extremo  
13 Round-tailed muskrat           herbi          14.6 alto     
14 Northern grasshopper mouse     carni          14.5 alto     
15 Tiger                          carni          15.8 alto     
16 Giant armadillo                insecti        18.1 extremo  
17 Arctic ground squirrel         herbi          16.6 alto     
18 Golden-mantled ground squirrel herbi          15.9 alto     
19 Eastern american chipmunk      herbi          15.8 alto     
20 Tenrec                         omni           15.6 alto     

1.3 Problema 3 — Scatter con log y smooth

msleep %>%
  ggplot(aes(bodywt, sleep_total)) +
  geom_point(aes(color = vore), alpha = 0.6, size = 2) +
  scale_x_log10() +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.1) +
  theme_clase03() +
  labs(
    x = "Masa corporal (kg, escala log)",
    y = "Sueño total (horas)",
    color = "Dieta"
  )
`geom_smooth()` using formula = 'y ~ x'

1.4 Problema 4 — Resumen con across()

msleep %>%
  filter(!is.na(vore)) %>%
  group_by(vore) %>%
  summarise(
    across(
      c(sleep_total, sleep_rem, brainwt),
      ~ mean(.x, na.rm = TRUE)
    )
  )
# A tibble: 4 × 4
  vore    sleep_total sleep_rem brainwt
  <chr>         <dbl>     <dbl>   <dbl>
1 carni         10.4       2.29  0.0793
2 herbi          9.51      1.37  0.622 
3 insecti       14.9       3.52  0.0216
4 omni          10.9       1.96  0.146 

1.5 Problema 5 — Figura facetada

msleep %>%
  ggplot(aes(bodywt, sleep_total)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_x_log10() +
  facet_wrap(~ vore, scales = "free_y") +
  theme_minimal(base_size = 12) +
  labs(
    x = "Masa corporal (kg, escala log)",
    y = "Sueño total (horas)"
  )

1.6 Problema 6 — Left join

vore_lookup <- tibble(
  vore = c("carni", "herbi", "omni", "insecti"),
  vore_es = c("carnívoro", "herbívoro", "omnívoro", "insectívoro")
)

msleep %>%
  left_join(vore_lookup, by = "vore") %>%
  select(name, vore, vore_es, sleep_total)
# A tibble: 83 × 4
   name                       vore  vore_es   sleep_total
   <chr>                      <chr> <chr>           <dbl>
 1 Cheetah                    carni carnívoro        12.1
 2 Owl monkey                 omni  omnívoro         17  
 3 Mountain beaver            herbi herbívoro        14.4
 4 Greater short-tailed shrew omni  omnívoro         14.9
 5 Cow                        herbi herbívoro         4  
 6 Three-toed sloth           herbi herbívoro        14.4
 7 Northern fur seal          carni carnívoro         8.7
 8 Vesper mouse               <NA>  <NA>              7  
 9 Dog                        carni carnívoro        10.1
10 Roe deer                   herbi herbívoro         3  
# ℹ 73 more rows

1.7 Problema 7 — Patchwork

p1 <- ggplot(msleep, aes(sleep_total)) +
  geom_histogram(bins = 15, color = "black", linewidth = 0.2) +
  theme_clase03() +
  labs(x = "Sueño total (horas)", y = "Frecuencia")

p2 <- ggplot(msleep, aes(bodywt, sleep_total)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_x_log10() +
  theme_clase03() +
  labs(x = "Masa corporal (kg, escala log)", y = "Sueño total (horas)")

p3 <- ggplot(msleep, aes(vore, sleep_total)) +
  geom_boxplot(linewidth = 0.7) +
  theme_clase03() +
  labs(x = "Dieta", y = "Sueño total (horas)")

(p1 | p2) / p3 +
  plot_annotation(tag_levels = "A")

1.8 Problema 8 — Pipeline de missingness

msleep %>%
  summarise(
    across(
      everything(),
      ~ sum(is.na(.x))
    )
  ) %>%
  pivot_longer(
    everything(),
    names_to = "var",
    values_to = "n_na"
  ) %>%
  arrange(desc(n_na))
# A tibble: 11 × 2
   var           n_na
   <chr>        <int>
 1 sleep_cycle     51
 2 conservation    29
 3 brainwt         27
 4 sleep_rem       22
 5 vore             7
 6 name             0
 7 genus            0
 8 order            0
 9 sleep_total      0
10 awake            0
11 bodywt           0