Lectura de datos

library(tidyverse)
datos <- read_csv("Alimentos_del_tr_pico_para_alimentaci_n_animal_-_AlimenTro.csv")
datos %>% head()
glimpse(datos)
## Rows: 41,309
## Columns: 25
## $ ID                              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,~
## $ GrupoMuestra                    <chr> "09 Flores, frutas y subproductos", "0~
## $ Ingrediente                     <chr> "Cacao mucilago -", "Cacao cascara -",~
## $ Departamento                    <chr> "Huila", "Huila", "Huila", "Huila", "H~
## $ Municipio                       <chr> "Garzón", "Garzón", "Garzón", "Garzón"~
## $ EdadCorte                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ EpocaRecoleccion                <chr> "Lluvia", "Lluvia", "Lluvia", "Lluvia"~
## $ FechaRecoleccion                <chr> "07/05/2012 12:00:00 AM", "08/05/2012 ~
## $ ProteinaCruda                   <dbl> 5.62, 6.34, 13.43, 13.81, 15.03, 15.27~
## $ PorcentajeCeniza                <dbl> 1.53, 7.28, 3.54, 4.49, 4.11, 4.71, 9.~
## $ ExtractoEtereo                  <chr> "0.8500", "0.7200", "5.1100", "5.1500"~
## $ FDN                             <dbl> 16.53, 60.43, 62.03, 56.92, 55.78, 62.~
## $ FDA                             <dbl> 8.53, 47.17, 12.08, 11.65, 12.22, 12.4~
## $ Hemicelulosa                    <dbl> 8.00, 13.26, 49.95, 45.27, 43.56, 49.5~
## $ Lignina                         <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ PorcentajeAlmidonTotal          <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ PorcentajeCarbohidratosSolubles <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ PorcentajeCarbohNoEstructurales <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ NDT                             <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ DigestibilidadMS                <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ EDRumiantes                     <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ EMRumiantes                     <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ ENmRumiantes                    <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ ENgRumiantes                    <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~
## $ ENLRumiantes                    <chr> "NULL", "NULL", "NULL", "NULL", "NULL"~

Depuración y transformación de datos

library(janitor)
library(lubridate)
df_alimentos <- datos %>% 
  clean_names() %>% 
  mutate(across(c(extracto_etereo, lignina:enl_rumiantes),
                as.numeric),
         fecha = as_date(fecha_recoleccion, format = "%m/%d/%Y"),
         mes = month(fecha, label = TRUE, abbr = FALSE),
         year = year(fecha)) %>% 
  select(-c(id, fecha_recoleccion)) %>% 
  relocate(fecha, year, mes, everything())

df_alimentos %>% head()

Exportando datos depurados

write_rds(df_alimentos, compress = "xz", file = "alimentro_depurada.rds")

Métricas descriptivas

Descriptico general

library(skimr)
skim(df_alimentos)
Data summary
Name df_alimentos
Number of rows 41309
Number of columns 26
_______________________
Column type frequency:
character 5
Date 1
factor 1
numeric 19
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
grupo_muestra 0 1 17 47 0 9 0
ingrediente 0 1 11 87 0 733 0
departamento 0 1 4 18 0 28 0
municipio 0 1 3 27 0 385 0
epoca_recoleccion 0 1 6 26 0 4 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
fecha 0 1 2012-07-05 2020-10-28 2019-02-06 1503

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
mes 0 1 TRUE 12 oct: 5068, jul: 4243, sep: 3784, ene: 3776

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2018.15 1.76 2012.00 2017.00 2019.00 2020.00 2020.00 ▁▁▁▅▇
edad_corte 0 1.00 44.72 33.51 0.00 28.00 35.00 49.00 392.00 ▇▁▁▁▁
proteina_cruda 0 1.00 13.11 5.82 0.66 8.70 11.73 16.67 62.59 ▇▅▁▁▁
porcentaje_ceniza 0 1.00 10.46 2.73 0.19 8.86 10.53 12.08 31.04 ▁▇▂▁▁
extracto_etereo 333 0.99 2.08 0.73 0.03 1.70 1.99 2.33 43.46 ▇▁▁▁▁
fdn 0 1.00 56.83 11.38 0.21 50.83 60.13 65.07 82.57 ▁▁▂▇▂
fda 0 1.00 31.36 7.34 0.03 27.24 32.84 36.26 51.82 ▁▁▅▇▁
hemicelulosa 3 1.00 25.47 5.59 0.14 22.75 26.84 29.08 54.81 ▁▂▇▁▁
lignina 598 0.99 6.73 1.88 0.05 5.49 6.97 8.09 13.81 ▁▃▇▃▁
porcentaje_almidon_total 327 0.99 7.85 7.39 0.02 5.19 6.68 8.25 72.75 ▇▁▁▁▁
porcentaje_carbohidratos_solubles 313 0.99 2.91 2.25 -0.22 1.62 2.35 3.51 33.64 ▇▁▁▁▁
porcentaje_carboh_no_estructurales 312 0.99 10.75 7.65 0.36 7.33 9.35 11.97 74.33 ▇▁▁▁▁
ndt 265 0.99 56.13 7.31 0.00 50.65 54.53 60.30 93.50 ▁▁▇▆▁
digestibilidad_ms 265 0.99 61.46 7.88 0.00 55.55 59.74 65.97 101.65 ▁▁▇▆▁
ed_rumiantes 265 0.99 2.53 0.36 0.00 2.26 2.45 2.74 6.81 ▁▇▃▁▁
em_rumiantes 265 0.99 2.03 0.32 0.00 1.79 1.96 2.21 3.68 ▁▁▇▂▁
e_nm_rumiantes 266 0.99 1.17 0.30 0.00 0.95 1.11 1.35 2.58 ▁▆▇▂▁
e_ng_rumiantes 265 0.99 0.61 0.27 0.00 0.40 0.56 0.77 1.83 ▃▇▃▁▁
enl_rumiantes 265 0.99 1.26 0.18 0.91 1.12 1.22 1.36 2.17 ▆▇▂▁▁

Ejercicio

  • Obtener métricas descriptivas para cada variable numérica de interés (media, desviación estándar y mediana) por año.
    • mean()
    • median()
    • sd()
    • No olvidar agregar el argumento na.rm = TRUE en cada función
library(DT)
df_alimentos %>% 
  select(year, edad_corte, proteina_cruda:enl_rumiantes) %>% 
  pivot_longer(cols = -year, names_to = "variable", values_to = "valor") %>% 
  group_by(year, variable) %>% 
  summarise(media = mean(valor, na.rm = TRUE),
            mediana = median(valor, na.rm = TRUE),
            desviacion = sd(valor, na.rm = TRUE)) %>% 
  mutate(across(is.numeric, round, digits = 2)) %>% 
  datatable(rownames = FALSE)

Ejemplos datos desordenados

Homicidios Quindío

df_homicidios <- read_csv("Tasas_de_homicidios_seg_n_municipios_por_cien_mil_habitantes._A_os_1990_-_2017.csv")

df_homicidios2 <- df_homicidios %>% 
  pivot_longer(cols = -Municipio, names_to = "year", values_to = "homicidio",
               names_transform = list(year = as.numeric)) %>% 
  rename(municipio = Municipio)

df_homicidios2 %>% head()

Visualizaciones

Cantidades

Gráfico estático

# Configurando tema para todos los gráficos
theme_set(theme_minimal())
library(ggthemes)

df_homicidios2 %>% 
  group_by(year) %>% 
  summarise(promedio = mean(homicidio)) %>% 
  ggplot(mapping = aes(x = year, y = promedio)) +
  geom_col() +
  scale_x_continuous(breaks = seq(1990, 2017, 2)) +
  labs(x = "Año", y = "Homicidios",
       title = "Tasa de homicidios departamento del Quindío",
       subtitle = "Por cada 100 mil habitantes")

Gráfico interactivo

library(plotly)
ggplotly(
  df_homicidios2 %>% 
  group_by(year) %>% 
  summarise(promedio = mean(homicidio)) %>% 
  ggplot(mapping = aes(x = year, y = promedio)) +
  geom_col() +
  scale_x_continuous(breaks = seq(1990, 2017, 2)) +
  labs(x = "Año", y = "Homicidios",
       title = "Tasa de homicidios departamento del Quindío"),
  
  width = 900
)

Distribuciones

  • Histogramas y densidades
  • Boxplot
  • Gráfico cuantil cuantil

Densidades

Gráfico estático

df_homicidios2 %>% 
  #filter(municipio %in% c("Calarca", "Circasia", "Salento")) %>% 
  ggplot(mapping = aes(x = homicidio)) +
  facet_wrap(facets = ~municipio, scales = "free") +
  geom_density(fill = "dodgerblue", color = "red", alpha = 0.5)

Gráfico interactivo

ggplotly(
  df_homicidios2 %>% 
  ggplot(mapping = aes(x = homicidio)) +
  facet_wrap(facets = ~municipio, scales = "free") +
  geom_density(fill = "dodgerblue", color = "red", alpha = 0.5),
  
  width = 900,
  height = 500
)

Boxplot

Gráfico estático

df_homicidios2 %>% 
  ggplot(mapping = aes(x = fct_reorder(municipio, homicidio, median),
                       y = homicidio)) +
  geom_boxplot() +
  stat_summary(fun = mean, geom = "point", color = "red")

Gráfico interactivo

ggplotly(
  df_homicidios2 %>% 
  ggplot(mapping = aes(x = fct_reorder(municipio, homicidio, median),
                       y = homicidio)) +
  geom_boxplot() +
  stat_summary(fun = mean, geom = "point", color = "red"),
  
  tooltip = c("y"),
  width = 900
)

Cuantil-Cuantil

Gráfico estático

df_homicidios2 %>% 
  ggplot(mapping = aes(sample = homicidio)) +
  facet_wrap(facets = ~municipio, scales = "free") +
  geom_qq() +
  geom_qq_line()

Relaciones X-Y

Gráfico estático

df_homicidios2 %>% 
  ggplot(mapping = aes(x = year, y = homicidio)) +
  geom_line(mapping = aes(color = municipio)) +
  geom_smooth(se = FALSE, color = "red")

Gráfico interactivo

ggplotly(
  df_homicidios2 %>% 
  ggplot(mapping = aes(x = year, y = homicidio)) +
  geom_line(mapping = aes(color = municipio)) +
  geom_smooth(se = FALSE, color = "red"),
  
  width = 900
)

Gráfico estático 2

df_homicidios2 %>% 
  ggplot(mapping = aes(x = year, y = homicidio)) +
  facet_wrap(facets = ~municipio, scales = "free") +
  geom_line() +
  geom_smooth(se = FALSE)

Incertidumbre

df_homicidios2 %>% 
  group_by(year) %>% 
  summarise(
    promedio = mean(homicidio),
    p5 = quantile(homicidio, probs = 0.05),
    p95 = quantile(homicidio, probs = 0.95),
    p20 = quantile(homicidio, probs = 0.20),
    p80 = quantile(homicidio, probs = 0.80)
  ) %>% 
  ggplot(mapping = aes(x = year, y = promedio)) +
  geom_ribbon(mapping = aes(ymin = p5, ymax = p95), 
              fill = "firebrick2", alpha = 0.5) +
  geom_ribbon(mapping = aes(ymin = p20, ymax = p80),
              fill = "firebrick2", alpha = 0.6) +
  geom_line()