Data “Wrangling”

Los siguientes ejemplos muestran cómo manipular datos en R. Se utilizan los paquetes dplyr y tidyr, que son parte del “tidyverse”.

Importación de datos

Vamos a utilizar un archivo con datos de PM2.5 y AQI.

# leer archivo csv
library(readr)
datos_dw1 <- read_csv("ad_viz_plotval_data.csv")
## Rows: 615 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Date, Source, Units, Local Site Name, AQS Parameter Description, M...
## dbl (11): Site ID, POC, Daily Mean PM10 Concentration, Daily AQI Value, Dail...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Examinar estructura de los datos y tipos de variables

# ver estructura de los datos
str(datos_dw1)
## spc_tbl_ [615 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date                         : chr [1:615] "01/01/2024" "01/03/2024" "01/09/2024" "01/12/2024" ...
##  $ Source                       : chr [1:615] "AQS" "AQS" "AQS" "AQS" ...
##  $ Site ID                      : num [1:615] 7.2e+08 7.2e+08 7.2e+08 7.2e+08 7.2e+08 ...
##  $ POC                          : num [1:615] 3 3 3 3 3 3 3 3 3 3 ...
##  $ Daily Mean PM10 Concentration: num [1:615] 66 35 45 36 24 17 23 34 40 28 ...
##  $ Units                        : chr [1:615] "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" ...
##  $ Daily AQI Value              : num [1:615] 56 32 42 33 22 16 21 31 37 26 ...
##  $ Local Site Name              : chr [1:615] "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" ...
##  $ Daily Obs Count              : num [1:615] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Percent Complete             : num [1:615] 100 100 100 100 100 100 100 100 100 100 ...
##  $ AQS Parameter Code           : num [1:615] 81102 81102 81102 81102 81102 ...
##  $ AQS Parameter Description    : chr [1:615] "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" ...
##  $ Method Code                  : chr [1:615] "079" "079" "079" "079" ...
##  $ CBSA Code                    : num [1:615] 41980 41980 41980 41980 41980 ...
##  $ CBSA Name                    : chr [1:615] "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" ...
##  $ State FIPS Code              : num [1:615] 72 72 72 72 72 72 72 72 72 72 ...
##  $ State                        : chr [1:615] "Puerto Rico" "Puerto Rico" "Puerto Rico" "Puerto Rico" ...
##  $ County FIPS Code             : chr [1:615] "033" "033" "033" "033" ...
##  $ County                       : chr [1:615] "Catano" "Catano" "Catano" "Catano" ...
##  $ Site Latitude                : num [1:615] 18.4 18.4 18.4 18.4 18.4 ...
##  $ Site Longitude               : num [1:615] -66.1 -66.1 -66.1 -66.1 -66.1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Source = col_character(),
##   ..   `Site ID` = col_double(),
##   ..   POC = col_double(),
##   ..   `Daily Mean PM10 Concentration` = col_double(),
##   ..   Units = col_character(),
##   ..   `Daily AQI Value` = col_double(),
##   ..   `Local Site Name` = col_character(),
##   ..   `Daily Obs Count` = col_double(),
##   ..   `Percent Complete` = col_double(),
##   ..   `AQS Parameter Code` = col_double(),
##   ..   `AQS Parameter Description` = col_character(),
##   ..   `Method Code` = col_character(),
##   ..   `CBSA Code` = col_double(),
##   ..   `CBSA Name` = col_character(),
##   ..   `State FIPS Code` = col_double(),
##   ..   State = col_character(),
##   ..   `County FIPS Code` = col_character(),
##   ..   County = col_character(),
##   ..   `Site Latitude` = col_double(),
##   ..   `Site Longitude` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# ver resumen de los datos
summary(datos_dw1)
##      Date              Source             Site ID               POC       
##  Length:615         Length:615         Min.   :720330004   Min.   :3.000  
##  Class :character   Class :character   1st Qu.:720330004   1st Qu.:3.000  
##  Mode  :character   Mode  :character   Median :720530003   Median :4.000  
##                                        Mean   :720508541   Mean   :4.132  
##                                        3rd Qu.:720610005   3rd Qu.:5.000  
##                                        Max.   :721130004   Max.   :9.000  
##  Daily Mean PM10 Concentration    Units           Daily AQI Value
##  Min.   :-21.00                Length:615         Min.   : 0.00  
##  1st Qu.: 14.00                Class :character   1st Qu.:13.00  
##  Median : 20.00                Mode  :character   Median :19.00  
##  Mean   : 23.16                                   Mean   :21.38  
##  3rd Qu.: 30.00                                   3rd Qu.:28.00  
##  Max.   :109.00                                   Max.   :78.00  
##  Local Site Name    Daily Obs Count Percent Complete AQS Parameter Code
##  Length:615         Min.   :1       Min.   :100      Min.   :81102     
##  Class :character   1st Qu.:1       1st Qu.:100      1st Qu.:81102     
##  Mode  :character   Median :1       Median :100      Median :81102     
##                     Mean   :1       Mean   :100      Mean   :81102     
##                     3rd Qu.:1       3rd Qu.:100      3rd Qu.:81102     
##                     Max.   :1       Max.   :100      Max.   :81102     
##  AQS Parameter Description Method Code          CBSA Code    
##  Length:615                Length:615         Min.   :25020  
##  Class :character          Class :character   1st Qu.:41980  
##  Mode  :character          Mode  :character   Median :41980  
##                                               Mean   :40244  
##                                               3rd Qu.:41980  
##                                               Max.   :41980  
##   CBSA Name         State FIPS Code    State           County FIPS Code  
##  Length:615         Min.   :72      Length:615         Length:615        
##  Class :character   1st Qu.:72      Class :character   Class :character  
##  Mode  :character   Median :72      Mode  :character   Mode  :character  
##                     Mean   :72                                           
##                     3rd Qu.:72                                           
##                     Max.   :72                                           
##     County          Site Latitude   Site Longitude  
##  Length:615         Min.   :17.96   Min.   :-66.63  
##  Class :character   1st Qu.:18.38   1st Qu.:-66.14  
##  Mode  :character   Median :18.43   Median :-66.14  
##                     Mean   :18.35   Mean   :-66.14  
##                     3rd Qu.:18.43   3rd Qu.:-66.12  
##                     Max.   :18.44   Max.   :-65.62
# ver tipos de variables
sapply(datos_dw1, class)
##                          Date                        Source 
##                   "character"                   "character" 
##                       Site ID                           POC 
##                     "numeric"                     "numeric" 
## Daily Mean PM10 Concentration                         Units 
##                     "numeric"                   "character" 
##               Daily AQI Value               Local Site Name 
##                     "numeric"                   "character" 
##               Daily Obs Count              Percent Complete 
##                     "numeric"                     "numeric" 
##            AQS Parameter Code     AQS Parameter Description 
##                     "numeric"                   "character" 
##                   Method Code                     CBSA Code 
##                   "character"                     "numeric" 
##                     CBSA Name               State FIPS Code 
##                   "character"                     "numeric" 
##                         State              County FIPS Code 
##                   "character"                   "character" 
##                        County                 Site Latitude 
##                   "character"                     "numeric" 
##                Site Longitude 
##                     "numeric"
# ver si hay valores NA
any(is.na(datos_dw1))
## [1] TRUE
# donde hay NA
library(naniar)
miss_var_summary(datos_dw1)
## # A tibble: 21 × 3
##    variable                      n_miss pct_miss
##    <chr>                          <int>    <num>
##  1 Local Site Name                  105     17.1
##  2 Date                               0      0  
##  3 Source                             0      0  
##  4 Site ID                            0      0  
##  5 POC                                0      0  
##  6 Daily Mean PM10 Concentration      0      0  
##  7 Units                              0      0  
##  8 Daily AQI Value                    0      0  
##  9 Daily Obs Count                    0      0  
## 10 Percent Complete                   0      0  
## # ℹ 11 more rows
# ver si hay valores duplicados
any(duplicated(datos_dw1))
## [1] FALSE

Examinar las variables categóricas

# Apply table() only to character columns
char_tables <- lapply(datos_dw1[sapply(datos_dw1, is.character) & names(datos_dw1) != "Date"], table)
# View result
for (col in names(char_tables)) {
  cat("\n---", col, "---\n")
  print(char_tables[[col]])
}
## 
## --- Source ---
## 
## AQS 
## 615 
## 
## --- Units ---
## 
## ug/m3 SC 
##      615 
## 
## --- Local Site Name ---
## 
##                        Fajardo                       Guaynabo 
##                             47                            101 
##               RESIDENTIAL AREA THE NW STREET AT THE 11 STREET 
##                             56                            306 
## 
## --- AQS Parameter Description ---
## 
## PM10 Total 0-10um STP 
##                   615 
## 
## --- Method Code ---
## 
## 063 079 
## 309 306 
## 
## --- CBSA Name ---
## 
##                  Guayama, PR                    Ponce, PR 
##                           52                           56 
## San Juan-Carolina-Caguas, PR 
##                          507 
## 
## --- State ---
## 
## Puerto Rico 
##         615 
## 
## --- County FIPS Code ---
## 
## 033 053 057 061 113 
## 306  47  52 154  56 
## 
## --- County ---
## 
##   Catano  Fajardo  Guayama Guaynabo    Ponce 
##      306       47       52      154       56

Limpiar nombres de variables

# activar paquete
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# limpiar nombres de variables
datos_dw2 <- clean_names(datos_dw1)
# crear archivo csv
write.csv(datos_dw2, "datos_dw1.csv", row.names = FALSE)
# ver nombres de variables
names(datos_dw2)
##  [1] "date"                          "source"                       
##  [3] "site_id"                       "poc"                          
##  [5] "daily_mean_pm10_concentration" "units"                        
##  [7] "daily_aqi_value"               "local_site_name"              
##  [9] "daily_obs_count"               "percent_complete"             
## [11] "aqs_parameter_code"            "aqs_parameter_description"    
## [13] "method_code"                   "cbsa_code"                    
## [15] "cbsa_name"                     "state_fips_code"              
## [17] "state"                         "county_fips_code"             
## [19] "county"                        "site_latitude"                
## [21] "site_longitude"
# ver estructura de los datos
str(datos_dw2)
## spc_tbl_ [615 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date                         : chr [1:615] "01/01/2024" "01/03/2024" "01/09/2024" "01/12/2024" ...
##  $ source                       : chr [1:615] "AQS" "AQS" "AQS" "AQS" ...
##  $ site_id                      : num [1:615] 7.2e+08 7.2e+08 7.2e+08 7.2e+08 7.2e+08 ...
##  $ poc                          : num [1:615] 3 3 3 3 3 3 3 3 3 3 ...
##  $ daily_mean_pm10_concentration: num [1:615] 66 35 45 36 24 17 23 34 40 28 ...
##  $ units                        : chr [1:615] "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" ...
##  $ daily_aqi_value              : num [1:615] 56 32 42 33 22 16 21 31 37 26 ...
##  $ local_site_name              : chr [1:615] "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" ...
##  $ daily_obs_count              : num [1:615] 1 1 1 1 1 1 1 1 1 1 ...
##  $ percent_complete             : num [1:615] 100 100 100 100 100 100 100 100 100 100 ...
##  $ aqs_parameter_code           : num [1:615] 81102 81102 81102 81102 81102 ...
##  $ aqs_parameter_description    : chr [1:615] "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" ...
##  $ method_code                  : chr [1:615] "079" "079" "079" "079" ...
##  $ cbsa_code                    : num [1:615] 41980 41980 41980 41980 41980 ...
##  $ cbsa_name                    : chr [1:615] "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" ...
##  $ state_fips_code              : num [1:615] 72 72 72 72 72 72 72 72 72 72 ...
##  $ state                        : chr [1:615] "Puerto Rico" "Puerto Rico" "Puerto Rico" "Puerto Rico" ...
##  $ county_fips_code             : chr [1:615] "033" "033" "033" "033" ...
##  $ county                       : chr [1:615] "Catano" "Catano" "Catano" "Catano" ...
##  $ site_latitude                : num [1:615] 18.4 18.4 18.4 18.4 18.4 ...
##  $ site_longitude               : num [1:615] -66.1 -66.1 -66.1 -66.1 -66.1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_character(),
##   ..   Source = col_character(),
##   ..   `Site ID` = col_double(),
##   ..   POC = col_double(),
##   ..   `Daily Mean PM10 Concentration` = col_double(),
##   ..   Units = col_character(),
##   ..   `Daily AQI Value` = col_double(),
##   ..   `Local Site Name` = col_character(),
##   ..   `Daily Obs Count` = col_double(),
##   ..   `Percent Complete` = col_double(),
##   ..   `AQS Parameter Code` = col_double(),
##   ..   `AQS Parameter Description` = col_character(),
##   ..   `Method Code` = col_character(),
##   ..   `CBSA Code` = col_double(),
##   ..   `CBSA Name` = col_character(),
##   ..   `State FIPS Code` = col_double(),
##   ..   State = col_character(),
##   ..   `County FIPS Code` = col_character(),
##   ..   County = col_character(),
##   ..   `Site Latitude` = col_double(),
##   ..   `Site Longitude` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Seleccionar variables

# seleccionar variables
library(dplyr)
datos_dw3 <- datos_dw2 %>%
  select(date, daily_mean_pm10_concentration, daily_aqi_value, local_site_name, method_code, cbsa_code, cbsa_name, county_fips_code, county, site_latitude, site_longitude)
# ver estructura de los datos
str(datos_dw3)
## tibble [615 × 11] (S3: tbl_df/tbl/data.frame)
##  $ date                         : chr [1:615] "01/01/2024" "01/03/2024" "01/09/2024" "01/12/2024" ...
##  $ daily_mean_pm10_concentration: num [1:615] 66 35 45 36 24 17 23 34 40 28 ...
##  $ daily_aqi_value              : num [1:615] 56 32 42 33 22 16 21 31 37 26 ...
##  $ local_site_name              : chr [1:615] "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" ...
##  $ method_code                  : chr [1:615] "079" "079" "079" "079" ...
##  $ cbsa_code                    : num [1:615] 41980 41980 41980 41980 41980 ...
##  $ cbsa_name                    : chr [1:615] "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" ...
##  $ county_fips_code             : chr [1:615] "033" "033" "033" "033" ...
##  $ county                       : chr [1:615] "Catano" "Catano" "Catano" "Catano" ...
##  $ site_latitude                : num [1:615] 18.4 18.4 18.4 18.4 18.4 ...
##  $ site_longitude               : num [1:615] -66.1 -66.1 -66.1 -66.1 -66.1 ...
# write csv
write.csv(datos_dw3, "datos_dw3.csv", row.names = FALSE)

Convertir variable de fecha a tipo Date

# convertir variable de fecha a tipo Date
datos_dw3$date <- as.Date(datos_dw3$date, format = "%m/%d/%Y")
# ver tipo de dato de date
str(datos_dw3$date)
##  Date[1:615], format: "2024-01-01" "2024-01-03" "2024-01-09" "2024-01-12" "2024-01-13" ...

Convertir variable de texto a factor

# convertir variable de texto a factor
datos_dw3$local_site_name <- as.factor(datos_dw3$local_site_name)
# ver tipo de dato de local_site_name
str(datos_dw3$local_site_name)
##  Factor w/ 4 levels "Fajardo","Guaynabo",..: 4 4 4 4 4 4 4 4 4 4 ...
# ver niveles de local_site_name
levels(datos_dw3$local_site_name)
## [1] "Fajardo"                        "Guaynabo"                      
## [3] "RESIDENTIAL AREA"               "THE NW STREET AT THE 11 STREET"

Filtrar datos

Seleccionar datos del municipio (county) de Ponce.

# filtrar datos
datos_dw_ponce <- datos_dw3 %>%
  filter(county == "Ponce")
# ver primeras filas
head(datos_dw_ponce)
## # A tibble: 6 × 11
##   date       daily_mean_pm10_conce…¹ daily_aqi_value local_site_name method_code
##   <date>                       <dbl>           <dbl> <fct>           <chr>      
## 1 2024-01-01                      14              13 RESIDENTIAL AR… 063        
## 2 2024-01-07                      18              17 RESIDENTIAL AR… 063        
## 3 2024-01-13                      40              37 RESIDENTIAL AR… 063        
## 4 2024-01-19                      14              13 RESIDENTIAL AR… 063        
## 5 2024-01-25                      19              18 RESIDENTIAL AR… 063        
## 6 2024-01-31                      30              28 RESIDENTIAL AR… 063        
## # ℹ abbreviated name: ¹​daily_mean_pm10_concentration
## # ℹ 6 more variables: cbsa_code <dbl>, cbsa_name <chr>, county_fips_code <chr>,
## #   county <chr>, site_latitude <dbl>, site_longitude <dbl>

Crear nuevas variables

Vamos a crear una nueva variable que sea el logaritmo de la concentración diaria de PM10.

# crear nueva variable como logaritmo natural de la concentración diaria de PM10
datos_dw_ponce <- datos_dw_ponce %>%
  mutate(log_daily_mean_pm10_concentration = log(daily_mean_pm10_concentration + 1)) # +1 para evitar log(0)
# ver primeras filas
head(datos_dw_ponce)
## # A tibble: 6 × 12
##   date       daily_mean_pm10_conce…¹ daily_aqi_value local_site_name method_code
##   <date>                       <dbl>           <dbl> <fct>           <chr>      
## 1 2024-01-01                      14              13 RESIDENTIAL AR… 063        
## 2 2024-01-07                      18              17 RESIDENTIAL AR… 063        
## 3 2024-01-13                      40              37 RESIDENTIAL AR… 063        
## 4 2024-01-19                      14              13 RESIDENTIAL AR… 063        
## 5 2024-01-25                      19              18 RESIDENTIAL AR… 063        
## 6 2024-01-31                      30              28 RESIDENTIAL AR… 063        
## # ℹ abbreviated name: ¹​daily_mean_pm10_concentration
## # ℹ 7 more variables: cbsa_code <dbl>, cbsa_name <chr>, county_fips_code <chr>,
## #   county <chr>, site_latitude <dbl>, site_longitude <dbl>,
## #   log_daily_mean_pm10_concentration <dbl>

Agrupar datos y calcular estadísticas en variables numéricas

# agrupar datos por county y calcular media y desviación estándar de la concentración diaria de PM10
datos_dw3_stat <- datos_dw3 %>%
  group_by(county) %>%
  summarise(mean_pm10 = mean(daily_mean_pm10_concentration, na.rm = TRUE),
            sd_pm10 = sd(daily_mean_pm10_concentration, na.rm = TRUE),
            mean_aqi = mean(daily_aqi_value, na.rm = TRUE),
            sd_aqi = sd(daily_aqi_value, na.rm = TRUE))
# ver resultados
print(datos_dw3_stat)
## # A tibble: 5 × 5
##   county   mean_pm10 sd_pm10 mean_aqi sd_aqi
##   <chr>        <dbl>   <dbl>    <dbl>  <dbl>
## 1 Catano        25.4    15.8     23.4  13.5 
## 2 Fajardo       20.5    10.1     19.0   9.44
## 3 Guayama       18.5    10.2     17.1   9.30
## 4 Guaynabo      21.1    10.8     19.5  10.0 
## 5 Ponce         22.9    12.4     21.2  11.2

Gráfica de boxplot de la concentración diaria de PM10 por municipio

# cargar paquete ggplot2
library(ggplot2)
# crear boxplot de la concentración diaria de PM10 por municipio
figura1 <- ggplot(datos_dw3, aes(x = county, y = daily_mean_pm10_concentration)) +
  geom_boxplot() +
  labs(x = "Municipio",
       y = "Concentración diaria de PM10 (µg/m³)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# mostrar gráfica
figura1