Los siguientes ejemplos muestran cómo manipular datos en R. Se
utilizan los paquetes dplyr
y tidyr
, que son
parte del “tidyverse”.
Vamos a utilizar un archivo con datos de PM2.5 y AQI.
# leer archivo csv
library(readr)
datos_dw1 <- read_csv("ad_viz_plotval_data.csv")
## Rows: 615 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Date, Source, Units, Local Site Name, AQS Parameter Description, M...
## dbl (11): Site ID, POC, Daily Mean PM10 Concentration, Daily AQI Value, Dail...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# ver estructura de los datos
str(datos_dw1)
## spc_tbl_ [615 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Date : chr [1:615] "01/01/2024" "01/03/2024" "01/09/2024" "01/12/2024" ...
## $ Source : chr [1:615] "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : num [1:615] 7.2e+08 7.2e+08 7.2e+08 7.2e+08 7.2e+08 ...
## $ POC : num [1:615] 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM10 Concentration: num [1:615] 66 35 45 36 24 17 23 34 40 28 ...
## $ Units : chr [1:615] "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" ...
## $ Daily AQI Value : num [1:615] 56 32 42 33 22 16 21 31 37 26 ...
## $ Local Site Name : chr [1:615] "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" ...
## $ Daily Obs Count : num [1:615] 1 1 1 1 1 1 1 1 1 1 ...
## $ Percent Complete : num [1:615] 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS Parameter Code : num [1:615] 81102 81102 81102 81102 81102 ...
## $ AQS Parameter Description : chr [1:615] "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" ...
## $ Method Code : chr [1:615] "079" "079" "079" "079" ...
## $ CBSA Code : num [1:615] 41980 41980 41980 41980 41980 ...
## $ CBSA Name : chr [1:615] "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" ...
## $ State FIPS Code : num [1:615] 72 72 72 72 72 72 72 72 72 72 ...
## $ State : chr [1:615] "Puerto Rico" "Puerto Rico" "Puerto Rico" "Puerto Rico" ...
## $ County FIPS Code : chr [1:615] "033" "033" "033" "033" ...
## $ County : chr [1:615] "Catano" "Catano" "Catano" "Catano" ...
## $ Site Latitude : num [1:615] 18.4 18.4 18.4 18.4 18.4 ...
## $ Site Longitude : num [1:615] -66.1 -66.1 -66.1 -66.1 -66.1 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_character(),
## .. Source = col_character(),
## .. `Site ID` = col_double(),
## .. POC = col_double(),
## .. `Daily Mean PM10 Concentration` = col_double(),
## .. Units = col_character(),
## .. `Daily AQI Value` = col_double(),
## .. `Local Site Name` = col_character(),
## .. `Daily Obs Count` = col_double(),
## .. `Percent Complete` = col_double(),
## .. `AQS Parameter Code` = col_double(),
## .. `AQS Parameter Description` = col_character(),
## .. `Method Code` = col_character(),
## .. `CBSA Code` = col_double(),
## .. `CBSA Name` = col_character(),
## .. `State FIPS Code` = col_double(),
## .. State = col_character(),
## .. `County FIPS Code` = col_character(),
## .. County = col_character(),
## .. `Site Latitude` = col_double(),
## .. `Site Longitude` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# ver resumen de los datos
summary(datos_dw1)
## Date Source Site ID POC
## Length:615 Length:615 Min. :720330004 Min. :3.000
## Class :character Class :character 1st Qu.:720330004 1st Qu.:3.000
## Mode :character Mode :character Median :720530003 Median :4.000
## Mean :720508541 Mean :4.132
## 3rd Qu.:720610005 3rd Qu.:5.000
## Max. :721130004 Max. :9.000
## Daily Mean PM10 Concentration Units Daily AQI Value
## Min. :-21.00 Length:615 Min. : 0.00
## 1st Qu.: 14.00 Class :character 1st Qu.:13.00
## Median : 20.00 Mode :character Median :19.00
## Mean : 23.16 Mean :21.38
## 3rd Qu.: 30.00 3rd Qu.:28.00
## Max. :109.00 Max. :78.00
## Local Site Name Daily Obs Count Percent Complete AQS Parameter Code
## Length:615 Min. :1 Min. :100 Min. :81102
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:81102
## Mode :character Median :1 Median :100 Median :81102
## Mean :1 Mean :100 Mean :81102
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:81102
## Max. :1 Max. :100 Max. :81102
## AQS Parameter Description Method Code CBSA Code
## Length:615 Length:615 Min. :25020
## Class :character Class :character 1st Qu.:41980
## Mode :character Mode :character Median :41980
## Mean :40244
## 3rd Qu.:41980
## Max. :41980
## CBSA Name State FIPS Code State County FIPS Code
## Length:615 Min. :72 Length:615 Length:615
## Class :character 1st Qu.:72 Class :character Class :character
## Mode :character Median :72 Mode :character Mode :character
## Mean :72
## 3rd Qu.:72
## Max. :72
## County Site Latitude Site Longitude
## Length:615 Min. :17.96 Min. :-66.63
## Class :character 1st Qu.:18.38 1st Qu.:-66.14
## Mode :character Median :18.43 Median :-66.14
## Mean :18.35 Mean :-66.14
## 3rd Qu.:18.43 3rd Qu.:-66.12
## Max. :18.44 Max. :-65.62
# ver tipos de variables
sapply(datos_dw1, class)
## Date Source
## "character" "character"
## Site ID POC
## "numeric" "numeric"
## Daily Mean PM10 Concentration Units
## "numeric" "character"
## Daily AQI Value Local Site Name
## "numeric" "character"
## Daily Obs Count Percent Complete
## "numeric" "numeric"
## AQS Parameter Code AQS Parameter Description
## "numeric" "character"
## Method Code CBSA Code
## "character" "numeric"
## CBSA Name State FIPS Code
## "character" "numeric"
## State County FIPS Code
## "character" "character"
## County Site Latitude
## "character" "numeric"
## Site Longitude
## "numeric"
# ver si hay valores NA
any(is.na(datos_dw1))
## [1] TRUE
# donde hay NA
library(naniar)
miss_var_summary(datos_dw1)
## # A tibble: 21 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 Local Site Name 105 17.1
## 2 Date 0 0
## 3 Source 0 0
## 4 Site ID 0 0
## 5 POC 0 0
## 6 Daily Mean PM10 Concentration 0 0
## 7 Units 0 0
## 8 Daily AQI Value 0 0
## 9 Daily Obs Count 0 0
## 10 Percent Complete 0 0
## # ℹ 11 more rows
# ver si hay valores duplicados
any(duplicated(datos_dw1))
## [1] FALSE
# Apply table() only to character columns
char_tables <- lapply(datos_dw1[sapply(datos_dw1, is.character) & names(datos_dw1) != "Date"], table)
# View result
for (col in names(char_tables)) {
cat("\n---", col, "---\n")
print(char_tables[[col]])
}
##
## --- Source ---
##
## AQS
## 615
##
## --- Units ---
##
## ug/m3 SC
## 615
##
## --- Local Site Name ---
##
## Fajardo Guaynabo
## 47 101
## RESIDENTIAL AREA THE NW STREET AT THE 11 STREET
## 56 306
##
## --- AQS Parameter Description ---
##
## PM10 Total 0-10um STP
## 615
##
## --- Method Code ---
##
## 063 079
## 309 306
##
## --- CBSA Name ---
##
## Guayama, PR Ponce, PR
## 52 56
## San Juan-Carolina-Caguas, PR
## 507
##
## --- State ---
##
## Puerto Rico
## 615
##
## --- County FIPS Code ---
##
## 033 053 057 061 113
## 306 47 52 154 56
##
## --- County ---
##
## Catano Fajardo Guayama Guaynabo Ponce
## 306 47 52 154 56
# activar paquete
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# limpiar nombres de variables
datos_dw2 <- clean_names(datos_dw1)
# crear archivo csv
write.csv(datos_dw2, "datos_dw1.csv", row.names = FALSE)
# ver nombres de variables
names(datos_dw2)
## [1] "date" "source"
## [3] "site_id" "poc"
## [5] "daily_mean_pm10_concentration" "units"
## [7] "daily_aqi_value" "local_site_name"
## [9] "daily_obs_count" "percent_complete"
## [11] "aqs_parameter_code" "aqs_parameter_description"
## [13] "method_code" "cbsa_code"
## [15] "cbsa_name" "state_fips_code"
## [17] "state" "county_fips_code"
## [19] "county" "site_latitude"
## [21] "site_longitude"
# ver estructura de los datos
str(datos_dw2)
## spc_tbl_ [615 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : chr [1:615] "01/01/2024" "01/03/2024" "01/09/2024" "01/12/2024" ...
## $ source : chr [1:615] "AQS" "AQS" "AQS" "AQS" ...
## $ site_id : num [1:615] 7.2e+08 7.2e+08 7.2e+08 7.2e+08 7.2e+08 ...
## $ poc : num [1:615] 3 3 3 3 3 3 3 3 3 3 ...
## $ daily_mean_pm10_concentration: num [1:615] 66 35 45 36 24 17 23 34 40 28 ...
## $ units : chr [1:615] "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" "ug/m3 SC" ...
## $ daily_aqi_value : num [1:615] 56 32 42 33 22 16 21 31 37 26 ...
## $ local_site_name : chr [1:615] "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" ...
## $ daily_obs_count : num [1:615] 1 1 1 1 1 1 1 1 1 1 ...
## $ percent_complete : num [1:615] 100 100 100 100 100 100 100 100 100 100 ...
## $ aqs_parameter_code : num [1:615] 81102 81102 81102 81102 81102 ...
## $ aqs_parameter_description : chr [1:615] "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" "PM10 Total 0-10um STP" ...
## $ method_code : chr [1:615] "079" "079" "079" "079" ...
## $ cbsa_code : num [1:615] 41980 41980 41980 41980 41980 ...
## $ cbsa_name : chr [1:615] "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" ...
## $ state_fips_code : num [1:615] 72 72 72 72 72 72 72 72 72 72 ...
## $ state : chr [1:615] "Puerto Rico" "Puerto Rico" "Puerto Rico" "Puerto Rico" ...
## $ county_fips_code : chr [1:615] "033" "033" "033" "033" ...
## $ county : chr [1:615] "Catano" "Catano" "Catano" "Catano" ...
## $ site_latitude : num [1:615] 18.4 18.4 18.4 18.4 18.4 ...
## $ site_longitude : num [1:615] -66.1 -66.1 -66.1 -66.1 -66.1 ...
## - attr(*, "spec")=
## .. cols(
## .. Date = col_character(),
## .. Source = col_character(),
## .. `Site ID` = col_double(),
## .. POC = col_double(),
## .. `Daily Mean PM10 Concentration` = col_double(),
## .. Units = col_character(),
## .. `Daily AQI Value` = col_double(),
## .. `Local Site Name` = col_character(),
## .. `Daily Obs Count` = col_double(),
## .. `Percent Complete` = col_double(),
## .. `AQS Parameter Code` = col_double(),
## .. `AQS Parameter Description` = col_character(),
## .. `Method Code` = col_character(),
## .. `CBSA Code` = col_double(),
## .. `CBSA Name` = col_character(),
## .. `State FIPS Code` = col_double(),
## .. State = col_character(),
## .. `County FIPS Code` = col_character(),
## .. County = col_character(),
## .. `Site Latitude` = col_double(),
## .. `Site Longitude` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# seleccionar variables
library(dplyr)
datos_dw3 <- datos_dw2 %>%
select(date, daily_mean_pm10_concentration, daily_aqi_value, local_site_name, method_code, cbsa_code, cbsa_name, county_fips_code, county, site_latitude, site_longitude)
# ver estructura de los datos
str(datos_dw3)
## tibble [615 × 11] (S3: tbl_df/tbl/data.frame)
## $ date : chr [1:615] "01/01/2024" "01/03/2024" "01/09/2024" "01/12/2024" ...
## $ daily_mean_pm10_concentration: num [1:615] 66 35 45 36 24 17 23 34 40 28 ...
## $ daily_aqi_value : num [1:615] 56 32 42 33 22 16 21 31 37 26 ...
## $ local_site_name : chr [1:615] "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" "THE NW STREET AT THE 11 STREET" ...
## $ method_code : chr [1:615] "079" "079" "079" "079" ...
## $ cbsa_code : num [1:615] 41980 41980 41980 41980 41980 ...
## $ cbsa_name : chr [1:615] "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" "San Juan-Carolina-Caguas, PR" ...
## $ county_fips_code : chr [1:615] "033" "033" "033" "033" ...
## $ county : chr [1:615] "Catano" "Catano" "Catano" "Catano" ...
## $ site_latitude : num [1:615] 18.4 18.4 18.4 18.4 18.4 ...
## $ site_longitude : num [1:615] -66.1 -66.1 -66.1 -66.1 -66.1 ...
# write csv
write.csv(datos_dw3, "datos_dw3.csv", row.names = FALSE)
# convertir variable de fecha a tipo Date
datos_dw3$date <- as.Date(datos_dw3$date, format = "%m/%d/%Y")
# ver tipo de dato de date
str(datos_dw3$date)
## Date[1:615], format: "2024-01-01" "2024-01-03" "2024-01-09" "2024-01-12" "2024-01-13" ...
# convertir variable de texto a factor
datos_dw3$local_site_name <- as.factor(datos_dw3$local_site_name)
# ver tipo de dato de local_site_name
str(datos_dw3$local_site_name)
## Factor w/ 4 levels "Fajardo","Guaynabo",..: 4 4 4 4 4 4 4 4 4 4 ...
# ver niveles de local_site_name
levels(datos_dw3$local_site_name)
## [1] "Fajardo" "Guaynabo"
## [3] "RESIDENTIAL AREA" "THE NW STREET AT THE 11 STREET"
Seleccionar datos del municipio (county) de Ponce.
# filtrar datos
datos_dw_ponce <- datos_dw3 %>%
filter(county == "Ponce")
# ver primeras filas
head(datos_dw_ponce)
## # A tibble: 6 × 11
## date daily_mean_pm10_conce…¹ daily_aqi_value local_site_name method_code
## <date> <dbl> <dbl> <fct> <chr>
## 1 2024-01-01 14 13 RESIDENTIAL AR… 063
## 2 2024-01-07 18 17 RESIDENTIAL AR… 063
## 3 2024-01-13 40 37 RESIDENTIAL AR… 063
## 4 2024-01-19 14 13 RESIDENTIAL AR… 063
## 5 2024-01-25 19 18 RESIDENTIAL AR… 063
## 6 2024-01-31 30 28 RESIDENTIAL AR… 063
## # ℹ abbreviated name: ¹daily_mean_pm10_concentration
## # ℹ 6 more variables: cbsa_code <dbl>, cbsa_name <chr>, county_fips_code <chr>,
## # county <chr>, site_latitude <dbl>, site_longitude <dbl>
Vamos a crear una nueva variable que sea el logaritmo de la concentración diaria de PM10.
# crear nueva variable como logaritmo natural de la concentración diaria de PM10
datos_dw_ponce <- datos_dw_ponce %>%
mutate(log_daily_mean_pm10_concentration = log(daily_mean_pm10_concentration + 1)) # +1 para evitar log(0)
# ver primeras filas
head(datos_dw_ponce)
## # A tibble: 6 × 12
## date daily_mean_pm10_conce…¹ daily_aqi_value local_site_name method_code
## <date> <dbl> <dbl> <fct> <chr>
## 1 2024-01-01 14 13 RESIDENTIAL AR… 063
## 2 2024-01-07 18 17 RESIDENTIAL AR… 063
## 3 2024-01-13 40 37 RESIDENTIAL AR… 063
## 4 2024-01-19 14 13 RESIDENTIAL AR… 063
## 5 2024-01-25 19 18 RESIDENTIAL AR… 063
## 6 2024-01-31 30 28 RESIDENTIAL AR… 063
## # ℹ abbreviated name: ¹daily_mean_pm10_concentration
## # ℹ 7 more variables: cbsa_code <dbl>, cbsa_name <chr>, county_fips_code <chr>,
## # county <chr>, site_latitude <dbl>, site_longitude <dbl>,
## # log_daily_mean_pm10_concentration <dbl>
# agrupar datos por county y calcular media y desviación estándar de la concentración diaria de PM10
datos_dw3_stat <- datos_dw3 %>%
group_by(county) %>%
summarise(mean_pm10 = mean(daily_mean_pm10_concentration, na.rm = TRUE),
sd_pm10 = sd(daily_mean_pm10_concentration, na.rm = TRUE),
mean_aqi = mean(daily_aqi_value, na.rm = TRUE),
sd_aqi = sd(daily_aqi_value, na.rm = TRUE))
# ver resultados
print(datos_dw3_stat)
## # A tibble: 5 × 5
## county mean_pm10 sd_pm10 mean_aqi sd_aqi
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Catano 25.4 15.8 23.4 13.5
## 2 Fajardo 20.5 10.1 19.0 9.44
## 3 Guayama 18.5 10.2 17.1 9.30
## 4 Guaynabo 21.1 10.8 19.5 10.0
## 5 Ponce 22.9 12.4 21.2 11.2
# cargar paquete ggplot2
library(ggplot2)
# crear boxplot de la concentración diaria de PM10 por municipio
figura1 <- ggplot(datos_dw3, aes(x = county, y = daily_mean_pm10_concentration)) +
geom_boxplot() +
labs(x = "Municipio",
y = "Concentración diaria de PM10 (µg/m³)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# mostrar gráfica
figura1