Loading packages

library(readr)
library(dplyr)
library(tidyr)
library(openair) # http://davidcarslaw.github.io/openair/
library(purrr)
library(lubridate)
library(ggplot2)
library(stringr)
library(knitr)
library(xts)
library(zoo)
library(gridExtra)
library(astsa)
library(rvest)
library(fpp2)
library(ranger)
library(broom)
library(RcppRoll)
library(reshape2)

Data loading

air_data_2 <- readRDS("data_rds_files/air_data_2.rds")

We take a look to the general trend of several indicators through the last 18 years

# We calcule the yearly mean of the pollutants levels.
year_avgs <- air_data_2 %>% select(station_alias, date_time_utc, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, year = year(date_time_utc)) %>%
  summarise_all(funs(mean(., na.rm = TRUE))) %>% 
  select(-date_time_utc) # We drop this variable

# We export the years_avgs as a csv file

write_csv(year_avgs, "data_final_csvs/year_avgs.csv")

# We convert the table to long format

year_avgs_long <- gather(year_avgs, contaminante, value, 3:length(year_avgs)) %>% 
                    filter(!(station_alias == 'Constitución' & year == '2006' & contaminante %in% c('BEN', 'MXIL', 'TOL'))) %>% # We filter this data because is only completed in 0.01%
                    filter(!(station_alias == 'Constitución' & year == '2008' & contaminante == 'PM25')) # We filter this data because is only completed in 0.02%

# We present the data in a grid of graphs

ggplot(year_avgs_long, aes(x = year, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

We drop the Santa Bárbara and Montevil stations. These stations have much less data and the behavior of their variables are significantly different (they are sub-urban stations). So, we take them out from the analysis for now.

air_data_3 <- air_data_2 %>% filter(station_alias != 'Montevil' , 
                                         station_alias != 'Santa Bárbara' )
# We calcule the yearly mean of the pollutants levels.
year_avgs <- air_data_3 %>% select(station_alias, date_time_utc, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, year = year(date_time_utc)) %>%
  summarise_all(funs(mean(., na.rm = TRUE))) %>% 
  select(-date_time_utc) # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

year_avgs_long <- gather(year_avgs, contaminante, value, 3:length(year_avgs)) %>%
                    filter(!(station_alias == 'Constitución' & year == '2006' & contaminante %in% c('BEN', 'MXIL', 'TOL'))) %>% # We filter this data because is only completed in 0.01%
                    filter(!(station_alias == 'Constitución' & year == '2008' & contaminante == 'PM25')) # We filter this data because is only completed in 0.02%

# We present the data in a grid of graphs

ggplot(year_avgs_long, aes(x = year, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the hourly mean of the pollutants levels.
hour_avgs <- air_data_3 %>% select(station_alias, hour, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, hour) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

hour_avgs_long <- gather(hour_avgs, contaminante, value, 3:length(hour_avgs))

# We present the data in a grid of graphs

ggplot(hour_avgs_long, aes(x = hour, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the monthly mean of the pollutants levels.
month_avgs <- air_data_3 %>% select(station_alias, month, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, month) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

month_avgs_long <- gather(month_avgs, contaminante, value, 3:length(month_avgs))

# We present the data in a grid of graphs

ggplot(month_avgs_long, aes(x = month, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the weekly mean of the pollutants levels.
week_day_avgs <- air_data_3 %>% select(station_alias, week_day, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, week_day) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

week_day_avgs_long <- gather(week_day_avgs, contaminante, value, 3:length(week_day_avgs))

# We present the data in a grid of graphs

ggplot(week_day_avgs_long, aes(x = week_day, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

Relationships between variables

The Constitucion Station is the only station with meteorological data. So, we are going to focus our efforts of data exploration on this station.

Reference: http://www.sthda.com/english/wiki/ggplot2-quick-correlation-matrix-heatmap-r-software-and-data-visualization

#This is from the future ;) (tengo que copiar el codigo aqui)
constitucion_data <- readRDS("data_rds_files/constitucion_data.rds")

pollutants <- constitucion_data %>% select(PM10, PM25, NO2, NO, SO2, CO, O3) %>%
                                    na.omit()
cor_matrix <- round(cor(pollutants), 2)
head(cor_matrix)
##      PM10 PM25  NO2   NO  SO2   CO    O3
## PM10 1.00 0.56 0.50 0.49 0.29 0.38 -0.26
## PM25 0.56 1.00 0.37 0.37 0.21 0.29 -0.26
## NO2  0.50 0.37 1.00 0.68 0.30 0.47 -0.59
## NO   0.49 0.37 0.68 1.00 0.29 0.44 -0.45
## SO2  0.29 0.21 0.30 0.29 1.00 0.28 -0.23
## CO   0.38 0.29 0.47 0.44 0.28 1.00 -0.29
long_cor_matrix <- melt(cor_matrix)
head(long_cor_matrix)
##   Var1 Var2 value
## 1 PM10 PM10  1.00
## 2 PM25 PM10  0.56
## 3  NO2 PM10  0.50
## 4   NO PM10  0.49
## 5  SO2 PM10  0.29
## 6   CO PM10  0.38
ggplot(data = long_cor_matrix, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

# Get lower triangle of the correlation matrix
  get_lower_tri<-function(cor_matrix){
    cor_matrix[upper.tri(cor_matrix)] <- NA
    return(cor_matrix)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(cor_matrix){
    cor_matrix[lower.tri(cor_matrix)]<- NA
    return(cor_matrix)
  }
  
  upper_tri <- get_upper_tri(cor_matrix)
upper_tri
##      PM10 PM25  NO2   NO  SO2   CO    O3
## PM10    1 0.56 0.50 0.49 0.29 0.38 -0.26
## PM25   NA 1.00 0.37 0.37 0.21 0.29 -0.26
## NO2    NA   NA 1.00 0.68 0.30 0.47 -0.59
## NO     NA   NA   NA 1.00 0.29 0.44 -0.45
## SO2    NA   NA   NA   NA 1.00 0.28 -0.23
## CO     NA   NA   NA   NA   NA 1.00 -0.29
## O3     NA   NA   NA   NA   NA   NA  1.00
# Melt the correlation matrix
library(reshape2)
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
ggplot(data = long_cor_matrix, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()

reorder_cor_matrix <- function(cor_matrix){
# Use correlation between variables as distance
dd <- as.dist((1-cor_matrix)/2)
hc <- hclust(dd)
cor_matrix <-cor_matrix[hc$order, hc$order]
}

# Reorder the correlation matrix
cor_matrix <- reorder_cor_matrix(cor_matrix)
upper_tri <- get_upper_tri(cor_matrix)
# Melt the correlation matrix
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(long_cor_matrix, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap)

ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 0),
  legend.position = c(0.6, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))

Including information about force and direction of the wind

#This is from the future ;) (tengo que copiar el codigo aqui)
constitucion_data <- readRDS("data_rds_files/constitucion_data.rds")

pollutants <- constitucion_data %>% select(PM10, PM25, NO2, NO, SO2, CO, O3, dd, vv, TMP, PRB, LL, HR, RS) %>%
                                    na.omit()

cor_matrix <- round(cor(pollutants), 2)
long_cor_matrix <- melt(cor_matrix)
head(long_cor_matrix)
##   Var1 Var2 value
## 1 PM10 PM10  1.00
## 2 PM25 PM10  0.56
## 3  NO2 PM10  0.50
## 4   NO PM10  0.49
## 5  SO2 PM10  0.29
## 6   CO PM10  0.38
# Get lower triangle of the correlation matrix
  get_lower_tri<-function(cor_matrix){
    cor_matrix[upper.tri(cor_matrix)] <- NA
    return(cor_matrix)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(cor_matrix){
    cor_matrix[lower.tri(cor_matrix)]<- NA
    return(cor_matrix)
  }
  
  upper_tri <- get_upper_tri(cor_matrix)
upper_tri
##      PM10 PM25  NO2   NO  SO2   CO    O3    dd    vv   TMP   PRB    LL
## PM10    1 0.56 0.50 0.49 0.29 0.38 -0.26  0.01 -0.14  0.03  0.00 -0.09
## PM25   NA 1.00 0.37 0.37 0.21 0.29 -0.26  0.02 -0.18 -0.06  0.04 -0.07
## NO2    NA   NA 1.00 0.68 0.30 0.47 -0.59  0.13 -0.35 -0.23 -0.09  0.01
## NO     NA   NA   NA 1.00 0.29 0.44 -0.45  0.09 -0.23 -0.17 -0.03 -0.03
## SO2    NA   NA   NA   NA 1.00 0.28 -0.23  0.11 -0.10 -0.05 -0.01 -0.03
## CO     NA   NA   NA   NA   NA 1.00 -0.29  0.07 -0.17 -0.15 -0.04  0.01
## O3     NA   NA   NA   NA   NA   NA  1.00 -0.30  0.61  0.27 -0.06  0.02
## dd     NA   NA   NA   NA   NA   NA    NA  1.00 -0.35 -0.23  0.02  0.04
## vv     NA   NA   NA   NA   NA   NA    NA    NA  1.00  0.24 -0.12  0.03
## TMP    NA   NA   NA   NA   NA   NA    NA    NA    NA  1.00 -0.03 -0.12
## PRB    NA   NA   NA   NA   NA   NA    NA    NA    NA    NA  1.00 -0.13
## LL     NA   NA   NA   NA   NA   NA    NA    NA    NA    NA    NA  1.00
## HR     NA   NA   NA   NA   NA   NA    NA    NA    NA    NA    NA    NA
## RS     NA   NA   NA   NA   NA   NA    NA    NA    NA    NA    NA    NA
##         HR    RS
## PM10  0.06  0.00
## PM25  0.08 -0.02
## NO2   0.04 -0.19
## NO    0.01 -0.08
## SO2  -0.12  0.04
## CO    0.10 -0.11
## O3   -0.20  0.35
## dd    0.02 -0.25
## vv   -0.30  0.53
## TMP  -0.15  0.39
## PRB   0.01  0.06
## LL    0.15 -0.08
## HR    1.00 -0.36
## RS      NA  1.00
reorder_cor_matrix <- function(cor_matrix){
# Use correlation between variables as distance
dd <- as.dist((1-cor_matrix)/2)
hc <- hclust(dd)
cor_matrix <-cor_matrix[hc$order, hc$order]
}

# Reorder the correlation matrix
cor_matrix <- reorder_cor_matrix(cor_matrix)
upper_tri <- get_upper_tri(cor_matrix)
# Melt the correlation matrix
long_cor_matrix <- melt(upper_tri, na.rm = TRUE)


# Create a ggheatmap
ggheatmap <- ggplot(long_cor_matrix, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()




ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 0),
  legend.position = c(0.6, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))

Data Exploration PM10 Constitucion Station

We create the dataset pm10 with PM10 values from the Constitución Station and we execute a summary

pm10 <- air_data_3 %>% filter(station_alias == 'Constitución') %>%
                        select(date_time_utc, PM10)

summary(pm10)
##  date_time_utc                      PM10       
##  Min.   :2000-01-01 00:00:00   Min.   :  0.00  
##  1st Qu.:2004-06-30 23:15:00   1st Qu.: 19.00  
##  Median :2009-01-02 00:30:00   Median : 29.00  
##  Mean   :2008-12-31 18:50:45   Mean   : 34.39  
##  3rd Qu.:2013-07-02 23:45:00   3rd Qu.: 44.00  
##  Max.   :2017-12-31 23:00:00   Max.   :888.00  
##                                NA's   :3106

25% of the values are between 44.00 and 888.00. 888.00 is a value really extreme. How many extreme values (outliers) do we have in this series? We plot all the values to visualise this:

ggplot(pm10, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1)

We have very few values greater than 250. So, it doesn’t seem we have a problem with the outliers (Pending: A PM10 level of 880 is something possible or is it likely to be a monitoring error?).

Daily averages

We create a new dataset with the PM10 daily averages and we plot them in a new graphic. We add a trend line too. There is a clear downward trend in the measurements and we have many fewer extreme values during the last decade. It seems like we have two very clear “epochs” in the data, before and after the year 2008.

pm10_day_avg <- pm10 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(PM10, na.rm = TRUE))

ggplot(pm10_day_avg, aes(x = day, y = day_avg, , colour = day_avg)) + 
         geom_point(alpha = 0.5) +
         geom_smooth(color = "grey", alpha = 0.2) +
         scale_colour_gradientn(colours = terrain.colors(10)) +
         theme(legend.position = c(0.3, 0.9),
                legend.background = element_rect(colour = "transparent", fill = NA), legend.direction = "horizontal") +
         labs(colour = "PM10 daily average (colour scale)", x = "Year", y = "PM10 daily average", title = "PM10 daily average - 2000-2017 evolution (Constitución Station)")

We identify a very clear trend through the years on the last graph. But, as we already saw before on the grid graphs there are other things happening at the same time.

year_const <- year_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot1 <- ggplot(year_const, aes(x = year, y = value)) + 
  geom_line()

month_const <- month_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot2 <- ggplot(month_const, aes(x = month, y = value)) + 
  geom_line()

week_day_const <- week_day_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot3 <- ggplot(week_day_const, aes(x = week_day, y = value)) + 
  geom_line()

hour_const <- hour_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot4 <- ggplot(hour_const, aes(x = hour, y = value)) + 
  geom_line()

grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)