Gijon Air Pollution - Data Exploration

Loading packages

library(readr)
library(dplyr)
library(tidyr)
library(openair) # http://davidcarslaw.github.io/openair/
library(purrr)
library(lubridate)
library(ggplot2)
library(stringr)
library(knitr)
library(xts)
library(zoo)
library(gridExtra)
library(astsa)
library(rvest)
library(fpp2)
library(ranger)
library(broom)
library(RcppRoll)

Data loading

air_data_2 <- readRDS("air_data_2.rds")

We take a look to the general trend of several indicators through the last 18 years

# We calcule the yearly mean of the pollutants levels.
year_avgs <- air_data_2 %>% select(station_alias, date_time_utc, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, year = year(date_time_utc)) %>%
  summarise_all(funs(mean(., na.rm = TRUE))) %>% 
  select(-date_time_utc) # We drop this variable

# We export the years_avgs as a csv file

write_csv(year_avgs, "final_csvs/year_avgs.csv")

# We convert the table to long format

year_avgs_long <- gather(year_avgs, contaminante, value, 3:length(year_avgs)) %>% 
                    filter(!(station_alias == 'Constitución' & year == '2006' & contaminante %in% c('BEN', 'MXIL', 'TOL'))) %>% # We filter this data because is only completed in 0.01%
                    filter(!(station_alias == 'Constitución' & year == '2008' & contaminante == 'PM25')) # We filter this data because is only completed in 0.02%

# We present the data in a grid of graphs

ggplot(year_avgs_long, aes(x = year, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

We drop the Santa Bárbara and Montevil stations. These stations have much less data and the behavior of their variables are significantly different (they are sub-urban stations). So, we take them out from the analysis for now.

air_data_3 <- air_data_2 %>% filter(station_alias != 'Montevil' , 
                                         station_alias != 'Santa Bárbara' )

# We calcule the yearly mean of the pollutants levels.
year_avgs <- air_data_3 %>% select(station_alias, date_time_utc, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, year = year(date_time_utc)) %>%
  summarise_all(funs(mean(., na.rm = TRUE))) %>% 
  select(-date_time_utc) # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

year_avgs_long <- gather(year_avgs, contaminante, value, 3:length(year_avgs)) %>%
                    filter(!(station_alias == 'Constitución' & year == '2006' & contaminante %in% c('BEN', 'MXIL', 'TOL'))) %>% # We filter this data because is only completed in 0.01%
                    filter(!(station_alias == 'Constitución' & year == '2008' & contaminante == 'PM25')) # We filter this data because is only completed in 0.02%

# We present the data in a grid of graphs

ggplot(year_avgs_long, aes(x = year, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the hourly mean of the pollutants levels.
hour_avgs <- air_data_3 %>% select(station_alias, hour, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, hour) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

hour_avgs_long <- gather(hour_avgs, contaminante, value, 3:length(hour_avgs))

# We present the data in a grid of graphs

ggplot(hour_avgs_long, aes(x = hour, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the monthly mean of the pollutants levels.
month_avgs <- air_data_3 %>% select(station_alias, month, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, month) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

month_avgs_long <- gather(month_avgs, contaminante, value, 3:length(month_avgs))

# We present the data in a grid of graphs

ggplot(month_avgs_long, aes(x = month, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

# We calcule the weekly mean of the pollutants levels.
week_day_avgs <- air_data_3 %>% select(station_alias, week_day, PM10, PM25, SO2, NO2, NO, O3, BEN, CO, MXIL, TOL) %>%
  group_by(station_alias, week_day) %>%
  summarise_all(funs(mean(., na.rm = TRUE)))  # quito ahora esta variable, porque no tiene sentido que salga su media.

# We convert the table to long format

week_day_avgs_long <- gather(week_day_avgs, contaminante, value, 3:length(week_day_avgs))

# We present the data in a grid of graphs

ggplot(week_day_avgs_long, aes(x = week_day, y = value)) + 
  geom_line() + 
  facet_grid(contaminante~station_alias,scales="free_y") +
   theme(axis.text = element_text(size = 6))

Prediction models

We are going to use as base model for our predictions the ARIMA method. And, as first step we are going to try to predict the values of the PM10 pollutant for the Constitución station.

We create the dataset pm10 with PM10 values from the Constitución Station and we execute a summary

pm10 <- air_data_3 %>% filter(station_alias == 'Constitución') %>%
                        select(date_time_utc, PM10)

summary(pm10)

##  date_time_utc                      PM10       
##  Min.   :2000-01-01 00:00:00   Min.   :  0.00  
##  1st Qu.:2004-06-30 23:15:00   1st Qu.: 19.00  
##  Median :2009-01-02 00:30:00   Median : 29.00  
##  Mean   :2008-12-31 18:50:45   Mean   : 34.39  
##  3rd Qu.:2013-07-02 23:45:00   3rd Qu.: 44.00  
##  Max.   :2017-12-31 23:00:00   Max.   :888.00  
##                                NA's   :3106

25% of the values are between 44.00 and 888.00. 888.00 is a value really extreme. How many extreme values (outliers) do we have in this series? We plot all the values to visualise this:

ggplot(pm10, aes(x = date_time_utc, y = PM10)) + 
         geom_point(alpha = 0.1)

We have very few values greater than 250. So, it doesn’t seem we have a problem with the outliers (Pending: A PM10 level of 880 is something possible or is it likely to be a monitoring error?).

Daily averages

We create a new dataset with the PM10 daily averages and we plot them in a new graphic. We add a trend line too. There is a clear downward trend in the measurements and we have many fewer extreme values during the last decade. It seems like we have two very clear “epochs” in the data, before and after the year 2008.

pm10_day_avg <- pm10 %>% group_by(day = date(date_time_utc)) %>%
                          summarise(day_avg = mean(PM10, na.rm = TRUE))

ggplot(pm10_day_avg, aes(x = day, y = day_avg, , colour = day_avg)) + 
         geom_point(alpha = 0.5) +
         geom_smooth(color = "grey", alpha = 0.2) +
         scale_colour_gradientn(colours = terrain.colors(10)) +
         theme(legend.position = c(0.3, 0.9),
                legend.background = element_rect(colour = "transparent", fill = NA), legend.direction = "horizontal") +
         labs(colour = "PM10 daily average (colour scale)", x = "Year", y = "PM10 daily average", title = "PM10 daily average - 2000-2017 evolution (Constitución Station)")

We identify a very clear trend through the years on the last graph. But, as we already saw before on the grid graphs there are other things happening at the same time.

year_const <- year_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot1 <- ggplot(year_const, aes(x = year, y = value)) + 
  geom_line()

month_const <- month_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot2 <- ggplot(month_const, aes(x = month, y = value)) + 
  geom_line()

week_day_const <- week_day_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot3 <- ggplot(week_day_const, aes(x = week_day, y = value)) + 
  geom_line()

hour_const <- hour_avgs_long %>% filter(station_alias == "Constitución", contaminante == 'PM10')
plot4 <- ggplot(hour_const, aes(x = hour, y = value)) + 
  geom_line()

grid.arrange(plot1, plot2, plot3, plot4, ncol = 2)