Loading the libraries we gonna use

library(dplyr)
library(tidyjson)
library(rjson)
library(lubridate)
library(GGally)
library(ggplot2)
library(xts)
library(zoo)
library(data.table)
library(heatmaply)
library(reshape2)
library(factoextra)
library(pca3d)
library(forecast)
library(caret)
library(corrplot)

Loading the datasets, one for each month

jan <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-01-01 00:00/2020-01-31 23:59")
feb <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-02-01 00:00/2020-02-31 23:59")
mar <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-03-01 00:00/2020-03-31 23:59")
apr <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-04-01 00:00/2020-04-31 23:59")
may <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-05-01 00:00/2020-05-31 23:59")
jun <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-06-01 00:00/2020-06-31 23:59")

Loading the datasets, one for each month for 2019

jan19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-01-01 00:00/2019-01-31 23:59")
feb19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-02-01 00:00/2019-02-31 23:59")
mar19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-03-01 00:00/2019-03-31 23:59")
apr19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-04-01 00:00/2019-04-31 23:59")
may19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-05-01 00:00/2019-05-31 23:59")
jun19<- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-06-01 00:00/2019-06-31 23:59")

Extracting the values for 2020

We gonna extract the values from our dataset. We gonna create tables for every station for all six months.

We gonna use the tidyjson library and spread_all function to spread the values and create a matrix in which

every row is a hourly measures and details for all types of pollution. Then we gonna save the matrix as .csv file.

Station 1

# creating the list of values for station 1 for all six months
station1 <- c(jan$data$station_1$values,feb$data$station_1$values,mar$data$station_1$values,
              apr$data$station_1$values,may$data$station_1$values,jun$data$station_1$values)
station1 <- station1 %>% spread_all() # spreading the list to one data.frame
station1$..JSON <- NULL # deleting the last column
write.csv(station1,"station1.csv",row.names = F) # saving the data.frame

Station 2

station2 <- c(jan$data$station_2$values,feb$data$station_2$values,mar$data$station_2$values,
              apr$data$station_2$values,may$data$station_2$values,jun$data$station_2$values)
station2 <- station2 %>% spread_all()
station2$..JSON <- NULL
write.csv(station2,"station2.csv", row.names = F)

Station 3

station3 <- c(jan$data$station_3$values,feb$data$station_3$values,mar$data$station_3$values,
              apr$data$station_3$values,may$data$station_3$values,jun$data$station_3$values)
station3 <- station3 %>% spread_all()
station3$..JSON <- NULL
write.csv(station3,"station3.csv", row.names = F)

Station 5

station5 <- c(jan$data$station_5$values,feb$data$station_5$values,mar$data$station_5$values,
              apr$data$station_5$values,may$data$station_5$values,jun$data$station_5$values)
station5 <- station5 %>% spread_all()
station5$..JSON <- NULL
write.csv(station5,"station5.csv", row.names = F)

Station 7

station7 <- c(jan$data$station_7$values,feb$data$station_7$values,mar$data$station_7$values,
              apr$data$station_7$values,may$data$station_7$values,jun$data$station_7$values)
station7 <- station7 %>% spread_all()
station7$..JSON <- NULL
write.csv(station7,"station7.csv", row.names = F)

Station 8

station8 <- c(jan$data$station_8$values,feb$data$station_8$values,mar$data$station_8$values,
              apr$data$station_8$values,may$data$station_8$values,jun$data$station_8$values)
station8 <- station8 %>% spread_all()
station8$..JSON <- NULL
write.csv(station8,"station8.csv", row.names = F)

Station 9

station9 <- c(jan$data$station_9$values,feb$data$station_9$values,mar$data$station_9$values,
              apr$data$station_9$values,may$data$station_9$values,jun$data$station_9$values)
station9 <- station9 %>% spread_all()
station9$..JSON <- NULL
write.csv(station9,"station9.csv", row.names = F)

Station 10

station10 <- c(jan$data$station_10$values,feb$data$station_10$values,mar$data$station_10$values,
               apr$data$station_10$values,may$data$station_10$values,jun$data$station_10$values)
station10 <- station10 %>% spread_all()
station10$..JSON <- NULL
write.csv(station10,"station10.csv", row.names = F)

Station 11

station11 <- c(jan$data$station_11$values,feb$data$station_11$values,mar$data$station_11$values,
               apr$data$station_11$values,may$data$station_11$values,jun$data$station_11$values)
station11 <- station11 %>% spread_all()
station11$..JSON <- NULL
write.csv(station11,"station11.csv", row.names = F)

Extracting the values for 2019

Station 1 2019

station1_19 <- c(jan19$data$station_1$values,feb19$data$station_1$values,mar19$data$station_1$values,
                 apr19$data$station_1$values,may19$data$station_1$values,jun19$data$station_1$values)
station1_19 <- station1_19 %>% spread_all()
station1_19$..JSON <- NULL
write.csv(station1_19,"station1_19.csv",row.names = F)

Station 2 2019

station2_19 <- c(jan19$data$station_2$values,feb19$data$station_2$values,mar19$data$station_2$values,
                 apr19$data$station_2$values,may19$data$station_2$values,jun19$data$station_2$values)
station2_19 <- station2_19 %>% spread_all()
station2_19$..JSON <- NULL
write.csv(station2_19,"station2_19.csv", row.names = F)

Station 3 2019

station3_19 <- c(jan19$data$station_3$values,feb19$data$station_3$values,mar19$data$station_3$values,
                 apr19$data$station_3$values,may19$data$station_3$values,jun19$data$station_3$values)
station3_19 <- station3_19 %>% spread_all()
station3_19$..JSON <- NULL
write.csv(station3_19,"station3_19.csv", row.names = F)

Station 5 2019

station5_19 <- c(jan19$data$station_5$values,feb19$data$station_5$values,mar19$data$station_5$values,
                 apr19$data$station_5$values,may19$data$station_5$values,jun19$data$station_5$values)
station5_19 <- station5_19 %>% spread_all()
station5_19$..JSON <- NULL
write.csv(station5_19,"station5_19.csv", row.names = F)

Station 7 2019

station7_19 <- c(jan19$data$station_7$values,feb19$data$station_7$values,mar19$data$station_7$values,
                 apr19$data$station_7$values,may19$data$station_7$values,jun19$data$station_7$values)
station7_19 <- station7_19 %>% spread_all()
station7_19$..JSON <- NULL
write.csv(station7_19,"station7_19.csv", row.names = F)

Station 8 2019

station8_19 <- c(jan19$data$station_8$values,feb19$data$station_8$values,mar19$data$station_8$values,
                 apr19$data$station_8$values,may19$data$station_8$values,jun19$data$station_8$values)
station8_19 <- station8_19 %>% spread_all()
station8_19$..JSON <- NULL
write.csv(station8_19,"station8_19.csv", row.names = F)

Station 9 2019

station9_19 <- c(jan19$data$station_9$values,feb19$data$station_9$values,mar19$data$station_9$values,
                 apr19$data$station_9$values,may19$data$station_9$values,jun19$data$station_9$values)
station9_19 <- station9_19 %>% spread_all()
station9_19$..JSON <- NULL
write.csv(station9_19,"station9_19.csv", row.names = F)

Station 10 2019

station10_19 <- c(jan19$data$station_10$values,feb19$data$station_10$values,mar19$data$station_10$values,
                  apr19$data$station_10$values,may19$data$station_10$values,jun19$data$station_10$values)
station10_19 <- station10_19 %>% spread_all()
station10_19$..JSON <- NULL
write.csv(station10_19,"station10_19.csv", row.names = F)

Station 11 2019

station11_19 <- c(jan19$data$station_11$values,feb19$data$station_11$values,mar19$data$station_11$values,
                  apr19$data$station_11$values,may19$data$station_11$values,jun19$data$station_11$values)
station11_19 <- station11_19 %>% spread_all()
station11_19$..JSON <- NULL
write.csv(station11_19,"station11_19.csv", row.names = F)

creating a vector with stations names

stations <- c("Nicosia_Traf","Nicosia_Resi","Limassol_Traf","Larnaca_Traf","Paphos_Traf",
              "Zygi_Indu","Mari_Indu","Ayia_Marina_Xylianou_Back","Paralimni_Traf")

Data preprocessing

We gonna load the datasets for every station, filter the columns and keep the values for every pollutant type. We’ll change the class of time to POSIXct, check for NAs and fill them using na.approx function

Station 1

s1 <- read.csv("station1.csv") # Loading the .csv file we previous create
s1 <- s1 %>% select(c(seq(2,38,4))) # Filtering the columns with pollutant values
names(s1) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25") #adding the column names
s1$time <- ymd_hms(s1$time) # changing the type of time
sapply(s1, function(x) sum(is.na(x))) # checking for NA
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0   99  675   35  104  104  375  103  104  675
# Filling NA with na.approx function
s1[,2:10] <- sapply(s1[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = T), x))
sapply(s1, function(x) sum(is.na(x))) # zero NA
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0    0    0    0    0    0    0    0    0    0
knitr::kable(head(s1,3)) # printing the first 3 rows
time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
2020-01-01 00:00:00 4.213296 113.18510 5.422678 41.66263 64.42079 1045.5090 5.304624 14.83504 89.80502
2020-01-01 01:00:00 3.986767 49.54911 7.720096 41.01508 90.58797 977.6353 2.203252 32.31437 35.75103
2020-01-01 02:00:00 4.940188 64.78768 4.493460 50.22812 119.87520 1263.8940 3.149906 45.39985 35.52024

Station 2

s2 <- read.csv("station2.csv")
s2 <- s2 %>% select(c(seq(2,26,4)))
names(s2) <- c("time","SO2","O3","NO2","NOx","CO","NO")
s2$time <- ymd_hms(s2$time)
sapply(s2, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO   NO 
##    0  179   44  100   84   89   50
s2[,2:7] <- sapply(s2[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 3

s3 <- read.csv("station3.csv")
s3 <- s3 %>% select(c(seq(2,30,4)))
names(s3) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3$time <- ymd_hms(s3$time)
sapply(s3, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO   NO 
##    0  684  133  124  228  228 1194  254
s3[,2:8] <- sapply(s3[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 5

s5 <- read.csv("station5.csv")
s5 <- s5 %>% select(c(seq(2,38,4)))
names(s5) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5$time <- ymd_hms(s5$time)
sapply(s5, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  199  210   97   64   64   98  185   95  221
s5[,2:10] <- sapply(s5[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 7

s7 <- read.csv("station7.csv")
s7 <- s7 %>% select(c(seq(2,38,4)))
names(s7) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s7$time <- ymd_hms(s7$time)
sapply(s7, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  656  406  121  165  164  320  539  206  479
s7[,2:10] <- sapply(s7[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 8

s8 <- read.csv("station8.csv")
s8 <- s8 %>% select(c(seq(2,34,4)))
names(s8) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8$time <- ymd_hms(s8$time)
sapply(s8, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx C6H6   NO PM25 
##    0  202  139   76   83   83 1027   91  142
s8[,2:9] <- sapply(s8[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 9

s9 <- read.csv("station9.csv")
s9 <- s9 %>% select(c(seq(2,30,4)))
names(s9) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9$time <- ymd_hms(s9$time)
sapply(s9, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO C6H6   NO 
##    0  180   44  114  114  626 1943  235
s9[,2:8] <- sapply(s9[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 10

s10 <- read.csv("station10.csv")
s10 <- s10 %>% select(c(seq(2,38,4)))
names(s10) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","PM25","NO")
s10$time <- ymd_hms(s10$time)
sapply(s10, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6 PM25   NO 
##    0  291 1144  108  180  180 1195  288 1167  429
s10[,2:10] <- sapply(s10[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 11

s11 <- read.csv("station11.csv")
s11 <- s11 %>% select(c(seq(2,34,4)))
names(s11) <- c("time","SO2","O3","NO2","NOx","CO","NO","PM10","PM25")
s11$time <- ymd_hms(s11$time)
sapply(s11, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO   NO PM10 PM25 
##    0  262  113  124  124   94  275  864  864
s11[,2:9] <- sapply(s11[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 1 2019

s1_19 <- read.csv("station1_19.csv")
s1_19 <- s1_19 %>% select(c(seq(2,38,4)))
names(s1_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s1_19$time <- ymd_hms(s1_19$time)
sapply(s1_19, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  156  654   63  113  113   84  125  115  654
s1_19[,2:10] <- sapply(s1_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 2 2019

s2_19 <- read.csv("station2_19.csv")
s2_19 <- s2_19 %>% select(c(seq(2,26,4)))
names(s2_19) <- c("time","SO2","CO","NO2","NOx","NO","O3")
s2_19$time <- ymd_hms(s2_19$time)
sapply(s2_19, function(x) sum(is.na(x)))
## time  SO2   CO  NO2  NOx   NO   O3 
##    0  820   85   94   94   96  115
s2_19[,2:7] <- sapply(s2_19[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 3 2019

s3_19 <- read.csv("station3_19.csv")
s3_19 <- s3_19 %>% select(c(seq(2,30,4)))
names(s3_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3_19$time <- ymd_hms(s3_19$time)
sapply(s3_19, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO   NO 
##    0  505   66  225  116  116  490  116
s3_19[,2:8] <- sapply(s3_19[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 5 2019

s5_19 <- read.csv("station5_19.csv")
s5_19 <- s5_19 %>% select(c(seq(2,38,4)))
names(s5_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5_19$time <- ymd_hms(s5_19$time)
sapply(s5_19, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  102  276  123   69   69   73   66   75  276
s5_19[,2:10] <- sapply(s5_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 7 2019

s7_19 <- read.csv("station7_19.csv")
s7_19 <- s7_19 %>% select(c(seq(2,38,4)))
names(s7_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO","PM10","PM25")
s7_19$time <- ymd_hms(s7_19$time)
sapply(s7_19, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO C6H6   NO PM10 PM25 
##    0  410   26   70   65   80  354   73 1695 3680
s7_19[,2:10] <- sapply(s7_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 8 2019

s8_19 <- read.csv("station8_19.csv")
s8_19 <- s8_19 %>% select(c(seq(2,34,4)))
names(s8_19) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8_19$time <- ymd_hms(s8_19$time)
sapply(s8_19, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx C6H6   NO PM25 
##    0  547  153  135  183   78  212   78  153
s8_19[,2:9] <- sapply(s8_19[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

#### Station 9 2019

s9_19 <- read.csv("station9_19.csv")
s9_19<- s9_19 %>% select(c(seq(2,30,4)))
names(s9_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9_19$time <- ymd_hms(s9_19$time)
sapply(s9_19, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO C6H6   NO 
##    0  352  503  221  221  297 1052  225
s9_19[,2:8] <- sapply(s9_19[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 10 2019

s10_19 <- read.csv("station10_19.csv")
s10_19 <- s10_19 %>% select(c(seq(2,38,4)))
names(s10_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO","PM10","PM25")
s10_19$time <- ymd_hms(s10_19$time)
sapply(s10_19, function(x) sum(is.na(x)))
## time  SO2   O3  NO2  NOx   CO C6H6   NO PM10 PM25 
##    0  922   88  209  209  321  276  406 1032 1030
s10_19[,2:10] <- sapply(s10_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 11 2019

s11_19 <- read.csv("station11_19.csv")
s11_19 <- s11_19 %>% select(c(seq(2,34,4)))
names(s11_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO","PM25")
s11_19$time <- ymd_hms(s11_19$time)
sapply(s11_19, function(x) sum(is.na(x)))
## time  SO2 PM10   O3  NO2  NOx   CO   NO PM25 
##    0  208  126   21   64   64   75   64  126
s11_19[,2:9] <- sapply(s11_19[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Changing the time

Next step is gonna be to change the time. Our dataset is in hours so we gonna create two new datasets for every station in which we gonna store mean value for each day and for each month. First we’ll transform the datasets to xts type and then create .d and .m datasets for days and months.

Station 1

s1.xts <- xts(s1, order.by = s1$time) # creating the xts type of s1 dataset
s1.d <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "days"), mean) #finding the mean for every day
s1.m <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "months"), mean) #finding the mean for every month
s1.d <- as.data.frame(s1.d) # changing for xts to data.frame type
s1.m <- as.data.frame(s1.m) # changing for xts to data.frame type

Station 1 2019

s1_19.xts <- xts(s1_19, order.by = s1_19$time) # creating the xts type of s1 dataset
s1_19.d <- period.apply(s1_19.xts[,2:10] , endpoints(s1_19.xts, "days"), mean) #finding the mean for every day
s1_19.m <- period.apply(s1_19.xts[,2:10] , endpoints(s1_19.xts, "months"), mean) #finding the mean for every month
s1_19.d <- as.data.frame(s1_19.d) # changing for xts to data.frame type
s1_19.m <- as.data.frame(s1_19.m) # changing for xts to data.frame type

Station 2

s2.xts <- xts(s2, order.by = s2$time)
s2.d <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "days"), mean)
s2.m <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "months"), mean)
s2.d <- as.data.frame(s2.d)
s2.m <- as.array.default(s2.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2.d$time <- as.POSIXct(row.names(s2.d),format = "%Y-%m-%d")

Station 2 2019

s2_19.xts <- xts(s2_19, order.by = s2_19$time)
s2_19.d <- period.apply(s2_19.xts[,2:7] , endpoints(s2_19.xts, "days"), mean)
s2_19.m <- period.apply(s2_19.xts[,2:7] , endpoints(s2_19.xts, "months"), mean)
s2_19.d <- as.data.frame(s2_19.d)
s2_19.m <- as.array.default(s2_19.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2_19.d$time <- as.POSIXct(row.names(s2_19.d),format = "%Y-%m-%d")

Station 3

s3.xts <- xts(s3, order.by = s3$time)
s3.d <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "days"), mean)
s3.m <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "months"), mean)
s3.d <- as.data.frame(s3.d)
s3.m <- as.data.frame(s3.m)

Station 3 2019

s3_19.xts <- xts(s3_19, order.by = s3_19$time)
s3_19.d <- period.apply(s3_19.xts[,2:8] , endpoints(s3_19.xts, "days"), mean)
s3_19.m <- period.apply(s3_19.xts[,2:8] , endpoints(s3_19.xts, "months"), mean)
s3_19.d <- as.data.frame(s3_19.d)
s3_19.m <- as.data.frame(s3_19.m)

Station 5

s5.xts <- xts(s5, order.by = s5$time)
s5.d <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "days"), mean)
s5.m <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "months"), mean)
s5.d <- as.data.frame(s5.d)
s5.m <- as.data.frame(s5.m)

Station 5 2019

s5_19.xts <- xts(s5_19, order.by = s5_19$time)
s5_19.d <- period.apply(s5_19.xts[,2:10] , endpoints(s5_19.xts, "days"), mean)
s5_19.m <- period.apply(s5_19.xts[,2:10] , endpoints(s5_19.xts, "months"), mean)
s5_19.d <- as.data.frame(s5_19.d)
s5_19.m <- as.data.frame(s5_19.m)

Station 7

s7.xts <- xts(s7, order.by = s7$time)
s7.d <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "days"), mean)
s7.m <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "months"), mean)
s7.d <- as.data.frame(s7.d)
s7.m <- as.data.frame(s7.m)
s7.d$time <- as.POSIXct(row.names(s7.d),format = "%Y-%m-%d")

Station 7 2019

s7_19.xts <- xts(s7_19, order.by = s7_19$time)
s7_19.d <- period.apply(s7_19.xts[,2:10] , endpoints(s7_19.xts, "days"), mean)
s7_19.m <- period.apply(s7_19.xts[,2:10] , endpoints(s7_19.xts, "months"), mean)
s7_19.d <- as.data.frame(s7_19.d)
s7_19.m <- as.data.frame(s7_19.m)
s7_19.d$time <- as.POSIXct(row.names(s7_19.d),format = "%Y-%m-%d")

Station 8

s8.xts <- xts(s8, order.by = s8$time)
s8.d <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "days"), mean)
s8.m <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "months"), mean)
s8.d <- as.data.frame(s8.d)
s8.m <- as.data.frame(s8.m)
s8.d$time <- as.POSIXct(row.names(s8.d),format = "%Y-%m-%d")

Station 8 2019

s8_19.xts <- xts(s8_19, order.by = s8_19$time)
s8_19.d <- period.apply(s8_19.xts[,2:9] , endpoints(s8_19.xts, "days"), mean)
s8_19.m <- period.apply(s8_19.xts[,2:9] , endpoints(s8_19.xts, "months"), mean)
s8_19.d <- as.data.frame(s8_19.d)
s8_19.m <- as.data.frame(s8_19.m)
s8_19.d$time <- as.POSIXct(row.names(s8_19.d),format = "%Y-%m-%d")

Station 9

s9.xts <- xts(s9, order.by = s9$time)
s9.d <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "days"), mean)
s9.m <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "months"), mean)
s9.d <- as.data.frame(s9.d)
s9.m <- as.data.frame(s9.m)

Station 9 2019

s9_19.xts <- xts(s9_19, order.by = s9_19$time)
s9_19.d <- period.apply(s9_19.xts[,2:8] , endpoints(s9_19.xts, "days"), mean)
s9_19.m <- period.apply(s9_19.xts[,2:8] , endpoints(s9_19.xts, "months"), mean)
s9_19.d <- as.data.frame(s9_19.d)
s9_19.m <- as.data.frame(s9_19.m)
s9_19.d$time <- as.POSIXct(row.names(s9_19.d),format = "%Y-%m-%d")

Station 10

s10.xts <- xts(s10, order.by = s10$time)
s10.d <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "days"), mean)
s10.m <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "months"), mean)
s10.d <- as.data.frame(s10.d)
s10.m <- as.data.frame(s10.m)

Station 10 2019

s10_19.xts <- xts(s10_19, order.by = s10_19$time)
s10_19.d <- period.apply(s10_19.xts[,2:10] , endpoints(s10_19.xts, "days"), mean)
s10_19.m <- period.apply(s10_19.xts[,2:10] , endpoints(s10_19.xts, "months"), mean)
s10_19.d <- as.data.frame(s1_19.d)
s10_19.m <- as.data.frame(s10_19.m)

Station 11

s11.xts <- xts(s11, order.by = s11$time)
s11.d <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "days"), mean)
s11.m <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "months"), mean)
s11.d <- as.data.frame(s11.d)
s11.m <- as.data.frame(s11.m)

Station 11 2019

s11_19.xts <- xts(s11_19, order.by = s11_19$time)
s11_19.d <- period.apply(s11_19.xts[,2:9] , endpoints(s11_19.xts, "days"), mean)
s11_19.m <- period.apply(s11_19.xts[,2:9] , endpoints(s11_19.xts, "months"), mean)
s11_19.d <- as.data.frame(s11_19.d)
s11_19.m <- as.data.frame(s11_19.m)

deleting the datasets we dont gonna use

rm(s1,s1.xts,s10, s10.xts, s11, s11.xts,s2,s2.xts,s3, s3.xts, s5, s5.xts, 
   s7, s7.xts, s8, s8.xts,s9, s9.xts, s10, s10.xts, s11, s11.xts, s1_19,s1_19.xts,
   s10_19,s10_19.xts,s11_19,s11_19.xts,s2_19,s2_19.xts,s3_19,s3_19.xts,s5_19,
   s5_19.xts,s7_19,s7_19.xts,s8_19,s8_19.xts,s9_19,s9_19.xts)

PM10 Analysis

Creating the dataframe for pm10 with day columns

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10 <- data.frame("station1"=s1.d$PM10,"station3"=s3.d$PM10,"station5"=s5.d$PM10,
                   "station10"=s10.d$PM10,"station11"=s11.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
pm10 <- full_join(pm10,s7.d[,c("time","PM10")],by="time")
# Full join pm10 and station8 by time column
pm10 <- full_join(pm10,s8.d[,c("time","PM10")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10 <- pm10 %>% rename(station7=PM10.x, station8=PM10.y) %>%
  select(time,station1,station3,station5,station7,station8,station10,station11) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
pm10$mean <- rowMeans(pm10[,2:8])
knitr::kable(head(pm10,3)) # printing the first 3 columns
time station1 station3 station5 station7 station8 station10 station11 mean
2020-01-01 42.89521 18.62904 34.34300 20.03441 26.316686 3.927083 26.23751 24.62613
2020-01-02 20.59653 10.59705 23.11284 17.06729 10.038982 3.721084 30.37896 16.50182
2020-01-03 10.83952 11.74851 13.33836 16.47911 6.818125 3.705833 30.29477 13.31775
### PM10 2019
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10_19 <- data.frame("station1"=s1_19.d$PM10,"station3"=s3_19.d$PM10,"station5"=s5_19.d$PM10,
                  "station7"=s7_19.d$PM10,"station8"=s8_19.d$PM10,"station10"=s10_19.d$PM10,
                  "station11"=s11_19.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10_19 <- pm10_19 %>% select(time,station1,station3,station5,station7,station8,station10,station11)
# Adding a mean column with mean value of PM10 for every day
pm10_19$mean <- rowMeans(pm10_19[,2:8])
knitr::kable(head(pm10_19,3)) # printing the first 3 columns
time station1 station3 station5 station7 station8 station10 station11 mean
2019-01-01 17.65954 25.25458 27.18125 24.36375 10.75981 17.65954 24.47333 21.05026
2019-01-02 30.83935 18.45208 27.98375 27.76417 20.74846 30.83935 26.08583 26.10186
2019-01-03 26.35819 15.31958 24.06979 20.94458 13.82719 26.35819 21.59292 21.21006

Creating the dataframe for pm10 with monthly columns

# adding the values to dataframe
pm10.m <- data.frame(s1.m$PM10,s3.m$PM10,s5.m$PM10,s7.m$PM10,
                     s8.m$PM10,s10.m$PM10,s11.m$PM10)
# Creating a column with mean values for every month
pm10.m$mean <- rowMeans(pm10.m)
# Adding row and column names
rownames(pm10.m) <- c("January","February","March","April","May","June") 
colnames(pm10.m) <- c(stations[c(1,3:6,8:9)],"mean")
knitr::kable(pm10.m, digits = 4) #printing the table
Nicosia_Traf Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf mean
January 36.3907 25.0343 30.1276 25.8627 16.7807 7.6372 28.1558 24.2841
February 30.6833 27.2613 28.6184 26.1846 21.0343 11.2538 28.1471 24.7404
March 30.3524 27.4642 33.2767 26.4542 25.0162 20.2982 29.5287 27.4844
April 25.1905 23.7127 27.7513 21.3241 25.6438 19.1190 25.2619 24.0005
May 21.0555 35.3394 36.1480 35.4592 36.0006 25.1626 32.7273 31.6990
June 25.9583 26.7145 34.0229 23.4378 32.5035 18.4995 31.6976 27.5477

Summary statistics of PM10 for each station

colnames(pm10) <- c("time",stations[c(1,3:6,8:9)],"mean")
knitr::kable(do.call(cbind, lapply(pm10[2:9], summary)))
Nicosia_Traf Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf mean
Min. 10.83952 10.00244 11.00115 9.065349 4.911398 3.240060 12.13500 12.47557
1st Qu. 20.87687 19.09575 22.17966 18.909459 17.613646 9.727647 21.06490 20.33870
Median 24.97054 25.34259 28.60686 24.131823 22.768948 14.381458 26.49583 24.25839
Mean 28.29714 27.61801 31.69190 26.171577 26.316686 17.084898 29.26734 26.63536
3rd Qu. 35.07800 32.26869 36.29274 29.643809 30.876087 19.562292 33.07865 29.54169
Max. 61.74035 94.58720 96.06582 89.734498 120.352083 82.874167 82.96542 79.25038

Data preperation

pm10.t <- as.data.frame(t(as.matrix(pm10))) # transpose the pm10 matrix
# changing row and column names
rownames(pm10.t) <- c("time",stations[c(1,3:6,8:9)],"mean")
colnames(pm10.t) <- pm10$time
pm10.bar2 <- pm10.t[2:8,]
pm10.bar2$row.names <- rownames(pm10.bar2)
# changing the data from wide format to a single column of data
pm10.bar2<-melt(pm10.bar2,id=c("row.names"))
# Changing the type of class
pm10.bar2$value <- as.numeric(pm10.bar2$value)
pm10.bar2$variable <- ymd(pm10.bar2$variable)
pm10.bar2$row.names <- as.factor(pm10.bar2$row.names)

Boxplot for each station

ggplot(data = pm10.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

PM10 time series.

The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. We can see there are some peaks as a result of dust episodes. We can see that COVID19 situation did not affect the values of PM10.

ggplot() + geom_line(data=pm10, aes(x=time,y=mean)) + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  labs(y="PM10 value", x="Month",title="January to June 2020 Time series for PM10") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notation", values = c(Lockdown="red",EU_limit_value="magenta",
                           WHO_air_quality_guideline="green",Flight_ban="orange")) +
  theme_classic() +
  annotate("text",x=pm10$time[140],y = 80,label="Dust Episode", size =4, fontface="bold", color="brown" )

pm10$meansel <- rowMeans(pm10[,c(2,3,5,6)])
pm10_19$meansel <- rowMeans(pm10_19[,c(2,3,5,6)])
pm10$index <- yday(pm10$time)
pm10_19$index <- yday(pm10_19$time)
ggplot() + geom_line(data=pm10, aes(x=index,y=meansel,color="2020"), size=1)+
  geom_line(data=pm10_19, aes(x=index,y=meansel, color="2019"), size=1) + 
  geom_smooth(data=pm10, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=pm10_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="PM10 value", x="Days",title="PM10 value comparison for first half of years 2019 and 2020") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic() + labs(color="Legend text")

ggplot() +  
  geom_smooth(data=pm10, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=pm10_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="PM10 value", x="Days",title="PM10 value comparison for first half of years 2019 and 2020") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic() + labs(color="Legend text")

Time Series plot for every station

sn4 <- c("Nicosia_Traf", "Larnaca_Traf","Paphos_Traf","Zygi_Indu")
ggplot(pm10.bar2[pm10.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) +
  geom_area(aes(color = row.names, fill = row.names),alpha = 0.7) + theme_bw() +
    labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
    labs(fill='Station') + labs(color="Station") 

ggplot(pm10.bar2[pm10.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
    alpha = 0.6, size=1) + theme_classic() +
    labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
    labs(color="Station") 

PM10 daily difference

COVID19 situation doesn’t change the stability of PM10 and the differences seems to be almost the same before and after lockdown. The highest differences can be spotted the first 2 weeks after lockdown.

pm10.ts <- ts(pm10[9], start=c(2020,1) , frequency = 365)
pm10.diff <- diff(pm10.ts)
pm10.dif <- data.frame("time"=pm10$time,"diff"=c(0,pm10.diff)) 
ggplot(pm10.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("PM10 difference") +
  ggtitle("PM10 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values

pm10.mt <- as.data.frame(t(as.matrix(pm10.m))) ##transpose of month matrix
rownames(pm10.mt) <- c(stations[c(1,3:6,8:9)],"mean") #adding row names
heatmaply(pm10.mt, grid_color = "white", dendrogram = "none", main = "PM10 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(pm10.mt,2))

Barplot of PM10 for every month and visual contribution of each station

pm10.bar <- pm10.mt[1:7,] # row filtering
pm10.bar$row.names <- rownames(pm10.bar) # adding column with row names
# changing the data from wide format to a single column of data
pm10.bar<-melt(pm10.bar,id=c("row.names"))
# creating the plot
ggplot(pm10.bar,aes(x=variable,y=value/7,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "PM10 per month and station contribution", y = "PM10 value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

I choose to create three clusters. The clusters seems to be very reasonal. The green cluster includes four of five traffic stations. The red includes only Ayia_Marina_Xyliaroy station, the only Rural Station of our dataset and the blue includes Paphos traffic station and Zygi industrial station.

pm10.t2 <- as.data.frame(t(as.matrix(pm10[,2:8])))
row.names(pm10.t2)<- stations[c(1,3:6,8:9)]
pm10.t2$station <- stations[c(1,3:6,8:9)]
pm10.scaled <- as.data.frame(scale(pm10.t2[,1:182])) # scaling
dist_pm10 <- dist(pm10.scaled, method = 'euclidean') # calculating the distance
hclust_pm10 <- hclust(dist_pm10, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_pm10,labels=pm10.t2$station)
rect.hclust(hclust_pm10 ,  k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule.

We can see that a good choice would be to choose k=2

fviz_nbclust(pm10.scaled, kmeans, method = "wss", k.max=6)

PCA

pm10_pca <- prcomp(pm10.t2[,-183], center = TRUE, scale. = TRUE)
summary(pm10_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6       PC7
## Standard deviation     8.9216 5.8958 5.1523 3.95726 3.69668 3.43124 3.983e-15
## Proportion of Variance 0.4373 0.1910 0.1459 0.08604 0.07508 0.06469 0.000e+00
## Cumulative Proportion  0.4373 0.6283 0.7742 0.86023 0.93531 1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 64% of the total variance

sub_grp <- cutree(hclust_pm10, k = 3)
fviz_cluster(list(data = pm10.scaled, cluster = sub_grp))

PCA 3d plot

This 3d plot for the first three principal components, accounting for 77% of the total variance

pca3d(pm10_pca, group=sub_grp, radius = 4, show.labels=T)
## [1] 1.4701407 0.7154049 0.6374044
## Creating new device
PCA 3d

PCA 3d

NOx Analysis

Creating the dataframe for NOx with day columns

nox <- data.frame("station1"=s1.d$NOx,"station3"=s3.d$NOx ,"station5"=s5.d$NOx , 
                   "station9"=s9.d$NOx,"station10"=s10.d$NOx,"station11"=s11.d$NOx)
nox$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
nox <- full_join(nox,s2.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s7.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s8.d[,c("time","NOx")],by="time")

nox <- nox %>% rename(station2=NOx.x, station7=NOx.y, station8=NOx) %>%
  select(time,station1,station2,station3,station5,station7,station8,
         station9,station10,station11) %>%
  mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
         station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox$mean <- rowMeans(nox[,2:10])
knitr::kable(head(nox,3))
time station1 station2 station3 station5 station7 station8 station9 station10 station11 mean
2020-01-01 82.36372 45.79606 40.62136 58.09139 25.14944 12.33513 8.669477 1.704961 35.21159 34.43812
2020-01-02 56.73409 40.99054 55.56007 45.45797 26.65660 13.26133 10.280462 2.418991 39.59478 32.32831
2020-01-03 42.54235 49.56395 86.81541 65.22078 36.50002 12.76792 14.621379 2.824170 39.55709 38.93479
nox_19 <- data.frame("station1"=s1_19.d$NOx,"station3"=s3_19.d$NOx ,"station5"=s5_19.d$NOx ,
                     "station7"=s7_19.d$NOx,"station8"=s8_19.d$NOx,
                     "station10"=s10_19.d$NOx,"station11"=s11_19.d$NOx)
nox_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
nox_19 <- full_join(nox_19,s2_19.d[,c("time","NOx")],by="time")
nox_19 <- full_join(nox_19,s9_19.d[,c("time","NOx")],by="time")

nox_19 <- nox_19 %>% rename(station2=NOx.x, station9=NOx.y) %>%
  select(time,station1,station2,station3,station5,station7,station8,
         station9,station10,station11) %>%
  mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
         station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox_19$mean <- rowMeans(nox_19[,2:10])
knitr::kable(head(nox_19,3))
time station1 station2 station3 station5 station7 station8 station9 station10 station11 mean
2019-01-01 41.97831 37.66930 74.46432 46.32760 16.02855 7.482461 7.172156 41.97831 22.16688 32.80754
2019-01-02 87.88999 80.63773 49.68460 52.49455 18.10256 11.683488 6.102470 87.88999 39.24132 48.19186
2019-01-03 103.52024 76.38051 43.31829 54.20211 19.50543 12.245074 10.726350 103.52024 58.94068 53.59544

Creating the dataframe for pm10 with monthly columns

nox.m <- data.frame(s1.m$NOx,s2.m$NOx,s3.m$NOx,
                    s5.m$NOx,s7.m$NOx,s8.m$NOx,
                    s9.m$NOx,s10.m$NOx,s11.m$NOx)
nox.m$Mean <- rowMeans(nox.m)
row.names(nox.m)<-c("January","February","March","April","May","June")
colnames(nox.m)<-c(stations,"Mean")
knitr::kable(nox.m, digits = 2)
Nicosia_Traf Nicosia_Resi Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Mari_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf Mean
January 91.52 52.55 65.59 59.05 30.98 13.16 12.87 2.92 42.54 41.24
February 63.56 42.07 57.72 44.19 24.97 15.39 14.25 3.67 30.46 32.92
March 33.44 21.28 34.36 29.16 15.23 13.05 10.73 3.22 16.98 19.72
April 15.85 8.54 13.42 11.89 6.16 7.63 8.12 2.38 9.91 9.32
May 25.49 12.75 23.82 20.43 16.50 9.47 10.97 2.28 14.27 15.11
June 23.35 14.33 27.15 20.36 10.06 15.31 18.14 1.83 15.18 16.19
colnames(nox) <- c("time",stations,"mean")
knitr::kable(do.call(cbind, lapply(nox[2:11], summary)))
Nicosia_Traf Nicosia_Resi Limassol_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Mari_Indu Ayia_Marina_Xylianou_Back Paralimni_Traf mean
Min. 4.381602 1.701600 4.374026 3.925617 1.096979 3.354127 4.492203 0.8867552 4.751423 4.325509
1st Qu. 18.919483 9.213398 16.921153 14.426882 6.056446 8.673014 8.477460 2.0534770 10.675313 12.570360
Median 28.274242 16.293681 30.563370 22.375224 15.017890 12.043879 10.922486 2.4228941 15.605522 17.711148
Mean 42.217921 25.329135 36.976008 30.844837 17.386851 12.335134 12.482777 2.7217090 21.551826 22.427355
3rd Qu. 54.810168 39.055830 52.007920 44.548574 25.405847 15.280685 15.588738 3.0809064 24.351356 31.687493
Max. 158.532276 100.709014 139.149394 126.046744 75.349972 27.850411 28.655146 8.9129858 113.762156 68.516004

data preperation

nox.t <- as.data.frame(t(as.matrix(nox)))
# changing row and column names
colnames(nox.t) <- nox$time
nox.bar <- nox.t[2:10,]
nox.bar$row.names <- rownames(nox.bar)
# changing the data from wide format to a single column of data
nox.bar<-melt(nox.bar,id=c("row.names"))
nox.bar$value <- as.numeric(nox.bar$value)
nox.bar$variable <- ymd(nox.bar$variable)
nox.bar$row.names <- as.factor(nox.bar$row.names)

Boxplot for each station

ggplot(data = nox.bar, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="NOx boxplot for all stations", x="Stations",y="NOx") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

NOx time series

The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. Some peaks are a result of dust episodes. We can see that COVID19 situation affect the values of NOx.

ggplot(nox, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Month",title="January to June 2020 Time series for NOx") + 
  scale_color_manual(name = "statistics", values = c(Lockdown="red",EU_limit_value="magenta",
                           Flight_ban="orange",WHO_air_quality_guideline="green")) +
  theme_classic() + 
  annotate("text",x=pm10$time[140],y = 37,label="Dust Episode", size =4, fontface="bold", color="brown" )

nox$meansel <- rowMeans(nox[,c(2,4,6,7)])
nox_19$meansel <- rowMeans(nox_19[,c(2,4,6,7)])
nox$index <- yday(nox$time)
nox_19$index <- yday(nox_19$time)
ggplot() + geom_line(data=nox, aes(x=index,y=meansel,color="2020"),size=1)+
  geom_line(data=nox_19, aes(x=index,y=meansel, color="2019"), size=1) + 
  geom_smooth(data=nox, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=nox_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Days",title="NOx value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic()

ggplot() + geom_smooth(data=nox, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=nox_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Days",title="NOx value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic()

Time Series plot for every station

ggplot(nox.bar[nox.bar$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + 
  geom_area(aes(fill = row.names, colour=row.names),alpha = 0.3) + theme_bw() +
    labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
    labs(fill="Station") + labs(colour="Station")

ggplot(nox.bar[nox.bar$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(colour=row.names), alpha=0.6, size=1) + theme_bw() +
    labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
    labs(colour="Station")

NOx daily difference

After COVID19 situation we can see that the daily difference is low and NOx seems to be more stable.

nox.ts <- ts(nox[11], start=c(2020,1) , frequency = 365)
nox.diff <- diff(nox.ts)
nox.diff <- data.frame("time"=nox$time,"diff"=c(0,nox.diff)) 
ggplot(nox.diff, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("NOx difference") +
  ggtitle("NOx daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot for NOx monthly values for each station

nox.mt <- as.data.frame(t(as.matrix(nox.m)))
heatmaply(nox.mt, grid_color = "white", dendrogram = "none", main = "NOx per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(nox.mt,2))

Barplot of NOx for every month and visual contribution of each station

nox.bar2 <- nox.mt[1:9,]
nox.bar2$row.names <- rownames(nox.bar2) # adding column with row names
# changing the data from wide format to a single column of data
nox.bar2<-melt(nox.bar2,id=c("row.names"))
# creating the plot
ggplot(nox.bar2,aes(x=variable,y=value/9,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "NOx per month and station contribution", y = "NOx value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

Performing a Hierarchical Cluster analysis, calculating the distance between two points using euclidean distance and complete method to calculate the distance between clusters. Ayia Marina Xylianou has her own cluster, the red cluster includes the traffic stations of the three big cities of Cyprus and the blue cluster includes the traffic station of smallers cities like Paralimni and Paphos, the industrial stations and the Nicosia residential station. We can assume red cluster has the highest NOx values, green cluster has the lowest NOx values and blue cluster has NOx values between green and red.

nox.t2 <- as.data.frame(t(as.matrix(nox[,2:10])))
nox.t2$station <- stations
nox.scaled <- as.data.frame(scale(nox.t2[,1:182]))
dist_nox <- dist(nox.scaled, method = 'euclidean') # calculating the distance
hclust_nox <- hclust(dist_nox, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_nox,labels=nox.t2$station)
rect.hclust(hclust_nox ,  k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule for NOx

fviz_nbclust(nox.scaled, kmeans, method = "wss", k.max=8)

PCA

nox_pca <- prcomp(nox.t2[,-183], center = TRUE, scale. = TRUE)
summary(nox_pca)
## Importance of components:
##                            PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     11.1551 4.3793 3.68206 2.70529 2.47200 2.20758 1.98466
## Proportion of Variance  0.6837 0.1054 0.07449 0.04021 0.03358 0.02678 0.02164
## Cumulative Proportion   0.6837 0.7891 0.86359 0.90380 0.93737 0.96415 0.98579
##                            PC8       PC9
## Standard deviation     1.60800 2.751e-15
## Proportion of Variance 0.01421 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 79% of the total variance

sub_grp2 <- cutree(hclust_nox, k = 3)
fviz_cluster(list(data = nox.scaled, cluster = sub_grp2))

PCA 3d plot

This 3d plot for the first three principal components, accounting for 86% of the total variance

pca3d(nox_pca, group=sub_grp2, radius = 4, show.labels=T)
## [1] 1.5335850 0.5048702 0.4734467
NOx PCA 3d

NOx PCA 3d

C6H6

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
c6h6 <- data.frame("station1"=s1.d$C6H6,"station5"=s5.d$C6H6,"station9"=s9.d$C6H6,"station10"=s10.d$C6H6)
# Creating a time column to help us join previous dataframe with station 7 and 8.
c6h6$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
c6h6 <- full_join(c6h6,s7.d[,c("time","C6H6")],by="time")
# Full join pm10 and station8 by time column
c6h6 <- full_join(c6h6,s8.d[,c("time","C6H6")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
c6h6 <- c6h6 %>% rename(station7=C6H6.x, station8=C6H6.y) %>%
  select(time,station1,station5,station7,station8,station9,station10) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
c6h6$mean <- rowMeans(c6h6[,2:7])
knitr::kable(head(c6h6,3)) # printing the first 3 columns
time station1 station5 station7 station8 station9 station10 mean
2020-01-01 2.3974533 2.0189181 1.216708 0.2946909 0.5019321 0.7258804 1.1925971
2020-01-02 1.4069889 0.9467244 1.261858 0.5025892 0.3706992 0.6880867 0.8628245
2020-01-03 0.8622788 1.1730584 1.267356 0.4103396 0.2715304 0.4655657 0.7416880
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
c6h6_19 <- data.frame("station1"=s1_19.d$C6H6,"station5"=s5_19.d$C6H6,
                   "station7"=s7_19.d$C6H6,"station8"=s8_19.d$C6H6,"station10"=s10_19.d$C6H6)
# Creating a time column to help us join previous dataframe with station 7 and 8.
c6h6_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
c6h6_19 <- full_join(c6h6_19,s9_19.d[,c("time","C6H6")],by="time")
# Full join pm10 and station8 by time column

# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
c6h6_19 <- c6h6_19 %>% rename(station9=C6H6) %>%
  select(time,station1,station5,station7,station8,station9,station10) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
c6h6_19$mean <- rowMeans(c6h6_19[,2:7])
knitr::kable(head(c6h6_19,3)) # printing the first 3 columns
time station1 station5 station7 station8 station9 station10 mean
2019-01-01 0.8574786 1.907207 0.9028013 0.8523375 0.2651717 0.8574786 0.9404124
2019-01-02 1.4710263 1.654617 0.6863346 0.7166400 0.1449858 1.4710263 1.0241050
2019-01-03 1.5480072 1.412580 0.8689784 0.6715878 0.0338229 1.5480072 1.0138307

Creating thedataframe for C6H6 with monthly columns

# adding the values to dataframe
c6h6.m <- data.frame(s1.m$C6H6,s5.m$C6H6,s7.m$C6H6,
                     s8.m$C6H6,s9.m$C6H6,s10.m$C6H6)
# Creating a column with mean values for every month
c6h6.m$mean <- rowMeans(c6h6.m)
# Adding row and column names
rownames(c6h6.m) <- c("January","February","March","April","May","June") 
colnames(c6h6.m) <- c(stations[c(1,4:8)],"mean")
knitr::kable(c6h6.m, digits = 4) #printing the table
Nicosia_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Mari_Indu Ayia_Marina_Xylianou_Back mean
January 1.7935 2.0161 1.5628 0.5110 0.3328 0.6582 1.1457
February 1.5373 1.1909 1.0157 0.1949 0.6308 0.6037 0.8622
March 0.8519 0.6539 0.4443 0.4412 0.7596 0.4543 0.6009
April 0.4002 0.3344 0.1802 0.2885 0.2698 0.3335 0.3011
May 0.4054 0.4391 0.2529 0.1717 0.0853 0.2006 0.2592
June 0.3129 0.4197 0.1785 0.1581 0.0479 0.1933 0.2184

Summary statistics of PM10 for each station

colnames(c6h6) <- c("time",stations[c(1,4:8)],"mean")
knitr::kable(do.call(cbind, lapply(c6h6[2:8], summary)))
Nicosia_Traf Larnaca_Traf Paphos_Traf Zygi_Indu Mari_Indu Ayia_Marina_Xylianou_Back mean
Min. 0.1441559 0.1363796 0.0545053 0.0000000 0.0045999 0.0251507 0.1360316
1st Qu. 0.3300891 0.3396551 0.1872086 0.1363064 0.1031937 0.2294919 0.2477120
Median 0.6190251 0.5113127 0.3523731 0.2865139 0.2722792 0.3869274 0.4228677
Mean 0.8820508 0.8434774 0.6204580 0.2946909 0.3532593 0.4076478 0.5669307
3rd Qu. 1.2920771 1.0519909 0.9430208 0.4188988 0.6641393 0.5345103 0.8204481
Max. 2.9314320 5.7590451 2.4143210 0.8420553 0.8128323 1.2147028 2.0201268

Data preperation

c6h6.t <- as.data.frame(t(as.matrix(c6h6))) # transpose the pm10 matrix
# changing row and column names
rownames(c6h6.t) <- c("time",stations[c(1,4:8)],"mean")
colnames(c6h6.t) <- c6h6$time
c6h6.bar2 <- c6h6.t[2:7,]
c6h6.bar2$row.names <- rownames(c6h6.bar2)
# changing the data from wide format to a single column of data
c6h6.bar2<-melt(c6h6.bar2,id=c("row.names"))
# Changing the type of class
c6h6.bar2$value <- as.numeric(c6h6.bar2$value)
c6h6.bar2$variable <- ymd(c6h6.bar2$variable)
c6h6.bar2$row.names <- as.factor(c6h6.bar2$row.names)

Boxplot for each station

ggplot(data = c6h6.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="C6H6 boxplot for all stations", x="Stations",y="C6H6") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

C6H6 time series.

c6h6$meansel <-  rowMeans(c6h6[,c(2,4,5)])
ggplot(c6h6, aes(x=time,y=meansel)) + geom_line(size=1, alpha=0.7) + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(c6h6$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(c6h6$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  labs(y="C6H6 value", x="Month",title="January to June 2020 Time series for C6H6") + 
  scale_color_manual(name = "Line notation", values = c(Lockdown="red",Flight_ban="orange")) +
  theme_classic() +
  annotate("text",x=c6h6$time[140],y = 1,label="Dust Episode", size =4, fontface="bold", color="brown" ) + geom_line(aes(x=time,y=mean), color="red", size=1, alpha=0.7)

c6h6_19$meansel <-  rowMeans(c6h6_19[,c(2,4,5)])
c6h6$index <- yday(c6h6$time)
c6h6_19$index <- yday(c6h6_19$time)
ggplot() + geom_line(data=c6h6, aes(x=index,y=meansel,color="2020"),size=1)+
           geom_smooth(data=c6h6, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
           geom_smooth(data=c6h6_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
           geom_line(data=c6h6_19, aes(x=index,y=meansel,color="2019"),size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="C6H6 value", x="Days",title="C6H6 value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c("2020"="red",
                           "2019"="blue",
                           "2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic()

ggplot() + geom_smooth(data=c6h6, aes(x=index,y=meansel,color="2020"), size=1,level=0.95) +
           geom_smooth(data=c6h6_19, aes(x=index,y=meansel,color="2019"), size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="C6H6 value", x="Days",title="C6H6 value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c("2019"="yellow","2020"="brown")) +
  scale_linetype_manual(name = "Lockdown date", values = c("Lockdown" = "dashed")) + theme_classic()

Time series plot for every station

ggplot(c6h6.bar2, aes(x=variable,y=as.numeric(value))) + geom_area(aes(color = row.names, fill = row.names),
    alpha = 0.7) + theme_bw() +
    labs(y="C6H6 value", x="Month",title="January to June 2020 C6H6 Time series for every station") +
    labs(fill='Station') + labs(color="Station") 

sn4 <- c("Nicosia_Traf", "Larnaca_Traf","Paphos_Traf","Zygi_Indu")
ggplot(c6h6.bar2[c6h6.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
    alpha = 0.5, size=1) + theme_classic() +
    labs(y="C6H6 value", x="Month",title="January to June 2020 C6H6 Time series for every station") +
    labs(color="Station") 

C6H6 daily difference

c6h6.ts <- ts(c6h6[8], start=c(2020,1) , frequency = 365)
c6h6.diff <- diff(c6h6.ts)
c6h6.dif <- data.frame("time"=c6h6$time,"diff"=c(0,c6h6.diff)) 
ggplot(c6h6.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("C6H6 difference") +
  ggtitle("C6H6 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values

c6h6.mt <- as.data.frame(t(as.matrix(c6h6.m))) ##transpose of month matrix
rownames(c6h6.mt) <- c(stations[c(1,4:8)],"mean") #adding row names
heatmaply(c6h6.mt, grid_color = "white", dendrogram = "none", main = "C6H6 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(c6h6.mt,2))

Barplot of C6H6 for every month and visual contribution of each station

c6h6.bar <- c6h6.mt[1:6,] # row filtering
c6h6.bar$row.names <- rownames(c6h6.bar) # adding column with row names
# changing the data from wide format to a single column of data
c6h6.bar<-melt(c6h6.bar,id=c("row.names"))
# creating the plot
ggplot(c6h6.bar,aes(x=variable,y=value/6,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "C6H6 per month and station contribution", y = "C6H6 value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

c6h6.t2 <- as.data.frame(t(as.matrix(c6h6[,2:7])))
row.names(c6h6.t2)<- stations[c(1,4:8)]
c6h6.t2$station <- stations[c(1,4:8)]
c6h6.scaled <- as.data.frame(scale(c6h6.t2[,1:182])) # scaling
dist_c6h6 <- dist(c6h6.scaled, method = 'euclidean') # calculating the distance
hclust_c6h6 <- hclust(dist_c6h6, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_c6h6,labels=c6h6.t2$station)
rect.hclust(hclust_c6h6 ,  k = 3,border = 2:6) # adding the 3 cluster boxes

fviz_nbclust(c6h6.scaled, kmeans, method = "wss", k.max=5)

PCA for C6H6

c6h6_pca <- prcomp(c6h6.t2[,-183], center = TRUE, scale. = TRUE)
summary(c6h6_pca)
## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5       PC6
## Standard deviation     9.691 6.0408 5.3560 3.70041 3.03511 2.156e-15
## Proportion of Variance 0.516 0.2005 0.1576 0.07524 0.05061 0.000e+00
## Cumulative Proportion  0.516 0.7165 0.8741 0.94939 1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 71.5% of the total variance

sub_grp <- cutree(hclust_c6h6, k = 3)
fviz_cluster(list(data = c6h6.scaled, cluster = sub_grp))

PCA 3d plot for C6H6

This 3d plot for the first three principal components, accounting for 87% of the total variance

pca3d(c6h6_pca, group=sub_grp, radius = 4, show.labels=T)
## [1] 0.9164679 0.7704207 0.5425491
PCA 3d

PCA 3d

Loading the covid database

covid <- read.csv("owid-covid-data.csv")
covid <- covid %>% filter(location=="Cyprus") # filtering the Cyprus data
covid$date <- ymd(covid$date) # changing the date class

Joining the covid and pm10 dataset

pm10$time <- ymd(pm10$time) # changing the time class
# left join the two dataset using time and date column
pm10.cov <- merge(pm10, covid, by.x="time",by.y="date", all.x =T )
# removing columns with almost zero variance
pm10.cov <- pm10.cov %>% select(-nearZeroVar(pm10.cov)) 
# filtering the columns we need
pm10.cov2 <- pm10.cov %>% select(c(1,9,12:15))
# filling NA with 0 for the dates before COVID19
pm10.cov2[,3:6] <- sapply(pm10.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix

# creating a function to setup the geom_smooth graph of ggpair plot
my_fn <- function(data, mapping, method="loess", ...){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=method, color="black", ...)
  p
}

ggpairs(pm10.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

creating the corrplot

We can see that there is no clear correlation between PM10 and COVID19. The correlation values are close to zero so there is almost zero impact of coronavirus to values of PM10.

cor.pm10 <- cor(pm10.cov2[,2:6])
corrplot.mixed(cor.pm10)

Join the nox and covid database

nox$time <- ymd(nox$time)
# left join between nox and covid dataset
nox.cov <- merge(nox, covid, by.x="time",by.y="date", all.x =T )
nox.cov <- nox.cov %>% select(-nearZeroVar(nox.cov)) 
nox.cov2 <- nox.cov %>% select(c(1,11,14:17))
nox.cov2[,3:6] <- sapply(nox.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix between mean value of NOx and COVID19 values

ggpairs(nox.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

We can see that we have a negative correlation between NOx value and total cases of coronavirus. That negative correlation means that when total cases increase the NOx mean value descreases. So there is COVID19 impact in the NOx values.

cor.nox <- cor(nox.cov2[,2:6])
corrplot.mixed(cor.nox)

Join the C6H6 and covid database

c6h6$time <- ymd(c6h6$time)
# left join between nox and covid dataset
c6h6.cov <- merge(c6h6, covid, by.x="time",by.y="date", all.x =T )
c6h6.cov <- c6h6.cov %>% select(-nearZeroVar(c6h6.cov)) 
c6h6.cov2 <- c6h6.cov %>% select(c(1,8,11:14))
c6h6.cov2[,3:6] <- sapply(c6h6.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix between mean value of C6H6 and COVID19 values

ggpairs(c6h6.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

cor.c6h6 <- cor(c6h6.cov2[,2:6])
corrplot.mixed(cor.c6h6)