Cyprus Air Pollution January-June 2020 Analysis

Loading the libraries we gonna use

library(dplyr)
library(tidyjson)
library(rjson)
library(lubridate)
library(GGally)
library(ggplot2)
library(xts)
library(zoo)
library(data.table)
library(heatmaply)
library(reshape2)
library(factoextra)
library(pca3d)
library(forecast)
library(caret)
library(corrplot)

Loading the datasets, one for each month

jan <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-01-01 00:00/2020-01-31 23:59")
feb <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-02-01 00:00/2020-02-31 23:59")
mar <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-03-01 00:00/2020-03-31 23:59")
apr <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-04-01 00:00/2020-04-31 23:59")
may <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-05-01 00:00/2020-05-31 23:59")
jun <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2020-06-01 00:00/2020-06-31 23:59")

Loading the datasets, one for each month for 2019

jan19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-01-01 00:00/2019-01-31 23:59")
feb19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-02-01 00:00/2019-02-31 23:59")
mar19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-03-01 00:00/2019-03-31 23:59")
apr19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-04-01 00:00/2019-04-31 23:59")
may19 <- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-05-01 00:00/2019-05-31 23:59")
jun19<- fromJSON(file="https://www.airquality.dli.mlsi.gov.cy/all_stations_data_range/2019-06-01 00:00/2019-06-31 23:59")

Extracting the values for 2020

We gonna extract the values from our dataset. We gonna create tables for every station for all six months.

We gonna use the tidyjson library and spread_all function to spread the values and create a matrix in which

every row is a hourly measures and details for all types of pollution. Then we gonna save the matrix as .csv file.

Station 1

# creating the list of values for station 1 for all six months
station1 <- c(jan$data$station_1$values,feb$data$station_1$values,mar$data$station_1$values,
              apr$data$station_1$values,may$data$station_1$values,jun$data$station_1$values)
station1 <- station1 %>% spread_all() # spreading the list to one data.frame
station1$..JSON <- NULL # deleting the last column
write.csv(station1,"station1.csv",row.names = F) # saving the data.frame

Station 2

station2 <- c(jan$data$station_2$values,feb$data$station_2$values,mar$data$station_2$values,
              apr$data$station_2$values,may$data$station_2$values,jun$data$station_2$values)
station2 <- station2 %>% spread_all()
station2$..JSON <- NULL
write.csv(station2,"station2.csv", row.names = F)

Station 3

station3 <- c(jan$data$station_3$values,feb$data$station_3$values,mar$data$station_3$values,
              apr$data$station_3$values,may$data$station_3$values,jun$data$station_3$values)
station3 <- station3 %>% spread_all()
station3$..JSON <- NULL
write.csv(station3,"station3.csv", row.names = F)

Station 5

station5 <- c(jan$data$station_5$values,feb$data$station_5$values,mar$data$station_5$values,
              apr$data$station_5$values,may$data$station_5$values,jun$data$station_5$values)
station5 <- station5 %>% spread_all()
station5$..JSON <- NULL
write.csv(station5,"station5.csv", row.names = F)

Station 7

station7 <- c(jan$data$station_7$values,feb$data$station_7$values,mar$data$station_7$values,
              apr$data$station_7$values,may$data$station_7$values,jun$data$station_7$values)
station7 <- station7 %>% spread_all()
station7$..JSON <- NULL
write.csv(station7,"station7.csv", row.names = F)

Station 8

station8 <- c(jan$data$station_8$values,feb$data$station_8$values,mar$data$station_8$values,
              apr$data$station_8$values,may$data$station_8$values,jun$data$station_8$values)
station8 <- station8 %>% spread_all()
station8$..JSON <- NULL
write.csv(station8,"station8.csv", row.names = F)

Station 9

station9 <- c(jan$data$station_9$values,feb$data$station_9$values,mar$data$station_9$values,
              apr$data$station_9$values,may$data$station_9$values,jun$data$station_9$values)
station9 <- station9 %>% spread_all()
station9$..JSON <- NULL
write.csv(station9,"station9.csv", row.names = F)

Station 10

station10 <- c(jan$data$station_10$values,feb$data$station_10$values,mar$data$station_10$values,
               apr$data$station_10$values,may$data$station_10$values,jun$data$station_10$values)
station10 <- station10 %>% spread_all()
station10$..JSON <- NULL
write.csv(station10,"station10.csv", row.names = F)

Station 11

station11 <- c(jan$data$station_11$values,feb$data$station_11$values,mar$data$station_11$values,
               apr$data$station_11$values,may$data$station_11$values,jun$data$station_11$values)
station11 <- station11 %>% spread_all()
station11$..JSON <- NULL
write.csv(station11,"station11.csv", row.names = F)

Extracting the values for 2019

Station 1 2019

station1_19 <- c(jan19$data$station_1$values,feb19$data$station_1$values,mar19$data$station_1$values,
                 apr19$data$station_1$values,may19$data$station_1$values,jun19$data$station_1$values)
station1_19 <- station1_19 %>% spread_all()
station1_19$..JSON <- NULL
write.csv(station1_19,"station1_19.csv",row.names = F)

Station 2 2019

station2_19 <- c(jan19$data$station_2$values,feb19$data$station_2$values,mar19$data$station_2$values,
                 apr19$data$station_2$values,may19$data$station_2$values,jun19$data$station_2$values)
station2_19 <- station2_19 %>% spread_all()
station2_19$..JSON <- NULL
write.csv(station2_19,"station2_19.csv", row.names = F)

Station 3 2019

station3_19 <- c(jan19$data$station_3$values,feb19$data$station_3$values,mar19$data$station_3$values,
                 apr19$data$station_3$values,may19$data$station_3$values,jun19$data$station_3$values)
station3_19 <- station3_19 %>% spread_all()
station3_19$..JSON <- NULL
write.csv(station3_19,"station3_19.csv", row.names = F)

Station 5 2019

station5_19 <- c(jan19$data$station_5$values,feb19$data$station_5$values,mar19$data$station_5$values,
                 apr19$data$station_5$values,may19$data$station_5$values,jun19$data$station_5$values)
station5_19 <- station5_19 %>% spread_all()
station5_19$..JSON <- NULL
write.csv(station5_19,"station5_19.csv", row.names = F)

Station 7 2019

station7_19 <- c(jan19$data$station_7$values,feb19$data$station_7$values,mar19$data$station_7$values,
                 apr19$data$station_7$values,may19$data$station_7$values,jun19$data$station_7$values)
station7_19 <- station7_19 %>% spread_all()
station7_19$..JSON <- NULL
write.csv(station7_19,"station7_19.csv", row.names = F)

Station 8 2019

station8_19 <- c(jan19$data$station_8$values,feb19$data$station_8$values,mar19$data$station_8$values,
                 apr19$data$station_8$values,may19$data$station_8$values,jun19$data$station_8$values)
station8_19 <- station8_19 %>% spread_all()
station8_19$..JSON <- NULL
write.csv(station8_19,"station8_19.csv", row.names = F)

Station 9 2019

station9_19 <- c(jan19$data$station_9$values,feb19$data$station_9$values,mar19$data$station_9$values,
                 apr19$data$station_9$values,may19$data$station_9$values,jun19$data$station_9$values)
station9_19 <- station9_19 %>% spread_all()
station9_19$..JSON <- NULL
write.csv(station9_19,"station9_19.csv", row.names = F)

Station 10 2019

station10_19 <- c(jan19$data$station_10$values,feb19$data$station_10$values,mar19$data$station_10$values,
                  apr19$data$station_10$values,may19$data$station_10$values,jun19$data$station_10$values)
station10_19 <- station10_19 %>% spread_all()
station10_19$..JSON <- NULL
write.csv(station10_19,"station10_19.csv", row.names = F)

Station 11 2019

station11_19 <- c(jan19$data$station_11$values,feb19$data$station_11$values,mar19$data$station_11$values,
                  apr19$data$station_11$values,may19$data$station_11$values,jun19$data$station_11$values)
station11_19 <- station11_19 %>% spread_all()
station11_19$..JSON <- NULL
write.csv(station11_19,"station11_19.csv", row.names = F)

creating a vector with stations names

stations <- c("Nicosia_Traf","Nicosia_Resi","Limassol_Traf","Larnaca_Traf","Paphos_Traf",
              "Zygi_Indu","Mari_Indu","Ayia_Marina_Xylianou_Back","Paralimni_Traf")

Data preprocessing

We gonna load the datasets for every station, filter the columns and keep the values for every pollutant type. We’ll change the class of time to POSIXct, check for NAs and fill them using na.approx function

Station 1

s1 <- read.csv("station1.csv") # Loading the .csv file we previous create
s1 <- s1 %>% select(c(seq(2,38,4))) # Filtering the columns with pollutant values
names(s1) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25") #adding the column names
s1$time <- ymd_hms(s1$time) # changing the type of time
sapply(s1, function(x) sum(is.na(x))) # checking for NA

## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0   99  675   35  104  104  375  103  104  675

# Filling NA with na.approx function
s1[,2:10] <- sapply(s1[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = T), x))
sapply(s1, function(x) sum(is.na(x))) # zero NA

## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0    0    0    0    0    0    0    0    0    0

knitr::kable(head(s1,3)) # printing the first 3 rows

time	SO2	PM10	O3	NO2	NOx	CO	C6H6	NO	PM25
2020-01-01 00:00:00	4.213296	113.18510	5.422678	41.66263	64.42079	1045.5090	5.304624	14.83504	89.80502
2020-01-01 01:00:00	3.986767	49.54911	7.720096	41.01508	90.58797	977.6353	2.203252	32.31437	35.75103
2020-01-01 02:00:00	4.940188	64.78768	4.493460	50.22812	119.87520	1263.8940	3.149906	45.39985	35.52024

Station 2

s2 <- read.csv("station2.csv")
s2 <- s2 %>% select(c(seq(2,26,4)))
names(s2) <- c("time","SO2","O3","NO2","NOx","CO","NO")
s2$time <- ymd_hms(s2$time)
sapply(s2, function(x) sum(is.na(x)))

## time  SO2   O3  NO2  NOx   CO   NO 
##    0  179   44  100   84   89   50

s2[,2:7] <- sapply(s2[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 3

s3 <- read.csv("station3.csv")
s3 <- s3 %>% select(c(seq(2,30,4)))
names(s3) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3$time <- ymd_hms(s3$time)
sapply(s3, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO   NO 
##    0  684  133  124  228  228 1194  254

s3[,2:8] <- sapply(s3[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 5

s5 <- read.csv("station5.csv")
s5 <- s5 %>% select(c(seq(2,38,4)))
names(s5) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5$time <- ymd_hms(s5$time)
sapply(s5, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  199  210   97   64   64   98  185   95  221

s5[,2:10] <- sapply(s5[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 7

s7 <- read.csv("station7.csv")
s7 <- s7 %>% select(c(seq(2,38,4)))
names(s7) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s7$time <- ymd_hms(s7$time)
sapply(s7, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  656  406  121  165  164  320  539  206  479

s7[,2:10] <- sapply(s7[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 8

s8 <- read.csv("station8.csv")
s8 <- s8 %>% select(c(seq(2,34,4)))
names(s8) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8$time <- ymd_hms(s8$time)
sapply(s8, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx C6H6   NO PM25 
##    0  202  139   76   83   83 1027   91  142

s8[,2:9] <- sapply(s8[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 9

s9 <- read.csv("station9.csv")
s9 <- s9 %>% select(c(seq(2,30,4)))
names(s9) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9$time <- ymd_hms(s9$time)
sapply(s9, function(x) sum(is.na(x)))

## time  SO2   O3  NO2  NOx   CO C6H6   NO 
##    0  180   44  114  114  626 1943  235

s9[,2:8] <- sapply(s9[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 10

s10 <- read.csv("station10.csv")
s10 <- s10 %>% select(c(seq(2,38,4)))
names(s10) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","PM25","NO")
s10$time <- ymd_hms(s10$time)
sapply(s10, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO C6H6 PM25   NO 
##    0  291 1144  108  180  180 1195  288 1167  429

s10[,2:10] <- sapply(s10[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 11

s11 <- read.csv("station11.csv")
s11 <- s11 %>% select(c(seq(2,34,4)))
names(s11) <- c("time","SO2","O3","NO2","NOx","CO","NO","PM10","PM25")
s11$time <- ymd_hms(s11$time)
sapply(s11, function(x) sum(is.na(x)))

## time  SO2   O3  NO2  NOx   CO   NO PM10 PM25 
##    0  262  113  124  124   94  275  864  864

s11[,2:9] <- sapply(s11[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 1 2019

s1_19 <- read.csv("station1_19.csv")
s1_19 <- s1_19 %>% select(c(seq(2,38,4)))
names(s1_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s1_19$time <- ymd_hms(s1_19$time)
sapply(s1_19, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  156  654   63  113  113   84  125  115  654

s1_19[,2:10] <- sapply(s1_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 2 2019

s2_19 <- read.csv("station2_19.csv")
s2_19 <- s2_19 %>% select(c(seq(2,26,4)))
names(s2_19) <- c("time","SO2","CO","NO2","NOx","NO","O3")
s2_19$time <- ymd_hms(s2_19$time)
sapply(s2_19, function(x) sum(is.na(x)))

## time  SO2   CO  NO2  NOx   NO   O3 
##    0  820   85   94   94   96  115

s2_19[,2:7] <- sapply(s2_19[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 3 2019

s3_19 <- read.csv("station3_19.csv")
s3_19 <- s3_19 %>% select(c(seq(2,30,4)))
names(s3_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3_19$time <- ymd_hms(s3_19$time)
sapply(s3_19, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO   NO 
##    0  505   66  225  116  116  490  116

s3_19[,2:8] <- sapply(s3_19[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 5 2019

s5_19 <- read.csv("station5_19.csv")
s5_19 <- s5_19 %>% select(c(seq(2,38,4)))
names(s5_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5_19$time <- ymd_hms(s5_19$time)
sapply(s5_19, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO C6H6   NO PM25 
##    0  102  276  123   69   69   73   66   75  276

s5_19[,2:10] <- sapply(s5_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 7 2019

s7_19 <- read.csv("station7_19.csv")
s7_19 <- s7_19 %>% select(c(seq(2,38,4)))
names(s7_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO","PM10","PM25")
s7_19$time <- ymd_hms(s7_19$time)
sapply(s7_19, function(x) sum(is.na(x)))

## time  SO2   O3  NO2  NOx   CO C6H6   NO PM10 PM25 
##    0  410   26   70   65   80  354   73 1695 3680

s7_19[,2:10] <- sapply(s7_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 8 2019

s8_19 <- read.csv("station8_19.csv")
s8_19 <- s8_19 %>% select(c(seq(2,34,4)))
names(s8_19) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8_19$time <- ymd_hms(s8_19$time)
sapply(s8_19, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx C6H6   NO PM25 
##    0  547  153  135  183   78  212   78  153

s8_19[,2:9] <- sapply(s8_19[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

#### Station 9 2019

s9_19 <- read.csv("station9_19.csv")
s9_19<- s9_19 %>% select(c(seq(2,30,4)))
names(s9_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9_19$time <- ymd_hms(s9_19$time)
sapply(s9_19, function(x) sum(is.na(x)))

## time  SO2   O3  NO2  NOx   CO C6H6   NO 
##    0  352  503  221  221  297 1052  225

s9_19[,2:8] <- sapply(s9_19[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 10 2019

s10_19 <- read.csv("station10_19.csv")
s10_19 <- s10_19 %>% select(c(seq(2,38,4)))
names(s10_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO","PM10","PM25")
s10_19$time <- ymd_hms(s10_19$time)
sapply(s10_19, function(x) sum(is.na(x)))

## time  SO2   O3  NO2  NOx   CO C6H6   NO PM10 PM25 
##    0  922   88  209  209  321  276  406 1032 1030

s10_19[,2:10] <- sapply(s10_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Station 11 2019

s11_19 <- read.csv("station11_19.csv")
s11_19 <- s11_19 %>% select(c(seq(2,34,4)))
names(s11_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO","PM25")
s11_19$time <- ymd_hms(s11_19$time)
sapply(s11_19, function(x) sum(is.na(x)))

## time  SO2 PM10   O3  NO2  NOx   CO   NO PM25 
##    0  208  126   21   64   64   75   64  126

s11_19[,2:9] <- sapply(s11_19[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))

Changing the time

Next step is gonna be to change the time. Our dataset is in hours so we gonna create two new datasets for every station in which we gonna store mean value for each day and for each month. First we’ll transform the datasets to xts type and then create .d and .m datasets for days and months.

Station 1

s1.xts <- xts(s1, order.by = s1$time) # creating the xts type of s1 dataset
s1.d <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "days"), mean) #finding the mean for every day
s1.m <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "months"), mean) #finding the mean for every month
s1.d <- as.data.frame(s1.d) # changing for xts to data.frame type
s1.m <- as.data.frame(s1.m) # changing for xts to data.frame type

Station 1 2019

s1_19.xts <- xts(s1_19, order.by = s1_19$time) # creating the xts type of s1 dataset
s1_19.d <- period.apply(s1_19.xts[,2:10] , endpoints(s1_19.xts, "days"), mean) #finding the mean for every day
s1_19.m <- period.apply(s1_19.xts[,2:10] , endpoints(s1_19.xts, "months"), mean) #finding the mean for every month
s1_19.d <- as.data.frame(s1_19.d) # changing for xts to data.frame type
s1_19.m <- as.data.frame(s1_19.m) # changing for xts to data.frame type

Station 2

s2.xts <- xts(s2, order.by = s2$time)
s2.d <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "days"), mean)
s2.m <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "months"), mean)
s2.d <- as.data.frame(s2.d)
s2.m <- as.array.default(s2.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2.d$time <- as.POSIXct(row.names(s2.d),format = "%Y-%m-%d")

Station 2 2019

s2_19.xts <- xts(s2_19, order.by = s2_19$time)
s2_19.d <- period.apply(s2_19.xts[,2:7] , endpoints(s2_19.xts, "days"), mean)
s2_19.m <- period.apply(s2_19.xts[,2:7] , endpoints(s2_19.xts, "months"), mean)
s2_19.d <- as.data.frame(s2_19.d)
s2_19.m <- as.array.default(s2_19.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2_19.d$time <- as.POSIXct(row.names(s2_19.d),format = "%Y-%m-%d")

Station 3

s3.xts <- xts(s3, order.by = s3$time)
s3.d <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "days"), mean)
s3.m <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "months"), mean)
s3.d <- as.data.frame(s3.d)
s3.m <- as.data.frame(s3.m)

Station 3 2019

s3_19.xts <- xts(s3_19, order.by = s3_19$time)
s3_19.d <- period.apply(s3_19.xts[,2:8] , endpoints(s3_19.xts, "days"), mean)
s3_19.m <- period.apply(s3_19.xts[,2:8] , endpoints(s3_19.xts, "months"), mean)
s3_19.d <- as.data.frame(s3_19.d)
s3_19.m <- as.data.frame(s3_19.m)

Station 5

s5.xts <- xts(s5, order.by = s5$time)
s5.d <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "days"), mean)
s5.m <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "months"), mean)
s5.d <- as.data.frame(s5.d)
s5.m <- as.data.frame(s5.m)

Station 5 2019

s5_19.xts <- xts(s5_19, order.by = s5_19$time)
s5_19.d <- period.apply(s5_19.xts[,2:10] , endpoints(s5_19.xts, "days"), mean)
s5_19.m <- period.apply(s5_19.xts[,2:10] , endpoints(s5_19.xts, "months"), mean)
s5_19.d <- as.data.frame(s5_19.d)
s5_19.m <- as.data.frame(s5_19.m)

Station 7

s7.xts <- xts(s7, order.by = s7$time)
s7.d <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "days"), mean)
s7.m <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "months"), mean)
s7.d <- as.data.frame(s7.d)
s7.m <- as.data.frame(s7.m)
s7.d$time <- as.POSIXct(row.names(s7.d),format = "%Y-%m-%d")

Station 7 2019

s7_19.xts <- xts(s7_19, order.by = s7_19$time)
s7_19.d <- period.apply(s7_19.xts[,2:10] , endpoints(s7_19.xts, "days"), mean)
s7_19.m <- period.apply(s7_19.xts[,2:10] , endpoints(s7_19.xts, "months"), mean)
s7_19.d <- as.data.frame(s7_19.d)
s7_19.m <- as.data.frame(s7_19.m)
s7_19.d$time <- as.POSIXct(row.names(s7_19.d),format = "%Y-%m-%d")

Station 8

s8.xts <- xts(s8, order.by = s8$time)
s8.d <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "days"), mean)
s8.m <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "months"), mean)
s8.d <- as.data.frame(s8.d)
s8.m <- as.data.frame(s8.m)
s8.d$time <- as.POSIXct(row.names(s8.d),format = "%Y-%m-%d")

Station 8 2019

s8_19.xts <- xts(s8_19, order.by = s8_19$time)
s8_19.d <- period.apply(s8_19.xts[,2:9] , endpoints(s8_19.xts, "days"), mean)
s8_19.m <- period.apply(s8_19.xts[,2:9] , endpoints(s8_19.xts, "months"), mean)
s8_19.d <- as.data.frame(s8_19.d)
s8_19.m <- as.data.frame(s8_19.m)
s8_19.d$time <- as.POSIXct(row.names(s8_19.d),format = "%Y-%m-%d")

Station 9

s9.xts <- xts(s9, order.by = s9$time)
s9.d <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "days"), mean)
s9.m <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "months"), mean)
s9.d <- as.data.frame(s9.d)
s9.m <- as.data.frame(s9.m)

Station 9 2019

s9_19.xts <- xts(s9_19, order.by = s9_19$time)
s9_19.d <- period.apply(s9_19.xts[,2:8] , endpoints(s9_19.xts, "days"), mean)
s9_19.m <- period.apply(s9_19.xts[,2:8] , endpoints(s9_19.xts, "months"), mean)
s9_19.d <- as.data.frame(s9_19.d)
s9_19.m <- as.data.frame(s9_19.m)
s9_19.d$time <- as.POSIXct(row.names(s9_19.d),format = "%Y-%m-%d")

Station 10

s10.xts <- xts(s10, order.by = s10$time)
s10.d <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "days"), mean)
s10.m <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "months"), mean)
s10.d <- as.data.frame(s10.d)
s10.m <- as.data.frame(s10.m)

Station 10 2019

s10_19.xts <- xts(s10_19, order.by = s10_19$time)
s10_19.d <- period.apply(s10_19.xts[,2:10] , endpoints(s10_19.xts, "days"), mean)
s10_19.m <- period.apply(s10_19.xts[,2:10] , endpoints(s10_19.xts, "months"), mean)
s10_19.d <- as.data.frame(s1_19.d)
s10_19.m <- as.data.frame(s10_19.m)

Station 11

s11.xts <- xts(s11, order.by = s11$time)
s11.d <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "days"), mean)
s11.m <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "months"), mean)
s11.d <- as.data.frame(s11.d)
s11.m <- as.data.frame(s11.m)

Station 11 2019

s11_19.xts <- xts(s11_19, order.by = s11_19$time)
s11_19.d <- period.apply(s11_19.xts[,2:9] , endpoints(s11_19.xts, "days"), mean)
s11_19.m <- period.apply(s11_19.xts[,2:9] , endpoints(s11_19.xts, "months"), mean)
s11_19.d <- as.data.frame(s11_19.d)
s11_19.m <- as.data.frame(s11_19.m)

deleting the datasets we dont gonna use

rm(s1,s1.xts,s10, s10.xts, s11, s11.xts,s2,s2.xts,s3, s3.xts, s5, s5.xts, 
   s7, s7.xts, s8, s8.xts,s9, s9.xts, s10, s10.xts, s11, s11.xts, s1_19,s1_19.xts,
   s10_19,s10_19.xts,s11_19,s11_19.xts,s2_19,s2_19.xts,s3_19,s3_19.xts,s5_19,
   s5_19.xts,s7_19,s7_19.xts,s8_19,s8_19.xts,s9_19,s9_19.xts)

PM10 Analysis

Creating the dataframe for pm10 with day columns

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10 <- data.frame("station1"=s1.d$PM10,"station3"=s3.d$PM10,"station5"=s5.d$PM10,
                   "station10"=s10.d$PM10,"station11"=s11.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
pm10 <- full_join(pm10,s7.d[,c("time","PM10")],by="time")
# Full join pm10 and station8 by time column
pm10 <- full_join(pm10,s8.d[,c("time","PM10")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10 <- pm10 %>% rename(station7=PM10.x, station8=PM10.y) %>%
  select(time,station1,station3,station5,station7,station8,station10,station11) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
pm10$mean <- rowMeans(pm10[,2:8])
knitr::kable(head(pm10,3)) # printing the first 3 columns

time	station1	station3	station5	station7	station8	station10	station11	mean
2020-01-01	42.89521	18.62904	34.34300	20.03441	26.316686	3.927083	26.23751	24.62613
2020-01-02	20.59653	10.59705	23.11284	17.06729	10.038982	3.721084	30.37896	16.50182
2020-01-03	10.83952	11.74851	13.33836	16.47911	6.818125	3.705833	30.29477	13.31775
### PM10 2019

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10_19 <- data.frame("station1"=s1_19.d$PM10,"station3"=s3_19.d$PM10,"station5"=s5_19.d$PM10,
                  "station7"=s7_19.d$PM10,"station8"=s8_19.d$PM10,"station10"=s10_19.d$PM10,
                  "station11"=s11_19.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10_19 <- pm10_19 %>% select(time,station1,station3,station5,station7,station8,station10,station11)
# Adding a mean column with mean value of PM10 for every day
pm10_19$mean <- rowMeans(pm10_19[,2:8])
knitr::kable(head(pm10_19,3)) # printing the first 3 columns

time	station1	station3	station5	station7	station8	station10	station11	mean
2019-01-01	17.65954	25.25458	27.18125	24.36375	10.75981	17.65954	24.47333	21.05026
2019-01-02	30.83935	18.45208	27.98375	27.76417	20.74846	30.83935	26.08583	26.10186
2019-01-03	26.35819	15.31958	24.06979	20.94458	13.82719	26.35819	21.59292	21.21006

Creating the dataframe for pm10 with monthly columns

# adding the values to dataframe
pm10.m <- data.frame(s1.m$PM10,s3.m$PM10,s5.m$PM10,s7.m$PM10,
                     s8.m$PM10,s10.m$PM10,s11.m$PM10)
# Creating a column with mean values for every month
pm10.m$mean <- rowMeans(pm10.m)
# Adding row and column names
rownames(pm10.m) <- c("January","February","March","April","May","June") 
colnames(pm10.m) <- c(stations[c(1,3:6,8:9)],"mean")
knitr::kable(pm10.m, digits = 4) #printing the table

	Nicosia_Traf	Limassol_Traf	Larnaca_Traf	Paphos_Traf	Zygi_Indu	Ayia_Marina_Xylianou_Back	Paralimni_Traf	mean
January	36.3907	25.0343	30.1276	25.8627	16.7807	7.6372	28.1558	24.2841
February	30.6833	27.2613	28.6184	26.1846	21.0343	11.2538	28.1471	24.7404
March	30.3524	27.4642	33.2767	26.4542	25.0162	20.2982	29.5287	27.4844
April	25.1905	23.7127	27.7513	21.3241	25.6438	19.1190	25.2619	24.0005
May	21.0555	35.3394	36.1480	35.4592	36.0006	25.1626	32.7273	31.6990
June	25.9583	26.7145	34.0229	23.4378	32.5035	18.4995	31.6976	27.5477

Summary statistics of PM10 for each station

colnames(pm10) <- c("time",stations[c(1,3:6,8:9)],"mean")
knitr::kable(do.call(cbind, lapply(pm10[2:9], summary)))

	Nicosia_Traf	Limassol_Traf	Larnaca_Traf	Paphos_Traf	Zygi_Indu	Ayia_Marina_Xylianou_Back	Paralimni_Traf	mean
Min.	10.83952	10.00244	11.00115	9.065349	4.911398	3.240060	12.13500	12.47557
1st Qu.	20.87687	19.09575	22.17966	18.909459	17.613646	9.727647	21.06490	20.33870
Median	24.97054	25.34259	28.60686	24.131823	22.768948	14.381458	26.49583	24.25839
Mean	28.29714	27.61801	31.69190	26.171577	26.316686	17.084898	29.26734	26.63536
3rd Qu.	35.07800	32.26869	36.29274	29.643809	30.876087	19.562292	33.07865	29.54169
Max.	61.74035	94.58720	96.06582	89.734498	120.352083	82.874167	82.96542	79.25038

Data preperation

pm10.t <- as.data.frame(t(as.matrix(pm10))) # transpose the pm10 matrix
# changing row and column names
rownames(pm10.t) <- c("time",stations[c(1,3:6,8:9)],"mean")
colnames(pm10.t) <- pm10$time
pm10.bar2 <- pm10.t[2:8,]
pm10.bar2$row.names <- rownames(pm10.bar2)
# changing the data from wide format to a single column of data
pm10.bar2<-melt(pm10.bar2,id=c("row.names"))
# Changing the type of class
pm10.bar2$value <- as.numeric(pm10.bar2$value)
pm10.bar2$variable <- ymd(pm10.bar2$variable)
pm10.bar2$row.names <- as.factor(pm10.bar2$row.names)

Boxplot for each station

ggplot(data = pm10.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

PM10 time series.

The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. We can see there are some peaks as a result of dust episodes. We can see that COVID19 situation did not affect the values of PM10.

ggplot() + geom_line(data=pm10, aes(x=time,y=mean)) + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  labs(y="PM10 value", x="Month",title="January to June 2020 Time series for PM10") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notation", values = c(Lockdown="red",EU_limit_value="magenta",
                           WHO_air_quality_guideline="green",Flight_ban="orange")) +
  theme_classic() +
  annotate("text",x=pm10$time[140],y = 80,label="Dust Episode", size =4, fontface="bold", color="brown" )

pm10$meansel <- rowMeans(pm10[,c(2,3,5,6)])
pm10_19$meansel <- rowMeans(pm10_19[,c(2,3,5,6)])
pm10$index <- yday(pm10$time)
pm10_19$index <- yday(pm10_19$time)
ggplot() + geom_line(data=pm10, aes(x=index,y=meansel,color="2020"), size=1)+
  geom_line(data=pm10_19, aes(x=index,y=meansel, color="2019"), size=1) + 
  geom_smooth(data=pm10, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=pm10_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="PM10 value", x="Days",title="PM10 value comparison for first half of years 2019 and 2020") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic() + labs(color="Legend text")

ggplot() +  
  geom_smooth(data=pm10, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=pm10_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="PM10 value", x="Days",title="PM10 value comparison for first half of years 2019 and 2020") + 
  geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic() + labs(color="Legend text")

Time Series plot for every station

sn4 <- c("Nicosia_Traf", "Larnaca_Traf","Paphos_Traf","Zygi_Indu")
ggplot(pm10.bar2[pm10.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) +
  geom_area(aes(color = row.names, fill = row.names),alpha = 0.7) + theme_bw() +
    labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
    labs(fill='Station') + labs(color="Station")

ggplot(pm10.bar2[pm10.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
    alpha = 0.6, size=1) + theme_classic() +
    labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
    labs(color="Station")

PM10 daily difference

COVID19 situation doesn’t change the stability of PM10 and the differences seems to be almost the same before and after lockdown. The highest differences can be spotted the first 2 weeks after lockdown.

pm10.ts <- ts(pm10[9], start=c(2020,1) , frequency = 365)
pm10.diff <- diff(pm10.ts)
pm10.dif <- data.frame("time"=pm10$time,"diff"=c(0,pm10.diff)) 
ggplot(pm10.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("PM10 difference") +
  ggtitle("PM10 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values

pm10.mt <- as.data.frame(t(as.matrix(pm10.m))) ##transpose of month matrix
rownames(pm10.mt) <- c(stations[c(1,3:6,8:9)],"mean") #adding row names
heatmaply(pm10.mt, grid_color = "white", dendrogram = "none", main = "PM10 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(pm10.mt,2))

Barplot of PM10 for every month and visual contribution of each station

pm10.bar <- pm10.mt[1:7,] # row filtering
pm10.bar$row.names <- rownames(pm10.bar) # adding column with row names
# changing the data from wide format to a single column of data
pm10.bar<-melt(pm10.bar,id=c("row.names"))
# creating the plot
ggplot(pm10.bar,aes(x=variable,y=value/7,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "PM10 per month and station contribution", y = "PM10 value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

I choose to create three clusters. The clusters seems to be very reasonal. The green cluster includes four of five traffic stations. The red includes only Ayia_Marina_Xyliaroy station, the only Rural Station of our dataset and the blue includes Paphos traffic station and Zygi industrial station.

pm10.t2 <- as.data.frame(t(as.matrix(pm10[,2:8])))
row.names(pm10.t2)<- stations[c(1,3:6,8:9)]
pm10.t2$station <- stations[c(1,3:6,8:9)]
pm10.scaled <- as.data.frame(scale(pm10.t2[,1:182])) # scaling
dist_pm10 <- dist(pm10.scaled, method = 'euclidean') # calculating the distance
hclust_pm10 <- hclust(dist_pm10, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_pm10,labels=pm10.t2$station)
rect.hclust(hclust_pm10 ,  k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule.

We can see that a good choice would be to choose k=2

fviz_nbclust(pm10.scaled, kmeans, method = "wss", k.max=6)

PCA

pm10_pca <- prcomp(pm10.t2[,-183], center = TRUE, scale. = TRUE)
summary(pm10_pca)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6       PC7
## Standard deviation     8.9216 5.8958 5.1523 3.95726 3.69668 3.43124 3.983e-15
## Proportion of Variance 0.4373 0.1910 0.1459 0.08604 0.07508 0.06469 0.000e+00
## Cumulative Proportion  0.4373 0.6283 0.7742 0.86023 0.93531 1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 64% of the total variance

sub_grp <- cutree(hclust_pm10, k = 3)
fviz_cluster(list(data = pm10.scaled, cluster = sub_grp))

PCA 3d plot

This 3d plot for the first three principal components, accounting for 77% of the total variance

pca3d(pm10_pca, group=sub_grp, radius = 4, show.labels=T)

## [1] 1.4701407 0.7154049 0.6374044
## Creating new device

PCA 3d

NOx Analysis

Creating the dataframe for NOx with day columns

nox <- data.frame("station1"=s1.d$NOx,"station3"=s3.d$NOx ,"station5"=s5.d$NOx , 
                   "station9"=s9.d$NOx,"station10"=s10.d$NOx,"station11"=s11.d$NOx)
nox$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
nox <- full_join(nox,s2.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s7.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s8.d[,c("time","NOx")],by="time")

nox <- nox %>% rename(station2=NOx.x, station7=NOx.y, station8=NOx) %>%
  select(time,station1,station2,station3,station5,station7,station8,
         station9,station10,station11) %>%
  mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
         station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox$mean <- rowMeans(nox[,2:10])
knitr::kable(head(nox,3))

time	station1	station2	station3	station5	station7	station8	station9	station10	station11	mean
2020-01-01	82.36372	45.79606	40.62136	58.09139	25.14944	12.33513	8.669477	1.704961	35.21159	34.43812
2020-01-02	56.73409	40.99054	55.56007	45.45797	26.65660	13.26133	10.280462	2.418991	39.59478	32.32831
2020-01-03	42.54235	49.56395	86.81541	65.22078	36.50002	12.76792	14.621379	2.824170	39.55709	38.93479

nox_19 <- data.frame("station1"=s1_19.d$NOx,"station3"=s3_19.d$NOx ,"station5"=s5_19.d$NOx ,
                     "station7"=s7_19.d$NOx,"station8"=s8_19.d$NOx,
                     "station10"=s10_19.d$NOx,"station11"=s11_19.d$NOx)
nox_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
nox_19 <- full_join(nox_19,s2_19.d[,c("time","NOx")],by="time")
nox_19 <- full_join(nox_19,s9_19.d[,c("time","NOx")],by="time")

nox_19 <- nox_19 %>% rename(station2=NOx.x, station9=NOx.y) %>%
  select(time,station1,station2,station3,station5,station7,station8,
         station9,station10,station11) %>%
  mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
         station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox_19$mean <- rowMeans(nox_19[,2:10])
knitr::kable(head(nox_19,3))

time	station1	station2	station3	station5	station7	station8	station9	station10	station11	mean
2019-01-01	41.97831	37.66930	74.46432	46.32760	16.02855	7.482461	7.172156	41.97831	22.16688	32.80754
2019-01-02	87.88999	80.63773	49.68460	52.49455	18.10256	11.683488	6.102470	87.88999	39.24132	48.19186
2019-01-03	103.52024	76.38051	43.31829	54.20211	19.50543	12.245074	10.726350	103.52024	58.94068	53.59544

Creating the dataframe for pm10 with monthly columns

nox.m <- data.frame(s1.m$NOx,s2.m$NOx,s3.m$NOx,
                    s5.m$NOx,s7.m$NOx,s8.m$NOx,
                    s9.m$NOx,s10.m$NOx,s11.m$NOx)
nox.m$Mean <- rowMeans(nox.m)
row.names(nox.m)<-c("January","February","March","April","May","June")
colnames(nox.m)<-c(stations,"Mean")
knitr::kable(nox.m, digits = 2)

	Nicosia_Traf	Nicosia_Resi	Limassol_Traf	Larnaca_Traf	Paphos_Traf	Zygi_Indu	Mari_Indu	Ayia_Marina_Xylianou_Back	Paralimni_Traf	Mean
January	91.52	52.55	65.59	59.05	30.98	13.16	12.87	2.92	42.54	41.24
February	63.56	42.07	57.72	44.19	24.97	15.39	14.25	3.67	30.46	32.92
March	33.44	21.28	34.36	29.16	15.23	13.05	10.73	3.22	16.98	19.72
April	15.85	8.54	13.42	11.89	6.16	7.63	8.12	2.38	9.91	9.32
May	25.49	12.75	23.82	20.43	16.50	9.47	10.97	2.28	14.27	15.11
June	23.35	14.33	27.15	20.36	10.06	15.31	18.14	1.83	15.18	16.19

colnames(nox) <- c("time",stations,"mean")
knitr::kable(do.call(cbind, lapply(nox[2:11], summary)))

	Nicosia_Traf	Nicosia_Resi	Limassol_Traf	Larnaca_Traf	Paphos_Traf	Zygi_Indu	Mari_Indu	Ayia_Marina_Xylianou_Back	Paralimni_Traf	mean
Min.	4.381602	1.701600	4.374026	3.925617	1.096979	3.354127	4.492203	0.8867552	4.751423	4.325509
1st Qu.	18.919483	9.213398	16.921153	14.426882	6.056446	8.673014	8.477460	2.0534770	10.675313	12.570360
Median	28.274242	16.293681	30.563370	22.375224	15.017890	12.043879	10.922486	2.4228941	15.605522	17.711148
Mean	42.217921	25.329135	36.976008	30.844837	17.386851	12.335134	12.482777	2.7217090	21.551826	22.427355
3rd Qu.	54.810168	39.055830	52.007920	44.548574	25.405847	15.280685	15.588738	3.0809064	24.351356	31.687493
Max.	158.532276	100.709014	139.149394	126.046744	75.349972	27.850411	28.655146	8.9129858	113.762156	68.516004

data preperation

nox.t <- as.data.frame(t(as.matrix(nox)))
# changing row and column names
colnames(nox.t) <- nox$time
nox.bar <- nox.t[2:10,]
nox.bar$row.names <- rownames(nox.bar)
# changing the data from wide format to a single column of data
nox.bar<-melt(nox.bar,id=c("row.names"))
nox.bar$value <- as.numeric(nox.bar$value)
nox.bar$variable <- ymd(nox.bar$variable)
nox.bar$row.names <- as.factor(nox.bar$row.names)

Boxplot for each station

ggplot(data = nox.bar, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="NOx boxplot for all stations", x="Stations",y="NOx") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

NOx time series

The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. Some peaks are a result of dust episodes. We can see that COVID19 situation affect the values of NOx.

ggplot(nox, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Month",title="January to June 2020 Time series for NOx") + 
  scale_color_manual(name = "statistics", values = c(Lockdown="red",EU_limit_value="magenta",
                           Flight_ban="orange",WHO_air_quality_guideline="green")) +
  theme_classic() + 
  annotate("text",x=pm10$time[140],y = 37,label="Dust Episode", size =4, fontface="bold", color="brown" )

nox$meansel <- rowMeans(nox[,c(2,4,6,7)])
nox_19$meansel <- rowMeans(nox_19[,c(2,4,6,7)])
nox$index <- yday(nox$time)
nox_19$index <- yday(nox_19$time)
ggplot() + geom_line(data=nox, aes(x=index,y=meansel,color="2020"),size=1)+
  geom_line(data=nox_19, aes(x=index,y=meansel, color="2019"), size=1) + 
  geom_smooth(data=nox, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=nox_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Days",title="NOx value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic()

ggplot() + geom_smooth(data=nox, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
  geom_smooth(data=nox_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
  geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) + 
  labs(y="NOx value", x="Days",title="NOx value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
                           WHO_air_quality_guideline="green","2020"="red",
                           "2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic()

Time Series plot for every station

ggplot(nox.bar[nox.bar$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + 
  geom_area(aes(fill = row.names, colour=row.names),alpha = 0.3) + theme_bw() +
    labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
    labs(fill="Station") + labs(colour="Station")

ggplot(nox.bar[nox.bar$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(colour=row.names), alpha=0.6, size=1) + theme_bw() +
    labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
    labs(colour="Station")

NOx daily difference

After COVID19 situation we can see that the daily difference is low and NOx seems to be more stable.

nox.ts <- ts(nox[11], start=c(2020,1) , frequency = 365)
nox.diff <- diff(nox.ts)
nox.diff <- data.frame("time"=nox$time,"diff"=c(0,nox.diff)) 
ggplot(nox.diff, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("NOx difference") +
  ggtitle("NOx daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot for NOx monthly values for each station

nox.mt <- as.data.frame(t(as.matrix(nox.m)))
heatmaply(nox.mt, grid_color = "white", dendrogram = "none", main = "NOx per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(nox.mt,2))

Barplot of NOx for every month and visual contribution of each station

nox.bar2 <- nox.mt[1:9,]
nox.bar2$row.names <- rownames(nox.bar2) # adding column with row names
# changing the data from wide format to a single column of data
nox.bar2<-melt(nox.bar2,id=c("row.names"))
# creating the plot
ggplot(nox.bar2,aes(x=variable,y=value/9,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "NOx per month and station contribution", y = "NOx value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

Performing a Hierarchical Cluster analysis, calculating the distance between two points using euclidean distance and complete method to calculate the distance between clusters. Ayia Marina Xylianou has her own cluster, the red cluster includes the traffic stations of the three big cities of Cyprus and the blue cluster includes the traffic station of smallers cities like Paralimni and Paphos, the industrial stations and the Nicosia residential station. We can assume red cluster has the highest NOx values, green cluster has the lowest NOx values and blue cluster has NOx values between green and red.

nox.t2 <- as.data.frame(t(as.matrix(nox[,2:10])))
nox.t2$station <- stations
nox.scaled <- as.data.frame(scale(nox.t2[,1:182]))
dist_nox <- dist(nox.scaled, method = 'euclidean') # calculating the distance
hclust_nox <- hclust(dist_nox, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_nox,labels=nox.t2$station)
rect.hclust(hclust_nox ,  k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule for NOx

fviz_nbclust(nox.scaled, kmeans, method = "wss", k.max=8)

PCA

nox_pca <- prcomp(nox.t2[,-183], center = TRUE, scale. = TRUE)
summary(nox_pca)

## Importance of components:
##                            PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     11.1551 4.3793 3.68206 2.70529 2.47200 2.20758 1.98466
## Proportion of Variance  0.6837 0.1054 0.07449 0.04021 0.03358 0.02678 0.02164
## Cumulative Proportion   0.6837 0.7891 0.86359 0.90380 0.93737 0.96415 0.98579
##                            PC8       PC9
## Standard deviation     1.60800 2.751e-15
## Proportion of Variance 0.01421 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 79% of the total variance

sub_grp2 <- cutree(hclust_nox, k = 3)
fviz_cluster(list(data = nox.scaled, cluster = sub_grp2))

PCA 3d plot

This 3d plot for the first three principal components, accounting for 86% of the total variance

pca3d(nox_pca, group=sub_grp2, radius = 4, show.labels=T)

## [1] 1.5335850 0.5048702 0.4734467

NOx PCA 3d

C6H6

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
c6h6 <- data.frame("station1"=s1.d$C6H6,"station5"=s5.d$C6H6,"station9"=s9.d$C6H6,"station10"=s10.d$C6H6)
# Creating a time column to help us join previous dataframe with station 7 and 8.
c6h6$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
c6h6 <- full_join(c6h6,s7.d[,c("time","C6H6")],by="time")
# Full join pm10 and station8 by time column
c6h6 <- full_join(c6h6,s8.d[,c("time","C6H6")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
c6h6 <- c6h6 %>% rename(station7=C6H6.x, station8=C6H6.y) %>%
  select(time,station1,station5,station7,station8,station9,station10) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
c6h6$mean <- rowMeans(c6h6[,2:7])
knitr::kable(head(c6h6,3)) # printing the first 3 columns

time	station1	station5	station7	station8	station9	station10	mean
2020-01-01	2.3974533	2.0189181	1.216708	0.2946909	0.5019321	0.7258804	1.1925971
2020-01-02	1.4069889	0.9467244	1.261858	0.5025892	0.3706992	0.6880867	0.8628245
2020-01-03	0.8622788	1.1730584	1.267356	0.4103396	0.2715304	0.4655657	0.7416880

# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
c6h6_19 <- data.frame("station1"=s1_19.d$C6H6,"station5"=s5_19.d$C6H6,
                   "station7"=s7_19.d$C6H6,"station8"=s8_19.d$C6H6,"station10"=s10_19.d$C6H6)
# Creating a time column to help us join previous dataframe with station 7 and 8.
c6h6_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
c6h6_19 <- full_join(c6h6_19,s9_19.d[,c("time","C6H6")],by="time")
# Full join pm10 and station8 by time column

# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
c6h6_19 <- c6h6_19 %>% rename(station9=C6H6) %>%
  select(time,station1,station5,station7,station8,station9,station10) %>%
  mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
         station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8)) 
# Adding a mean column with mean value of PM10 for every day
c6h6_19$mean <- rowMeans(c6h6_19[,2:7])
knitr::kable(head(c6h6_19,3)) # printing the first 3 columns

time	station1	station5	station7	station8	station9	station10	mean
2019-01-01	0.8574786	1.907207	0.9028013	0.8523375	0.2651717	0.8574786	0.9404124
2019-01-02	1.4710263	1.654617	0.6863346	0.7166400	0.1449858	1.4710263	1.0241050
2019-01-03	1.5480072	1.412580	0.8689784	0.6715878	0.0338229	1.5480072	1.0138307

Creating thedataframe for C6H6 with monthly columns

# adding the values to dataframe
c6h6.m <- data.frame(s1.m$C6H6,s5.m$C6H6,s7.m$C6H6,
                     s8.m$C6H6,s9.m$C6H6,s10.m$C6H6)
# Creating a column with mean values for every month
c6h6.m$mean <- rowMeans(c6h6.m)
# Adding row and column names
rownames(c6h6.m) <- c("January","February","March","April","May","June") 
colnames(c6h6.m) <- c(stations[c(1,4:8)],"mean")
knitr::kable(c6h6.m, digits = 4) #printing the table

	Nicosia_Traf	Larnaca_Traf	Paphos_Traf	Zygi_Indu	Mari_Indu	Ayia_Marina_Xylianou_Back	mean
January	1.7935	2.0161	1.5628	0.5110	0.3328	0.6582	1.1457
February	1.5373	1.1909	1.0157	0.1949	0.6308	0.6037	0.8622
March	0.8519	0.6539	0.4443	0.4412	0.7596	0.4543	0.6009
April	0.4002	0.3344	0.1802	0.2885	0.2698	0.3335	0.3011
May	0.4054	0.4391	0.2529	0.1717	0.0853	0.2006	0.2592
June	0.3129	0.4197	0.1785	0.1581	0.0479	0.1933	0.2184

Summary statistics of PM10 for each station

colnames(c6h6) <- c("time",stations[c(1,4:8)],"mean")
knitr::kable(do.call(cbind, lapply(c6h6[2:8], summary)))

	Nicosia_Traf	Larnaca_Traf	Paphos_Traf	Zygi_Indu	Mari_Indu	Ayia_Marina_Xylianou_Back	mean
Min.	0.1441559	0.1363796	0.0545053	0.0000000	0.0045999	0.0251507	0.1360316
1st Qu.	0.3300891	0.3396551	0.1872086	0.1363064	0.1031937	0.2294919	0.2477120
Median	0.6190251	0.5113127	0.3523731	0.2865139	0.2722792	0.3869274	0.4228677
Mean	0.8820508	0.8434774	0.6204580	0.2946909	0.3532593	0.4076478	0.5669307
3rd Qu.	1.2920771	1.0519909	0.9430208	0.4188988	0.6641393	0.5345103	0.8204481
Max.	2.9314320	5.7590451	2.4143210	0.8420553	0.8128323	1.2147028	2.0201268

Data preperation

c6h6.t <- as.data.frame(t(as.matrix(c6h6))) # transpose the pm10 matrix
# changing row and column names
rownames(c6h6.t) <- c("time",stations[c(1,4:8)],"mean")
colnames(c6h6.t) <- c6h6$time
c6h6.bar2 <- c6h6.t[2:7,]
c6h6.bar2$row.names <- rownames(c6h6.bar2)
# changing the data from wide format to a single column of data
c6h6.bar2<-melt(c6h6.bar2,id=c("row.names"))
# Changing the type of class
c6h6.bar2$value <- as.numeric(c6h6.bar2$value)
c6h6.bar2$variable <- ymd(c6h6.bar2$variable)
c6h6.bar2$row.names <- as.factor(c6h6.bar2$row.names)

Boxplot for each station

ggplot(data = c6h6.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
  theme(legend.position = "none") + labs(title="C6H6 boxplot for all stations", x="Stations",y="C6H6") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

C6H6 time series.

c6h6$meansel <-  rowMeans(c6h6[,c(2,4,5)])
ggplot(c6h6, aes(x=time,y=meansel)) + geom_line(size=1, alpha=0.7) + geom_smooth() +
  geom_vline(aes(xintercept=as.numeric(c6h6$time[84]),colour="Lockdown"),linetype=2, size=1) +
  geom_vline(aes(xintercept=as.numeric(c6h6$time[81]),colour="Flight_ban"),linetype=2, size=1) +
  labs(y="C6H6 value", x="Month",title="January to June 2020 Time series for C6H6") + 
  scale_color_manual(name = "Line notation", values = c(Lockdown="red",Flight_ban="orange")) +
  theme_classic() +
  annotate("text",x=c6h6$time[140],y = 1,label="Dust Episode", size =4, fontface="bold", color="brown" ) + geom_line(aes(x=time,y=mean), color="red", size=1, alpha=0.7)

c6h6_19$meansel <-  rowMeans(c6h6_19[,c(2,4,5)])
c6h6$index <- yday(c6h6$time)
c6h6_19$index <- yday(c6h6_19$time)
ggplot() + geom_line(data=c6h6, aes(x=index,y=meansel,color="2020"),size=1)+
           geom_smooth(data=c6h6, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
           geom_smooth(data=c6h6_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
           geom_line(data=c6h6_19, aes(x=index,y=meansel,color="2019"),size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="C6H6 value", x="Days",title="C6H6 value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c("2020"="red",
                           "2019"="blue",
                           "2020 smooth"="brown","2019 smooth"="yellow")) +
  scale_linetype_manual(name = "Lockdown date",
                           values = c("Lockdown" = "dashed")) + 
  theme_classic()

ggplot() + geom_smooth(data=c6h6, aes(x=index,y=meansel,color="2020"), size=1,level=0.95) +
           geom_smooth(data=c6h6_19, aes(x=index,y=meansel,color="2019"), size=1) +
  geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
  labs(y="C6H6 value", x="Days",title="C6H6 value comparison for first half of years 2019 and 2020") + 
  scale_color_manual(name = "Line notations", values = c("2019"="yellow","2020"="brown")) +
  scale_linetype_manual(name = "Lockdown date", values = c("Lockdown" = "dashed")) + theme_classic()

Time series plot for every station

ggplot(c6h6.bar2, aes(x=variable,y=as.numeric(value))) + geom_area(aes(color = row.names, fill = row.names),
    alpha = 0.7) + theme_bw() +
    labs(y="C6H6 value", x="Month",title="January to June 2020 C6H6 Time series for every station") +
    labs(fill='Station') + labs(color="Station")

sn4 <- c("Nicosia_Traf", "Larnaca_Traf","Paphos_Traf","Zygi_Indu")
ggplot(c6h6.bar2[c6h6.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
    alpha = 0.5, size=1) + theme_classic() +
    labs(y="C6H6 value", x="Month",title="January to June 2020 C6H6 Time series for every station") +
    labs(color="Station")

C6H6 daily difference

c6h6.ts <- ts(c6h6[8], start=c(2020,1) , frequency = 365)
c6h6.diff <- diff(c6h6.ts)
c6h6.dif <- data.frame("time"=c6h6$time,"diff"=c(0,c6h6.diff)) 
ggplot(c6h6.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("C6H6 difference") +
  ggtitle("C6H6 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values

c6h6.mt <- as.data.frame(t(as.matrix(c6h6.m))) ##transpose of month matrix
rownames(c6h6.mt) <- c(stations[c(1,4:8)],"mean") #adding row names
heatmaply(c6h6.mt, grid_color = "white", dendrogram = "none", main = "C6H6 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(c6h6.mt,2))

Barplot of C6H6 for every month and visual contribution of each station

c6h6.bar <- c6h6.mt[1:6,] # row filtering
c6h6.bar$row.names <- rownames(c6h6.bar) # adding column with row names
# changing the data from wide format to a single column of data
c6h6.bar<-melt(c6h6.bar,id=c("row.names"))
# creating the plot
ggplot(c6h6.bar,aes(x=variable,y=value/6,fill=row.names))+geom_bar(position="stack", stat="identity") +
  labs(title = "C6H6 per month and station contribution", y = "C6H6 value", x = "Months") +
  labs(fill = "Stations")

Hierarchical Analysis

c6h6.t2 <- as.data.frame(t(as.matrix(c6h6[,2:7])))
row.names(c6h6.t2)<- stations[c(1,4:8)]
c6h6.t2$station <- stations[c(1,4:8)]
c6h6.scaled <- as.data.frame(scale(c6h6.t2[,1:182])) # scaling
dist_c6h6 <- dist(c6h6.scaled, method = 'euclidean') # calculating the distance
hclust_c6h6 <- hclust(dist_c6h6, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_c6h6,labels=c6h6.t2$station)
rect.hclust(hclust_c6h6 ,  k = 3,border = 2:6) # adding the 3 cluster boxes

fviz_nbclust(c6h6.scaled, kmeans, method = "wss", k.max=5)

PCA for C6H6

c6h6_pca <- prcomp(c6h6.t2[,-183], center = TRUE, scale. = TRUE)
summary(c6h6_pca)

## Importance of components:
##                          PC1    PC2    PC3     PC4     PC5       PC6
## Standard deviation     9.691 6.0408 5.3560 3.70041 3.03511 2.156e-15
## Proportion of Variance 0.516 0.2005 0.1576 0.07524 0.05061 0.000e+00
## Cumulative Proportion  0.516 0.7165 0.8741 0.94939 1.00000 1.000e+00

Cluster plot with PCA

This biplot for the first two principal components, accounting for 71.5% of the total variance

sub_grp <- cutree(hclust_c6h6, k = 3)
fviz_cluster(list(data = c6h6.scaled, cluster = sub_grp))

PCA 3d plot for C6H6

This 3d plot for the first three principal components, accounting for 87% of the total variance

pca3d(c6h6_pca, group=sub_grp, radius = 4, show.labels=T)

## [1] 0.9164679 0.7704207 0.5425491

PCA 3d

Loading the covid database

covid <- read.csv("owid-covid-data.csv")
covid <- covid %>% filter(location=="Cyprus") # filtering the Cyprus data
covid$date <- ymd(covid$date) # changing the date class

Joining the covid and pm10 dataset

pm10$time <- ymd(pm10$time) # changing the time class
# left join the two dataset using time and date column
pm10.cov <- merge(pm10, covid, by.x="time",by.y="date", all.x =T )
# removing columns with almost zero variance
pm10.cov <- pm10.cov %>% select(-nearZeroVar(pm10.cov)) 
# filtering the columns we need
pm10.cov2 <- pm10.cov %>% select(c(1,9,12:15))
# filling NA with 0 for the dates before COVID19
pm10.cov2[,3:6] <- sapply(pm10.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix

# creating a function to setup the geom_smooth graph of ggpair plot
my_fn <- function(data, mapping, method="loess", ...){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=method, color="black", ...)
  p
}

ggpairs(pm10.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

creating the corrplot

We can see that there is no clear correlation between PM10 and COVID19. The correlation values are close to zero so there is almost zero impact of coronavirus to values of PM10.

cor.pm10 <- cor(pm10.cov2[,2:6])
corrplot.mixed(cor.pm10)

Join the nox and covid database

nox$time <- ymd(nox$time)
# left join between nox and covid dataset
nox.cov <- merge(nox, covid, by.x="time",by.y="date", all.x =T )
nox.cov <- nox.cov %>% select(-nearZeroVar(nox.cov)) 
nox.cov2 <- nox.cov %>% select(c(1,11,14:17))
nox.cov2[,3:6] <- sapply(nox.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix between mean value of NOx and COVID19 values

ggpairs(nox.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

We can see that we have a negative correlation between NOx value and total cases of coronavirus. That negative correlation means that when total cases increase the NOx mean value descreases. So there is COVID19 impact in the NOx values.

cor.nox <- cor(nox.cov2[,2:6])
corrplot.mixed(cor.nox)

Join the C6H6 and covid database

c6h6$time <- ymd(c6h6$time)
# left join between nox and covid dataset
c6h6.cov <- merge(c6h6, covid, by.x="time",by.y="date", all.x =T )
c6h6.cov <- c6h6.cov %>% select(-nearZeroVar(c6h6.cov)) 
c6h6.cov2 <- c6h6.cov %>% select(c(1,8,11:14))
c6h6.cov2[,3:6] <- sapply(c6h6.cov2[,3:6], function(x) ifelse(is.na(x),0,x))

Correlation matrix between mean value of C6H6 and COVID19 values

ggpairs(c6h6.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn), 
        upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

cor.c6h6 <- cor(c6h6.cov2[,2:6])
corrplot.mixed(cor.c6h6)

Cyprus Air Pollution January-June 2020 Analysis

Stavros Oikonomou

2/8/2020

Loading the libraries we gonna use

Loading the datasets, one for each month

Loading the datasets, one for each month for 2019

Extracting the values for 2020

We gonna extract the values from our dataset. We gonna create tables for every station for all six months.

We gonna use the tidyjson library and spread_all function to spread the values and create a matrix in which

every row is a hourly measures and details for all types of pollution. Then we gonna save the matrix as .csv file.

Station 1

Station 2

Station 3

Station 5

Station 7

Station 8

Station 9

Station 10

Station 11

Extracting the values for 2019

Station 1 2019

Station 2 2019

Station 3 2019

Station 5 2019

Station 7 2019

Station 8 2019

Station 9 2019

Station 10 2019

Station 11 2019

creating a vector with stations names

Data preprocessing

We gonna load the datasets for every station, filter the columns and keep the values for every pollutant type. We’ll change the class of time to POSIXct, check for NAs and fill them using na.approx function

Station 1

Station 2

Station 3

Station 5

Station 7

Station 8

Station 9

Station 10

Station 11

Station 1 2019

Station 2 2019

Station 3 2019

Station 5 2019

Station 7 2019

Station 8 2019

Station 10 2019

Station 11 2019

Changing the time

Next step is gonna be to change the time. Our dataset is in hours so we gonna create two new datasets for every station in which we gonna store mean value for each day and for each month. First we’ll transform the datasets to xts type and then create .d and .m datasets for days and months.

Station 1

Station 1 2019

Station 2

Station 2 2019

Station 3

Station 3 2019

Station 5

Station 5 2019

Station 7

Station 7 2019

Station 8

Station 8 2019

Station 9

Station 9 2019

Station 10

Station 10 2019

Station 11

Station 11 2019

deleting the datasets we dont gonna use

PM10 Analysis

Creating the dataframe for pm10 with day columns

Creating the dataframe for pm10 with monthly columns

Summary statistics of PM10 for each station

Data preperation

Boxplot for each station

PM10 time series.

Time Series plot for every station

PM10 daily difference

COVID19 situation doesn’t change the stability of PM10 and the differences seems to be almost the same before and after lockdown. The highest differences can be spotted the first 2 weeks after lockdown.