Data preprocessing
We gonna load the datasets for every station, filter the columns and keep the values for every pollutant type. We’ll change the class of time to POSIXct, check for NAs and fill them using na.approx function
Station 1
s1 <- read.csv("station1.csv") # Loading the .csv file we previous create
s1 <- s1 %>% select(c(seq(2,38,4))) # Filtering the columns with pollutant values
names(s1) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25") #adding the column names
s1$time <- ymd_hms(s1$time) # changing the type of time
sapply(s1, function(x) sum(is.na(x))) # checking for NA
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 99 675 35 104 104 375 103 104 675
# Filling NA with na.approx function
s1[,2:10] <- sapply(s1[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = T), x))
sapply(s1, function(x) sum(is.na(x))) # zero NA
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 0 0 0 0 0 0 0 0 0
knitr::kable(head(s1,3)) # printing the first 3 rows
| 2020-01-01 00:00:00 |
4.213296 |
113.18510 |
5.422678 |
41.66263 |
64.42079 |
1045.5090 |
5.304624 |
14.83504 |
89.80502 |
| 2020-01-01 01:00:00 |
3.986767 |
49.54911 |
7.720096 |
41.01508 |
90.58797 |
977.6353 |
2.203252 |
32.31437 |
35.75103 |
| 2020-01-01 02:00:00 |
4.940188 |
64.78768 |
4.493460 |
50.22812 |
119.87520 |
1263.8940 |
3.149906 |
45.39985 |
35.52024 |
Station 2
s2 <- read.csv("station2.csv")
s2 <- s2 %>% select(c(seq(2,26,4)))
names(s2) <- c("time","SO2","O3","NO2","NOx","CO","NO")
s2$time <- ymd_hms(s2$time)
sapply(s2, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO NO
## 0 179 44 100 84 89 50
s2[,2:7] <- sapply(s2[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 3
s3 <- read.csv("station3.csv")
s3 <- s3 %>% select(c(seq(2,30,4)))
names(s3) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3$time <- ymd_hms(s3$time)
sapply(s3, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO NO
## 0 684 133 124 228 228 1194 254
s3[,2:8] <- sapply(s3[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 5
s5 <- read.csv("station5.csv")
s5 <- s5 %>% select(c(seq(2,38,4)))
names(s5) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5$time <- ymd_hms(s5$time)
sapply(s5, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 199 210 97 64 64 98 185 95 221
s5[,2:10] <- sapply(s5[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 7
s7 <- read.csv("station7.csv")
s7 <- s7 %>% select(c(seq(2,38,4)))
names(s7) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s7$time <- ymd_hms(s7$time)
sapply(s7, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 656 406 121 165 164 320 539 206 479
s7[,2:10] <- sapply(s7[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 8
s8 <- read.csv("station8.csv")
s8 <- s8 %>% select(c(seq(2,34,4)))
names(s8) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8$time <- ymd_hms(s8$time)
sapply(s8, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx C6H6 NO PM25
## 0 202 139 76 83 83 1027 91 142
s8[,2:9] <- sapply(s8[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 9
s9 <- read.csv("station9.csv")
s9 <- s9 %>% select(c(seq(2,30,4)))
names(s9) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9$time <- ymd_hms(s9$time)
sapply(s9, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO C6H6 NO
## 0 180 44 114 114 626 1943 235
s9[,2:8] <- sapply(s9[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 10
s10 <- read.csv("station10.csv")
s10 <- s10 %>% select(c(seq(2,38,4)))
names(s10) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","PM25","NO")
s10$time <- ymd_hms(s10$time)
sapply(s10, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 PM25 NO
## 0 291 1144 108 180 180 1195 288 1167 429
s10[,2:10] <- sapply(s10[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 11
s11 <- read.csv("station11.csv")
s11 <- s11 %>% select(c(seq(2,34,4)))
names(s11) <- c("time","SO2","O3","NO2","NOx","CO","NO","PM10","PM25")
s11$time <- ymd_hms(s11$time)
sapply(s11, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO NO PM10 PM25
## 0 262 113 124 124 94 275 864 864
s11[,2:9] <- sapply(s11[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 1 2019
s1_19 <- read.csv("station1_19.csv")
s1_19 <- s1_19 %>% select(c(seq(2,38,4)))
names(s1_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s1_19$time <- ymd_hms(s1_19$time)
sapply(s1_19, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 156 654 63 113 113 84 125 115 654
s1_19[,2:10] <- sapply(s1_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 2 2019
s2_19 <- read.csv("station2_19.csv")
s2_19 <- s2_19 %>% select(c(seq(2,26,4)))
names(s2_19) <- c("time","SO2","CO","NO2","NOx","NO","O3")
s2_19$time <- ymd_hms(s2_19$time)
sapply(s2_19, function(x) sum(is.na(x)))
## time SO2 CO NO2 NOx NO O3
## 0 820 85 94 94 96 115
s2_19[,2:7] <- sapply(s2_19[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 3 2019
s3_19 <- read.csv("station3_19.csv")
s3_19 <- s3_19 %>% select(c(seq(2,30,4)))
names(s3_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3_19$time <- ymd_hms(s3_19$time)
sapply(s3_19, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO NO
## 0 505 66 225 116 116 490 116
s3_19[,2:8] <- sapply(s3_19[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 5 2019
s5_19 <- read.csv("station5_19.csv")
s5_19 <- s5_19 %>% select(c(seq(2,38,4)))
names(s5_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5_19$time <- ymd_hms(s5_19$time)
sapply(s5_19, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 102 276 123 69 69 73 66 75 276
s5_19[,2:10] <- sapply(s5_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 7 2019
s7_19 <- read.csv("station7_19.csv")
s7_19 <- s7_19 %>% select(c(seq(2,38,4)))
names(s7_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO","PM10","PM25")
s7_19$time <- ymd_hms(s7_19$time)
sapply(s7_19, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO C6H6 NO PM10 PM25
## 0 410 26 70 65 80 354 73 1695 3680
s7_19[,2:10] <- sapply(s7_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 8 2019
s8_19 <- read.csv("station8_19.csv")
s8_19 <- s8_19 %>% select(c(seq(2,34,4)))
names(s8_19) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8_19$time <- ymd_hms(s8_19$time)
sapply(s8_19, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx C6H6 NO PM25
## 0 547 153 135 183 78 212 78 153
s8_19[,2:9] <- sapply(s8_19[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
#### Station 9 2019
s9_19 <- read.csv("station9_19.csv")
s9_19<- s9_19 %>% select(c(seq(2,30,4)))
names(s9_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9_19$time <- ymd_hms(s9_19$time)
sapply(s9_19, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO C6H6 NO
## 0 352 503 221 221 297 1052 225
s9_19[,2:8] <- sapply(s9_19[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 10 2019
s10_19 <- read.csv("station10_19.csv")
s10_19 <- s10_19 %>% select(c(seq(2,38,4)))
names(s10_19) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO","PM10","PM25")
s10_19$time <- ymd_hms(s10_19$time)
sapply(s10_19, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO C6H6 NO PM10 PM25
## 0 922 88 209 209 321 276 406 1032 1030
s10_19[,2:10] <- sapply(s10_19[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 11 2019
s11_19 <- read.csv("station11_19.csv")
s11_19 <- s11_19 %>% select(c(seq(2,34,4)))
names(s11_19) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO","PM25")
s11_19$time <- ymd_hms(s11_19$time)
sapply(s11_19, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO NO PM25
## 0 208 126 21 64 64 75 64 126
s11_19[,2:9] <- sapply(s11_19[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Changing the time
Next step is gonna be to change the time. Our dataset is in hours so we gonna create two new datasets for every station in which we gonna store mean value for each day and for each month. First we’ll transform the datasets to xts type and then create .d and .m datasets for days and months.
Station 1
s1.xts <- xts(s1, order.by = s1$time) # creating the xts type of s1 dataset
s1.d <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "days"), mean) #finding the mean for every day
s1.m <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "months"), mean) #finding the mean for every month
s1.d <- as.data.frame(s1.d) # changing for xts to data.frame type
s1.m <- as.data.frame(s1.m) # changing for xts to data.frame type
Station 1 2019
s1_19.xts <- xts(s1_19, order.by = s1_19$time) # creating the xts type of s1 dataset
s1_19.d <- period.apply(s1_19.xts[,2:10] , endpoints(s1_19.xts, "days"), mean) #finding the mean for every day
s1_19.m <- period.apply(s1_19.xts[,2:10] , endpoints(s1_19.xts, "months"), mean) #finding the mean for every month
s1_19.d <- as.data.frame(s1_19.d) # changing for xts to data.frame type
s1_19.m <- as.data.frame(s1_19.m) # changing for xts to data.frame type
Station 2
s2.xts <- xts(s2, order.by = s2$time)
s2.d <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "days"), mean)
s2.m <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "months"), mean)
s2.d <- as.data.frame(s2.d)
s2.m <- as.array.default(s2.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2.d$time <- as.POSIXct(row.names(s2.d),format = "%Y-%m-%d")
Station 2 2019
s2_19.xts <- xts(s2_19, order.by = s2_19$time)
s2_19.d <- period.apply(s2_19.xts[,2:7] , endpoints(s2_19.xts, "days"), mean)
s2_19.m <- period.apply(s2_19.xts[,2:7] , endpoints(s2_19.xts, "months"), mean)
s2_19.d <- as.data.frame(s2_19.d)
s2_19.m <- as.array.default(s2_19.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2_19.d$time <- as.POSIXct(row.names(s2_19.d),format = "%Y-%m-%d")
Station 3
s3.xts <- xts(s3, order.by = s3$time)
s3.d <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "days"), mean)
s3.m <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "months"), mean)
s3.d <- as.data.frame(s3.d)
s3.m <- as.data.frame(s3.m)
Station 3 2019
s3_19.xts <- xts(s3_19, order.by = s3_19$time)
s3_19.d <- period.apply(s3_19.xts[,2:8] , endpoints(s3_19.xts, "days"), mean)
s3_19.m <- period.apply(s3_19.xts[,2:8] , endpoints(s3_19.xts, "months"), mean)
s3_19.d <- as.data.frame(s3_19.d)
s3_19.m <- as.data.frame(s3_19.m)
Station 5
s5.xts <- xts(s5, order.by = s5$time)
s5.d <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "days"), mean)
s5.m <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "months"), mean)
s5.d <- as.data.frame(s5.d)
s5.m <- as.data.frame(s5.m)
Station 5 2019
s5_19.xts <- xts(s5_19, order.by = s5_19$time)
s5_19.d <- period.apply(s5_19.xts[,2:10] , endpoints(s5_19.xts, "days"), mean)
s5_19.m <- period.apply(s5_19.xts[,2:10] , endpoints(s5_19.xts, "months"), mean)
s5_19.d <- as.data.frame(s5_19.d)
s5_19.m <- as.data.frame(s5_19.m)
Station 7
s7.xts <- xts(s7, order.by = s7$time)
s7.d <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "days"), mean)
s7.m <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "months"), mean)
s7.d <- as.data.frame(s7.d)
s7.m <- as.data.frame(s7.m)
s7.d$time <- as.POSIXct(row.names(s7.d),format = "%Y-%m-%d")
Station 7 2019
s7_19.xts <- xts(s7_19, order.by = s7_19$time)
s7_19.d <- period.apply(s7_19.xts[,2:10] , endpoints(s7_19.xts, "days"), mean)
s7_19.m <- period.apply(s7_19.xts[,2:10] , endpoints(s7_19.xts, "months"), mean)
s7_19.d <- as.data.frame(s7_19.d)
s7_19.m <- as.data.frame(s7_19.m)
s7_19.d$time <- as.POSIXct(row.names(s7_19.d),format = "%Y-%m-%d")
Station 8
s8.xts <- xts(s8, order.by = s8$time)
s8.d <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "days"), mean)
s8.m <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "months"), mean)
s8.d <- as.data.frame(s8.d)
s8.m <- as.data.frame(s8.m)
s8.d$time <- as.POSIXct(row.names(s8.d),format = "%Y-%m-%d")
Station 8 2019
s8_19.xts <- xts(s8_19, order.by = s8_19$time)
s8_19.d <- period.apply(s8_19.xts[,2:9] , endpoints(s8_19.xts, "days"), mean)
s8_19.m <- period.apply(s8_19.xts[,2:9] , endpoints(s8_19.xts, "months"), mean)
s8_19.d <- as.data.frame(s8_19.d)
s8_19.m <- as.data.frame(s8_19.m)
s8_19.d$time <- as.POSIXct(row.names(s8_19.d),format = "%Y-%m-%d")
Station 9
s9.xts <- xts(s9, order.by = s9$time)
s9.d <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "days"), mean)
s9.m <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "months"), mean)
s9.d <- as.data.frame(s9.d)
s9.m <- as.data.frame(s9.m)
Station 9 2019
s9_19.xts <- xts(s9_19, order.by = s9_19$time)
s9_19.d <- period.apply(s9_19.xts[,2:8] , endpoints(s9_19.xts, "days"), mean)
s9_19.m <- period.apply(s9_19.xts[,2:8] , endpoints(s9_19.xts, "months"), mean)
s9_19.d <- as.data.frame(s9_19.d)
s9_19.m <- as.data.frame(s9_19.m)
s9_19.d$time <- as.POSIXct(row.names(s9_19.d),format = "%Y-%m-%d")
Station 10
s10.xts <- xts(s10, order.by = s10$time)
s10.d <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "days"), mean)
s10.m <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "months"), mean)
s10.d <- as.data.frame(s10.d)
s10.m <- as.data.frame(s10.m)
Station 10 2019
s10_19.xts <- xts(s10_19, order.by = s10_19$time)
s10_19.d <- period.apply(s10_19.xts[,2:10] , endpoints(s10_19.xts, "days"), mean)
s10_19.m <- period.apply(s10_19.xts[,2:10] , endpoints(s10_19.xts, "months"), mean)
s10_19.d <- as.data.frame(s1_19.d)
s10_19.m <- as.data.frame(s10_19.m)
Station 11
s11.xts <- xts(s11, order.by = s11$time)
s11.d <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "days"), mean)
s11.m <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "months"), mean)
s11.d <- as.data.frame(s11.d)
s11.m <- as.data.frame(s11.m)
Station 11 2019
s11_19.xts <- xts(s11_19, order.by = s11_19$time)
s11_19.d <- period.apply(s11_19.xts[,2:9] , endpoints(s11_19.xts, "days"), mean)
s11_19.m <- period.apply(s11_19.xts[,2:9] , endpoints(s11_19.xts, "months"), mean)
s11_19.d <- as.data.frame(s11_19.d)
s11_19.m <- as.data.frame(s11_19.m)
deleting the datasets we dont gonna use
rm(s1,s1.xts,s10, s10.xts, s11, s11.xts,s2,s2.xts,s3, s3.xts, s5, s5.xts,
s7, s7.xts, s8, s8.xts,s9, s9.xts, s10, s10.xts, s11, s11.xts, s1_19,s1_19.xts,
s10_19,s10_19.xts,s11_19,s11_19.xts,s2_19,s2_19.xts,s3_19,s3_19.xts,s5_19,
s5_19.xts,s7_19,s7_19.xts,s8_19,s8_19.xts,s9_19,s9_19.xts)
PM10 Analysis
Creating the dataframe for pm10 with day columns
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10 <- data.frame("station1"=s1.d$PM10,"station3"=s3.d$PM10,"station5"=s5.d$PM10,
"station10"=s10.d$PM10,"station11"=s11.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
pm10 <- full_join(pm10,s7.d[,c("time","PM10")],by="time")
# Full join pm10 and station8 by time column
pm10 <- full_join(pm10,s8.d[,c("time","PM10")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10 <- pm10 %>% rename(station7=PM10.x, station8=PM10.y) %>%
select(time,station1,station3,station5,station7,station8,station10,station11) %>%
mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
# Adding a mean column with mean value of PM10 for every day
pm10$mean <- rowMeans(pm10[,2:8])
knitr::kable(head(pm10,3)) # printing the first 3 columns
| 2020-01-01 |
42.89521 |
18.62904 |
34.34300 |
20.03441 |
26.316686 |
3.927083 |
26.23751 |
24.62613 |
| 2020-01-02 |
20.59653 |
10.59705 |
23.11284 |
17.06729 |
10.038982 |
3.721084 |
30.37896 |
16.50182 |
| 2020-01-03 |
10.83952 |
11.74851 |
13.33836 |
16.47911 |
6.818125 |
3.705833 |
30.29477 |
13.31775 |
| ### PM10 2019 |
|
|
|
|
|
|
|
|
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10_19 <- data.frame("station1"=s1_19.d$PM10,"station3"=s3_19.d$PM10,"station5"=s5_19.d$PM10,
"station7"=s7_19.d$PM10,"station8"=s8_19.d$PM10,"station10"=s10_19.d$PM10,
"station11"=s11_19.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10_19 <- pm10_19 %>% select(time,station1,station3,station5,station7,station8,station10,station11)
# Adding a mean column with mean value of PM10 for every day
pm10_19$mean <- rowMeans(pm10_19[,2:8])
knitr::kable(head(pm10_19,3)) # printing the first 3 columns
| 2019-01-01 |
17.65954 |
25.25458 |
27.18125 |
24.36375 |
10.75981 |
17.65954 |
24.47333 |
21.05026 |
| 2019-01-02 |
30.83935 |
18.45208 |
27.98375 |
27.76417 |
20.74846 |
30.83935 |
26.08583 |
26.10186 |
| 2019-01-03 |
26.35819 |
15.31958 |
24.06979 |
20.94458 |
13.82719 |
26.35819 |
21.59292 |
21.21006 |
Creating the dataframe for pm10 with monthly columns
# adding the values to dataframe
pm10.m <- data.frame(s1.m$PM10,s3.m$PM10,s5.m$PM10,s7.m$PM10,
s8.m$PM10,s10.m$PM10,s11.m$PM10)
# Creating a column with mean values for every month
pm10.m$mean <- rowMeans(pm10.m)
# Adding row and column names
rownames(pm10.m) <- c("January","February","March","April","May","June")
colnames(pm10.m) <- c(stations[c(1,3:6,8:9)],"mean")
knitr::kable(pm10.m, digits = 4) #printing the table
| January |
36.3907 |
25.0343 |
30.1276 |
25.8627 |
16.7807 |
7.6372 |
28.1558 |
24.2841 |
| February |
30.6833 |
27.2613 |
28.6184 |
26.1846 |
21.0343 |
11.2538 |
28.1471 |
24.7404 |
| March |
30.3524 |
27.4642 |
33.2767 |
26.4542 |
25.0162 |
20.2982 |
29.5287 |
27.4844 |
| April |
25.1905 |
23.7127 |
27.7513 |
21.3241 |
25.6438 |
19.1190 |
25.2619 |
24.0005 |
| May |
21.0555 |
35.3394 |
36.1480 |
35.4592 |
36.0006 |
25.1626 |
32.7273 |
31.6990 |
| June |
25.9583 |
26.7145 |
34.0229 |
23.4378 |
32.5035 |
18.4995 |
31.6976 |
27.5477 |
Summary statistics of PM10 for each station
colnames(pm10) <- c("time",stations[c(1,3:6,8:9)],"mean")
knitr::kable(do.call(cbind, lapply(pm10[2:9], summary)))
| Min. |
10.83952 |
10.00244 |
11.00115 |
9.065349 |
4.911398 |
3.240060 |
12.13500 |
12.47557 |
| 1st Qu. |
20.87687 |
19.09575 |
22.17966 |
18.909459 |
17.613646 |
9.727647 |
21.06490 |
20.33870 |
| Median |
24.97054 |
25.34259 |
28.60686 |
24.131823 |
22.768948 |
14.381458 |
26.49583 |
24.25839 |
| Mean |
28.29714 |
27.61801 |
31.69190 |
26.171577 |
26.316686 |
17.084898 |
29.26734 |
26.63536 |
| 3rd Qu. |
35.07800 |
32.26869 |
36.29274 |
29.643809 |
30.876087 |
19.562292 |
33.07865 |
29.54169 |
| Max. |
61.74035 |
94.58720 |
96.06582 |
89.734498 |
120.352083 |
82.874167 |
82.96542 |
79.25038 |
Data preperation
pm10.t <- as.data.frame(t(as.matrix(pm10))) # transpose the pm10 matrix
# changing row and column names
rownames(pm10.t) <- c("time",stations[c(1,3:6,8:9)],"mean")
colnames(pm10.t) <- pm10$time
pm10.bar2 <- pm10.t[2:8,]
pm10.bar2$row.names <- rownames(pm10.bar2)
# changing the data from wide format to a single column of data
pm10.bar2<-melt(pm10.bar2,id=c("row.names"))
# Changing the type of class
pm10.bar2$value <- as.numeric(pm10.bar2$value)
pm10.bar2$variable <- ymd(pm10.bar2$variable)
pm10.bar2$row.names <- as.factor(pm10.bar2$row.names)
Boxplot for each station
ggplot(data = pm10.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))

PM10 time series.
The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. We can see there are some peaks as a result of dust episodes. We can see that COVID19 situation did not affect the values of PM10.
ggplot() + geom_line(data=pm10, aes(x=time,y=mean)) + geom_smooth() +
geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
labs(y="PM10 value", x="Month",title="January to June 2020 Time series for PM10") +
geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
scale_color_manual(name = "Line notation", values = c(Lockdown="red",EU_limit_value="magenta",
WHO_air_quality_guideline="green",Flight_ban="orange")) +
theme_classic() +
annotate("text",x=pm10$time[140],y = 80,label="Dust Episode", size =4, fontface="bold", color="brown" )

pm10$meansel <- rowMeans(pm10[,c(2,3,5,6)])
pm10_19$meansel <- rowMeans(pm10_19[,c(2,3,5,6)])
pm10$index <- yday(pm10$time)
pm10_19$index <- yday(pm10_19$time)
ggplot() + geom_line(data=pm10, aes(x=index,y=meansel,color="2020"), size=1)+
geom_line(data=pm10_19, aes(x=index,y=meansel, color="2019"), size=1) +
geom_smooth(data=pm10, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
geom_smooth(data=pm10_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
geom_vline(aes(xintercept=as.numeric(pm10$index[84]),linetype="Lockdown"),colour="red", size=1) +
labs(y="PM10 value", x="Days",title="PM10 value comparison for first half of years 2019 and 2020") +
geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
WHO_air_quality_guideline="green","2020"="red",
"2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
scale_linetype_manual(name = "Lockdown date",
values = c("Lockdown" = "dashed")) +
theme_classic() + labs(color="Legend text")

ggplot() +
geom_smooth(data=pm10, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
geom_smooth(data=pm10_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
geom_vline(aes(xintercept=as.numeric(pm10$index[84]),linetype="Lockdown"),colour="red", size=1) +
labs(y="PM10 value", x="Days",title="PM10 value comparison for first half of years 2019 and 2020") +
geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
WHO_air_quality_guideline="green","2020"="red",
"2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
scale_linetype_manual(name = "Lockdown date",
values = c("Lockdown" = "dashed")) +
theme_classic() + labs(color="Legend text")

Time Series plot for every station
sn4 <- c("Nicosia_Traf", "Larnaca_Traf","Paphos_Traf","Zygi_Indu")
ggplot(pm10.bar2[pm10.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) +
geom_area(aes(color = row.names, fill = row.names),alpha = 0.7) + theme_bw() +
labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
labs(fill='Station') + labs(color="Station")

ggplot(pm10.bar2[pm10.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
alpha = 0.6, size=1) + theme_classic() +
labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
labs(color="Station")

PM10 daily difference
COVID19 situation doesn’t change the stability of PM10 and the differences seems to be almost the same before and after lockdown. The highest differences can be spotted the first 2 weeks after lockdown.
pm10.ts <- ts(pm10[9], start=c(2020,1) , frequency = 365)
pm10.diff <- diff(pm10.ts)
pm10.dif <- data.frame("time"=pm10$time,"diff"=c(0,pm10.diff))
ggplot(pm10.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("PM10 difference") +
ggtitle("PM10 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values
pm10.mt <- as.data.frame(t(as.matrix(pm10.m))) ##transpose of month matrix
rownames(pm10.mt) <- c(stations[c(1,3:6,8:9)],"mean") #adding row names
heatmaply(pm10.mt, grid_color = "white", dendrogram = "none", main = "PM10 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(pm10.mt,2))
Barplot of PM10 for every month and visual contribution of each station
pm10.bar <- pm10.mt[1:7,] # row filtering
pm10.bar$row.names <- rownames(pm10.bar) # adding column with row names
# changing the data from wide format to a single column of data
pm10.bar<-melt(pm10.bar,id=c("row.names"))
# creating the plot
ggplot(pm10.bar,aes(x=variable,y=value/7,fill=row.names))+geom_bar(position="stack", stat="identity") +
labs(title = "PM10 per month and station contribution", y = "PM10 value", x = "Months") +
labs(fill = "Stations")

Hierarchical Analysis
I choose to create three clusters. The clusters seems to be very reasonal. The green cluster includes four of five traffic stations. The red includes only Ayia_Marina_Xyliaroy station, the only Rural Station of our dataset and the blue includes Paphos traffic station and Zygi industrial station.
pm10.t2 <- as.data.frame(t(as.matrix(pm10[,2:8])))
row.names(pm10.t2)<- stations[c(1,3:6,8:9)]
pm10.t2$station <- stations[c(1,3:6,8:9)]
pm10.scaled <- as.data.frame(scale(pm10.t2[,1:182])) # scaling
dist_pm10 <- dist(pm10.scaled, method = 'euclidean') # calculating the distance
hclust_pm10 <- hclust(dist_pm10, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_pm10,labels=pm10.t2$station)
rect.hclust(hclust_pm10 , k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule.
We can see that a good choice would be to choose k=2
fviz_nbclust(pm10.scaled, kmeans, method = "wss", k.max=6)

PCA
pm10_pca <- prcomp(pm10.t2[,-183], center = TRUE, scale. = TRUE)
summary(pm10_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 8.9216 5.8958 5.1523 3.95726 3.69668 3.43124 3.983e-15
## Proportion of Variance 0.4373 0.1910 0.1459 0.08604 0.07508 0.06469 0.000e+00
## Cumulative Proportion 0.4373 0.6283 0.7742 0.86023 0.93531 1.00000 1.000e+00
Cluster plot with PCA
This biplot for the first two principal components, accounting for 64% of the total variance
sub_grp <- cutree(hclust_pm10, k = 3)
fviz_cluster(list(data = pm10.scaled, cluster = sub_grp))

PCA 3d plot
This 3d plot for the first three principal components, accounting for 77% of the total variance
pca3d(pm10_pca, group=sub_grp, radius = 4, show.labels=T)
## [1] 1.4701407 0.7154049 0.6374044
## Creating new device
NOx Analysis
Creating the dataframe for NOx with day columns
nox <- data.frame("station1"=s1.d$NOx,"station3"=s3.d$NOx ,"station5"=s5.d$NOx ,
"station9"=s9.d$NOx,"station10"=s10.d$NOx,"station11"=s11.d$NOx)
nox$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
nox <- full_join(nox,s2.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s7.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s8.d[,c("time","NOx")],by="time")
nox <- nox %>% rename(station2=NOx.x, station7=NOx.y, station8=NOx) %>%
select(time,station1,station2,station3,station5,station7,station8,
station9,station10,station11) %>%
mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox$mean <- rowMeans(nox[,2:10])
knitr::kable(head(nox,3))
| 2020-01-01 |
82.36372 |
45.79606 |
40.62136 |
58.09139 |
25.14944 |
12.33513 |
8.669477 |
1.704961 |
35.21159 |
34.43812 |
| 2020-01-02 |
56.73409 |
40.99054 |
55.56007 |
45.45797 |
26.65660 |
13.26133 |
10.280462 |
2.418991 |
39.59478 |
32.32831 |
| 2020-01-03 |
42.54235 |
49.56395 |
86.81541 |
65.22078 |
36.50002 |
12.76792 |
14.621379 |
2.824170 |
39.55709 |
38.93479 |
nox_19 <- data.frame("station1"=s1_19.d$NOx,"station3"=s3_19.d$NOx ,"station5"=s5_19.d$NOx ,
"station7"=s7_19.d$NOx,"station8"=s8_19.d$NOx,
"station10"=s10_19.d$NOx,"station11"=s11_19.d$NOx)
nox_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
nox_19 <- full_join(nox_19,s2_19.d[,c("time","NOx")],by="time")
nox_19 <- full_join(nox_19,s9_19.d[,c("time","NOx")],by="time")
nox_19 <- nox_19 %>% rename(station2=NOx.x, station9=NOx.y) %>%
select(time,station1,station2,station3,station5,station7,station8,
station9,station10,station11) %>%
mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox_19$mean <- rowMeans(nox_19[,2:10])
knitr::kable(head(nox_19,3))
| 2019-01-01 |
41.97831 |
37.66930 |
74.46432 |
46.32760 |
16.02855 |
7.482461 |
7.172156 |
41.97831 |
22.16688 |
32.80754 |
| 2019-01-02 |
87.88999 |
80.63773 |
49.68460 |
52.49455 |
18.10256 |
11.683488 |
6.102470 |
87.88999 |
39.24132 |
48.19186 |
| 2019-01-03 |
103.52024 |
76.38051 |
43.31829 |
54.20211 |
19.50543 |
12.245074 |
10.726350 |
103.52024 |
58.94068 |
53.59544 |
Creating the dataframe for pm10 with monthly columns
nox.m <- data.frame(s1.m$NOx,s2.m$NOx,s3.m$NOx,
s5.m$NOx,s7.m$NOx,s8.m$NOx,
s9.m$NOx,s10.m$NOx,s11.m$NOx)
nox.m$Mean <- rowMeans(nox.m)
row.names(nox.m)<-c("January","February","March","April","May","June")
colnames(nox.m)<-c(stations,"Mean")
knitr::kable(nox.m, digits = 2)
| January |
91.52 |
52.55 |
65.59 |
59.05 |
30.98 |
13.16 |
12.87 |
2.92 |
42.54 |
41.24 |
| February |
63.56 |
42.07 |
57.72 |
44.19 |
24.97 |
15.39 |
14.25 |
3.67 |
30.46 |
32.92 |
| March |
33.44 |
21.28 |
34.36 |
29.16 |
15.23 |
13.05 |
10.73 |
3.22 |
16.98 |
19.72 |
| April |
15.85 |
8.54 |
13.42 |
11.89 |
6.16 |
7.63 |
8.12 |
2.38 |
9.91 |
9.32 |
| May |
25.49 |
12.75 |
23.82 |
20.43 |
16.50 |
9.47 |
10.97 |
2.28 |
14.27 |
15.11 |
| June |
23.35 |
14.33 |
27.15 |
20.36 |
10.06 |
15.31 |
18.14 |
1.83 |
15.18 |
16.19 |
colnames(nox) <- c("time",stations,"mean")
knitr::kable(do.call(cbind, lapply(nox[2:11], summary)))
| Min. |
4.381602 |
1.701600 |
4.374026 |
3.925617 |
1.096979 |
3.354127 |
4.492203 |
0.8867552 |
4.751423 |
4.325509 |
| 1st Qu. |
18.919483 |
9.213398 |
16.921153 |
14.426882 |
6.056446 |
8.673014 |
8.477460 |
2.0534770 |
10.675313 |
12.570360 |
| Median |
28.274242 |
16.293681 |
30.563370 |
22.375224 |
15.017890 |
12.043879 |
10.922486 |
2.4228941 |
15.605522 |
17.711148 |
| Mean |
42.217921 |
25.329135 |
36.976008 |
30.844837 |
17.386851 |
12.335134 |
12.482777 |
2.7217090 |
21.551826 |
22.427355 |
| 3rd Qu. |
54.810168 |
39.055830 |
52.007920 |
44.548574 |
25.405847 |
15.280685 |
15.588738 |
3.0809064 |
24.351356 |
31.687493 |
| Max. |
158.532276 |
100.709014 |
139.149394 |
126.046744 |
75.349972 |
27.850411 |
28.655146 |
8.9129858 |
113.762156 |
68.516004 |
data preperation
nox.t <- as.data.frame(t(as.matrix(nox)))
# changing row and column names
colnames(nox.t) <- nox$time
nox.bar <- nox.t[2:10,]
nox.bar$row.names <- rownames(nox.bar)
# changing the data from wide format to a single column of data
nox.bar<-melt(nox.bar,id=c("row.names"))
nox.bar$value <- as.numeric(nox.bar$value)
nox.bar$variable <- ymd(nox.bar$variable)
nox.bar$row.names <- as.factor(nox.bar$row.names)
Boxplot for each station
ggplot(data = nox.bar, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
theme(legend.position = "none") + labs(title="NOx boxplot for all stations", x="Stations",y="NOx") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))

NOx time series
The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. Some peaks are a result of dust episodes. We can see that COVID19 situation affect the values of NOx.
ggplot(nox, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) +
labs(y="NOx value", x="Month",title="January to June 2020 Time series for NOx") +
scale_color_manual(name = "statistics", values = c(Lockdown="red",EU_limit_value="magenta",
Flight_ban="orange",WHO_air_quality_guideline="green")) +
theme_classic() +
annotate("text",x=pm10$time[140],y = 37,label="Dust Episode", size =4, fontface="bold", color="brown" )

nox$meansel <- rowMeans(nox[,c(2,4,6,7)])
nox_19$meansel <- rowMeans(nox_19[,c(2,4,6,7)])
nox$index <- yday(nox$time)
nox_19$index <- yday(nox_19$time)
ggplot() + geom_line(data=nox, aes(x=index,y=meansel,color="2020"),size=1)+
geom_line(data=nox_19, aes(x=index,y=meansel, color="2019"), size=1) +
geom_smooth(data=nox, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
geom_smooth(data=nox_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) +
labs(y="NOx value", x="Days",title="NOx value comparison for first half of years 2019 and 2020") +
scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
WHO_air_quality_guideline="green","2020"="red",
"2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
scale_linetype_manual(name = "Lockdown date",
values = c("Lockdown" = "dashed")) +
theme_classic()

ggplot() + geom_smooth(data=nox, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
geom_smooth(data=nox_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) +
labs(y="NOx value", x="Days",title="NOx value comparison for first half of years 2019 and 2020") +
scale_color_manual(name = "Line notations", values = c(EU_limit_value="brown",
WHO_air_quality_guideline="green","2020"="red",
"2019"="blue","2020 ACL"="black","2020 smooth"="brown","2019 smooth"="yellow")) +
scale_linetype_manual(name = "Lockdown date",
values = c("Lockdown" = "dashed")) +
theme_classic()

Time Series plot for every station
ggplot(nox.bar[nox.bar$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) +
geom_area(aes(fill = row.names, colour=row.names),alpha = 0.3) + theme_bw() +
labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
labs(fill="Station") + labs(colour="Station")

ggplot(nox.bar[nox.bar$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(colour=row.names), alpha=0.6, size=1) + theme_bw() +
labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
labs(colour="Station")

NOx daily difference
After COVID19 situation we can see that the daily difference is low and NOx seems to be more stable.
nox.ts <- ts(nox[11], start=c(2020,1) , frequency = 365)
nox.diff <- diff(nox.ts)
nox.diff <- data.frame("time"=nox$time,"diff"=c(0,nox.diff))
ggplot(nox.diff, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("NOx difference") +
ggtitle("NOx daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot for NOx monthly values for each station
nox.mt <- as.data.frame(t(as.matrix(nox.m)))
heatmaply(nox.mt, grid_color = "white", dendrogram = "none", main = "NOx per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(nox.mt,2))
Barplot of NOx for every month and visual contribution of each station
nox.bar2 <- nox.mt[1:9,]
nox.bar2$row.names <- rownames(nox.bar2) # adding column with row names
# changing the data from wide format to a single column of data
nox.bar2<-melt(nox.bar2,id=c("row.names"))
# creating the plot
ggplot(nox.bar2,aes(x=variable,y=value/9,fill=row.names))+geom_bar(position="stack", stat="identity") +
labs(title = "NOx per month and station contribution", y = "NOx value", x = "Months") +
labs(fill = "Stations")

Elbow rule for NOx
fviz_nbclust(nox.scaled, kmeans, method = "wss", k.max=8)

PCA
nox_pca <- prcomp(nox.t2[,-183], center = TRUE, scale. = TRUE)
summary(nox_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 11.1551 4.3793 3.68206 2.70529 2.47200 2.20758 1.98466
## Proportion of Variance 0.6837 0.1054 0.07449 0.04021 0.03358 0.02678 0.02164
## Cumulative Proportion 0.6837 0.7891 0.86359 0.90380 0.93737 0.96415 0.98579
## PC8 PC9
## Standard deviation 1.60800 2.751e-15
## Proportion of Variance 0.01421 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
Cluster plot with PCA
This biplot for the first two principal components, accounting for 79% of the total variance
sub_grp2 <- cutree(hclust_nox, k = 3)
fviz_cluster(list(data = nox.scaled, cluster = sub_grp2))

PCA 3d plot
This 3d plot for the first three principal components, accounting for 86% of the total variance
pca3d(nox_pca, group=sub_grp2, radius = 4, show.labels=T)
## [1] 1.5335850 0.5048702 0.4734467
C6H6
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
c6h6 <- data.frame("station1"=s1.d$C6H6,"station5"=s5.d$C6H6,"station9"=s9.d$C6H6,"station10"=s10.d$C6H6)
# Creating a time column to help us join previous dataframe with station 7 and 8.
c6h6$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
c6h6 <- full_join(c6h6,s7.d[,c("time","C6H6")],by="time")
# Full join pm10 and station8 by time column
c6h6 <- full_join(c6h6,s8.d[,c("time","C6H6")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
c6h6 <- c6h6 %>% rename(station7=C6H6.x, station8=C6H6.y) %>%
select(time,station1,station5,station7,station8,station9,station10) %>%
mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
# Adding a mean column with mean value of PM10 for every day
c6h6$mean <- rowMeans(c6h6[,2:7])
knitr::kable(head(c6h6,3)) # printing the first 3 columns
| 2020-01-01 |
2.3974533 |
2.0189181 |
1.216708 |
0.2946909 |
0.5019321 |
0.7258804 |
1.1925971 |
| 2020-01-02 |
1.4069889 |
0.9467244 |
1.261858 |
0.5025892 |
0.3706992 |
0.6880867 |
0.8628245 |
| 2020-01-03 |
0.8622788 |
1.1730584 |
1.267356 |
0.4103396 |
0.2715304 |
0.4655657 |
0.7416880 |
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
c6h6_19 <- data.frame("station1"=s1_19.d$C6H6,"station5"=s5_19.d$C6H6,
"station7"=s7_19.d$C6H6,"station8"=s8_19.d$C6H6,"station10"=s10_19.d$C6H6)
# Creating a time column to help us join previous dataframe with station 7 and 8.
c6h6_19$time <- as.POSIXct(row.names(s1_19.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
c6h6_19 <- full_join(c6h6_19,s9_19.d[,c("time","C6H6")],by="time")
# Full join pm10 and station8 by time column
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
c6h6_19 <- c6h6_19 %>% rename(station9=C6H6) %>%
select(time,station1,station5,station7,station8,station9,station10) %>%
mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
# Adding a mean column with mean value of PM10 for every day
c6h6_19$mean <- rowMeans(c6h6_19[,2:7])
knitr::kable(head(c6h6_19,3)) # printing the first 3 columns
| 2019-01-01 |
0.8574786 |
1.907207 |
0.9028013 |
0.8523375 |
0.2651717 |
0.8574786 |
0.9404124 |
| 2019-01-02 |
1.4710263 |
1.654617 |
0.6863346 |
0.7166400 |
0.1449858 |
1.4710263 |
1.0241050 |
| 2019-01-03 |
1.5480072 |
1.412580 |
0.8689784 |
0.6715878 |
0.0338229 |
1.5480072 |
1.0138307 |
Creating thedataframe for C6H6 with monthly columns
# adding the values to dataframe
c6h6.m <- data.frame(s1.m$C6H6,s5.m$C6H6,s7.m$C6H6,
s8.m$C6H6,s9.m$C6H6,s10.m$C6H6)
# Creating a column with mean values for every month
c6h6.m$mean <- rowMeans(c6h6.m)
# Adding row and column names
rownames(c6h6.m) <- c("January","February","March","April","May","June")
colnames(c6h6.m) <- c(stations[c(1,4:8)],"mean")
knitr::kable(c6h6.m, digits = 4) #printing the table
| January |
1.7935 |
2.0161 |
1.5628 |
0.5110 |
0.3328 |
0.6582 |
1.1457 |
| February |
1.5373 |
1.1909 |
1.0157 |
0.1949 |
0.6308 |
0.6037 |
0.8622 |
| March |
0.8519 |
0.6539 |
0.4443 |
0.4412 |
0.7596 |
0.4543 |
0.6009 |
| April |
0.4002 |
0.3344 |
0.1802 |
0.2885 |
0.2698 |
0.3335 |
0.3011 |
| May |
0.4054 |
0.4391 |
0.2529 |
0.1717 |
0.0853 |
0.2006 |
0.2592 |
| June |
0.3129 |
0.4197 |
0.1785 |
0.1581 |
0.0479 |
0.1933 |
0.2184 |
Summary statistics of PM10 for each station
colnames(c6h6) <- c("time",stations[c(1,4:8)],"mean")
knitr::kable(do.call(cbind, lapply(c6h6[2:8], summary)))
| Min. |
0.1441559 |
0.1363796 |
0.0545053 |
0.0000000 |
0.0045999 |
0.0251507 |
0.1360316 |
| 1st Qu. |
0.3300891 |
0.3396551 |
0.1872086 |
0.1363064 |
0.1031937 |
0.2294919 |
0.2477120 |
| Median |
0.6190251 |
0.5113127 |
0.3523731 |
0.2865139 |
0.2722792 |
0.3869274 |
0.4228677 |
| Mean |
0.8820508 |
0.8434774 |
0.6204580 |
0.2946909 |
0.3532593 |
0.4076478 |
0.5669307 |
| 3rd Qu. |
1.2920771 |
1.0519909 |
0.9430208 |
0.4188988 |
0.6641393 |
0.5345103 |
0.8204481 |
| Max. |
2.9314320 |
5.7590451 |
2.4143210 |
0.8420553 |
0.8128323 |
1.2147028 |
2.0201268 |
Data preperation
c6h6.t <- as.data.frame(t(as.matrix(c6h6))) # transpose the pm10 matrix
# changing row and column names
rownames(c6h6.t) <- c("time",stations[c(1,4:8)],"mean")
colnames(c6h6.t) <- c6h6$time
c6h6.bar2 <- c6h6.t[2:7,]
c6h6.bar2$row.names <- rownames(c6h6.bar2)
# changing the data from wide format to a single column of data
c6h6.bar2<-melt(c6h6.bar2,id=c("row.names"))
# Changing the type of class
c6h6.bar2$value <- as.numeric(c6h6.bar2$value)
c6h6.bar2$variable <- ymd(c6h6.bar2$variable)
c6h6.bar2$row.names <- as.factor(c6h6.bar2$row.names)
Boxplot for each station
ggplot(data = c6h6.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
theme(legend.position = "none") + labs(title="C6H6 boxplot for all stations", x="Stations",y="C6H6") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))

C6H6 time series.
c6h6$meansel <- rowMeans(c6h6[,c(2,4,5)])
ggplot(c6h6, aes(x=time,y=meansel)) + geom_line(size=1, alpha=0.7) + geom_smooth() +
geom_vline(aes(xintercept=as.numeric(c6h6$time[84]),colour="Lockdown"),linetype=2, size=1) +
geom_vline(aes(xintercept=as.numeric(c6h6$time[81]),colour="Flight_ban"),linetype=2, size=1) +
labs(y="C6H6 value", x="Month",title="January to June 2020 Time series for C6H6") +
scale_color_manual(name = "Line notation", values = c(Lockdown="red",Flight_ban="orange")) +
theme_classic() +
annotate("text",x=c6h6$time[140],y = 1,label="Dust Episode", size =4, fontface="bold", color="brown" ) + geom_line(aes(x=time,y=mean), color="red", size=1, alpha=0.7)

c6h6_19$meansel <- rowMeans(c6h6_19[,c(2,4,5)])
c6h6$index <- yday(c6h6$time)
c6h6_19$index <- yday(c6h6_19$time)
ggplot() + geom_line(data=c6h6, aes(x=index,y=meansel,color="2020"),size=1)+
geom_smooth(data=c6h6, aes(x=index,y=meansel,color="2020 smooth"), size=1) +
geom_smooth(data=c6h6_19, aes(x=index,y=meansel,color="2019 smooth"), size=1) +
geom_line(data=c6h6_19, aes(x=index,y=meansel,color="2019"),size=1) +
geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
labs(y="C6H6 value", x="Days",title="C6H6 value comparison for first half of years 2019 and 2020") +
scale_color_manual(name = "Line notations", values = c("2020"="red",
"2019"="blue",
"2020 smooth"="brown","2019 smooth"="yellow")) +
scale_linetype_manual(name = "Lockdown date",
values = c("Lockdown" = "dashed")) +
theme_classic()

ggplot() + geom_smooth(data=c6h6, aes(x=index,y=meansel,color="2020"), size=1,level=0.95) +
geom_smooth(data=c6h6_19, aes(x=index,y=meansel,color="2019"), size=1) +
geom_vline(aes(xintercept=as.numeric(nox$index[84]),linetype="Lockdown"),colour="red", size=1) +
labs(y="C6H6 value", x="Days",title="C6H6 value comparison for first half of years 2019 and 2020") +
scale_color_manual(name = "Line notations", values = c("2019"="yellow","2020"="brown")) +
scale_linetype_manual(name = "Lockdown date", values = c("Lockdown" = "dashed")) + theme_classic()

Time series plot for every station
ggplot(c6h6.bar2, aes(x=variable,y=as.numeric(value))) + geom_area(aes(color = row.names, fill = row.names),
alpha = 0.7) + theme_bw() +
labs(y="C6H6 value", x="Month",title="January to June 2020 C6H6 Time series for every station") +
labs(fill='Station') + labs(color="Station")

sn4 <- c("Nicosia_Traf", "Larnaca_Traf","Paphos_Traf","Zygi_Indu")
ggplot(c6h6.bar2[c6h6.bar2$row.names %in% sn4,], aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
alpha = 0.5, size=1) + theme_classic() +
labs(y="C6H6 value", x="Month",title="January to June 2020 C6H6 Time series for every station") +
labs(color="Station")

C6H6 daily difference
c6h6.ts <- ts(c6h6[8], start=c(2020,1) , frequency = 365)
c6h6.diff <- diff(c6h6.ts)
c6h6.dif <- data.frame("time"=c6h6$time,"diff"=c(0,c6h6.diff))
ggplot(c6h6.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("C6H6 difference") +
ggtitle("C6H6 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values
c6h6.mt <- as.data.frame(t(as.matrix(c6h6.m))) ##transpose of month matrix
rownames(c6h6.mt) <- c(stations[c(1,4:8)],"mean") #adding row names
heatmaply(c6h6.mt, grid_color = "white", dendrogram = "none", main = "C6H6 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(c6h6.mt,2))
Barplot of C6H6 for every month and visual contribution of each station
c6h6.bar <- c6h6.mt[1:6,] # row filtering
c6h6.bar$row.names <- rownames(c6h6.bar) # adding column with row names
# changing the data from wide format to a single column of data
c6h6.bar<-melt(c6h6.bar,id=c("row.names"))
# creating the plot
ggplot(c6h6.bar,aes(x=variable,y=value/6,fill=row.names))+geom_bar(position="stack", stat="identity") +
labs(title = "C6H6 per month and station contribution", y = "C6H6 value", x = "Months") +
labs(fill = "Stations")

Hierarchical Analysis
c6h6.t2 <- as.data.frame(t(as.matrix(c6h6[,2:7])))
row.names(c6h6.t2)<- stations[c(1,4:8)]
c6h6.t2$station <- stations[c(1,4:8)]
c6h6.scaled <- as.data.frame(scale(c6h6.t2[,1:182])) # scaling
dist_c6h6 <- dist(c6h6.scaled, method = 'euclidean') # calculating the distance
hclust_c6h6 <- hclust(dist_c6h6, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_c6h6,labels=c6h6.t2$station)
rect.hclust(hclust_c6h6 , k = 3,border = 2:6) # adding the 3 cluster boxes

fviz_nbclust(c6h6.scaled, kmeans, method = "wss", k.max=5)

PCA for C6H6
c6h6_pca <- prcomp(c6h6.t2[,-183], center = TRUE, scale. = TRUE)
summary(c6h6_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 9.691 6.0408 5.3560 3.70041 3.03511 2.156e-15
## Proportion of Variance 0.516 0.2005 0.1576 0.07524 0.05061 0.000e+00
## Cumulative Proportion 0.516 0.7165 0.8741 0.94939 1.00000 1.000e+00
Cluster plot with PCA
This biplot for the first two principal components, accounting for 71.5% of the total variance
sub_grp <- cutree(hclust_c6h6, k = 3)
fviz_cluster(list(data = c6h6.scaled, cluster = sub_grp))

PCA 3d plot for C6H6
This 3d plot for the first three principal components, accounting for 87% of the total variance
pca3d(c6h6_pca, group=sub_grp, radius = 4, show.labels=T)
## [1] 0.9164679 0.7704207 0.5425491
Loading the covid database
covid <- read.csv("owid-covid-data.csv")
covid <- covid %>% filter(location=="Cyprus") # filtering the Cyprus data
covid$date <- ymd(covid$date) # changing the date class
Joining the covid and pm10 dataset
pm10$time <- ymd(pm10$time) # changing the time class
# left join the two dataset using time and date column
pm10.cov <- merge(pm10, covid, by.x="time",by.y="date", all.x =T )
# removing columns with almost zero variance
pm10.cov <- pm10.cov %>% select(-nearZeroVar(pm10.cov))
# filtering the columns we need
pm10.cov2 <- pm10.cov %>% select(c(1,9,12:15))
# filling NA with 0 for the dates before COVID19
pm10.cov2[,3:6] <- sapply(pm10.cov2[,3:6], function(x) ifelse(is.na(x),0,x))
Correlation matrix
# creating a function to setup the geom_smooth graph of ggpair plot
my_fn <- function(data, mapping, method="loess", ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point() +
geom_smooth(method=method, color="black", ...)
p
}
ggpairs(pm10.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn),
upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

creating the corrplot
We can see that there is no clear correlation between PM10 and COVID19. The correlation values are close to zero so there is almost zero impact of coronavirus to values of PM10.
cor.pm10 <- cor(pm10.cov2[,2:6])
corrplot.mixed(cor.pm10)

Join the nox and covid database
nox$time <- ymd(nox$time)
# left join between nox and covid dataset
nox.cov <- merge(nox, covid, by.x="time",by.y="date", all.x =T )
nox.cov <- nox.cov %>% select(-nearZeroVar(nox.cov))
nox.cov2 <- nox.cov %>% select(c(1,11,14:17))
nox.cov2[,3:6] <- sapply(nox.cov2[,3:6], function(x) ifelse(is.na(x),0,x))
Correlation matrix between mean value of NOx and COVID19 values
ggpairs(nox.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn),
upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

We can see that we have a negative correlation between NOx value and total cases of coronavirus. That negative correlation means that when total cases increase the NOx mean value descreases. So there is COVID19 impact in the NOx values.
cor.nox <- cor(nox.cov2[,2:6])
corrplot.mixed(cor.nox)

Join the C6H6 and covid database
c6h6$time <- ymd(c6h6$time)
# left join between nox and covid dataset
c6h6.cov <- merge(c6h6, covid, by.x="time",by.y="date", all.x =T )
c6h6.cov <- c6h6.cov %>% select(-nearZeroVar(c6h6.cov))
c6h6.cov2 <- c6h6.cov %>% select(c(1,8,11:14))
c6h6.cov2[,3:6] <- sapply(c6h6.cov2[,3:6], function(x) ifelse(is.na(x),0,x))
Correlation matrix between mean value of C6H6 and COVID19 values
ggpairs(c6h6.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn),
upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

cor.c6h6 <- cor(c6h6.cov2[,2:6])
corrplot.mixed(cor.c6h6)
