library(readxl)
library(leaflet)
directory = "C:/Users/raulg/Desktop/Assignment3" # Cambiar esto por la carpeta donde esten los excel
mataro = read_excel(paste (directory, "HistoricMesuresMataro_Auto.xlsx", sep = "/")) # Read Mataro data
svm = read_excel(paste (directory, "HistoricMesuresSantVicens_Auto.xls", sep = "/")) # Read Sant Vicens de Montalt data
eixample = read_excel(paste (directory, "HistoricMesures_Eixample_Auto.xls", sep = "/")) # Read Eixample data
For this part we chose to use the data coming from Mataro station
mataro <- data.frame("Date" = mataro$Date,
"PM10" = mataro$PM10)
svm <- data.frame("Date" = svm$Date,
"PM10" = svm$PM10)
eixample <- data.frame("Date" = eixample$Date,
"PM10" = eixample$PM10)
mataro <- subset(mataro, !is.na(PM10)) ## Cleanup useless rows
svm <- subset(svm, !is.na(PM10)) ## Cleanup useless rows
eixample <- subset(eixample, !is.na(PM10)) ## Cleanup useless rows
mataro <- subset(mataro, PM10!= "Sense dades") ## Some more cleanup to avoid weird results
svm <- subset(svm, PM10!= "Sense dades") ## Some more cleanup to avoid weird results
eixample <- subset(eixample, PM10!= "Sense dades") ## Some more cleanup to avoid weird results
svm$Date<- as.POSIXct(svm$Date, format="%d/%m/%Y %H:%M:%S", tz="CET")
svm$Day <- as.factor(weekdays(svm$Date))
svm$Time <- format(svm$Date, "%T")
eixample$Date<- as.POSIXct(eixample$Date, format="%d/%m/%Y %H:%M:%S", tz="CET")
eixample$Day <- as.factor(weekdays(eixample$Date))
eixample$Time <- format(eixample$Date, "%T")
eixample$PM10 <- as.integer(eixample$PM10)
mataro$Date<- as.POSIXct(mataro$Date, format="%d/%m/%Y %H:%M:%S", tz="CET")
mataro$Day <- as.factor(weekdays(mataro$Date))
mataro$Time <- format(mataro$Date, "%T")
mataroWeekend <- subset(mataro, Day == "sábado" | Day == "domingo")
mataroWeekend <- data.frame("Date" = mataroWeekend$Date,
"PM10" = mataroWeekend$PM10)
mataroWeekdays <- subset(mataro, Day == "lunes" | Day == "martes"| Day == "miércoles"| Day == "jueves"| Day == "viernes")
mataroWeekdays <- data.frame("Date" = mataroWeekdays$Date,
"PM10" = mataroWeekdays$PM10)
t.test(as.numeric(mataroWeekdays$PM10))
##
## One Sample t-test
##
## data: as.numeric(mataroWeekdays$PM10)
## t = 133.04, df = 6115, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 20.19520 20.79924
## sample estimates:
## mean of x
## 20.49722
t.test(as.numeric(mataroWeekend$PM10))
##
## One Sample t-test
##
## data: as.numeric(mataroWeekend$PM10)
## t = 106.99, df = 2495, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 16.69116 17.31444
## sample estimates:
## mean of x
## 17.0028
mean(mataroWeekdays[[2]])
## [1] 20.49722
mean(mataroWeekend[[2]])
## [1] 17.0028
plot(mataroWeekdays)
plot(mataroWeekend)
As we can see the means are close but on weekdays it’s higher, going above the OMS recommended limits.
# We need to calculate the mean to compare it, so:
mean(mataro[[2]]) # Mean value
## [1] 19.48444
The annual mean is 19.5 so it’s close to the recommended 20 but not above it.
underthirty <- subset(mataro, PM10 < 30)
underfifty <- subset(mataro, PM10 < 50)
nrow(underthirty) # Amount of days under 30
## [1] 7598
nrow(underfifty) # Amount of days under 50
## [1] 8504
Results were: A total of 7598 of days under 30 and 8504 under 50. Please consider this is after cleaning up the dataframe of useless data.
sd(mataro[[2]]) # Sigma value
## [1] 11.12968
The approximation of sigma is 11.13
We do believe data isn’t part of a gaussian distribution.
For this part we chose to use the data coming from Mataro, Sant Vicens and Eixample stations.
zerotosix <- subset(mataro, Time > "00:00:00" & Time < "06:00:00")
zerotosix <- data.frame("Date" = zerotosix$Date,
"PM10" = zerotosix$PM10)
sixtoten <- subset(mataro, Time > "06:00:00" & Time < "10:00:00")
sixtoten <- data.frame("Date" = sixtoten$Date,
"PM10" = sixtoten$PM10)
fifteentotwenty <- subset(mataro, Time > "15:00:00" & Time < "20:00:00")
fifteentotwenty <- data.frame("Date" = fifteentotwenty$Date,
"PM10" = fifteentotwenty$PM10)
The worst hours are from 10 to 12 and from 19 to 21 as seen in the barplot below (section of 3 hours mean bars). This is probably due to people going to work or returning from it.
mean(zerotosix[[2]])
## [1] 14.96496
mean(sixtoten[[2]])
## [1] 20.78825
plot(zerotosix)
plot(sixtoten)
The concentration means are different by a margin but the concentrations shape is quite similar.
mean(sixtoten[[2]])
## [1] 20.78825
mean(fifteentotwenty[[2]])
## [1] 20.39293
plot(zerotosix)
plot(fifteentotwenty)
The concentrations are very similar.
zerotothree <- subset(mataro, Time > "01:00:00" & Time < "03:00:00")
zerotothree <- data.frame("Date" = zerotothree$Date,
"PM10" = zerotothree$PM10)
to3 <- mean(zerotothree[["PM10"]])
fourtosix <- subset(mataro, Time > "04:00:00" & Time < "06:00:00")
fourtosix <- data.frame("Date" = fourtosix$Date,
"PM10" = fourtosix$PM10)
to6 <- mean(fourtosix[["PM10"]])
seventonine <- subset(mataro, Time > "07:00:00" & Time < "09:00:00")
seventonine <- data.frame("Date" = seventonine$Date,
"PM10" = seventonine$PM10)
to9 <- mean(seventonine[["PM10"]])
tentotwelve <- subset(mataro, Time > "10:00:00" & Time < "12:00:00")
tentotwelve <- data.frame("Date" = tentotwelve$Date,
"PM10" = tentotwelve$PM10)
to12 <- mean(tentotwelve[["PM10"]])
thirteentofifteen <- subset(mataro, Time > "13:00:00" & Time < "15:00:00")
thirteentofifteen <- data.frame("Date" = thirteentofifteen$Date,
"PM10" = thirteentofifteen$PM10)
to15 <- mean(thirteentofifteen[["PM10"]])
sixteentoeighteen <- subset(mataro, Time > "16:00:00" & Time < "18:00:00")
sixteentoeighteen <- data.frame("Date" = sixteentoeighteen$Date,
"PM10" = sixteentoeighteen$PM10)
to18 <- mean(sixteentoeighteen[["PM10"]])
nineteentotwentyone <- subset(mataro, Time > "19:00:00" & Time < "21:00:00")
nineteentotwentyone <- data.frame("Date" = nineteentotwentyone$Date,
"PM10" = nineteentotwentyone$PM10)
to21 <- mean(nineteentotwentyone[["PM10"]])
twentytwotozero <- subset(mataro, Time > "22:00:00" & Time < "24:00:00")
twentytwotozero <- data.frame("Date" = twentytwotozero$Date,
"PM10" = twentytwotozero$PM10)
to0 <- mean(twentytwotozero[["PM10"]])
hmeans <- c(to3,to6,to9,to12,to15,to18,to21,to0)
barplot(height = hmeans, main="PM10 by hours", names.arg=c("01-03", "04-06", "07-09", "10-12", "13-15", "16- 18","19-21", "22-00"), xlab = "Hours", ylab = "PM10")
plot(zerotothree)
plot(fourtosix)
plot(seventonine)
plot(tentotwelve)
plot(thirteentofifteen)
plot(sixteentoeighteen)
plot(nineteentotwentyone)
plot(twentytwotozero)
zerotosix <- subset(svm, Time > "00:00:00" & Time < "06:00:00")
zerotosix <- data.frame("Date" = zerotosix$Date,
"PM10" = zerotosix$PM10)
sixtoten <- subset(svm, Time > "06:00:00" & Time < "10:00:00")
sixtoten <- data.frame("Date" = sixtoten$Date,
"PM10" = sixtoten$PM10)
fifteentotwenty <- subset(svm, Time > "15:00:00" & Time < "20:00:00")
fifteentotwenty <- data.frame("Date" = fifteentotwenty$Date,
"PM10" = fifteentotwenty$PM10)
The worst hours are from 7 to 9 and from 10 to 12 as seen in the barplot below (section of 3 hours mean bars).
mean(zerotosix[[2]])
## [1] 24.06964
mean(sixtoten[[2]])
## [1] 38.00464
plot(zerotosix)
plot(sixtoten)
The concentrations means are different but the shape is somewhat similar as seen in the plots above.
mean(sixtoten[[2]])
## [1] 38.00464
mean(fifteentotwenty[[2]])
## [1] 27.30174
plot(zerotosix)
plot(fifteentotwenty)
The concentrations means are different for quite a margin but the shapes are similar.
zerotothree <- subset(svm, Time > "01:00:00" & Time < "03:00:00")
zerotothree <- data.frame("Date" = zerotothree$Date,
"PM10" = zerotothree$PM10)
to3 <- mean(zerotothree[["PM10"]])
fourtosix <- subset(svm, Time > "04:00:00" & Time < "06:00:00")
fourtosix <- data.frame("Date" = fourtosix$Date,
"PM10" = fourtosix$PM10)
to6 <- mean(fourtosix[["PM10"]])
seventonine <- subset(svm, Time > "07:00:00" & Time < "09:00:00")
seventonine <- data.frame("Date" = seventonine$Date,
"PM10" = seventonine$PM10)
to9 <- mean(seventonine[["PM10"]])
tentotwelve <- subset(svm, Time > "10:00:00" & Time < "12:00:00")
tentotwelve <- data.frame("Date" = tentotwelve$Date,
"PM10" = tentotwelve$PM10)
to12 <- mean(tentotwelve[["PM10"]])
thirteentofifteen <- subset(svm, Time > "13:00:00" & Time < "15:00:00")
thirteentofifteen <- data.frame("Date" = thirteentofifteen$Date,
"PM10" = thirteentofifteen$PM10)
to15 <- mean(thirteentofifteen[["PM10"]])
sixteentoeighteen <- subset(svm, Time > "16:00:00" & Time < "18:00:00")
sixteentoeighteen <- data.frame("Date" = sixteentoeighteen$Date,
"PM10" = sixteentoeighteen$PM10)
to18 <- mean(sixteentoeighteen[["PM10"]])
nineteentotwentyone <- subset(svm, Time > "19:00:00" & Time < "21:00:00")
nineteentotwentyone <- data.frame("Date" = nineteentotwentyone$Date,
"PM10" = nineteentotwentyone$PM10)
to21 <- mean(nineteentotwentyone[["PM10"]])
twentytwotozero <- subset(svm, Time > "22:00:00" & Time < "24:00:00")
twentytwotozero <- data.frame("Date" = twentytwotozero$Date,
"PM10" = twentytwotozero$PM10)
to0 <- mean(twentytwotozero[["PM10"]])
hmeans <- c(to3,to6,to9,to12,to15,to18,to21,to0)
barplot(height = hmeans, main="PM10 by hours", names.arg=c("01-03", "04-06", "07-09", "10-12", "13-15", "16- 18","19-21", "22-00"), xlab = "Hours", ylab = "PM10")
plot(zerotothree)
plot(fourtosix)
plot(seventonine)
plot(tentotwelve)
plot(thirteentofifteen)
plot(sixteentoeighteen)
plot(nineteentotwentyone)
plot(twentytwotozero)
zerotosix <- subset(eixample, Time > "00:00:00" & Time < "06:00:00")
zerotosix <- data.frame("Date" = zerotosix$Date,
"PM10" = zerotosix$PM10)
sixtoten <- subset(eixample, Time > "06:00:00" & Time < "10:00:00")
sixtoten <- data.frame("Date" = sixtoten$Date,
"PM10" = sixtoten$PM10)
fifteentotwenty <- subset(eixample, Time > "15:00:00" & Time < "20:00:00")
fifteentotwenty <- data.frame("Date" = fifteentotwenty$Date,
"PM10" = fifteentotwenty$PM10)
The worst hours are from 10 to 12 and from 16 to 18 as seen in the barplot below (section of 3 hours mean bars).
mean(zerotosix[[2]])
## [1] 41.8076
mean(sixtoten[[2]])
## [1] 48.13129
plot(zerotosix)
plot(sixtoten)
The concentrations means are very close but the shapes are very different.
mean(sixtoten[[2]])
## [1] 48.13129
mean(fifteentotwenty[[2]])
## [1] 48.1215
plot(zerotosix)
plot(fifteentotwenty)
The concentrations means are almost the same but the shapes are very different.
zerotothree <- subset(eixample, Time > "01:00:00" & Time < "03:00:00")
zerotothree <- data.frame("Date" = zerotothree$Date,
"PM10" = zerotothree$PM10)
to3 <- mean(zerotothree[["PM10"]])
fourtosix <- subset(eixample, Time > "04:00:00" & Time < "06:00:00")
fourtosix <- data.frame("Date" = fourtosix$Date,
"PM10" = fourtosix$PM10)
to6 <- mean(fourtosix[["PM10"]])
seventonine <- subset(eixample, Time > "07:00:00" & Time < "09:00:00")
seventonine <- data.frame("Date" = seventonine$Date,
"PM10" = seventonine$PM10)
to9 <- mean(seventonine[["PM10"]])
tentotwelve <- subset(eixample, Time > "10:00:00" & Time < "12:00:00")
tentotwelve <- data.frame("Date" = tentotwelve$Date,
"PM10" = tentotwelve$PM10)
to12 <- mean(tentotwelve[["PM10"]])
thirteentofifteen <- subset(eixample, Time > "13:00:00" & Time < "15:00:00")
thirteentofifteen <- data.frame("Date" = thirteentofifteen$Date,
"PM10" = thirteentofifteen$PM10)
to15 <- mean(thirteentofifteen[["PM10"]])
sixteentoeighteen <- subset(eixample, Time > "16:00:00" & Time < "18:00:00")
sixteentoeighteen <- data.frame("Date" = sixteentoeighteen$Date,
"PM10" = sixteentoeighteen$PM10)
to18 <- mean(sixteentoeighteen[["PM10"]])
nineteentotwentyone <- subset(eixample, Time > "19:00:00" & Time < "21:00:00")
nineteentotwentyone <- data.frame("Date" = nineteentotwentyone$Date,
"PM10" = nineteentotwentyone$PM10)
to21 <- mean(nineteentotwentyone[["PM10"]])
twentytwotozero <- subset(eixample, Time > "22:00:00" & Time < "24:00:00")
twentytwotozero <- data.frame("Date" = twentytwotozero$Date,
"PM10" = twentytwotozero$PM10)
to0 <- mean(twentytwotozero[["PM10"]])
hmeans <- c(to3,to6,to9,to12,to15,to18,to21,to0)
barplot(height = hmeans, main="PM10 by hours", names.arg=c("01-03", "04-06", "07-09", "10-12", "13-15", "16- 18","19-21", "22-00"), xlab = "Hours", ylab = "PM10")
plot(zerotothree)
plot(fourtosix)
plot(seventonine)
plot(tentotwelve)
plot(thirteentofifteen)
plot(sixteentoeighteen)
plot(nineteentotwentyone)
plot(twentytwotozero)
For this part we chose to use Mataro, Sant Vicens, Sant Just, Eixample and Hospitalet to get the air quality data.
The first relation we’ve seen is that the stations closer to a big city (Barcelona, in this case) have higher PM10 concentrations, while the further you go from Barcelona it gets lower. This is probably caused due to industrial activity and more car density being sources of PM10. This would explain why the stations close or inside Barcelona have WAY higher concentrations, while Mataro has lower ones.
We couldn’t find suitable meteorogical data close to the stations we’re working with in time. We found what we think is a valid option to get data (http://www.meteo.cat/wpweb/serveis/peticions-de-dades/peticio-dinformes-meteorologics/) but it can take up to a month to receive it, therefore we didn’t manage to get suitable data in time to see if temperature, humidity or wind have effect on PM10 concentrations.
Since we couldn’t find suitable meteorogical data we’ll only show the air quality stations.
#Visualize all stations in a single map
data <- read.csv("stations.csv", stringsAsFactors=FALSE)
m <- leaflet(data) %>% addTiles()
m %>% addMarkers(~lng, ~lat, popup=data$name)
We’ve been able to acquire data from 2015 of all the stations stated above, let’s load it and take a quick look.
hospitalet2015 = read_excel(paste (directory, "Historic2015Hospitalet_Auto.xls", sep = "/"))
just2015 = read_excel(paste (directory, "Historic2015SantJust_Auto.xls", sep = "/"))
eixample2015 = read_excel(paste (directory, "Historic2015Eixample_Auto.xls", sep = "/"))
svm2015 = read_excel(paste (directory, "Historic2015SantVicens_Auto.xls", sep = "/"))
mataro2015 = read_excel(paste (directory, "Historic2015Mataro_Auto.xls", sep = "/"))
mataro2015 <- data.frame("Date" = mataro2015$Date,
"PM10" = mataro2015$PM10)
svm2015 <- data.frame("Date" = svm2015$Date,
"PM10" = svm2015$PM10)
eixample2015 <- data.frame("Date" = eixample$Date,
"PM10" = eixample$PM10)
just2015 <- data.frame("Date" = just2015$Date,
"PM10" = just2015$PM10)
hospitalet2015 <- data.frame("Date" = hospitalet2015$Date,
"PM10" = hospitalet2015$PM10)
mataro2015 <- subset(mataro2015, !is.na(PM10))
svm2015 <- subset(svm2015, !is.na(PM10))
eixample2015 <- subset(eixample2015, !is.na(PM10))
just2015 <- subset(just2015, !is.na(PM10))
hospitalet2015 <- subset(hospitalet2015, !is.na(PM10))
mataro2015 <- subset(mataro2015, PM10!= "Sense dades")
svm2015 <- subset(svm2015, PM10!= "Sense dades")
eixample2015 <- subset(eixample2015, PM10!= "Sense dades")
just2015 <- subset(just2015, PM10!= "Sense dades")
hospitalet2015 <- subset(hospitalet2015, PM10!= "Sense dades")
mataro2015$PM10 <- as.integer(mataro2015$PM10)
svm2015$PM10 <- as.integer(svm2015$PM10)
eixample2015$PM10 <- as.integer(eixample2015$PM10)
just2015$PM10 <- as.integer(just2015$PM10)
hospitalet2015$PM10 <- as.integer(hospitalet2015$PM10)
Now that we’ve loaded and made some treatments to the data to avoid problems, let’s take a look at some basic tests.
head(mataro2015)
## Date PM10
## 1 01/01/2015 00:00:00 29
## 2 01/01/2015 01:00:00 25
## 3 01/01/2015 02:00:00 22
## 4 01/01/2015 03:00:00 18
## 5 01/01/2015 04:00:00 18
## 6 01/01/2015 05:00:00 16
head(svm2015)
## Date PM10
## 1 01/01/2015 00:00:00 38
## 2 01/01/2015 01:00:00 39
## 3 01/01/2015 02:00:00 46
## 4 01/01/2015 03:00:00 40
## 5 01/01/2015 04:00:00 36
## 6 01/01/2015 05:00:00 36
head(just2015)
## Date PM10
## 1 01/01/2015 00:00:00 32
## 2 01/01/2015 01:00:00 36
## 3 01/01/2015 02:00:00 45
## 4 01/01/2015 03:00:00 42
## 5 01/01/2015 04:00:00 35
## 6 01/01/2015 05:00:00 30
head(hospitalet2015)
## Date PM10
## 1 01/01/2015 00:00:00 69
## 2 01/01/2015 01:00:00 80
## 3 01/01/2015 02:00:00 69
## 4 01/01/2015 03:00:00 52
## 5 01/01/2015 04:00:00 42
## 6 01/01/2015 05:00:00 49
head(eixample2015)
## Date PM10
## 1 2016-01-01 00:00:00 51
## 2 2016-01-01 01:00:00 52
## 3 2016-01-01 02:00:00 44
## 4 2016-01-01 03:00:00 43
## 5 2016-01-01 04:00:00 51
## 6 2016-01-01 05:00:00 43
t.test(as.numeric(mataro2015$PM10))
##
## One Sample t-test
##
## data: as.numeric(mataro2015$PM10)
## t = 182, df = 8624, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.30944 21.77346
## sample estimates:
## mean of x
## 21.54145
t.test(as.numeric(svm2015$PM10))
##
## One Sample t-test
##
## data: as.numeric(svm2015$PM10)
## t = 180, df = 8418, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 33.44822 34.18475
## sample estimates:
## mean of x
## 33.81649
t.test(as.numeric(just2015$PM10))
##
## One Sample t-test
##
## data: as.numeric(just2015$PM10)
## t = 140.82, df = 5276, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 40.14163 41.27508
## sample estimates:
## mean of x
## 40.70836
t.test(as.numeric(hospitalet2015$PM10))
##
## One Sample t-test
##
## data: as.numeric(hospitalet2015$PM10)
## t = 193.08, df = 8354, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 52.72646 53.80807
## sample estimates:
## mean of x
## 53.26727
t.test(as.numeric(eixample2015$PM10))
##
## One Sample t-test
##
## data: as.numeric(eixample2015$PM10)
## t = 189.99, df = 8075, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 45.37919 46.32537
## sample estimates:
## mean of x
## 45.85228
As we can see all of the stations above had PM10 concentrations higher than recommended on 2015, but we know for a fact that atleast Mataro has improved that because we analyzed the 2016 data and found that, if only by a little margin, their concentrations weren’t above the recommended in 2016 (we can see improvement over 1 year period).