In this excersice, we have provided with hourly pullution data, and we will forecast the pollution values (PM10 in this case) for next 24 hours.
library(ggplot2) # For visualization
library(lubridate) # For date functions
library(forecast) # For time series forecasting
library(tseries) # Time series functions
library(kableExtra) # Table formatting
library(dplyr)
# Load the .csv file
indata = read.csv("./data/Air Data.csv", stringsAsFactors = F)
Let’s have closer look at the data which we have loaded.
# Dimensions of data
dim(indata)
## [1] 240 12
sprintf("The loaded data has %d rows and %d columns", nrow(indata), ncol(indata))
## [1] "The loaded data has 240 rows and 12 columns"
# Structure of data
str(indata)
## 'data.frame': 240 obs. of 12 variables:
## $ time : chr "2019-07-12T17:48:42.558Z" "2019-07-12T18:50:25.415Z" "2019-07-12T19:52:08.272Z" "2019-07-12T20:53:51.129Z" ...
## $ AirSensorData.mean_BATTERY_12V : num 13.4 13.3 13.3 13.3 13.3 ...
## $ AirSensorData.mean_BATTERY_4V2 : num 4.16 4.16 4.15 4.14 4.14 ...
## $ AirSensorData.mean_HUMIDITY : num 42.9 47.1 48.8 55.5 58 ...
## $ AirSensorData.mean_ME3_CO : num 0.312 0.515 0.517 0.6 0.499 ...
## $ AirSensorData.mean_ME3_NO2 : num 0.0582 0.0422 0.0431 0.0327 0.0348 ...
## $ AirSensorData.mean_ME3_O3 : num 0.0493 0.0346 0.0409 0.0261 0.0317 ...
## $ AirSensorData.mean_ME3_SO2 : num 0.0184 0.0221 0.0218 0.0268 0.0252 ...
## $ AirSensorData.mean_PRESSURE : num 941 943 942 943 943 ...
## $ AirSensorData.mean_SENSOR_DUST_PM10: num 9.73 9.6 14.94 11.5 13.06 ...
## $ AirSensorData.mean_SENSOR_DUST_PM2 : num 8.9 8.61 13.43 10.35 11.79 ...
## $ AirSensorData.mean_TEMP : num 20.6 19.6 18.5 17.3 17 ...
Seeing at data what we have
kable(head(indata, n = 14)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),font_size = 10) %>%
scroll_box(width = "900px", height = "500px")
| time | AirSensorData.mean_BATTERY_12V | AirSensorData.mean_BATTERY_4V2 | AirSensorData.mean_HUMIDITY | AirSensorData.mean_ME3_CO | AirSensorData.mean_ME3_NO2 | AirSensorData.mean_ME3_O3 | AirSensorData.mean_ME3_SO2 | AirSensorData.mean_PRESSURE | AirSensorData.mean_SENSOR_DUST_PM10 | AirSensorData.mean_SENSOR_DUST_PM2 | AirSensorData.mean_TEMP |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2019-07-12T17:48:42.558Z | 13.43567 | 4.159111 | 42.90100 | 0.3118889 | 0.0582222 | 0.0493333 | 0.0184444 | 941.2557 | 9.728333 | 8.896778 | 20.64111 |
| 2019-07-12T18:50:25.415Z | 13.27733 | 4.156778 | 47.12756 | 0.5154444 | 0.0422222 | 0.0345556 | 0.0221111 | 942.8657 | 9.601222 | 8.609111 | 19.56889 |
| 2019-07-12T19:52:08.272Z | 13.32330 | 4.148200 | 48.79950 | 0.5170000 | 0.0431000 | 0.0409000 | 0.0218000 | 942.3227 | 14.942900 | 13.430900 | 18.51800 |
| 2019-07-12T20:53:51.129Z | 13.25356 | 4.143111 | 55.54456 | 0.6002222 | 0.0326667 | 0.0261111 | 0.0267778 | 942.9999 | 11.499667 | 10.353333 | 17.31444 |
| 2019-07-12T21:55:33.986Z | 13.30680 | 4.141400 | 57.99210 | 0.4993000 | 0.0348000 | 0.0317000 | 0.0252000 | 942.5493 | 13.056700 | 11.792700 | 17.01200 |
| 2019-07-12T22:57:16.843Z | 13.30100 | 4.141800 | 57.39350 | 0.4429000 | 0.0330000 | 0.0306000 | 0.0220000 | 942.4301 | 11.098600 | 10.202500 | 17.08400 |
| 2019-07-12T23:58:59.700Z | 13.24033 | 4.156556 | 59.21289 | 0.1807778 | 0.0264444 | 0.0205556 | 0.0276667 | 942.9283 | 10.594667 | 9.713556 | 17.11667 |
| 2019-07-13T01:00:42.557Z | 13.29530 | 4.146100 | 61.80340 | 0.4399000 | 0.0252000 | 0.0232000 | 0.0261000 | 941.9967 | 10.094900 | 9.104400 | 16.71000 |
| 2019-07-13T02:02:25.414Z | 13.39022 | 4.152000 | 63.47200 | 0.3326667 | 0.0251111 | 0.0207778 | 0.0301111 | 941.0621 | 9.574667 | 8.725778 | 15.59444 |
| 2019-07-13T03:04:08.271Z | 13.21378 | 4.140000 | 69.59033 | 0.4737778 | 0.0242222 | 0.0200000 | 0.0240000 | 942.5628 | 12.881889 | 11.626222 | 14.71556 |
| 2019-07-13T04:05:51.128Z | 13.28860 | 4.174200 | 65.96890 | 0.3258000 | 0.0328000 | 0.0225000 | 0.0254000 | 941.9358 | 13.576500 | 12.329100 | 16.27200 |
| 2019-07-13T05:07:33.985Z | 13.37010 | 4.140300 | 60.60120 | 0.2230000 | 0.0293000 | 0.0213000 | 0.0286000 | 942.1323 | 13.211700 | 12.011900 | 18.12300 |
| 2019-07-13T06:09:16.842Z | 13.57000 | 4.150889 | 50.20256 | 0.1711111 | 0.0347778 | 0.0282222 | 0.0210000 | 942.5449 | 11.377556 | 10.385556 | 20.90444 |
| 2019-07-13T07:10:59.699Z | 13.71880 | 4.155100 | 43.47040 | 0.2402000 | 0.0465000 | 0.0355000 | 0.0251000 | 941.5604 | 11.067200 | 10.024200 | 22.71100 |
Seeing at the structure of the data, we can say that we have timestamp in our data which tells at what time the value of particular type of pollution was recorded. And the rest columns are the values of different type of pollutions.
Here we are interested only in the prediction of PM10 forecasting i.e. AirSensorData.mean_SENSOR_DUST_PM10 column.
Since we are only interested in the forecasting of PM10 values , we can drop the rest of the pollution types and keep time and PM10 details in our dataset.
# Keeping PPM20 and time column only
indata = indata[,c(1,10)]
Since we dont have all the information for 2017-07-12, hence we can the data of 2017-07-12 and continue from 2017-07-13 till 2017-07-22.
indata = indata[-c(1:7),]
# Renaming the column names
colnames(indata) = c("time","PM10")
# Show data
kable(head(indata, n = 10)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"))
| time | PM10 | |
|---|---|---|
| 8 | 2019-07-13T01:00:42.557Z | 10.094900 |
| 9 | 2019-07-13T02:02:25.414Z | 9.574667 |
| 10 | 2019-07-13T03:04:08.271Z | 12.881889 |
| 11 | 2019-07-13T04:05:51.128Z | 13.576500 |
| 12 | 2019-07-13T05:07:33.985Z | 13.211700 |
| 13 | 2019-07-13T06:09:16.842Z | 11.377556 |
| 14 | 2019-07-13T07:10:59.699Z | 11.067200 |
| 15 | 2019-07-13T08:12:42.556Z | 10.838400 |
| 16 | 2019-07-13T09:14:25.413Z | 11.031222 |
| 17 | 2019-07-13T10:16:08.270Z | 11.326889 |
# Seperating
indata$Day = day(indata$time)
indata$Hour = as.integer(substr(indata$time,12,13))
# Adding missing and removing extra records
indata = indata[-which(indata$Day == 14 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 15 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 17 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 18 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 20 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 21 & indata$Hour == 00),]
indata = rbind(indata,data.frame(time = "2019-07-14T12:00:00.000Z",
PM10 = mean(indata[which(indata$Day == 14),"PM10"]),
Day = 14, Hour = 12, stringsAsFactors = F))
indata = rbind(indata,data.frame(time = "2019-07-17T12:00:00.000Z",
PM10 = mean(indata[which(indata$Day == 17),"PM10"]),
Day = 17, Hour = 12, stringsAsFactors = F))
indata = rbind(indata,data.frame(time = "2019-07-20T12:00:00.000Z",
PM10 = mean(indata[which(indata$Day == 20),"PM10"]),
Day = 20, Hour = 12, stringsAsFactors = F))
# Sorting data
indata = indata[order(indata$Day,indata$Hour),]
# Converting data in to time series
ts_PM10 = ts(indata[,2], frequency = 23)
# Plotting time series
plot(ts_PM10)
# Decomposing time series susing stl
decomp_ts = stl(ts_PM10, s.window = "p")
plot(decomp_ts)
# Plotting Differenced time series
plot(diff(ts_PM10, 1))
# ADF Test
adf.test(diff(ts_PM10, 1))
## Warning in adf.test(diff(ts_PM10, 1)): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(ts_PM10, 1)
## Dickey-Fuller = -5.8842, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
ggsubseriesplot(diff(ts_PM10,1)) +
ggtitle("Subseries Plot")
model = auto.arima(ts_PM10,d = 1,seasonal = TRUE, stepwise = TRUE, ic = "aic")
frcst = forecast(model,h = 23)
plot(frcst)
# Plotting residuals
plot(model$residuals)
# Plotting real v/s fitted values
ts.plot(model$x, model$fitted, col=1:2,gpars = list(xlab = "Hours",ylab = "PM10",main = "Real v/s Fitted values"))