report.utf8.md

Air Pollution Hourly Time series Forecast

Author: Danish Mumtaz

Date : 09-Aug-2019

1.0 Background

In this excersice, we have provided with hourly pullution data, and we will forecast the pollution values (PM10 in this case) for next 24 hours.

2.0 Libraries Used

library(ggplot2)      # For visualization
library(lubridate)    # For date functions
library(forecast)     # For time series forecasting
library(tseries)      # Time series functions
library(kableExtra)   # Table formatting
library(dplyr)

3.0 Load Data

# Load the .csv file
indata = read.csv("./data/Air Data.csv", stringsAsFactors = F)

Let’s have closer look at the data which we have loaded.

# Dimensions of data
dim(indata)

## [1] 240  12

sprintf("The loaded data has %d rows and %d columns", nrow(indata), ncol(indata))

## [1] "The loaded data has 240 rows and 12 columns"

# Structure of data
str(indata)

## 'data.frame':    240 obs. of  12 variables:
##  $ time                               : chr  "2019-07-12T17:48:42.558Z" "2019-07-12T18:50:25.415Z" "2019-07-12T19:52:08.272Z" "2019-07-12T20:53:51.129Z" ...
##  $ AirSensorData.mean_BATTERY_12V     : num  13.4 13.3 13.3 13.3 13.3 ...
##  $ AirSensorData.mean_BATTERY_4V2     : num  4.16 4.16 4.15 4.14 4.14 ...
##  $ AirSensorData.mean_HUMIDITY        : num  42.9 47.1 48.8 55.5 58 ...
##  $ AirSensorData.mean_ME3_CO          : num  0.312 0.515 0.517 0.6 0.499 ...
##  $ AirSensorData.mean_ME3_NO2         : num  0.0582 0.0422 0.0431 0.0327 0.0348 ...
##  $ AirSensorData.mean_ME3_O3          : num  0.0493 0.0346 0.0409 0.0261 0.0317 ...
##  $ AirSensorData.mean_ME3_SO2         : num  0.0184 0.0221 0.0218 0.0268 0.0252 ...
##  $ AirSensorData.mean_PRESSURE        : num  941 943 942 943 943 ...
##  $ AirSensorData.mean_SENSOR_DUST_PM10: num  9.73 9.6 14.94 11.5 13.06 ...
##  $ AirSensorData.mean_SENSOR_DUST_PM2 : num  8.9 8.61 13.43 10.35 11.79 ...
##  $ AirSensorData.mean_TEMP            : num  20.6 19.6 18.5 17.3 17 ...

Seeing at data what we have

kable(head(indata, n = 14)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"),font_size = 10) %>%
  scroll_box(width = "900px", height = "500px")

time	AirSensorData.mean_BATTERY_12V	AirSensorData.mean_BATTERY_4V2	AirSensorData.mean_HUMIDITY	AirSensorData.mean_ME3_CO	AirSensorData.mean_ME3_NO2	AirSensorData.mean_ME3_O3	AirSensorData.mean_ME3_SO2	AirSensorData.mean_PRESSURE	AirSensorData.mean_SENSOR_DUST_PM10	AirSensorData.mean_SENSOR_DUST_PM2	AirSensorData.mean_TEMP
2019-07-12T17:48:42.558Z	13.43567	4.159111	42.90100	0.3118889	0.0582222	0.0493333	0.0184444	941.2557	9.728333	8.896778	20.64111
2019-07-12T18:50:25.415Z	13.27733	4.156778	47.12756	0.5154444	0.0422222	0.0345556	0.0221111	942.8657	9.601222	8.609111	19.56889
2019-07-12T19:52:08.272Z	13.32330	4.148200	48.79950	0.5170000	0.0431000	0.0409000	0.0218000	942.3227	14.942900	13.430900	18.51800
2019-07-12T20:53:51.129Z	13.25356	4.143111	55.54456	0.6002222	0.0326667	0.0261111	0.0267778	942.9999	11.499667	10.353333	17.31444
2019-07-12T21:55:33.986Z	13.30680	4.141400	57.99210	0.4993000	0.0348000	0.0317000	0.0252000	942.5493	13.056700	11.792700	17.01200
2019-07-12T22:57:16.843Z	13.30100	4.141800	57.39350	0.4429000	0.0330000	0.0306000	0.0220000	942.4301	11.098600	10.202500	17.08400
2019-07-12T23:58:59.700Z	13.24033	4.156556	59.21289	0.1807778	0.0264444	0.0205556	0.0276667	942.9283	10.594667	9.713556	17.11667
2019-07-13T01:00:42.557Z	13.29530	4.146100	61.80340	0.4399000	0.0252000	0.0232000	0.0261000	941.9967	10.094900	9.104400	16.71000
2019-07-13T02:02:25.414Z	13.39022	4.152000	63.47200	0.3326667	0.0251111	0.0207778	0.0301111	941.0621	9.574667	8.725778	15.59444
2019-07-13T03:04:08.271Z	13.21378	4.140000	69.59033	0.4737778	0.0242222	0.0200000	0.0240000	942.5628	12.881889	11.626222	14.71556
2019-07-13T04:05:51.128Z	13.28860	4.174200	65.96890	0.3258000	0.0328000	0.0225000	0.0254000	941.9358	13.576500	12.329100	16.27200
2019-07-13T05:07:33.985Z	13.37010	4.140300	60.60120	0.2230000	0.0293000	0.0213000	0.0286000	942.1323	13.211700	12.011900	18.12300
2019-07-13T06:09:16.842Z	13.57000	4.150889	50.20256	0.1711111	0.0347778	0.0282222	0.0210000	942.5449	11.377556	10.385556	20.90444
2019-07-13T07:10:59.699Z	13.71880	4.155100	43.47040	0.2402000	0.0465000	0.0355000	0.0251000	941.5604	11.067200	10.024200	22.71100

Seeing at the structure of the data, we can say that we have timestamp in our data which tells at what time the value of particular type of pollution was recorded. And the rest columns are the values of different type of pollutions.

Here we are interested only in the prediction of PM10 forecasting i.e. AirSensorData.mean_SENSOR_DUST_PM10 column.

4.0 Data Pre-Processing

Since we are only interested in the forecasting of PM10 values , we can drop the rest of the pollution types and keep time and PM10 details in our dataset.

# Keeping PPM20 and time column only
indata = indata[,c(1,10)]

Since we dont have all the information for 2017-07-12, hence we can the data of 2017-07-12 and continue from 2017-07-13 till 2017-07-22.

indata = indata[-c(1:7),]

# Renaming the column names
colnames(indata) = c("time","PM10")

# Show data
kable(head(indata, n = 10)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed","responsive"))

	time	PM10
8	2019-07-13T01:00:42.557Z	10.094900
9	2019-07-13T02:02:25.414Z	9.574667
10	2019-07-13T03:04:08.271Z	12.881889
11	2019-07-13T04:05:51.128Z	13.576500
12	2019-07-13T05:07:33.985Z	13.211700
13	2019-07-13T06:09:16.842Z	11.377556
14	2019-07-13T07:10:59.699Z	11.067200
15	2019-07-13T08:12:42.556Z	10.838400
16	2019-07-13T09:14:25.413Z	11.031222
17	2019-07-13T10:16:08.270Z	11.326889

# Seperating 
indata$Day = day(indata$time)
indata$Hour = as.integer(substr(indata$time,12,13))

# Adding missing and removing extra records

indata = indata[-which(indata$Day == 14 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 15 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 17 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 18 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 20 & indata$Hour == 00),]
indata = indata[-which(indata$Day == 21 & indata$Hour == 00),]


indata = rbind(indata,data.frame(time = "2019-07-14T12:00:00.000Z",
                                 PM10 = mean(indata[which(indata$Day == 14),"PM10"]),
                                 Day = 14, Hour = 12, stringsAsFactors = F))

indata = rbind(indata,data.frame(time = "2019-07-17T12:00:00.000Z",
                                 PM10 = mean(indata[which(indata$Day == 17),"PM10"]),
                                 Day = 17, Hour = 12, stringsAsFactors = F))

indata = rbind(indata,data.frame(time = "2019-07-20T12:00:00.000Z",
                                 PM10 = mean(indata[which(indata$Day == 20),"PM10"]),
                                 Day = 20, Hour = 12, stringsAsFactors = F))

# Sorting data 
indata = indata[order(indata$Day,indata$Hour),]


# Converting data in to time series
ts_PM10 = ts(indata[,2], frequency = 23)

# Plotting time series
plot(ts_PM10)

# Decomposing time series susing stl
decomp_ts = stl(ts_PM10, s.window = "p")
plot(decomp_ts)

# Plotting Differenced time series
plot(diff(ts_PM10, 1))

# ADF Test
adf.test(diff(ts_PM10, 1))

## Warning in adf.test(diff(ts_PM10, 1)): p-value smaller than printed p-value

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(ts_PM10, 1)
## Dickey-Fuller = -5.8842, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary

ggsubseriesplot(diff(ts_PM10,1)) + 
  ggtitle("Subseries Plot")

model = auto.arima(ts_PM10,d = 1,seasonal = TRUE, stepwise = TRUE, ic = "aic")
frcst = forecast(model,h = 23)

plot(frcst)

# Plotting residuals
plot(model$residuals)

# Plotting real v/s fitted values
ts.plot(model$x, model$fitted, col=1:2,gpars = list(xlab = "Hours",ylab = "PM10",main = "Real v/s Fitted values"))