#Time Series and Forecasting  ____ 
#1) Select an interesting dataset of your
#choice that will allow you to permit time series forecasting ___ 
#loading Bejing air polluntion dataset and storing in the data2 variable.
data2 <- read.csv("PRSA.csv")
View(data2)
#2) Clean and transform the data into a more convenient form, if needed. ____ 
any(is.na(data2))
## [1] TRUE
summary(data2)
##        No             year          month             day       
##  Min.   :    1   Min.   :2013   Min.   : 1.000   Min.   : 1.00  
##  1st Qu.: 8767   1st Qu.:2014   1st Qu.: 4.000   1st Qu.: 8.00  
##  Median :17532   Median :2015   Median : 7.000   Median :16.00  
##  Mean   :17532   Mean   :2015   Mean   : 6.523   Mean   :15.73  
##  3rd Qu.:26298   3rd Qu.:2016   3rd Qu.:10.000   3rd Qu.:23.00  
##  Max.   :35064   Max.   :2017   Max.   :12.000   Max.   :31.00  
##                                                                 
##       hour           PM2.5             PM10            SO2          
##  Min.   : 0.00   Min.   :  3.00   Min.   :  2.0   Min.   :  0.2856  
##  1st Qu.: 5.75   1st Qu.: 22.00   1st Qu.: 38.0   1st Qu.:  3.0000  
##  Median :11.50   Median : 58.00   Median : 87.0   Median :  9.0000  
##  Mean   :11.50   Mean   : 82.77   Mean   :110.1   Mean   : 17.3759  
##  3rd Qu.:17.25   3rd Qu.:114.00   3rd Qu.:155.0   3rd Qu.: 21.0000  
##  Max.   :23.00   Max.   :898.00   Max.   :984.0   Max.   :341.0000  
##                  NA's   :925      NA's   :718     NA's   :935       
##       NO2               CO              O3                TEMP       
##  Min.   :  2.00   Min.   :  100   Min.   :  0.2142   Min.   :-16.80  
##  1st Qu.: 30.00   1st Qu.:  500   1st Qu.:  8.0000   1st Qu.:  3.10  
##  Median : 53.00   Median :  900   Median : 42.0000   Median : 14.50  
##  Mean   : 59.31   Mean   : 1263   Mean   : 56.3534   Mean   : 13.58  
##  3rd Qu.: 82.00   3rd Qu.: 1500   3rd Qu.: 82.0000   3rd Qu.: 23.30  
##  Max.   :290.00   Max.   :10000   Max.   :423.0000   Max.   : 40.50  
##  NA's   :1023     NA's   :1776    NA's   :1719       NA's   :20      
##       PRES             DEWP              RAIN                wd       
##  Min.   : 985.9   Min.   :-35.300   Min.   : 0.00000   NE     : 5140  
##  1st Qu.:1003.3   1st Qu.: -8.100   1st Qu.: 0.00000   ENE    : 3950  
##  Median :1011.4   Median :  3.800   Median : 0.00000   SW     : 3359  
##  Mean   :1011.8   Mean   :  3.123   Mean   : 0.06742   E      : 2608  
##  3rd Qu.:1020.1   3rd Qu.: 15.600   3rd Qu.: 0.00000   NNE    : 2445  
##  Max.   :1042.0   Max.   : 28.500   Max.   :72.50000   (Other):17481  
##  NA's   :20       NA's   :20        NA's   :20         NA's   :   81  
##       WSPM                station     
##  Min.   : 0.000   Aotizhongxin:35064  
##  1st Qu.: 0.900                       
##  Median : 1.400                       
##  Mean   : 1.708                       
##  3rd Qu.: 2.200                       
##  Max.   :11.200                       
##  NA's   :14
newdata2 <- na.omit(data2)
#3) Describe the problem statement as it relates to 
#using time series analysis approach for prediction 
#Which year has the maximum CO2 level in the environment?

#4) Perform exploratory time series analysis by producing a time plot 
#of the data.  Describe the time plot. ____ 
#5) To better assess the trends, remove any seasonal effects by 
#aggregation and describe the plot obtained. ____ 
newdata2.month <- ts(newdata2$CO, start = c(2013,3), end = c(2017,2), frequency =12)
newdata2.annual <- aggregate(newdata2.month, FUN = mean)
#After taking frequency manthly we can that CO2 level has been increased from 2014 till 2017
#After removing seasonal effects,we can see in the year 2015 there is a huge inclined in CO, 
#from the annual plot.
plot(newdata2.month, xlab="year", ylab = "CO2")

plot(newdata2.annual, xlab="year", ylab = "CO2")

#6) Obtain boxplots summarizing the observed values and explain your results ____ 
#We can see that the highest median is higher than the other months, we can also see that
#March, April, May, June and July has the lower median in CO level.
boxplot(newdata2.month~ cycle(newdata2.month))

#7) Decompose the series into a components trend, seasonal effect and residuals.
#Plot the decomposed series.Produce a plot of the trend with a superimposed seasonal effect. ____ 
newdata2.decom <- decompose(newdata2.month,type = "multiplicative")
plot(newdata2.decom)

seasonal <- newdata2.decom$seasonal
trend <- newdata2.decom$trend
ts.plot(cbind(seasonal,trend * seasonal),lty=1:2)

#8) Plot the correlogram of the residuals from step 7.  
#Describe the plot and explain any “significant” correlations at significant lags. ____
#There is no significant correlations at significant lags as we can see from the plot.
plot(ts(newdata2.decom$random))

acf(newdata2.decom$random, na.action = na.pass)

#9) If patterns change over a time period, use appropriate methods (moving average, 
#exponential smoothing, Holt-Winters method)to obtain an appropriately fitted model for 
#forecasting. ____ 
#Holt-Winters method generalizes the simple exponential smoothing
#equation and takes into account seasonal effects and trends, hence we are using this methods
#as we have both trend and seasonal effects.
newdata2.HW <- HoltWinters(newdata2.month, seasonal = "multiplicative")
newdata2.HW
## Holt-Winters exponential smoothing with trend and multiplicative seasonal component.
## 
## Call:
## HoltWinters(x = newdata2.month, seasonal = "multiplicative")
## 
## Smoothing parameters:
##  alpha: 0.3915301
##  beta : 0.05345697
##  gamma: 0.9280682
## 
## Coefficients:
##             [,1]
## a   1301.4001063
## b     30.8431014
## s1     1.0450765
## s2     0.7749670
## s3     0.7745522
## s4     0.7881087
## s5     0.9775579
## s6     1.0662227
## s7     0.9933308
## s8     1.1190524
## s9     1.1775109
## s10    1.2172574
## s11    1.2183904
## s12    1.3013351
plot(newdata2.HW)

#10)    Using the fitted model, forecast values for a time period of your choice. 
#Assess the validity of these forecasts.  ____ 
# can see from the plot that its is showing increase in the CO level, due to more pollution.
newdata2.pred <- predict(newdata2.HW,n.ahead = 4*12)
newdata2.pred
##           Jan      Feb      Mar      Apr      May      Jun      Jul
## 2017                   1392.296 1056.347 1079.671 1122.876 1422.949
## 2018 1998.982 2175.204 1779.097 1343.176 1366.346 1414.568 1784.760
## 2019 2449.929 2656.851 2165.898 1630.004 1653.021 1706.261 2146.571
## 2020 2900.876 3138.497 2552.699 1916.833 1939.696 1997.953 2508.382
## 2021 3351.824 3620.144                                             
##           Aug      Sep      Oct      Nov      Dec
## 2017 1584.896 1507.183 1732.455 1859.276 1959.579
## 2018 1979.523 1874.831 2146.636 2295.093 2410.107
## 2019 2374.151 2242.480 2560.817 2730.910 2860.635
## 2020 2768.778 2610.129 2974.997 3166.727 3311.163
## 2021
ts.plot(newdata2.month,newdata2.pred,lty=1:2)

#11)    What would your recommendations be based on the forecasts observed? ____ 
#This forecast showing increase in CO level.The HoltWinters methods shows increase in trends and fitted almost.
#which will lead to increase in the global warming from this we can understand that 
#the use of sustainable practice is so much important in order to protect our climate.