Loading packages

library(dplyr); library(psych);library(pgirmess); library(table1); library(explore); library(GGally); library(gridExtra); library(leaps);library(BMA); library(ggplot2);library(ggfortify);library(sciplot); library(fpp3)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'pgirmess'
## The following object is masked from 'package:psych':
## 
##     shannon
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
## 
## Attaching package: 'explore'
## The following object is masked from 'package:psych':
## 
##     describe
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:explore':
## 
##     rescale01
## The following object is masked from 'package:dplyr':
## 
##     nasa
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: survival
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.5-2)
## -- Attaching packages ---------------------------------------------- fpp3 0.2 --
## v tibble      3.0.1     v tsibbledata 0.1.0
## v tidyr       1.0.3     v feasts      0.1.3
## v lubridate   1.7.8     v fable       0.2.0
## v tsibble     0.8.6
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x ggplot2::%+%()       masks psych::%+%()
## x ggplot2::alpha()     masks psych::alpha()
## x gridExtra::combine() masks dplyr::combine()
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::id()        masks dplyr::id()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x fabletools::report() masks explore::report()

Loading data due to analysis

setwd("E:/cuong_dataR/test")
dat = read.csv("dataxlbtestR.csv", header = T, na.strings = "0")
dat = na.omit(dat)
attach(dat)
names(dat)
##  [1] "date"          "year"          "month"         "seasonal"     
##  [5] "rwntu"         "fecl3"         "rw.vol"        "feedsludge"   
##  [9] "drysludge"     "polymer"       "concentr.poly" "unit.poly"    
## [13] "poly.gm3"

Define variables

“date” : data update date (yyyy/mm/dd)

“year” : data updadte year (2017, 2018, 2019)

“month” : data update month (1,2,3,…,10,11,12)

“seasonal” : data update season (Dry, Rain)

“rwntu” : Turbidity of raw water (NTU)

“fecl3” : ferric chloride using for water treatment (g/m3)

“feedsludge” : sludge flow inlet (m3/day)

“drysludge” : dry sludge volume outlet (Kg/day)

“polymer” : polymer cation volume using for STS (Kg/day)

“concentr.poly” : concentration of polymer cation (%)

“unit.poly” : polymer cation quantity per a m3 feedsludge (Kg/m3)

“poly.gm3” : the unit of “polymer” with a gram quantity of polymer cation per a m3 raw water

Set up tsisbble for data and plot monthly changes of Feed sludge and Polymer at STS

dat$date = as.character(dat$date)

dat <- dat %>% mutate(Time = yearmonth(date)) %>% as_tsibble(index = Time)

dat %>% 
  gather("var", "value", rwntu, feedsludge, polymer) %>%
  ggplot(aes(x = Time, y = value)) + geom_line() +
  facet_grid(vars(var), scales = "free_y") + xlab("Year") +
  ylab(NULL) + 
  ggtitle("Monthly changes of turdibility, feed sludge and polymer at STS")

Descriptive analysis

p = ggplot(dat, aes(x=as.factor(month), y=poly.gm3, colour=factor(year)))
p + geom_boxplot(aes(colour=factor(year))) + geom_jitter(alpha=0.05) + theme_bw() + theme_classic()

temp = dat[ , c("year","month", "seasonal", "rwntu", "feedsludge", "poly.gm3")]
ggpairs(temp)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table1(~ rwntu + feedsludge + poly.gm3|year, dadta = dat)
## Warning in table1.formula(~rwntu + feedsludge + poly.gm3 | year, dadta = dat):
## Terms to the right of '|' in formula 'x' define table columns and are expected
## to be factors with meaningful labels.
2017
(N=321)
2018
(N=347)
2019
(N=327)
Overall
(N=995)
rwntu
Mean (SD) 44.1 (31.3) 44.4 (27.4) 57.4 (45.2) 48.6 (35.8)
Median [Min, Max] 32.8 [8.20, 175] 34.5 [11.5, 183] 38.2 [6.39, 217] 35.8 [6.39, 217]
feedsludge
Mean (SD) 325 (173) 297 (136) 332 (174) 317 (162)
Median [Min, Max] 301 [0, 868] 253 [105, 696] 263 [72.0, 794] 272 [0, 868]
poly.gm3
Mean (SD) 0.156 (0.0851) 0.160 (0.105) 0.195 (0.136) 0.170 (0.112)
Median [Min, Max] 0.140 [0, 1.00] 0.130 [0, 0.540] 0.180 [0, 0.800] 0.160 [0, 1.00]
table1(~ rwntu + feedsludge + poly.gm3|seasonal, dadta = dat)
Dry
(N=502)
Rain
(N=493)
Overall
(N=995)
rwntu
Mean (SD) 27.0 (11.9) 70.5 (38.7) 48.6 (35.8)
Median [Min, Max] 24.9 [6.39, 84.5] 62.0 [12.8, 217] 35.8 [6.39, 217]
feedsludge
Mean (SD) 215 (79.6) 422 (158) 317 (162)
Median [Min, Max] 212 [0, 673] 419 [72.0, 868] 272 [0, 868]
poly.gm3
Mean (SD) 0.121 (0.0860) 0.220 (0.114) 0.170 (0.112)
Median [Min, Max] 0.100 [0, 1.00] 0.200 [0, 0.550] 0.160 [0, 1.00]
table1(~ rwntu + feedsludge + poly.gm3|month, dadta = dat)
## Warning in table1.formula(~rwntu + feedsludge + poly.gm3 | month, dadta = dat):
## Terms to the right of '|' in formula 'x' define table columns and are expected
## to be factors with meaningful labels.
## Warning in .table1.internal(x = x, labels = labels, groupspan = groupspan, :
## Table has 13 columns. Are you sure this is what you want?
1
(N=87)
2
(N=75)
3
(N=91)
4
(N=84)
5
(N=87)
6
(N=83)
7
(N=90)
8
(N=76)
9
(N=78)
10
(N=79)
11
(N=82)
12
(N=83)
Overall
(N=995)
rwntu
Mean (SD) 25.7 (9.13) 21.9 (7.85) 21.7 (8.77) 22.7 (6.00) 32.0 (11.1) 51.0 (18.4) 72.4 (21.3) 98.2 (43.8) 96.7 (40.9) 78.8 (37.2) 42.2 (16.0) 28.3 (6.09) 48.6 (35.8)
Median [Min, Max] 24.0 [9.35, 46.9] 21.1 [10.6, 45.6] 21.1 [6.39, 41.0] 21.0 [12.0, 37.2] 29.9 [12.8, 71.0] 49.0 [21.9, 120] 69.6 [38.3, 150] 82.1 [45.7, 217] 86.8 [42.1, 215] 68.4 [35.2, 202] 39.6 [16.2, 84.5] 28.0 [16.5, 43.0] 35.8 [6.39, 217]
feedsludge
Mean (SD) 230 (63.4) 217 (72.4) 213 (52.9) 233 (84.4) 262 (98.4) 380 (126) 453 (110) 520 (137) 443 (124) 490 (193) 183 (103) 213 (86.1) 317 (162)
Median [Min, Max] 228 [63.0, 540] 201 [89.0, 404] 205 [105, 389] 213 [108, 673] 243 [108, 532] 351 [142, 646] 457 [220, 868] 545 [165, 809] 454 [72.0, 697] 503 [92.0, 794] 205 [0, 313] 199 [0, 556] 272 [0, 868]
poly.gm3
Mean (SD) 0.103 (0.0586) 0.113 (0.104) 0.0942 (0.0573) 0.0969 (0.0569) 0.114 (0.0801) 0.192 (0.101) 0.243 (0.0833) 0.292 (0.131) 0.251 (0.0950) 0.239 (0.0999) 0.180 (0.121) 0.144 (0.0695) 0.170 (0.112)
Median [Min, Max] 0.100 [0, 0.300] 0.100 [0, 0.800] 0.0900 [0, 0.240] 0.100 [0, 0.220] 0.100 [0, 0.380] 0.170 [0, 0.420] 0.220 [0.110, 0.470] 0.280 [0.0800, 0.550] 0.240 [0, 0.480] 0.200 [0.0600, 0.460] 0.170 [0, 1.00] 0.130 [0, 0.330] 0.160 [0, 1.00]

Building Regression model (TSLM)

fit1.polymer <- dat %>%
  model(
    tslm = TSLM(poly.gm3 ~ feedsludge)
  )
report(fit1.polymer)
## Series: poly.gm3 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24739 -0.04910 -0.01322  0.04180  0.89313 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.618e-02  5.897e-03    4.44    1e-05 ***
## feedsludge  4.533e-04  1.655e-05   27.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08459 on 993 degrees of freedom
## Multiple R-squared: 0.4304,  Adjusted R-squared: 0.4298
## F-statistic: 750.3 on 1 and 993 DF, p-value: < 2.22e-16
glance(fit1.polymer) %>% select(adj_r_squared, CV, AIC, AICc, BIC)
## # A tibble: 1 x 5
##   adj_r_squared      CV    AIC   AICc    BIC
##           <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1         0.430 0.00718 -4911. -4911. -4896.
fit2.polymer <- dat %>%
  model(
    tslm = TSLM(poly.gm3 ~ rwntu)
  )
report(fit2.polymer)
## Series: poly.gm3 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.248146 -0.053838 -0.009589  0.044678  0.845935 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.080308   0.004829   16.63   <2e-16 ***
## rwntu       0.001847   0.000080   23.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09041 on 993 degrees of freedom
## Multiple R-squared: 0.3493,  Adjusted R-squared: 0.3487
## F-statistic: 533.1 on 1 and 993 DF, p-value: < 2.22e-16
glance(fit2.polymer) %>% select(adj_r_squared, CV, AIC, AICc, BIC)
## # A tibble: 1 x 5
##   adj_r_squared      CV    AIC   AICc    BIC
##           <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1         0.349 0.00819 -4779. -4779. -4764.
fit.polymer <- dat %>%
  model(
    tslm = TSLM(poly.gm3 ~ feedsludge + rwntu)
  )
report(fit.polymer)
## Series: poly.gm3 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.237806 -0.047000 -0.009198  0.043096  0.883139 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.261e-02  5.644e-03   4.007 6.61e-05 ***
## feedsludge  3.250e-04  2.049e-05  15.861  < 2e-16 ***
## rwntu       9.113e-04  9.269e-05   9.832  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08079 on 992 degrees of freedom
## Multiple R-squared: 0.481,   Adjusted R-squared: 0.4799
## F-statistic: 459.6 on 2 and 992 DF, p-value: < 2.22e-16
glance(fit.polymer) %>% select(adj_r_squared, CV, AIC, AICc, BIC)
## # A tibble: 1 x 5
##   adj_r_squared      CV    AIC   AICc    BIC
##           <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1         0.480 0.00656 -5002. -5002. -4982.

Commentary :

Choose “fit.polymer” model due to the least AICc value.

Equation :

poly.gm3 = 0.02 + 0.0003 * feedsludge + 0.0009 * rwntu

Describe different Fitted (Predicted values) and Data (Actual values) of “fit.polymer” model

augment(fit.polymer) %>%
  ggplot(aes(x=Time)) + 
  geom_line(aes(y=poly.gm3, colour = "Data")) + 
  geom_line(aes(y=.fitted, colour = "Fitted")) + 
  xlab("Year") + ylab(NULL) + 
  ggtitle("Polymer changes for time series at STS") + 
  guides(colour = guide_legend(title = NULL))

augment(fit.polymer) %>%
  ggplot(aes(x=poly.gm3, y=.fitted,
             colour = factor(year(Time)))) + 
  geom_point() + 
  ylab("Fitted (Predicted values)") + 
  xlab("Data (Actual values)") + 
  ggtitle("Annual polymer for time series observing at STS") +   scale_colour_brewer(palette = "Dark2", name = "Year") + 
  geom_abline(intercept = 0, slope = 1)

Forecasting with regression models in the 24 months

fit_feedsludge <- dat %>%
  model(TSLM(feedsludge ~ trend() + season()))
report(fit_feedsludge)
## Series: feedsludge 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -399.894  -62.255   -3.279   63.257  442.390 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    228.1113    12.6896  17.976   <2e-16 ***
## trend()          0.1674     0.3589   0.466   0.6411    
## season()year2  -13.6542    17.2559  -0.791   0.4290    
## season()year3  -17.7537    16.4307  -1.081   0.2802    
## season()year4    1.8297    16.7836   0.109   0.9132    
## season()year5   30.5142    16.6750   1.830   0.0676 .  
## season()year6  148.9243    16.8836   8.821   <2e-16 ***
## season()year7  221.3753    16.5928  13.342   <2e-16 ***
## season()year8  288.8968    17.3412  16.660   <2e-16 ***
## season()year9  211.2451    17.3475  12.177   <2e-16 ***
## season()year10 258.0927    17.2962  14.922   <2e-16 ***
## season()year11 -48.9240    17.1823  -2.847   0.0045 ** 
## season()year12 -18.6457    17.1719  -1.086   0.2778    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109.5 on 982 degrees of freedom
## Multiple R-squared: 0.5493,  Adjusted R-squared: 0.5438
## F-statistic: 99.73 on 12 and 982 DF, p-value: < 2.22e-16
fc_feedsludge1 <- forecast(fit_feedsludge)
fc_feedsludge1 %>% autoplot(dat) + 
  ggtitle("Forecasts of Feed sludge using regression") +
  xlab("Year") + ylab("m3")

fourier_feedsludge <- dat %>%
  model(TSLM(feedsludge ~ trend() + fourier(K=2)))
report(fourier_feedsludge)
## Series: feedsludge 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -416.14  -65.11   -9.70   58.37  442.54 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          320.03759    7.98916  40.059   <2e-16 ***
## trend()               -0.02715    0.38075  -0.071   0.9432    
## fourier(K = 2)C1_12 -130.51859    5.27249 -24.755   <2e-16 ***
## fourier(K = 2)S1_12  -76.13941    5.44989 -13.971   <2e-16 ***
## fourier(K = 2)C2_12   13.33410    5.24971   2.540   0.0112 *  
## fourier(K = 2)S2_12   51.35025    5.33158   9.631   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 117.3 on 989 degrees of freedom
## Multiple R-squared: 0.4791,  Adjusted R-squared: 0.4765
## F-statistic: 181.9 on 5 and 989 DF, p-value: < 2.22e-16
fc_feedsludge2 <- forecast(fourier_feedsludge)
fc_feedsludge2 %>% autoplot(dat) + 
  ggtitle("Forecasts of Feed sludge using regression") +
  xlab("Year") + ylab("m3")

fit_rwntu <- dat %>%
  model(TSLM(rwntu ~ trend() + season()))
report(fit_rwntu)
## Series: rwntu 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.282 -10.592  -1.893   7.145 116.793 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    18.52461    2.55566   7.248 8.53e-13 ***
## trend()         0.53528    0.07228   7.406 2.80e-13 ***
## season()year2  -3.95627    3.47531  -1.138 0.255234    
## season()year3  -4.82740    3.30910  -1.459 0.144933    
## season()year4  -4.52723    3.38018  -1.339 0.180768    
## season()year5   4.01594    3.35832   1.196 0.232056    
## season()year6  22.87473    3.40032   6.727 2.93e-11 ***
## season()year7  43.60297    3.34176  13.048  < 2e-16 ***
## season()year8  69.11361    3.49249  19.789  < 2e-16 ***
## season()year9  66.41700    3.49374  19.010  < 2e-16 ***
## season()year10 48.48333    3.48341  13.918  < 2e-16 ***
## season()year11 11.52860    3.46049   3.331 0.000896 ***
## season()year12 -2.65420    3.45840  -0.767 0.442988    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.06 on 982 degrees of freedom
## Multiple R-squared: 0.626,   Adjusted R-squared: 0.6214
## F-statistic:   137 on 12 and 982 DF, p-value: < 2.22e-16
fc_rwntu1 <- forecast(fit_rwntu)
fc_rwntu1 %>% autoplot(dat) + 
  ggtitle("Forecasts of Turdibility using regression") +
  xlab("Year") + ylab("NTU")

fourier_rwntu <- dat %>%
  model(TSLM(rwntu ~ trend() + fourier(K=2)))
report(fourier_rwntu)
## Series: rwntu 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.923 -11.506  -1.964   7.774 121.263 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          39.81557    1.51169  26.338  < 2e-16 ***
## trend()               0.50906    0.07204   7.066 3.01e-12 ***
## fourier(K = 2)C1_12 -26.72269    0.99765 -26.786  < 2e-16 ***
## fourier(K = 2)S1_12 -24.22844    1.03121 -23.495  < 2e-16 ***
## fourier(K = 2)C2_12   0.61489    0.99334   0.619    0.536    
## fourier(K = 2)S2_12  13.02600    1.00883  12.912  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.2 on 989 degrees of freedom
## Multiple R-squared: 0.6185,  Adjusted R-squared: 0.6165
## F-statistic: 320.6 on 5 and 989 DF, p-value: < 2.22e-16
fc_rwntu2 <- forecast(fourier_rwntu)
fc_rwntu2 %>% autoplot(dat) + 
  ggtitle("Forecasts of Turdibility using regression") +
  xlab("Year") + ylab("NTU")

fit_poly <- dat %>%
  model(TSLM(poly.gm3 ~ trend() + season()))
report(fit_poly)
## Series: poly.gm3 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24939 -0.05213 -0.01332  0.04623  0.83896 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     0.080783   0.010290   7.850 1.08e-14 ***
## trend()         0.001630   0.000291   5.600 2.79e-08 ***
## season()year2   0.009723   0.013994   0.695   0.4874    
## season()year3  -0.011053   0.013324  -0.830   0.4070    
## season()year4  -0.010419   0.013610  -0.766   0.4441    
## season()year5   0.004871   0.013522   0.360   0.7188    
## season()year6   0.081569   0.013692   5.958 3.56e-09 ***
## season()year7   0.131035   0.013456   9.738  < 2e-16 ***
## season()year8   0.179111   0.014063  12.737  < 2e-16 ***
## season()year9   0.134386   0.014068   9.553  < 2e-16 ***
## season()year10  0.122477   0.014026   8.732  < 2e-16 ***
## season()year11  0.062332   0.013934   4.473 8.60e-06 ***
## season()year12  0.025017   0.013926   1.796   0.0727 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08881 on 982 degrees of freedom
## Multiple R-squared: 0.3792,  Adjusted R-squared: 0.3716
## F-statistic: 49.98 on 12 and 982 DF, p-value: < 2.22e-16
fc_poly1 <- forecast(fit_poly)
fc_poly1 %>% autoplot(dat) + 
  ggtitle("Forecasts of polymer using regression") +
  xlab("Year") + ylab("g/m3")

fourier_poly <- dat %>%
  model(TSLM(poly.gm3 ~ trend() + fourier(K=2)))
report(fourier_poly)
## Series: poly.gm3 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27078 -0.05252 -0.01274  0.04607  0.84199 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.141327   0.006063  23.309  < 2e-16 ***
## trend()              0.001630   0.000289   5.641 2.21e-08 ***
## fourier(K = 2)C1_12 -0.061887   0.004001 -15.466  < 2e-16 ***
## fourier(K = 2)S1_12 -0.060981   0.004136 -14.744  < 2e-16 ***
## fourier(K = 2)C2_12  0.011647   0.003984   2.923  0.00354 ** 
## fourier(K = 2)S2_12  0.019969   0.004046   4.935 9.39e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08903 on 989 degrees of freedom
## Multiple R-squared: 0.3716,  Adjusted R-squared: 0.3685
## F-statistic:   117 on 5 and 989 DF, p-value: < 2.22e-16
fc_poly2 <- forecast(fourier_poly)
fc_poly2 %>% autoplot(dat) + 
  ggtitle("Forecasts of polymer using regression") +
  xlab("Year") + ylab("g/m3")

Residuals Analysis for TSLM

Choose “fit.polymer” model to analyse

ACF plot of residuals

fit.polymer %>% gg_tsresiduals()

augment(fit.polymer) %>%
  features(.resid, ljung_box, dog = 5, lag = 18)
## # A tibble: 1 x 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 tslm     1041.         0
df <- left_join(dat, residuals(fit.polymer), by = "Time")

p1 <- ggplot(df, aes(x=feedsludge, y=.resid)) + 
  geom_point() + ylab("Residuals")

p2 <- ggplot(df, aes(x=rwntu, y=.resid)) + 
  geom_point() + ylab("Residuals")

p3 <- ggplot(df, aes(x=poly.gm3, y=.resid)) + 
  geom_point() + ylab("Residuals")

grid.arrange(p1, p2, p3, ncol = 3)

augment(fit.polymer) %>%
  ggplot(aes(x=.fitted, y=.resid)) + 
  geom_point() + 
  labs(x = "Fitted", y = "Residuals")

Building ARIMA models

ARIMA model with polymer ~ feedsludge (fit1)

fit1 <- dat %>%
  model(ARIMA(poly.gm3 ~ feedsludge))
report(fit1)
## Series: poly.gm3 
## Model: LM w/ ARIMA(0,1,1) errors 
## 
## Coefficients:
##           ma1  feedsludge
##       -0.7945       3e-04
## s.e.   0.0220       0e+00
## 
## sigma^2 estimated as 0.004958:  log likelihood=1227.56
## AIC=-2449.12   AICc=-2449.09   BIC=-2434.41

ARIMA model with polymer ~ turdibility (fit2)

fit2 <- dat %>%
  model(ARIMA(poly.gm3 ~ rwntu))
report(fit2)
## Series: poly.gm3 
## Model: LM w/ ARIMA(4,1,1) errors 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4      ma1  rwntu
##       0.1763  0.1436  0.1318  0.1129  -0.9450  6e-04
## s.e.  0.0377  0.0363  0.0353  0.0345   0.0198  1e-04
## 
## sigma^2 estimated as 0.005799:  log likelihood=1151.6
## AIC=-2289.2   AICc=-2289.09   BIC=-2254.89

ARIMA model with polymer ~ feed sludge + turdibility (fit)

fit <- dat %>%
  model(ARIMA(poly.gm3 ~ feedsludge + rwntu))
report(fit)
## Series: poly.gm3 
## Model: LM w/ ARIMA(0,1,1) errors 
## 
## Coefficients:
##           ma1  feedsludge  rwntu
##       -0.7988       3e-04  3e-04
## s.e.   0.0221       0e+00  1e-04
## 
## sigma^2 estimated as 0.004933:  log likelihood=1230.54
## AIC=-2453.08   AICc=-2453.04   BIC=-2433.48

Commentary :

ARIMA of “fit” is better than “fit1” and “fit2” (there is the least AICc value).

Equations :

polymer = 3e^-4 * feedsludge + 3e^-4*rwntu + eta(t)

eta(t) = eps(t) - 0.8 * eps(t-1)

eps(t) ~ NID(0, 0.005).

Recovering estimates of both eta(t) and eps(t) series using the residuals() function for “fit1”, “fit2” and “fit” models

bind_rows(
  "Regression Errors" = residuals(fit1, type = "regression"),
  "ARIMA Errors" = residuals(fit1, type = "innovation"),
  .id = "type"
) %>%
  ggplot(aes(x=Time, y=.resid)) + 
  geom_line() + 
  facet_grid(vars(type), scales = "free_y") +
  xlab("Year") + ylab(NULL)

bind_rows(
  "Regression Errors" = residuals(fit2, type = "regression"),
  "ARIMA Errors" = residuals(fit2, type = "innovation"),
  .id = "type"
) %>%
  ggplot(aes(x=Time, y=.resid)) + 
  geom_line() + 
  facet_grid(vars(type), scales = "free_y") +
  xlab("Year") + ylab(NULL)

bind_rows(
  "Regression Errors" = residuals(fit, type = "regression"),
  "ARIMA Errors" = residuals(fit, type = "innovation"),
  .id = "type"
) %>%
  ggplot(aes(x=Time, y=.resid)) + 
  geom_line() + 
  facet_grid(vars(type), scales = "free_y") +
  xlab("Year") + ylab(NULL)

fit1 %>% gg_tsresiduals()

fit2 %>% gg_tsresiduals()

fit %>% gg_tsresiduals()

augment(fit1) %>%
  features(.resid, ljung_box, dof = 5, lag = 18)
## # A tibble: 1 x 3
##   .model                       lb_stat lb_pvalue
##   <chr>                          <dbl>     <dbl>
## 1 ARIMA(poly.gm3 ~ feedsludge)    37.4  0.000358
augment(fit2) %>%
  features(.resid, ljung_box, dof = 5, lag = 18)
## # A tibble: 1 x 3
##   .model                  lb_stat lb_pvalue
##   <chr>                     <dbl>     <dbl>
## 1 ARIMA(poly.gm3 ~ rwntu)    22.2    0.0528
augment(fit) %>%
  features(.resid, ljung_box, dof = 5, lag = 18)
## # A tibble: 1 x 3
##   .model                               lb_stat lb_pvalue
##   <chr>                                  <dbl>     <dbl>
## 1 ARIMA(poly.gm3 ~ feedsludge + rwntu)    38.9  0.000206

Forecasting ARIMA model in the next 24 months

poly_future <- new_data(dat, 24) %>%
  mutate(
    feedsludge = mean(dat$feedsludge),
    rwntu = mean(dat$rwntu),
    poly.gm3 = mean(dat$poly.gm3)
    )
forecast(fit, new_data = poly_future) %>%
  autoplot(dat) + 
  xlab("Year") + ylab("Average change")

CONCLUSION

Comparing AICc values of TSLM and ARIMA which were chosen, the regression model had AICc lowest.

So, the useful model for “optimize polymer” in sludge treatment at STS :

poly.gm3 = 0.02 + 0.003 * feedsludge + 0.0009 * rwntu

THE END.