# importacao ---- 

dados <- read.csv2("dados_Imputed.csv")

# manipulacao dos dados ---- 

dados_prev <- dados %>% 
  filter(str_detect(Data, c("2012" ,"2013"))) %>% 
  select(Data, E1_SO2)

dados_prev$Data <- as.Date(dados_prev$Data)

dados_prev <- data.frame(media_dia = tapply(dados_prev$E1_SO2, dados_prev$Data, mean))

dados_prev <- rownames_to_column(dados_prev, var = "data")

dados_prev$data <- as.Date(dados_prev$data)

dados_prev <- dados_prev %>% 
  filter(data >= "2012-07-01")

dados <- dados_prev %>% 
  filter(data < "2013-11-01")

# grafico ----

ggplot(dados, aes(x = data, y = media_dia)) + 
  geom_line(color = "aquamarine4") + 
  geom_hline(yintercept = mean(dados$media_dia))+ 
  xlab("Data") + 
  ylab("SO2")

dados$media_boxcox <- BoxCox(dados$media_dia, 
                             BoxCox.lambda(dados$media_dia))
dados_prev$media_boxcox <- BoxCox(dados_prev$media_dia,
                                  BoxCox.lambda(dados_prev$media_dia))

ggplot(dados, aes(x = data, y = media_boxcox)) + geom_line(color = "aquamarine4") + 
  geom_hline(yintercept = mean(dados$media_boxcox)) +  
  xlab("Data") +
  ylab("SO2 Transformada")

ggplot(dados) + geom_boxplot(aes(x = format(as.Date(dados$data), "%Y-%m"), 
                                 y = media_boxcox), 
                             fill = "aquamarine4") + 
  xlab("Data") + 
  ylab("S02 Transformada")
## Warning: Use of `dados$data` is discouraged. Use `data` instead.

# Teste de normalidade ---- 

jarque.bera.test(dados$media_boxcox)
## 
##  Jarque Bera Test
## 
## data:  dados$media_boxcox
## X-squared = 16.19, df = 2, p-value = 0.000305
# Periodograma para boxcox ---- 

periodograma <- itsmr::periodogram(x = dados$media_boxcox)

# Periodo (sazonalidade) ---- 

freq <- which.max(periodograma)/nrow(dados)
per <- 1/freq

# regressao seno cosseno ajuste por periodo ----

dadoscos <- cos(2*pi*freq*(1:nrow(dados)))
dadossin <- sin(2*pi*freq*(1:nrow(dados)))

fit <- lm(dados$media_boxcox ~ dadoscos + dadossin)
summary(fit)
## 
## Call:
## lm(formula = dados$media_boxcox ~ dadoscos + dadossin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59036 -0.11513  0.00252  0.13689  0.55412 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.816066   0.008881  204.50   <2e-16 ***
## dadoscos    -0.129256   0.012559  -10.29   <2e-16 ***
## dadossin    -0.006031   0.012559   -0.48    0.631    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1962 on 485 degrees of freedom
## Multiple R-squared:  0.1796, Adjusted R-squared:  0.1762 
## F-statistic: 53.08 on 2 and 485 DF,  p-value: < 2.2e-16
fit <- lm(dados$media_boxcox ~ dadoscos)
summary(fit)
## 
## Call:
## lm(formula = dados$media_boxcox ~ dadoscos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59499 -0.11491  0.00195  0.14224  0.55875 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.816066   0.008874   204.7   <2e-16 ***
## dadoscos    -0.129256   0.012549   -10.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.196 on 486 degrees of freedom
## Multiple R-squared:  0.1792, Adjusted R-squared:  0.1775 
## F-statistic: 106.1 on 1 and 486 DF,  p-value: < 2.2e-16
# plots do modelo ----

par(mfrow=c(1,1))
plot.ts(dados$media_boxcox,xlab="Tempo",ylab="SO2 Transformada")
lines(fit$fitted.values, col='red')

par(mfrow=c(1,2))
acf(fit$residuals)
pacf(fit$residuals)

par(mfrow=c(1,1))
plot.ts(x = fit$residuals, ylab="Resíduos",xlab="Tempo")

par(mfrow=c(1,1))
itsmr::periodogram(x = fit$residuals)

# teste para estacionariedade e variabilidade ---- 

adf.test(dados$media_dia)
## Warning in adf.test(dados$media_dia): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  dados$media_dia
## Dickey-Fuller = -4.4778, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
adf.test(dados$media_boxcox)
## Warning in adf.test(dados$media_boxcox): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  dados$media_boxcox
## Dickey-Fuller = -4.0177, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
adf.test(fit$residuals)
## Warning in adf.test(fit$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  fit$residuals
## Dickey-Fuller = -5.298, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# EACF ----

TSA::eacf(fit$residuals,ar.max=8, ma.max=8)
## AR/MA
##   0 1 2 3 4 5 6 7 8
## 0 x x x x x x x x x
## 1 x x x o o o o o o
## 2 x x o o o o o o o
## 3 x x x o o o o o o
## 4 x x x x o o o o o
## 5 o x x x o o o o o
## 6 o x x x x x o o o
## 7 x x x o o x x o o
## 8 x x x x o x x o o
# modelo completo ---- 

Xreg <- model.matrix(~ dadoscos)
fit_RegARMA <- arima(x = dados$media_boxcox, order = c(1,0,1),
                   xreg = Xreg, include.mean = FALSE)
summary(fit_RegARMA)
## 
## Call:
## arima(x = dados$media_boxcox, order = c(1, 0, 1), xreg = Xreg, include.mean = FALSE)
## 
## Coefficients:
##          ar1      ma1  (Intercept)  dadoscos
##       0.8787  -0.6641       1.8120   -0.1375
## s.e.  0.0490   0.0791       0.0223    0.0311
## 
## sigma^2 estimated as 0.03202:  log likelihood = 147.12,  aic = -286.23
## 
## Training set error measures:
## Warning in trainingaccuracy(object, test, d, D): test elements must be within
## sample
##               ME RMSE MAE MPE MAPE
## Training set NaN  NaN NaN NaN  NaN
lmtest::coeftest(fit_RegARMA)
## 
## z test of coefficients:
## 
##              Estimate Std. Error z value  Pr(>|z|)    
## ar1          0.878734   0.048961 17.9477 < 2.2e-16 ***
## ma1         -0.664121   0.079076 -8.3985 < 2.2e-16 ***
## (Intercept)  1.811956   0.022263 81.3903 < 2.2e-16 ***
## dadoscos    -0.137501   0.031102 -4.4209 9.829e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adjusted <- fit$fitted.values+(fit$residuals-fit_RegARMA$residuals)

par(mfrow=c(1,2));acf(fit_RegARMA$residuals);pacf(fit_RegARMA$residuals)

par(mfrow=c(1,1));plot.ts(dados$media_boxcox,ylab="S02 Tranformada",xlab="Tempo");lines(adjusted, col='red')

periodograma<-itsmr::periodogram(x = fit_RegARMA$residuals)

hist(fit_RegARMA$residuals)

jarque.bera.test(fit_RegARMA$residuals)
## 
##  Jarque Bera Test
## 
## data:  fit_RegARMA$residuals
## X-squared = 5.0695, df = 2, p-value = 0.07928
# modelo forecast para previsao ---- 

fit_RegARMA <- Arima(y = dados$media_boxcox, order = c(1,0,1),
                   xreg = Xreg, include.mean = FALSE)

# teste para explicacao de autocorrelacao ate o lag k ----

Box.test(x = fit_RegARMA$residuals, lag=25, fitdf = 4, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fit_RegARMA$residuals
## X-squared = 23.932, df = 21, p-value = 0.2964
# Pontos de quebra ----

breaked <- breakpoints(fit_RegARMA$residuals ~ 1)
summary(breaked) # BIC mínimo = 0
## 
##   Optimal (m+1)-segment partition: 
## 
## Call:
## breakpoints.formula(formula = fit_RegARMA$residuals ~ 1)
## 
## Breakpoints at observation number:
##                           
## m = 1                  381
## m = 2   78             381
## m = 3   78     280     381
## m = 4   78 170 280     381
## m = 5   78 170 244 318 397
## 
## Corresponding to breakdates:
##                           
## m = 1                  381
## m = 2   78             381
## m = 3   78     280     381
## m = 4   78 170 280     381
## m = 5   78 170 244 318 397
## 
## Fit:
##                                                    
## m   0       1       2       3       4       5      
## RSS   15.62   15.49   15.41   15.32   15.30   15.29
## BIC -282.18 -274.06 -264.08 -254.65 -243.04 -230.74
plot(breaked)

# previsao ---- 

n <- nrow(dados)
h <- nrow(dados_prev) - nrow(dados)

dadoscos.new <- cos(2*pi*freq*(1:h+n))
dadossin.new <- sin(2*pi*freq*(1:h+n))

Xreg.new <- model.matrix( ~ dadoscos,
                          data = data.frame(dadoscos = dadoscos.new))

prev <- forecast(object = fit_RegARMA, h=h, xreg = Xreg.new)

prev.mean.x <- InvBoxCox(prev$mean, BoxCox.lambda(x = dados$media_dia))
prev.lower.x <- InvBoxCox(prev$lower[,2], BoxCox.lambda(x = dados$media_dia))
prev.upper.x <- InvBoxCox(prev$upper[,2], BoxCox.lambda(x = dados$media_dia))
m <- min(prev.lower.x, dados_prev$media_dia)
M <- max(prev.upper.x, dados_prev$media_dia)
plot.ts(dados_prev$media_dia[seq(n, n+h)], ylim=c(m, M),xlab="Tempo",ylab="S02 Tranformada")
lines(1:h, prev.mean.x, col='red', lwd=1.5)
lines(1:h, prev.lower.x, col='red', lwd=1.5, lty=2)
lines(1:h, prev.upper.x, col='red', lwd=1.5, lty=2)