## Load in data
atm_data <- read_xlsx("ATM624Data2.xlsx")
## spread df int multiple columns,
spread_df <- atm_data %>%
spread(.,ATM,Cash)
## apply describe over ATM's machines
dfs <- lapply(spread_df[,c("ATM1","ATM2","ATM3","ATM4")],function(x){psych::describe(x)})
kable(dfs,digits =2,'markdown',booktabs =T)| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 1 | 362 | 83.89 | 36.66 | 91 | 86.65 | 25.2 | 1 | 180 | 179 | -0.71 | 0.19 | 1.93 |
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 1 | 363 | 62.58 | 38.9 | 67 | 62.24 | 48.93 | 0 | 147 | 147 | -0.03 | -1.09 | 2.04 |
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 1 | 365 | 0.72 | 7.94 | 0 | 0 | 0 | 0 | 96 | 96 | 10.93 | 118.38 | 0.42 |
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 1 | 365 | 474.04 | 650.94 | 403.84 | 416.68 | 425.8 | 1.56 | 10919.76 | 10918.2 | 11.39 | 178.91 | 34.07 |
table(atm_data$ATM)##
## ATM1 ATM2 ATM3 ATM4
## 365 365 365 365
## scatterplot data of ATM'S
ggplot(atm_data) + geom_point(aes(x=DATE, y=Cash))+
facet_wrap(~ ATM,scales = "free")## Warning: Removed 5 rows containing missing values (geom_point).
the matrix plots below are to be read ATM1-top left, ATM2 top right, ATM3 bottom left, and ATM4 is bottom right
## $ATM1
##
## $ATM2
##
## $ATM3
##
## $ATM4
# ##print missing vlaues
# spread_df[!complete.cases(spread_df),]
# tempData <- mice(spread_df,m=5,maxit=50,meth='pmm',seed=500)
# summary(tempData)
# completedData <- complete(tempData,1)
#
#
# write.csv(completedData, file = "imputed.csv")High Level Summary of ATM prediction
x <- getURL("https://raw.githubusercontent.com/justinherman42/data624/master/imputed.csv")
completedData <- read.csv(text = x)
##check on imputation
sum(is.na(completedData))## [1] 0
##grab atm1
atm1 <- completedData[ ,c('ATM1','DATE') ]
atm1_ts <- ts(atm1$ATM1,frequency=7)
##plot decomps and lag plots
plot_grid(autoplot(atm1_ts),ggAcf(atm1_ts,lag=365))gglagplot(atm1_ts,lag=13)#gglagplot(atm1_ts,lag=4)
additive_atm1 <- atm1_ts %>% decompose(type="additive")
plot(additive_atm1) stl_decomposition <- atm1_ts %>%
stl(t.window=13, s.window="periodic", robust=TRUE)
#plot(stl_decomposition)##make split
myts.train <- atm1[1:336,]
test_results <- atm1[336:365,]$ATM1
ts_test_results <- ts(test_results)
myts.train <- ts(myts.train$ATM1,frequency=7)
naive_fit <- naive(myts.train)
autoplot(naive_fit,h=29)frequency(myts.train)## [1] 7
fit <- forecast::ets(myts.train, allow.multiplicative.trend=TRUE)
fc <- forecast(fit,h=29)
autoplot(fc)+
autolayer(ts(atm1$ATM1,frequency = 7),series="actual")fcast <- stlf(myts.train,method='naive',h=29)
autoplot(fcast)+
autolayer(ts(atm1$ATM1,frequency = 7),series="actual")random_walk_rmse <- rmse(ts(atm1$ATM1,frequency = 7),fcast$mean)
ets_ana_rmse <- rmse(ts(atm1$ATM1,frequency = 7),fc$mean)
naive_fit_rmse <- rmse(ts(atm1$ATM1,frequency = 7),naive_fit$mean)
## train split results
cbind(ets_ana_rmse,random_walk_rmse,naive_fit_rmse)## ets_ana_rmse random_walk_rmse naive_fit_rmse
## [1,] 12.05313 18.88769 30.4713
## load data from github
x <- getURL("https://raw.githubusercontent.com/justinherman42/data624/master/project%201/cv_scores_atm1.csv")
y <- as.data.frame(read.csv(text = x))
kable(y)| X | naive_fit_cv_rmse | rwf_fit_rmse | arima_fit_rmse | ets_fit_rmse |
|---|---|---|---|---|
| 1 | 50.39462 | 50.39462 | 25.37661 | 24.95437 |
## CV error checks
## Cross validation below took long to run, therefore i saved results to github and load in dataframe
# naive_fit_cv <- forecast::tsCV(ts(completedData$ATM1,frequency = 7),naives, h=1)
# rwf_fit <- forecast::tsCV(ts(completedData$ATM1,frequency = 7),rwf, drift=TRUE, h=1)
# ets_fit <- forecast::tsCV(ts(completedData$ATM1,frequency = 7),fets, h=1)
# arima_fit <- forecast::tsCV(ts(completedData$ATM1,frequency = 7),farima, h=1)
# naive_fit_cv_rmse <- sqrt(mean(rwf_fit^2, na.rm=TRUE))
# rwf_fit_rmse <- sqrt(mean(rwf_fit^2, na.rm=TRUE))
# ets_fit_rmse <- sqrt(mean(ets_fit^2, na.rm=TRUE))
# arima_fit_rmse <- sqrt(mean(arima_fit^2, na.rm=TRUE))
#write.csv(cv_scores,file = "cv_scores_atm1.csv")
#cv_scores <- as.data.frame(cbind(naive_fit_cv_rmse,rwf_fit_rmse,arima_fit_rmse,ets_fit_rmse))
## CV functions
farima <- function(x, h) {
forecast(auto.arima(x,allowdrift=TRUE), h=h)
}
fets <- function(x, h) {
forecast(forecast::ets(x), h = h)
}
naives <- function(x, h) {
forecast(forecast::naive(x), h = h)
}
## actual models
auto_arima_model <- farima(ts(completedData$ATM1,frequency = 7),365/7)
auto_arima_model$model## Series: x
## ARIMA(0,0,1)(0,1,2)[7]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1854 -0.5798 -0.1051
## s.e. 0.0547 0.0507 0.0496
##
## sigma^2 estimated as 554.6: log likelihood=-1639.44
## AIC=3286.87 AICc=3286.98 BIC=3302.39
checkresiduals(auto_arima_model)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,2)[7]
## Q* = 15.395, df = 11, p-value = 0.1651
##
## Model df: 3. Total lags used: 14
ets_model <- fets(ts(completedData$ATM1,frequency = 7),365/7)
ets_model$model## ETS(A,N,A)
##
## Call:
## forecast::ets(y = x)
##
## Smoothing parameters:
## alpha = 0.0182
## gamma = 0.3322
##
## Initial states:
## l = 78.2155
## s = -55.2411 2.9244 11.3521 7.2315 11.2749 12.7148
## 9.7435
##
## sigma: 24.0278
##
## AIC AICc BIC
## 4485.175 4485.797 4524.174
checkresiduals(ets_model)##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 29.034, df = 5, p-value = 2.283e-05
##
## Model df: 9. Total lags used: 14
autoplot(forecast::forecast(auto_arima_model,h=31))may <- forecast::forecast(auto_arima_model,h=31)
kable(may$mean)| x |
|---|
| 87.212759 |
| 104.090548 |
| 73.136168 |
| 8.066955 |
| 99.996116 |
| 80.866183 |
| 86.679746 |
| 87.983125 |
| 102.763405 |
| 73.383796 |
| 8.634997 |
| 100.649617 |
| 80.393494 |
| 86.942325 |
| 88.051637 |
| 102.763405 |
| 73.383796 |
| 8.634997 |
| 100.649617 |
| 80.393494 |
| 86.942325 |
| 88.051637 |
| 102.763405 |
| 73.383796 |
| 8.634997 |
| 100.649617 |
| 80.393494 |
| 86.942325 |
| 88.051637 |
| 102.763405 |
| 73.383796 |
#write.csv(may$mean,file = "ATM1_predictions.csv")## [1] 7
## ets_ana_rmse random_walk_rmse naive_fit_rmse
## [1,] 19.15283 24.53681 49.75741
| X | naive_fit_cv_rmse | rwf_fit_rmse | arima_fit_rmse | ets_fit_rmse |
|---|---|---|---|---|
| 1 | 54.83285 | 54.83285 | 29.40424 | 27.07259 |
## ETS(A,N,A)
##
## Call:
## forecast::ets(y = x)
##
## Smoothing parameters:
## alpha = 1e-04
## gamma = 0.3642
##
## Initial states:
## l = 72.433
## s = -51.8832 -26.1088 17.0946 12.8074 12.719 8.96
## 26.411
##
## sigma: 25.058
##
## AIC AICc BIC
## 4515.820 4516.442 4554.819
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 80.929, df = 43.143, p-value = 0.0004333
##
## Model df: 9. Total lags used: 52.1428571428571
## Series: x
## ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4260 -0.9129 0.4644 0.8015 -0.7495
## s.e. 0.0547 0.0416 0.0850 0.0567 0.0387
##
## sigma^2 estimated as 586.3: log likelihood=-1648.74
## AIC=3309.47 AICc=3309.71 BIC=3332.76
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 9.182, df = 9, p-value = 0.4207
##
## Model df: 5. Total lags used: 14
| x |
|---|
| 67.280206 |
| 69.665971 |
| 15.547293 |
| 1.356739 |
| 96.220008 |
| 92.423392 |
| 52.669333 |
| 67.280206 |
| 69.665971 |
| 15.547293 |
| 1.356739 |
| 96.220008 |
| 92.423392 |
| 52.669333 |
| 67.280206 |
| 69.665971 |
| 15.547293 |
| 1.356739 |
| 96.220008 |
| 92.423392 |
| 52.669333 |
| 67.280206 |
| 69.665971 |
| 15.547293 |
| 1.356739 |
| 96.220008 |
| 92.423392 |
| 52.669333 |
| 67.280206 |
| 69.665971 |
| 15.547293 |
atm3 <- completedData[ ,c('ATM3','DATE') ]
atm3_ts <- ts(atm3$ATM3,frequency=7)
##plot decomps and lag plots
plot_grid(autoplot(atm3_ts),ggAcf(atm3_ts,lag=365))gglagplot(atm3_ts,lag=13)gglagplot(atm3_ts,lag=4)## [1] 7
## Series: myts.train
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 480.8732
## s.e. 36.6534
##
## sigma^2 estimated as 452755: log likelihood=-2664.14
## AIC=5332.29 AICc=5332.32 BIC=5339.92
## ets_ana_rmse random_walk_rmse naive_fit_rmse arima_fit_rmse
## [1,] 412.0649 375.2636 309.7526 337.5212
## Turn max into na and than impute
max_index <- as.integer(which.max(atm4$ATM4))
sum(is.na(atm4))## [1] 0
atm4[285,"ATM4"] <- 591.5083
#tempData <- mice(atm4,m=5,maxit=50,meth='pmm',seed=500)
#tempData$imp$ATM4$`5`
#completedData1 <- complete(tempData,5)## rerun models
#atm4 <- completedData1[ ,c('ATM4','DATE') ]
atm4_ts <- ts(atm4$ATM4,frequency=7)
##make split
myts.train <- atm4[1:336,]
test_results <- atm4[336:365,]$ATM4
ts_test_results <- ts(test_results)
myts.train <- ts(myts.train$ATM4,frequency=7)
naive_fit <- naive(myts.train)
autoplot(naive_fit,h=29)frequency(myts.train)## [1] 7
fit <- ets(myts.train)
fc <- forecast(fit,h=29)
autoplot(fc)+
autolayer(ts(atm4$ATM4,frequency = 7),series="actual")fcast <- stlf(myts.train,method='naive',h=29)
autoplot(fcast)+
autolayer(ts(atm4$ATM4,frequency = 7),series="actual")auto_arima_model <- farima(myts.train,1)
my_arima <- auto.arima(myts.train)
random_walk_rmse <- rmse(ts(atm4$ATM4,frequency = 7),fcast$mean)
ets_ana_rmse <- rmse(ts(atm4$ATM4,frequency = 7),fc$mean)
naive_fit_rmse <- rmse(ts(atm4$ATM4,frequency = 7),naive_fit$mean)
arima_fit_rmse <- rmse(ts(atm4$ATM4,frequency = 7),auto_arima_model$mean)
cbind(ets_ana_rmse,random_walk_rmse,naive_fit_rmse,arima_fit_rmse)## ets_ana_rmse random_walk_rmse naive_fit_rmse arima_fit_rmse
## [1,] 310.7799 260.5841 309.7526 362.2817
checkresiduals(fcast)## Warning in checkresiduals(fcast): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + Random walk
## Q* = 131.15, df = 14, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 14
checkresiduals(my_arima)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 12.255, df = 9, p-value = 0.1993
##
## Model df: 5. Total lags used: 14
| X | naive_fit_cv_rmse | rwf_fit_rmse | arima_fit_rmse | ets_fit_rmse |
|---|---|---|---|---|
| 1 | 477.4819 | 481.7455 | 362.3827 | 343.8724 |
## CV functions
farima <- function(x, h) {
forecast(auto.arima(x,allowdrift=TRUE), h=h)
}
fets <- function(x, h) {
forecast(forecast::ets(x), h = h)
}
naives <- function(x, h) {
forecast(forecast::naive(x), h = h)
}
## actual models
auto_arima_model <- farima(ts(atm4$ATM4,frequency = 7),1)
auto_arima_model$model## Series: x
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
##
## Coefficients:
## ma1 ma2 ma3 sar1 mean
## 0.0854 0.0134 0.0877 0.1843 445.1183
## s.e. 0.0531 0.0582 0.0517 0.0526 25.9995
##
## sigma^2 estimated as 119443: log likelihood=-2649.07
## AIC=5310.13 AICc=5310.37 BIC=5333.53
checkresiduals(auto_arima_model)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 15.4, df = 9, p-value = 0.08052
##
## Model df: 5. Total lags used: 14
ets_model <- fets(ts(atm4$ATM4,frequency = 7),1)
ets_model$model## ETS(A,N,A)
##
## Call:
## forecast::ets(y = x)
##
## Smoothing parameters:
## alpha = 1e-04
## gamma = 1e-04
##
## Initial states:
## l = 445.9429
## s = -270.1123 -34.6411 19.7065 40.4391 87.4913 39.2365
## 117.88
##
## sigma: 332.5713
##
## AIC AICc BIC
## 6403.353 6403.975 6442.352
checkresiduals(ets_model,lag=365/7)##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 53.712, df = 43.143, p-value = 0.1299
##
## Model df: 9. Total lags used: 52.1428571428571
autoplot(forecast(ets_model$fitted,h=31)) # autolayer(forecast(auto_arima_model$fitted,h=31)
may_atm4 <- forecast(ets_model$fitted,h=31)
may_atm4$mean## Time Series:
## Start = c(53, 2)
## End = c(57, 4)
## Frequency = 7
## [1] 485.2171 533.4299 486.3401 465.6330 411.2900 175.7619 563.8406
## [8] 485.1946 533.4097 486.3220 465.6168 411.2754 175.7488 563.8289
## [15] 485.1840 533.4003 486.3135 465.6092 411.2686 175.7427 563.8234
## [22] 485.1791 533.3959 486.3096 465.6057 411.2654 175.7399 563.8208
## [29] 485.1768 533.3938 486.3077
#write.csv(may$mean,file = "ATM1_predictions.csv")
my_predictions <- as.data.frame(cbind(may$mean,may_atm2$mean,may_atm4$mean))
colnames(my_predictions) <- c("ATM1","ATM2","ATM4")
#write.csv(my_predictions,file = "ATMpredictions.csv")residential_power <- read_xlsx("F:/reformat stuff/GitHub/data_624/ResidentialCustomerForecastLoad-624.xlsx")
residential_power <- as.data.frame(residential_power)
## index 151 is min, July 2010
outlier <- which.min(residential_power$KWH)
outlier_value <- residential_power$`YYYY-MMM`[151]
# describe kwh usage
kable(describe(residential_power$KWH))| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 1 | 191 | 6502475 | 1447571 | 6283324 | 6439475 | 1543074 | 770523 | 10655730 | 9885207 | 0.1683404 | 0.4547269 | 104742.6 |
#tsoutliers(residential_power$KWH)
ggplot(residential_power) + geom_point(aes(x=`YYYY-MMM`, y=KWH))+
ggtitle("PRE-IMPUTATION")## Warning: Removed 1 rows containing missing values (geom_point).
## index 129 is NA
na_values <- which(is.na(residential_power$KWH))
## fix and convert time to datetime column
residential_power$Date <- as.yearmon(residential_power$`YYYY-MMM`, "%Y-%b")
residential_power$Date <- as.Date.yearmon(residential_power$Date)
## Credit to https://2series.github.io/post/residential_energy/residential-energy-usage/
kWh <- xts::xts(residential_power$KWH, order.by=residential_power$Date)
par(mfrow=c(2, 1), mar = c(3, 5, 0, 0), oma = c(0, 0, 0.5, 0.5))
original_timeplot <- plot(kWh, main="kWh")
plot(kWh, main="kWh")
# display frequency at which values in a vector occur
hist(kWh, xlab="", main="",col="red")## impute data set
## I SET THE MIN OUTLIER VALUE TO NA HERE AND IMPUTE IT AS WELL
residential_power$KWH[which.min(residential_power$KWH)] <- NA
residential_power$KWH <- ts(residential_power$KWH,frequency=12,start=1998)
residential_power$KWH <- tsclean(residential_power$KWH,replace.missing = T)
#sum(is.na(residential_power$KWH))
## scatterplot data of KWH POST IMPUTATIONS
kWh <- xts::xts(residential_power$KWH, order.by=residential_power$Date)
par(mfrow=c(2, 1), mar = c(3, 5, 0, 0), oma = c(0, 0, 0.5, 0.5),main="After Impute")## Warning in par(mfrow = c(2, 1), mar = c(3, 5, 0, 0), oma = c(0, 0, 0.5, :
## "main" is not a graphical parameter
plot(kWh, main=" after imputation")
hist(kWh, xlab="", main=" after imputation",col="red")##build box cox transformed series
lambda <- BoxCox.lambda(residential_power$KWH )
lambda_kWh <- as.data.frame(cbind(BoxCox(residential_power$KWH,lambda),residential_power$Date))
colnames(lambda_kWh) <- c("KWH","DATE")
lambda_kWh$DATE <- as.Date(lambda_kWh$DATE)
lambda_kWh$KWH <- ts(lambda_kWh$KWH,frequency = 12,start=1998)
lambda_kWh_xts <- xts::xts(lambda_kWh$KWH, order.by=lambda_kWh$DATE)
##PLOT LAMBDA TRANSFORMED TS VERSUS ORIGINAL
par(mfrow=c(2, 1), mar = c(3, 5, 0, 0), oma = c(0, 0, 0.5, 0.5))
plot(lambda_kWh_xts, main="LAMBDA_kWh")
plot(residential_power$KWH, main="kWh")#frequency(lambda_kWh$KWH)
stl_decomposition <- lambda_kWh$KWH %>%
stl(t.window=13, s.window="periodic", robust=TRUE)
plot(stl_decomposition)##
#ndiffs(lambda_kWh$KWH)
nsdiffs(residential_power$KWH)## [1] 1
diff_lambda_trans <- diff(lambda_kWh$KWH)
a <- ggAcf(lambda_kWh$KWH)
b <- ggPacf(lambda_kWh$KWH)
c <- ggAcf(diff_lambda_trans)
d <- ggPacf(diff_lambda_trans)
plot_grid(a,b,c,d)a <- ggAcf(residential_power$KWH)
b <- ggPacf(residential_power$KWH)
c <- ggAcf(diff(residential_power$KWH))
d <- ggPacf(diff(residential_power$KWH))
plot_grid(a,b,c,d)## attempt deseasonalize and log transform
deseasnalized_logged <- diff(log(residential_power$KWH),12)
plot(deseasnalized_logged)a <- ggAcf(deseasnalized_logged)
b <- ggPacf(deseasnalized_logged)
plot_grid(a,b)“In the plots of the seasonally differenced data, there are spikes in the PACF at lags 12 and 24….. This may be suggestive of a seasonal AR(2) term.”
## run models and plot
Arima_with_transform <- Arima(deseasnalized_logged, order=c(1,0,0),seasonal=c(2,0,0))
checkresiduals(Arima_with_transform)##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(2,0,0)[12] with non-zero mean
## Q* = 26.294, df = 20, p-value = 0.1563
##
## Model df: 4. Total lags used: 24
plot(forecast(Arima_with_transform,h=12)) Arima_without_transform <- Arima(residential_power$KWH, order=c(3,0,1),seasonal=c(2,1,0),include.drift=TRUE)
checkresiduals(Arima_without_transform)##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,1)(2,1,0)[12] with drift
## Q* = 11.527, df = 17, p-value = 0.8279
##
## Model df: 7. Total lags used: 24
plot(forecast(Arima_without_transform,h=12)) x <- getURL("https://raw.githubusercontent.com/justinherman42/data624/master/project%201/cross_val_error.csv")
y <- as.data.frame(read.csv(text = x))
kable(y)| X | rbind.ets_error..arima_error2. |
|---|---|
| ets_error | 669293335859 |
| arima_error2 | 387878912641 |
fit <- ets(residential_power$KWH)
fc <- forecast(fit,h=12)
# autoplot(fc)+
# autolayer(residential_power$KWH,series="actual")+
# autolayer(residential_power$KWH,series="actual")
#
# # fcast <- stlf(residential_power$KWH,method='naive',h=12)
# # autoplot(fcast)+
# # autolayer(residential_power$KWH,series="actual")
# #
#
# ## fucntions for tscv
# far2 <- function(x, h){forecast(Arima(x, order=c(3,0,1),seasonal=c(2,1,0),include.drift=TRUE), h=h)}
# fets <- function(x, h) {
# forecast(ets(x), h = h)
# }power_forecast <- forecast(Arima_without_transform,h=12)
power_forecast$mean## Jan Feb Mar Apr May Jun Jul Aug
## 2014 9432442 8570542 6615197 6005401 5917519 8187445 9529206 9992318
## Sep Oct Nov Dec
## 2014 8546942 5856320 6153350 7732248
#write.csv(power_forecast$mean,file = "PartB_Power_predictions.csv")## Load in data
pipe1<- read_xlsx("F:/reformat stuff/GitHub/data_624/Waterflow_Pipe1.xlsx")
pipe2<- read_xlsx("F:/reformat stuff/GitHub/data_624/Waterflow_Pipe2.xlsx")
waterflow <- rbind(pipe1, pipe2)
## aggregate by hour and return mean for pipe 1
h<-as.POSIXct(strptime(pipe1$`Date Time`,format="%Y-%m-%d %H"))
pipe1_clean<-aggregate(pipe1$WaterFlow,by=list(h),mean)
h<-as.POSIXct(strptime(waterflow$`Date Time`,format="%Y-%m-%d %H"))
pipes_clean<-aggregate(waterflow$WaterFlow,by=list(h),mean)
str(as.data.frame(pipes_clean))## 'data.frame': 1001 obs. of 2 variables:
## $ Group.1: POSIXct, format: "2015-10-23 00:00:00" "2015-10-23 01:00:00" ...
## $ x : num 26.1 18.8 24.5 25.6 22.4 ...
## build xts for display
xts_pipe1 <- xts::xts(pipe1_clean$x, order.by=pipe1_clean$Group.1)
attr(xts_pipe1, 'frequency') <- 24
## display xts hist and autoplot
par(mfrow=c(2, 1), mar = c(3, 5, 0, 0), oma = c(0, 0, 0.5, 0.5))
original_timeplot <- plot(xts_pipe1, main="pipe1")
plot(xts_pipe1, main="kWh")
# display frequency at which values in a vector occur
hist(xts_pipe1, xlab="", main="",col="red")## build acf/pacf plots for pipe1 and diff(pipe1)
a <- ggAcf(xts_pipe1,lag=200)
b <- ggPacf(xts_pipe1,lag=200)
## Example of overfdifferencing below
c <- ggAcf(diff(xts_pipe1),lag=10)
d <- ggPacf(diff(xts_pipe1),lag=200)
plot_grid(a,b,c,d)x <- getURL("https://raw.githubusercontent.com/justinherman42/data624/master/project%201/PartC_pipe1_CVscores.csv")
y <- as.data.frame(read.csv(text = x))
kable(y)| X | naive_fit_cv_rmse | rwf_fit_rmse | arima_fit_rmse | ets_fit_rmse |
|---|---|---|---|---|
| 1 | 5.919252 | 6.010851 | 4.405855 | 4.422798 |
eventdata1 <- msts(xts_pipe1, seasonal.periods = c(1,2,3,4))
frequency(xts_pipe1)## [1] 24
arima_model <- auto.arima(xts_pipe1)
ets_model <- ets(xts_pipe1)
stl_mdoel <- stlf(eventdata1)
#tbats_model <- tbats(eventdata1)
checkresiduals(arima_model)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0) with non-zero mean
## Q* = 37.178, df = 46.2, p-value = 0.8256
##
## Model df: 1. Total lags used: 47.2
checkresiduals(ets_model)##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 37.178, df = 45.2, p-value = 0.7962
##
## Model df: 2. Total lags used: 47.2
checkresiduals(stl_mdoel)## Warning in checkresiduals(stl_mdoel): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 2.9318, df = 6, p-value = 0.8174
##
## Model df: 2. Total lags used: 8
#checkresiduals(tbats_model)
## CV functions
farima <- function(x, h) {
forecast(auto.arima(x,allowdrift=TRUE), h=h)
}
fets <- function(x, h) {
forecast(forecast::ets(x), h = h)
}
naives <- function(x, h) {
forecast(forecast::naive(x), h = h)
}
stlfmod <- function(x, h) {
forecast(forecast::stl_mdoel(x), h = h)
}
## actual models and residuals
auto_arima_model <- farima(xts_pipe1,24*7)
ets_model <- fets(xts_pipe1,1)
naives_model <- naives(xts_pipe1,1)
## get cv socres and error
# naive_fit_cv <- forecast::tsCV(xts_pipe1,naives,h=1)
# rwf_fit <- forecast::tsCV(xts_pipe1, rwf, drift=TRUE, h=1)
# ets_fit <- forecast::tsCV(xts_pipe1, fets, h=1)
# arima_fit <- forecast::tsCV(xts_pipe1,farima, h=1)
# naive_fit_cv_rmse <- sqrt(mean(naive_fit_cv^2, na.rm=TRUE))
# rwf_fit_rmse <- sqrt(mean(rwf_fit^2, na.rm=TRUE))
# ets_fit_rmse <- sqrt(mean(ets_fit^2, na.rm=TRUE))
# arima_fit_rmse <- sqrt(mean(arima_fit^2, na.rm=TRUE))
# cv_scores <- as.data.frame(cbind(naive_fit_cv_rmse,rwf_fit_rmse,arima_fit_rmse,ets_fit_rmse))
## make predicitons
#write.csv(cv_scores,file = "PartC_pipe1_CVscores.csv")
plot(forecast(auto_arima_model,h=24*7))xts_pipe2 <- xts::xts(pipe2$WaterFlow, order.by=pipe2$`Date Time`)
attr(xts_pipe2, 'frequency') <- 24
par(mfrow=c(2, 1), mar = c(3, 5, 0, 0), oma = c(0, 0, 0.5, 0.5))
original_timeplot <- plot(xts_pipe1, main="pipe1")
plot(xts_pipe2, main="Pipe 2")
# display frequency at which values in a vector occur
hist(xts_pipe2, xlab="", main="",col="red")a <- ggAcf(xts_pipe2,lag=200)
b <- ggPacf(xts_pipe2,lag=200)
c <- ggAcf(diff(xts_pipe2),lag=200)
d <- ggPacf(diff(xts_pipe2),lag=200)
plot_grid(a,b,c,d)| X | naive_fit_cv_rmse | rwf_fit_rmse | arima_fit_rmse | ets_fit_rmse |
|---|---|---|---|---|
| 1 | 22.95769 | 23.03454 | 16.2082 | 16.17178 |
## [1] 24
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0)(0,0,1)[24] with non-zero mean
## Q* = 55.158, df = 46, p-value = 0.1669
##
## Model df: 2. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 62.381, df = 46, p-value = 0.05406
##
## Model df: 2. Total lags used: 48
## Warning in checkresiduals(stl_mdoel): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(M,N,N)
## Q* = 6.0377, df = 6, p-value = 0.419
##
## Model df: 2. Total lags used: 8