setwd("/Volumes/Passport_Mac/RU_SPRING_2020/Forecasting&BigData/Project-1")
data <- read.csv("project1_data.csv", header = TRUE)
summary(data)
##       Year            UR            MedCare     
##  Min.   :2009   Min.   : 3.500   Min.   :369.8  
##  1st Qu.:2011   1st Qu.: 4.400   1st Qu.:403.4  
##  Median :2014   Median : 6.100   Median :435.8  
##  Mean   :2014   Mean   : 6.475   Mean   :437.7  
##  3rd Qu.:2017   3rd Qu.: 8.500   3rd Qu.:474.4  
##  Max.   :2020   Max.   :10.000   Max.   :512.1
UR <- ts(data$UR, frequency=12, start = c(2009,1))
MedCare <- ts(data$MedCare, frequency=12, start=c(2009,1))
summary(UR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.500   4.400   6.100   6.475   8.500  10.000
plot(UR, type="o")

ggAcf(UR)

#forecasting methods
#forecasts the next period (Feb 2020)
#average method
plot(meanf(UR, h=1))

#naive method
UR1=naive(UR, h=1)
plot(UR1)

#rwf with drift
UR2=rwf(UR, h=1, drift = TRUE)
plot(UR2)

# seasonal naive method
UR3=snaive(UR, h=1)
plot(UR3)

meanf(UR, h=1)
##          Point Forecast    Lo 80   Hi 80    Lo 95    Hi 95
## Feb 2020       6.475188 3.666106 9.28427 2.161019 10.78936
UR1
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Feb 2020            3.6 3.393115 3.806885 3.283597 3.916403
UR2
##          Point Forecast    Lo 80    Hi 80    Lo 95   Hi 95
## Feb 2020       3.568182 3.364583 3.771781 3.256804 3.87956
UR3
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Feb 2020            3.8 2.833924 4.766076 2.322514 5.277486
#residuals from each model
resmeanf = residuals(meanf(UR, h=1))
Box.test(resmeanf, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resmeanf
## X-squared = 2148.2, df = 24, p-value < 2.2e-16
checkresiduals(UR)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

resnaive = residuals(UR1)
Box.test(resnaive, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resnaive
## X-squared = 25.905, df = 24, p-value = 0.358
checkresiduals(UR1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Naive method
## Q* = 25.905, df = 24, p-value = 0.358
## 
## Model df: 0.   Total lags used: 24
resdrift = residuals(UR2)
Box.test(resdrift, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resdrift
## X-squared = 25.905, df = 24, p-value = 0.358
checkresiduals(UR2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Random walk with drift
## Q* = 25.905, df = 23, p-value = 0.3054
## 
## Model df: 1.   Total lags used: 24
resnaives = residuals(UR3)
Box.test(resnaives, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resnaives
## X-squared = 267.55, df = 24, p-value < 2.2e-16
checkresiduals(UR3)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 267.55, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

It is clear this series (UR) shows a trend. The slow decrease in ACF as the lags increase is due to this trend. -Point Forecasts (Feb 2020) - Mean Method: 6.475, (2.161019, 10.78936) 95% Naive Method: 3.6, (3.283597, 3.916403) 95% Drift Method: 3.568, (3.256804, 3.87956) 95% Seasonal Naive Method: 3.8, (2.322514, 5.277486) 95%

Residuals Box-Test: Mean Method: p-value < 2.2e-16 Naive Method: p-value = 0.358 Drift Method: p-value = 0.358 Season Naive Method: p-value < 2.2e-16

Conclusion- Both mean and naive seasonal methods produce strongly correlated residuals, therefore cannot be trusted for reliable forecasts. The naive method and drift method each produce more weakly correlected residuals as shown by the ACF plots for each method. I would pick the random walk with drift method for this time series as its residuals show they are the closest to a normal distribution.

setwd("/Volumes/Passport_Mac/RU_SPRING_2020/Forecasting&BigData/Project-1")
data <- read.csv("project1_data.csv", header = TRUE)
summary(data)
##       Year            UR            MedCare     
##  Min.   :2009   Min.   : 3.500   Min.   :369.8  
##  1st Qu.:2011   1st Qu.: 4.400   1st Qu.:403.4  
##  Median :2014   Median : 6.100   Median :435.8  
##  Mean   :2014   Mean   : 6.475   Mean   :437.7  
##  3rd Qu.:2017   3rd Qu.: 8.500   3rd Qu.:474.4  
##  Max.   :2020   Max.   :10.000   Max.   :512.1
MedCare <- ts(data$MedCare, frequency=12, start=c(2009,1))
summary(MedCare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   369.8   403.4   435.8   437.7   474.4   512.1
plot(MedCare, type="o")

ggAcf(MedCare)

#forecasting methods
#forecasts the next period (Feb 2020)
#average method
plot(meanf(MedCare, h=1))

#naive method
MedCare1=naive(MedCare, h=1)
plot(MedCare1)

#rwf with drift
MedCare2=rwf(MedCare, h=1, drift = TRUE)
plot(MedCare2)

# seasonal naive method
MedCare3=snaive(MedCare, h=1)
plot(MedCare3)

meanf(MedCare, h=1)
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Feb 2020       437.7027 386.8392 488.5662 359.5868 515.8186
MedCare1
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Feb 2020        512.149 510.1745 514.1235 509.1293 515.1687
MedCare2
##          Point Forecast    Lo 80   Hi 80    Lo 95    Hi 95
## Feb 2020       513.2272 511.8113 514.643 511.0619 515.3925
MedCare3
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Feb 2020        491.227 474.8184 507.6356 466.1323 516.3217
#residuals from each model
resmeanf = residuals(meanf(MedCare, h=1))
Box.test(resmeanf, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resmeanf
## X-squared = 1876.4, df = 24, p-value < 2.2e-16
checkresiduals(MedCare)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

resnaive = residuals(MedCare1)
Box.test(resnaive, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resnaive
## X-squared = 49.571, df = 24, p-value = 0.001605
checkresiduals(MedCare1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Naive method
## Q* = 49.571, df = 24, p-value = 0.001605
## 
## Model df: 0.   Total lags used: 24
resdrift = residuals(MedCare2)
Box.test(resdrift, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resdrift
## X-squared = 49.571, df = 24, p-value = 0.001605
checkresiduals(MedCare2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Random walk with drift
## Q* = 49.571, df = 23, p-value = 0.001048
## 
## Model df: 1.   Total lags used: 24
resnaives = residuals(MedCare3)
Box.test(resnaives, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  resnaives
## X-squared = 468.82, df = 24, p-value < 2.2e-16
checkresiduals(MedCare3)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 468.82, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

It is clear this series (MedCare) shows a trend. The slow decrease in ACF as the lags increase is due to this trend.

-Point Forecasts (Feb 2020) - Mean Method: 437.7027, (359.5868, 515.8186) 95% Naive Method: 512.149, (509.1293, 515.1687) 95% Drift Method: 513.2272, (511.0619, 515.3925) 95% Seasonal Naive Method: 491.227, (466.1323, 516.3217) 95%

Residuals Testing: Mean Method: p-value < 2.2e-16 Naive Method: p-value = p-value = 0.001605 Drift Method: p-value = p-value = 0.001605 Season Naive Method: p-value < 2.2e-16

Conclusion-

Both mean and naive seasonal methods produce strongly correlated residuals, therefore cannot be trusted for reliable forecasts. The naive method and drift method each produce more weakly correlected residuals as shown by the ACF plots for each method. I would pick the random walk with drift method for this time series as its residuals show they are the closest to a normal distribution.