final scipt

## Warning: package 'fpp2' was built under R version 3.5.3

## Warning: package 'forecast' was built under R version 3.5.3

## Warning: package 'fma' was built under R version 3.5.3

## Warning: package 'expsmooth' was built under R version 3.5.3

## Warning: package 'corrplot' was built under R version 3.5.3

## Warning: package 'zoo' was built under R version 3.5.3

data visualization

train.sj[, 4:ncol(train.sj)] %>% 
  cor(use = "pairwise.complete.obs") %>% 
  corrplot(tl.cex = 0.5, 
           method = "circle")

train.iq[, 4:ncol(train.iq)] %>% 
  cor(use = "pairwise.complete.obs") %>% 
  corrplot(tl.cex = 0.5, 
           method = "circle")

choose three indicators from all of the indicators

autoplot(ts(train.sj$total_cases, start = c(1990,18), frequency = 52))

autoplot(ts(train.iq$total_cases, start = c(2000,26), frequency = 52))

the historical data is not whitenoise, may need transformation

# prepare for modeling and forecast
train.sj.new <- train.sj[,c("total_cases",
                            "reanalysis_specific_humidity_g_per_kg",
                            "reanalysis_precip_amt_kg_per_m2",
                            "reanalysis_dew_point_temp_k")]
train.iq.new <- train.iq[,c("total_cases",
                            "reanalysis_specific_humidity_g_per_kg",
                            "reanalysis_precip_amt_kg_per_m2",
                            "reanalysis_dew_point_temp_k")]

new.names <- c("cases", "humidity","precipitation","temperature")
colnames(train.sj.new) <- new.names
colnames(train.iq.new) <- new.names

test.feature.sj <- subset(test.feature, city == "sj")
test.feature.iq <- subset(test.feature, city == "iq")
test.sj.new <- test.feature.sj[,c("reanalysis_specific_humidity_g_per_kg",
                                   "reanalysis_precip_amt_kg_per_m2",
                                   "reanalysis_dew_point_temp_k")]
test.iq.new <- test.feature.iq[,c("reanalysis_specific_humidity_g_per_kg",
                                   "reanalysis_precip_amt_kg_per_m2",
                                   "reanalysis_dew_point_temp_k")]
test.new.names <- c("humidity","precipitation","temperature")
colnames(test.sj.new)  <- test.new.names
colnames(test.iq.new)  <- test.new.names
# deal with NA values
colSums(sapply(train.sj.new, is.na))

##         cases      humidity precipitation   temperature 
##             0             6             6             6

colSums(sapply(train.iq.new, is.na))

##         cases      humidity precipitation   temperature 
##             0             4             4             4

colSums(sapply(test.sj.new, is.na))

##      humidity precipitation   temperature 
##             2             2             2

colSums(sapply(test.iq.new, is.na))

##      humidity precipitation   temperature 
##             0             0             0

train.sj.new <- na.locf(train.sj.new)
train.iq.new <- na.locf(train.iq.new)
test.sj.new <- na.locf(test.sj.new)
# ts form
ts.train.sj.new <- ts(train.sj.new, start=c(1990,18),frequency = 52)
ts.train.iq.new <- ts(train.iq.new, start=c(2000,26),frequency = 52)

model

#model1 - tslm
sj_fit1 <- tslm(cases ~ humidity + precipitation + temperature, 
                data = ts.train.sj.new)
summary(sj_fit1)

## 
## Call:
## tslm(formula = cases ~ humidity + precipitation + temperature, 
##     data = ts.train.sj.new)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54.61 -23.67 -11.19   3.70 424.94 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    1.130e+04  5.418e+03   2.086   0.0372 *
## humidity       4.737e+01  1.961e+01   2.415   0.0159 *
## precipitation  5.137e-02  4.934e-02   1.041   0.2981  
## temperature   -4.085e+01  1.946e+01  -2.099   0.0360 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.21 on 932 degrees of freedom
## Multiple R-squared:  0.04811,    Adjusted R-squared:  0.04504 
## F-statistic:  15.7 on 3 and 932 DF,  p-value: 5.742e-10

autoplot(ts.train.sj.new[,"cases"]) +
  autolayer(sj_fit1$fitted)

sj.fcast_fit1 <- forecast(sj_fit1, newdata = test.sj.new)
autoplot(sj.fcast_fit1)

accuracy(sj.fcast_fit1)

##                        ME     RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set 5.177944e-15 50.10345 27.07694 -Inf  Inf 0.7359268 0.8460648

iq_fit1 <- tslm(cases ~ humidity + precipitation + temperature, 
                data = ts.train.iq.new)
summary(iq_fit1)

## 
## Call:
## tslm(formula = cases ~ humidity + precipitation + temperature, 
##     data = ts.train.iq.new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.874  -5.226  -2.863   1.333 106.619 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   2753.70749 1361.82683   2.022   0.0437 *
## humidity        11.76303    4.81483   2.443   0.0149 *
## precipitation   -0.01356    0.01126  -1.204   0.2291  
## temperature     -9.97141    4.88607  -2.041   0.0418 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.45 on 516 degrees of freedom
## Multiple R-squared:  0.0639, Adjusted R-squared:  0.05846 
## F-statistic: 11.74 on 3 and 516 DF,  p-value: 1.88e-07

autoplot(ts.train.iq.new[,"cases"]) +
  autolayer(iq_fit1$fitted)

iq.fcast_fit1 <- forecast(iq_fit1, newdata = test.iq.new)
autoplot(iq.fcast_fit1)

accuracy(iq.fcast_fit1)

##                         ME    RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set -5.059788e-16 10.4058 6.274696 -Inf  Inf 0.6652827 0.3946892

# model2 - Arima errors
sj_fit2 <- auto.arima(ts.train.sj.new[,"cases"],
                 xreg = ts.train.sj.new[,"humidity"])
summary(sj_fit2)

## Series: ts.train.sj.new[, "cases"] 
## Regression with ARIMA(3,1,2)(0,0,1)[52] errors 
## 
## Coefficients:
##           ar1      ar2      ar3     ma1     ma2     sma1   xreg
##       -0.8602  -1.0974  -0.4001  0.4645  0.8210  -0.0687  3.840
## s.e.   0.0586   0.0393   0.0335  0.0586  0.0422   0.0319  0.713
## 
## sigma^2 estimated as 612.7:  log likelihood=-4323.96
## AIC=8663.92   AICc=8664.08   BIC=8702.64
## 
## Training set error measures:
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.004373336 24.64724 12.46461 -Inf  Inf 0.3387769 -0.03176884

checkresiduals(sj_fit2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(3,1,2)(0,0,1)[52] errors
## Q* = 202.56, df = 97, p-value = 1.965e-09
## 
## Model df: 7.   Total lags used: 104

autoplot(ts.train.sj.new[,"cases"]) +
  autolayer(sj_fit2$fitted)

sj.fcast_fit2 <- forecast(sj_fit2, 
                          xreg = test.sj.new[,"humidity"])
autoplot(sj.fcast_fit2)

accuracy(sj.fcast_fit2)

##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.004373336 24.64724 12.46461 -Inf  Inf 0.3387769 -0.03176884

iq_fit2 <- auto.arima(ts.train.iq.new[,"cases"],
                      xreg = ts.train.iq.new[,"humidity"])
summary(iq_fit2)

## Series: ts.train.iq.new[, "cases"] 
## Regression with ARIMA(2,0,1)(0,0,2)[52] errors 
## 
## Coefficients:
##          ar1     ar2      ma1     sma1    sma2  intercept    xreg
##       0.5053  0.2896  -0.3107  -0.0065  0.0943    -5.1111  0.7383
## s.e.  0.1013  0.0634   0.1054   0.0462  0.0507     6.3822  0.3645
## 
## sigma^2 estimated as 76.39:  log likelihood=-1862.4
## AIC=3740.8   AICc=3741.08   BIC=3774.83
## 
## Training set error measures:
##                      ME     RMSE     MAE MPE MAPE      MASE        ACF1
## Training set 0.05237957 8.681177 4.86068 NaN  Inf 0.5153599 0.001998071

checkresiduals(iq_fit2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(2,0,1)(0,0,2)[52] errors
## Q* = 77.752, df = 97, p-value = 0.9246
## 
## Model df: 7.   Total lags used: 104

autoplot(ts.train.iq.new[,"cases"]) +
  autolayer(iq_fit2$fitted)

iq.fcast_fit2 <- forecast(iq_fit2, 
                          xreg = test.iq.new[,"humidity"])
autoplot(iq.fcast_fit2)

accuracy(iq.fcast_fit2)

##                      ME     RMSE     MAE MPE MAPE      MASE        ACF1
## Training set 0.05237957 8.681177 4.86068 NaN  Inf 0.5153599 0.001998071

# model 3 - neural network

sj_fit3 <- nnetar(ts.train.sj.new[,"cases"], 
                  xreg = ts.train.sj.new[,"humidity"])
summary(sj_fit3)

##           Length Class        Mode     
## x         936    ts           numeric  
## m           1    -none-       numeric  
## p           1    -none-       numeric  
## P           1    -none-       numeric  
## scalex      2    -none-       list     
## scalexreg   2    -none-       list     
## size        1    -none-       numeric  
## xreg      936    -none-       numeric  
## subset    936    -none-       numeric  
## model      20    nnetarmodels list     
## nnetargs    0    -none-       list     
## fitted    936    ts           numeric  
## residuals 936    ts           numeric  
## lags       21    -none-       numeric  
## series      1    -none-       character
## method      1    -none-       character
## call        3    -none-       call

autoplot(ts.train.sj.new[,"cases"]) +
  autolayer(sj_fit3$fitted)

## Warning: Removed 52 rows containing missing values (geom_path).

sj.fcast_fit3 <- forecast(sj_fit3, xreg=test.sj.new[,1])
autoplot(sj.fcast_fit3)

accuracy(sj.fcast_fit3)

##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.1073929 8.532394 5.586333 -Inf  Inf 0.1518315 0.04115448

iq_fit3 <- nnetar(ts.train.iq.new[,"cases"],
                  xreg = ts.train.iq.new[,"humidity"] )
summary(iq_fit3)

##           Length Class        Mode     
## x         520    ts           numeric  
## m           1    -none-       numeric  
## p           1    -none-       numeric  
## P           1    -none-       numeric  
## scalex      2    -none-       list     
## scalexreg   2    -none-       list     
## size        1    -none-       numeric  
## xreg      520    -none-       numeric  
## subset    520    -none-       numeric  
## model      20    nnetarmodels list     
## nnetargs    0    -none-       list     
## fitted    520    ts           numeric  
## residuals 520    ts           numeric  
## lags       14    -none-       numeric  
## series      1    -none-       character
## method      1    -none-       character
## call        3    -none-       call

autoplot(ts.train.iq.new[,"cases"]) +
  autolayer(iq_fit3$fitted)

## Warning: Removed 52 rows containing missing values (geom_path).

iq.fcast_fit3 <- forecast(iq_fit3, xreg = test.iq.new[,"humidity"])
autoplot(iq.fcast_fit3)

accuracy(iq.fcast_fit3)

##                       ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set -0.05163778 3.197393 2.303775 NaN  Inf 0.2442607 0.0007720475

feature <- rbind(train.feature, test.feature)
feature.sj <- subset(feature, city=="sj")
feature.sj <- na.locf(feature.sj)
colSums(sapply(feature.sj, is.na))

##                                  city 
##                                     0 
##                                  year 
##                                     0 
##                            weekofyear 
##                                     0 
##                       week_start_date 
##                                     0 
##                               ndvi_ne 
##                                     0 
##                               ndvi_nw 
##                                     0 
##                               ndvi_se 
##                                     0 
##                               ndvi_sw 
##                                     0 
##                  precipitation_amt_mm 
##                                     0 
##                 reanalysis_air_temp_k 
##                                     0 
##                 reanalysis_avg_temp_k 
##                                     0 
##           reanalysis_dew_point_temp_k 
##                                     0 
##             reanalysis_max_air_temp_k 
##                                     0 
##             reanalysis_min_air_temp_k 
##                                     0 
##       reanalysis_precip_amt_kg_per_m2 
##                                     0 
##  reanalysis_relative_humidity_percent 
##                                     0 
##          reanalysis_sat_precip_amt_mm 
##                                     0 
## reanalysis_specific_humidity_g_per_kg 
##                                     0 
##                     reanalysis_tdtr_k 
##                                     0 
##                    station_avg_temp_c 
##                                     0 
##               station_diur_temp_rng_c 
##                                     0 
##                    station_max_temp_c 
##                                     0 
##                    station_min_temp_c 
##                                     0 
##                     station_precip_mm 
##                                     0

feature.iq <- subset(feature, city=="iq")
feature.iq <- na.locf(feature.iq)
autoplot(ts(log(feature.sj$reanalysis_specific_humidity_g_per_kg), 
            start=c(1990,18),frequency = 52))

autoplot(ts(feature.iq$reanalysis_specific_humidity_g_per_kg, 
            start=c(2000,26),frequency = 52))

ts.sj.cases <- ts(train.sj$total_cases, 
                  start=c(1990,18),frequency = 52)
ts.sj.humi <- ts(feature.sj$reanalysis_specific_humidity_g_per_kg, 
                 start=c(1990,18),frequency = 52)
ts.iq.cases <- ts(train.iq$total_cases, start=c(2000,26),frequency = 52)
ts.iq.humi <- ts(feature.iq$reanalysis_specific_humidity_g_per_kg, 
                 start=c(2000,26),frequency = 52)
sj.df.humi <- diff(ts.sj.humi)
train_sj.df.humi=sj.df.humi[1:936]
test_sj.df.humi=sj.df.humi[936:1195]

f1 <- auto.arima(ts.sj.cases, xreg = train_sj.df.humi)
summary(f1)

## Series: ts.sj.cases 
## Regression with ARIMA(2,1,5)(0,0,2)[52] errors 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2      ma3     ma4     ma5     sma1
##       -0.7131  -0.8425  0.2742  0.5514  -0.3240  0.1549  0.1410  -0.0645
## s.e.   0.0415   0.0449  0.0508  0.0479   0.0379  0.0361  0.0371   0.0351
##         sma2     xreg
##       0.0692  -0.1143
## s.e.  0.0330   0.7926
## 
## sigma^2 estimated as 618.1:  log likelihood=-4326.9
## AIC=8675.8   AICc=8676.09   BIC=8729.05
## 
## Training set error measures:
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.001647022 24.71486 12.09538 -Inf  Inf 0.3287415 0.005220595

Box.test(ts.sj.cases)

## 
##  Box-Pierce test
## 
## data:  ts.sj.cases
## X-squared = 681.47, df = 1, p-value < 2.2e-16

library(tseries)

## Warning: package 'tseries' was built under R version 3.5.3

lambda_sj.cases <- BoxCox.lambda(ts.sj.cases)
Box.test(BoxCox(ts.sj.cases, lambda = lambda_sj.cases))

## 
##  Box-Pierce test
## 
## data:  BoxCox(ts.sj.cases, lambda = lambda_sj.cases)
## X-squared = 0.016433, df = 1, p-value = 0.898

kpss.test(BoxCox(ts.sj.cases, lambda = lambda_sj.cases))

## Warning in kpss.test(BoxCox(ts.sj.cases, lambda = lambda_sj.cases)): p-
## value greater than printed p-value

## 
##  KPSS Test for Level Stationarity
## 
## data:  BoxCox(ts.sj.cases, lambda = lambda_sj.cases)
## KPSS Level = 0.1353, Truncation lag parameter = 6, p-value = 0.1

autoplot(diff((ts.train.iq.new[,"humidity"])))

f2 <- auto.arima(ts.train.sj.new[,"cases"],
                      xreg = ts.train.sj.new[,"humidity"],
                      lambda = lambda_sj.cases)
summary(f2)

## Series: ts.train.sj.new[, "cases"] 
## Regression with ARIMA(3,0,2) errors 
## Box Cox transformation: lambda= 4.102259e-05 
## 
## Coefficients:
##           ar1     ar2     ar3     ma1      ma2     xreg
##       -0.4386  0.4445  0.3623  0.3946  -0.2322  -5.7972
## s.e.   0.0802  0.0751  0.0313  0.0854   0.0833   5.2959
## 
## sigma^2 estimated as 2184647:  log likelihood=-8156.71
## AIC=16327.42   AICc=16327.54   BIC=16361.31
## 
## Training set error measures:
##                ME RMSE MAE  MPE MAPE MASE ACF1
## Training set -Inf  Inf Inf -Inf  Inf  Inf  NaN