setwd("C:/Users/etital/Desktop")
fludata <- read.csv("ILINet.csv")
fludata <- fludata[,-c(3:8)]
fludata <- fludata[-which(fludata$WEEK==53),]
fludata$ILI.percent <- as.numeric(as.vector(fludata$ILI.percent))
## Warning: NAs introduced by coercion
flu.ts <- ts(as.numeric(fludata$ILI.percent),frequency = 52,start = c(1997,40),end=c(2016,52))
season <- numeric()
max_seasons <- numeric()
for(i in 1:1001){
season <- c(season,fludata$ILI.percent[i])
if(fludata$WEEK[i] == 39){
max_seasons <- c(max_seasons,max(season,na.rm = T))
season <- numeric()
}
}
max_seasons
## [1] 4.910070 5.077597 5.646977 4.186807 3.163820 3.166620 7.064955
## [8] 4.751971 3.052776 3.270195 5.422575 4.603616 7.618892 4.435337
## [15] 2.294100 6.025980 4.387237 6.114967 3.585145
set.seed(1)
km.out <- kmeans (max_seasons,4, nstart =50)
place_max <- which(flu.ts%in%max_seasons)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by peak season,with K=4")
points(time(flu.ts)[place_max],max_seasons,col=(km.out$cluster +1),pch=20,cex=2)
set.seed(1)
km.out <- kmeans (max_seasons,3, nstart =50)
place_max <- which(flu.ts%in%max_seasons)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by peak season,with K=3")
points(time(flu.ts)[place_max],max_seasons,col=(km.out$cluster +1),pch=20,cex=2)
mydata <- max_seasons
wss <- numeric()
for (i in 1:15) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",main="Elbow method - peak season clusters")
all.area <- numeric()
season_start <- which(fludata$WEEK==40 |fludata$WEEK==39)[seq(1,37,by=2)]
season_end <- which(fludata$WEEK==40 |fludata$WEEK==39)[seq(2,38,by=2)]
for(i in 1:length(season_end)){
all.area <- c(all.area,sum(flu.ts[season_start[i]:season_end[i]],na.rm = T)+sum(abs(diff(flu.ts[season_start[i]:season_end[i]]))/2,na.rm = T))
}
all.area
## [1] 66.43434 68.07663 59.92658 66.19052 56.67827 68.07641 88.00119
## [8] 89.02303 80.72987 80.14763 97.43725 119.42979 114.08713 96.74932
## [15] 76.83413 107.65133 92.49522 104.06709 92.54987
set.seed(1)
km.out.all.area <- kmeans (all.area,4, nstart =50)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by area,with K=4")
for(i in 1:length(season_end)){
polygon(c(time(flu.ts)[season_start[i]],time(flu.ts)[season_start[i]:season_end[i]],time(flu.ts)[season_end[i]]),c(0,fludata$ILI.percent[season_start[i]:season_end[i]],0),col=(km.out.all.area$cluster[i]+1))
}
set.seed(1)
km.out.all.area <- kmeans (all.area,3, nstart =50)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by area,with K=3")
for(i in 1:length(season_end)){
polygon(c(time(flu.ts)[season_start[i]],time(flu.ts)[season_start[i]:season_end[i]],time(flu.ts)[season_end[i]]),c(0,fludata$ILI.percent[season_start[i]:season_end[i]],0),col=(km.out.all.area$cluster[i]+1))
}
mydata <- all.area
wss <- numeric()
for (i in 1:15) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",main="Elbow method for area clusters")
My main struggle was which k to choose. I think that with k=3 there is more information left unexplained by the clusters, with k=4 i think there is not much left to explain by the clusters. The elbow method helped me to decide that 4 would be the preferable number of clusters.
The train data will be the seasons from 2002 to 2013. The test data will be the seasons from 2013 to 2016.
flu52nona <- fludata[261:1001,] #removing the years of NA's
flu52.ts <- ts(as.numeric(flu52nona$ILI.percent),frequency = 52,start = c(2002,40),end=c(2016,52))
plot(flu52.ts,ylab="ILI.percent")
train <- flu52nona[1:572,] #end (2013,39)
test <- flu52nona[573:741,] #start (2013,40)
rmse <- function(true,predicted){return(sqrt((sum((true-predicted)^2)/length(true))))}
rmspe <- function(true,predicted){return(sqrt((sum(((true-predicted)/true)^2))/length(true))*100)}
mape <- function(true,predicted){return((max((abs(true-predicted))/true))*100)}
ht <- function(true,predicted){return((sum(sign(true[2:length(true)]-true[1:(length(true)-1)])==sign(predicted[2:length(predicted)]-predicted[1:(length(predicted)-1)])))*100/(length(true)-1))}
baseline <- numeric()
for(i in 0:(nrow(test)-4)){
baseline <- c(baseline,mean(flu52nona$ILI.percent[which(flu52nona$WEEK[1:(572+i)]==test$WEEK[4+i])]))
}
plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent")
lines(ts(baseline,start=c(2013,43),end=c(2016,52),frequency = 52), col="blue", lty=2,lwd=2)
I will now find the DF of the smoothing splines that has the lowest rmse on the test data. This model will be chosen to forecast 4 weeks ahead. To find the best DF the process is moving forward 1 step ahead each.
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.14-4
collected_rmse <- numeric()
for(j in 1:10){
pred.values.gam <- numeric()
for(i in 0:165){
fit <- gam(ILI.percent~s(WEEK,df=j)+s(YEAR,df=2),data=flu52nona[1:(572+i),])#gam(ILI.percent~s(YEAR,j)+s(WEEK,j),data=flu52nona[1:(572+i),])
pred.values.gam <- c(pred.values.gam,predict(fit,newdata = test[4+i,]))
}
collected_rmse <- c(collected_rmse,rmse(test$ILI.percent[-c(1:3)],pred.values.gam))
}
DF <- which.min(collected_rmse)
pred.values.gam <- numeric()
for(i in 0:(nrow(test)-4)){
fit <- gam(ILI.percent~s(WEEK,df = DF)+s(YEAR,df = 2),data=flu52nona[1:(572+i),])#gam(ILI.percent~s(YEAR,df = DF)+s(WEEK,df = DF),data=flu52nona[1:(572+i),])
pred.values.gam <- c(pred.values.gam,predict(fit,newdata = test[4+i,]))
}
plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent",main="GAM model")
lines(ts(pred.values.gam,start=c(2013,43),end=c(2016,52),frequency = 52), col="red", lty=2,lwd=2)
I am choosing the best span by the same way was used in the previous method. the range that is checked here is after i checked a very large range from 0.1 to 1. due to a very long computation time,presented here the shorter version of the process.
collected_rmse <- numeric()
for(j in seq(0.71,0.73,by=0.01)){
pred.values.local <- numeric()
for(i in 0:(nrow(test)-4)){
fit <- loess(ILI.percent~WEEK,span = j,control = loess.control(surface = "direct"),data=flu52nona[1:(572+i),])
pred.values.local <- c(pred.values.local,predict(fit,newdata = test[4+i,]))
}
collected_rmse <- c(collected_rmse,rmse(test$ILI.percent[-c(1:3)],pred.values.local))
}
sp <- seq(0.71,0.73,by=0.01)[which.min(collected_rmse)]
pred.values.local <- numeric()
for(i in 0:(nrow(test)-4)){
fit <- loess(ILI.percent~WEEK,span=sp,control = loess.control(surface = "direct"),data=flu52nona[1:(572+i),])
pred.values.local <- c(pred.values.local,predict(fit,newdata = test[4+i,]))
}
plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent",main="Local regression model")
lines(ts(pred.values.local,start=c(2013,43),end=c(2016,52),frequency = 52), col="red", lty=2,lwd=2)
I have noticed that adding the variable YEAR to the model made the predictions accuracy worse. that’s why I used only 1 variable.
A stationary time series is one whose properties do not depend on the time at which the series is observed. First, we need to check if the time series is stationary or not. in order to determine if the time series is stationary i will use the Augmented Dickey-Fuller (ADF) test.
library(forecast)
library(tseries)
adf.test(flu52.ts, alternative = "stationary")
## Warning in adf.test(flu52.ts, alternative = "stationary"): p-value smaller
## than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: flu52.ts
## Dickey-Fuller = -6.5191, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
p-value is 0.01, therefore we can assume the data is stationary.
The auto.arima function chooses the best combination of the parameters p,d,q. p is an autoregression parameter, d is a difference parameter (if the time series is not stationary) and q is a moving average parameter. Although i did an ADF test, the auto.arima uses repeated KPSS tests in order to determine the d parameter. I would expect that the d parameter will be 0 according to the test above.
In order to reduce variance i set the lambda to be 0 (log transformation).
train.ts <-ts(as.numeric(flu52nona$ILI.percent),frequency = 52,start=c(2002,40),end = c(2013,39))
test.ts <-ts(as.numeric(flu52nona$ILI.percent),frequency = 52,start = c(2013,40),end=c(2016,52))
plot(log(train.ts))
train.model <- auto.arima(train.ts,lambda = 0)
train.model
## Series: train.ts
## ARIMA(2,0,2)(1,0,0)[52] with non-zero mean
## Box Cox transformation: lambda= 0
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 mean
## 1.7722 -0.8021 -0.6905 0.0932 0.3234 0.3437
## s.e. 0.0631 0.0591 0.0729 0.0524 0.0425 0.1004
##
## sigma^2 estimated as 0.01629: log likelihood=364.6
## AIC=-715.2 AICc=-715.01 BIC=-684.76
Checking the residuals of the model.
Acf(residuals(train.model))
The ACF plot of the residuals from the ARIMA model shows almost all correlations within the threshold limits indicating that the residuals are behaving like white noise. I am assuming that the correlations outside the threshold limits are due to chance, as the threshold is a 95% confidence interval it is expected that 5% of the correlations might be out of the threshold limits.
pred.values.arima <- data.frame()
for(j in c(2013,2014,2015,2016)){
if(j==2013){
for(i in 0:13){
newfit <- Arima(window(flu52.ts,end=c(j,39+i)), model=train.model)
pred.values.arima <- rbind(pred.values.arima,as.data.frame(forecast(newfit,h= 4))[4,])
}
}
if(j!=2013){
for(k in 1:52){
newfit <- Arima(window(flu52.ts,end=c(j,k)), model=train.model)
pred.values.arima <- rbind(pred.values.arima,as.data.frame(forecast(newfit,h= 4))[4,])
}
}
}
plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent",main="Arima model")
lines(ts(pred.values.arima[-c(167:170),1],start=c(2013,43),end=c(2016,52),frequency = 52), col="green", lty=2,lwd=2)
It is important to understand that each time the model moved 1 step ahead it didn’t re-estimate the model’s coefficients as the previous models did. This is because it is beyond the capability of my computer. I think that the model would be much more accurate if it was possible to re-estimate the coefficients each time it moved forward. Additionally, the Arima model predicts one step ahead at a time, so the 4th week prediction is based on the predictions of the previous weeks.
cor.baseline <- cor(test$ILI.percent[-c(1:3)],baseline)
cor.gam <- cor(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
cor.local.reg <- cor(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
cor.arima <- cor(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
rmse.baseline <- rmse(test$ILI.percent[-c(1:3)],baseline)
rmse.gam <- rmse(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
rmse.local.reg <- rmse(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
rmse.arima <- rmse(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
rmspe.gam <- rmspe(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
rmspe.baseline <- rmspe(test$ILI.percent[-c(1:3)],baseline)
rmspe.local.reg <- rmspe(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
rmspe.arima <- rmspe(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
mape.gam <- mape(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
mape.baseline <- mape(test$ILI.percent[-c(1:3)],baseline)
mape.local.reg <- mape(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
mape.arima <- mape(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
ht.gam <- ht(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
ht.baseline <- ht(test$ILI.percent[-c(1:3)],baseline)
ht.local.reg <- ht(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
ht.arima <- ht(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
accuracy.measures.table <- data.frame(c("Baseline","Local regression","GAM with splines","Arima"),c(cor.baseline,cor.local.reg,cor.gam,cor.arima),c(rmse.baseline,rmse.local.reg,rmse.gam,rmse.arima),c(rmspe.baseline,rmspe.local.reg,rmspe.gam,rmspe.arima),c(mape.baseline,mape.local.reg,mape.gam,mape.arima),c(ht.baseline,ht.local.reg,ht.gam,ht.arima))
colnames(accuracy.measures.table) <-c("Method","Pearson","RMSE","RMSPE","MAPE","HT")
accuracy.measures.table
## Method Pearson RMSE RMSPE MAPE HT
## 1 Baseline 0.8306082 0.5223886 18.75786 50.38119 70.90909
## 2 Local regression 0.8530220 0.5007663 17.53816 55.15203 80.00000
## 3 GAM with splines 0.8389832 0.5117104 20.44687 49.57888 77.57576
## 4 Arima 0.8642133 0.4857432 16.23125 44.00672 65.45455