Influenza project

Clustering

Clustering by peak season

setwd("C:/Users/etital/Desktop")
fludata <- read.csv("ILINet.csv")
fludata <- fludata[,-c(3:8)]
fludata <- fludata[-which(fludata$WEEK==53),]
fludata$ILI.percent <- as.numeric(as.vector(fludata$ILI.percent))
## Warning: NAs introduced by coercion
flu.ts <- ts(as.numeric(fludata$ILI.percent),frequency = 52,start = c(1997,40),end=c(2016,52))


season <- numeric()
max_seasons <- numeric()
for(i in 1:1001){
  season <- c(season,fludata$ILI.percent[i])
  if(fludata$WEEK[i] == 39){
    max_seasons <- c(max_seasons,max(season,na.rm = T))
    season <- numeric()
  }
 
}
max_seasons
##  [1] 4.910070 5.077597 5.646977 4.186807 3.163820 3.166620 7.064955
##  [8] 4.751971 3.052776 3.270195 5.422575 4.603616 7.618892 4.435337
## [15] 2.294100 6.025980 4.387237 6.114967 3.585145
set.seed(1)
km.out <- kmeans (max_seasons,4, nstart =50)
place_max <- which(flu.ts%in%max_seasons)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by peak season,with K=4")
points(time(flu.ts)[place_max],max_seasons,col=(km.out$cluster +1),pch=20,cex=2)

set.seed(1)
km.out <- kmeans (max_seasons,3, nstart =50)
place_max <- which(flu.ts%in%max_seasons)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by peak season,with K=3")
points(time(flu.ts)[place_max],max_seasons,col=(km.out$cluster +1),pch=20,cex=2)

mydata <- max_seasons
wss <- numeric()
  for (i in 1:15) wss[i] <- sum(kmeans(mydata,
                                       centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares",main="Elbow method - peak season clusters")

Clustering by area

all.area <- numeric()
season_start <- which(fludata$WEEK==40 |fludata$WEEK==39)[seq(1,37,by=2)]
season_end <- which(fludata$WEEK==40 |fludata$WEEK==39)[seq(2,38,by=2)]
for(i in 1:length(season_end)){
all.area <- c(all.area,sum(flu.ts[season_start[i]:season_end[i]],na.rm = T)+sum(abs(diff(flu.ts[season_start[i]:season_end[i]]))/2,na.rm = T))
}

all.area
##  [1]  66.43434  68.07663  59.92658  66.19052  56.67827  68.07641  88.00119
##  [8]  89.02303  80.72987  80.14763  97.43725 119.42979 114.08713  96.74932
## [15]  76.83413 107.65133  92.49522 104.06709  92.54987
set.seed(1)
km.out.all.area <- kmeans (all.area,4, nstart =50)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by area,with K=4")
for(i in 1:length(season_end)){
polygon(c(time(flu.ts)[season_start[i]],time(flu.ts)[season_start[i]:season_end[i]],time(flu.ts)[season_end[i]]),c(0,fludata$ILI.percent[season_start[i]:season_end[i]],0),col=(km.out.all.area$cluster[i]+1))
}

set.seed(1)
km.out.all.area <- kmeans (all.area,3, nstart =50)
plot(flu.ts,ylab="ILI.Percent",main="Clusters by area,with K=3")
for(i in 1:length(season_end)){
polygon(c(time(flu.ts)[season_start[i]],time(flu.ts)[season_start[i]:season_end[i]],time(flu.ts)[season_end[i]]),c(0,fludata$ILI.percent[season_start[i]:season_end[i]],0),col=(km.out.all.area$cluster[i]+1))
}

mydata <- all.area
wss <- numeric()
  for (i in 1:15) wss[i] <- sum(kmeans(mydata,
                                       centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares",main="Elbow method for area clusters")

My main struggle was which k to choose. I think that with k=3 there is more information left unexplained by the clusters, with k=4 i think there is not much left to explain by the clusters. The elbow method helped me to decide that 4 would be the preferable number of clusters.

Forecasting

The train data will be the seasons from 2002 to 2013. The test data will be the seasons from 2013 to 2016.

flu52nona <- fludata[261:1001,]  #removing the years of NA's
flu52.ts <- ts(as.numeric(flu52nona$ILI.percent),frequency = 52,start = c(2002,40),end=c(2016,52))
plot(flu52.ts,ylab="ILI.percent")

train <- flu52nona[1:572,] #end (2013,39)
test <- flu52nona[573:741,] #start (2013,40)

Accuraacy measures

RMSE

rmse <- function(true,predicted){return(sqrt((sum((true-predicted)^2)/length(true))))}

RMSPE

rmspe <- function(true,predicted){return(sqrt((sum(((true-predicted)/true)^2))/length(true))*100)}

MAPE

mape <- function(true,predicted){return((max((abs(true-predicted))/true))*100)}

HT

ht <- function(true,predicted){return((sum(sign(true[2:length(true)]-true[1:(length(true)-1)])==sign(predicted[2:length(predicted)]-predicted[1:(length(predicted)-1)])))*100/(length(true)-1))}

Baseline predictor

baseline <- numeric()
for(i in 0:(nrow(test)-4)){
baseline <- c(baseline,mean(flu52nona$ILI.percent[which(flu52nona$WEEK[1:(572+i)]==test$WEEK[4+i])]))
}
plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent")
lines(ts(baseline,start=c(2013,43),end=c(2016,52),frequency = 52), col="blue", lty=2,lwd=2)

Forecast by GAM with smoothing splines.

I will now find the DF of the smoothing splines that has the lowest rmse on the test data. This model will be chosen to forecast 4 weeks ahead. To find the best DF the process is moving forward 1 step ahead each.

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.14-4
collected_rmse <- numeric()
for(j in 1:10){
pred.values.gam <- numeric()
for(i in 0:165){
fit <- gam(ILI.percent~s(WEEK,df=j)+s(YEAR,df=2),data=flu52nona[1:(572+i),])#gam(ILI.percent~s(YEAR,j)+s(WEEK,j),data=flu52nona[1:(572+i),])
pred.values.gam <- c(pred.values.gam,predict(fit,newdata = test[4+i,]))
}

collected_rmse <- c(collected_rmse,rmse(test$ILI.percent[-c(1:3)],pred.values.gam))
}

DF <- which.min(collected_rmse)
pred.values.gam <- numeric()
for(i in 0:(nrow(test)-4)){
fit <- gam(ILI.percent~s(WEEK,df = DF)+s(YEAR,df = 2),data=flu52nona[1:(572+i),])#gam(ILI.percent~s(YEAR,df = DF)+s(WEEK,df = DF),data=flu52nona[1:(572+i),])
pred.values.gam <- c(pred.values.gam,predict(fit,newdata = test[4+i,]))
}

plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent",main="GAM model")
lines(ts(pred.values.gam,start=c(2013,43),end=c(2016,52),frequency = 52), col="red", lty=2,lwd=2)

Forecast by local regression

I am choosing the best span by the same way was used in the previous method. the range that is checked here is after i checked a very large range from 0.1 to 1. due to a very long computation time,presented here the shorter version of the process.

collected_rmse <- numeric()
for(j in seq(0.71,0.73,by=0.01)){
pred.values.local <- numeric()
for(i in 0:(nrow(test)-4)){
fit <- loess(ILI.percent~WEEK,span = j,control = loess.control(surface = "direct"),data=flu52nona[1:(572+i),])
pred.values.local <- c(pred.values.local,predict(fit,newdata = test[4+i,]))
}
collected_rmse <- c(collected_rmse,rmse(test$ILI.percent[-c(1:3)],pred.values.local))
}

sp <- seq(0.71,0.73,by=0.01)[which.min(collected_rmse)]
pred.values.local <- numeric()
for(i in 0:(nrow(test)-4)){
fit <- loess(ILI.percent~WEEK,span=sp,control = loess.control(surface = "direct"),data=flu52nona[1:(572+i),])
pred.values.local <- c(pred.values.local,predict(fit,newdata = test[4+i,]))
}

plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent",main="Local regression model")
lines(ts(pred.values.local,start=c(2013,43),end=c(2016,52),frequency = 52), col="red", lty=2,lwd=2)

I have noticed that adding the variable YEAR to the model made the predictions accuracy worse. that’s why I used only 1 variable.

Forecast by Arima model

A stationary time series is one whose properties do not depend on the time at which the series is observed. First, we need to check if the time series is stationary or not. in order to determine if the time series is stationary i will use the Augmented Dickey-Fuller (ADF) test.

library(forecast)
library(tseries)
adf.test(flu52.ts, alternative = "stationary")
## Warning in adf.test(flu52.ts, alternative = "stationary"): p-value smaller
## than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  flu52.ts
## Dickey-Fuller = -6.5191, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

p-value is 0.01, therefore we can assume the data is stationary.

The auto.arima function chooses the best combination of the parameters p,d,q. p is an autoregression parameter, d is a difference parameter (if the time series is not stationary) and q is a moving average parameter. Although i did an ADF test, the auto.arima uses repeated KPSS tests in order to determine the d parameter. I would expect that the d parameter will be 0 according to the test above.

In order to reduce variance i set the lambda to be 0 (log transformation).

train.ts <-ts(as.numeric(flu52nona$ILI.percent),frequency = 52,start=c(2002,40),end = c(2013,39))
test.ts <-ts(as.numeric(flu52nona$ILI.percent),frequency = 52,start = c(2013,40),end=c(2016,52))
plot(log(train.ts))

train.model <- auto.arima(train.ts,lambda = 0)
train.model
## Series: train.ts 
## ARIMA(2,0,2)(1,0,0)[52] with non-zero mean 
## Box Cox transformation: lambda= 0 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    sar1    mean
##       1.7722  -0.8021  -0.6905  0.0932  0.3234  0.3437
## s.e.  0.0631   0.0591   0.0729  0.0524  0.0425  0.1004
## 
## sigma^2 estimated as 0.01629:  log likelihood=364.6
## AIC=-715.2   AICc=-715.01   BIC=-684.76

Checking the residuals of the model.

Acf(residuals(train.model))

The ACF plot of the residuals from the ARIMA model shows almost all correlations within the threshold limits indicating that the residuals are behaving like white noise. I am assuming that the correlations outside the threshold limits are due to chance, as the threshold is a 95% confidence interval it is expected that 5% of the correlations might be out of the threshold limits.

ARIMA prediction

pred.values.arima <- data.frame()
for(j in c(2013,2014,2015,2016)){
if(j==2013){
for(i in 0:13){  

  newfit <- Arima(window(flu52.ts,end=c(j,39+i)), model=train.model)
  pred.values.arima <- rbind(pred.values.arima,as.data.frame(forecast(newfit,h= 4))[4,])
}
}
if(j!=2013){

for(k in 1:52){
  
  newfit <- Arima(window(flu52.ts,end=c(j,k)), model=train.model)
  pred.values.arima <- rbind(pred.values.arima,as.data.frame(forecast(newfit,h= 4))[4,])
}
}
}

plot(window(flu52.ts,start=c(2013,43)),ylab="ILI.percent",main="Arima model")
lines(ts(pred.values.arima[-c(167:170),1],start=c(2013,43),end=c(2016,52),frequency = 52), col="green", lty=2,lwd=2)

It is important to understand that each time the model moved 1 step ahead it didn’t re-estimate the model’s coefficients as the previous models did. This is because it is beyond the capability of my computer. I think that the model would be much more accurate if it was possible to re-estimate the coefficients each time it moved forward. Additionally, the Arima model predicts one step ahead at a time, so the 4th week prediction is based on the predictions of the previous weeks.

Accuracy measures table

cor.baseline <- cor(test$ILI.percent[-c(1:3)],baseline)
cor.gam <- cor(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
cor.local.reg <- cor(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
cor.arima <- cor(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
rmse.baseline <- rmse(test$ILI.percent[-c(1:3)],baseline)
rmse.gam <- rmse(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
rmse.local.reg <- rmse(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
rmse.arima <- rmse(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
rmspe.gam <- rmspe(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))
rmspe.baseline <- rmspe(test$ILI.percent[-c(1:3)],baseline)
rmspe.local.reg <- rmspe(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))
rmspe.arima <- rmspe(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])
mape.gam <- mape(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))  
mape.baseline <- mape(test$ILI.percent[-c(1:3)],baseline)
mape.local.reg <- mape(test$ILI.percent[-c(1:3)],as.vector(pred.values.local))  
mape.arima <- mape(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1])  
ht.gam <- ht(test$ILI.percent[-c(1:3)],as.vector(pred.values.gam))  
ht.baseline <- ht(test$ILI.percent[-c(1:3)],baseline)
ht.local.reg <- ht(test$ILI.percent[-c(1:3)],as.vector(pred.values.local)) 
ht.arima <- ht(test$ILI.percent[-c(1:3)],pred.values.arima[-c(167:170),1]) 

accuracy.measures.table <- data.frame(c("Baseline","Local regression","GAM with splines","Arima"),c(cor.baseline,cor.local.reg,cor.gam,cor.arima),c(rmse.baseline,rmse.local.reg,rmse.gam,rmse.arima),c(rmspe.baseline,rmspe.local.reg,rmspe.gam,rmspe.arima),c(mape.baseline,mape.local.reg,mape.gam,mape.arima),c(ht.baseline,ht.local.reg,ht.gam,ht.arima))
colnames(accuracy.measures.table) <-c("Method","Pearson","RMSE","RMSPE","MAPE","HT")

accuracy.measures.table
##             Method   Pearson      RMSE    RMSPE     MAPE       HT
## 1         Baseline 0.8306082 0.5223886 18.75786 50.38119 70.90909
## 2 Local regression 0.8530220 0.5007663 17.53816 55.15203 80.00000
## 3 GAM with splines 0.8389832 0.5117104 20.44687 49.57888 77.57576
## 4            Arima 0.8642133 0.4857432 16.23125 44.00672 65.45455