setwd('/home/daria/Courses/R/Edx/Linear Regression')
flu <- read.csv('FluTrain.csv')
# find the highest percentage of ILI-related physician visits
flu[which.max(flu$ILI),]
## Week ILI Queries
## 303 2009-10-18 - 2009-10-24 7.618892 1
# find the highest percentage of ILI-related query fraction
flu[which.max(flu$Queries),]
## Week ILI Queries
## 303 2009-10-18 - 2009-10-24 7.618892 1
# plot hist
hist(flu$ILI)

plot(flu$Queries, log(flu$ILI))
FluTrend1 <- lm(log(ILI) ~ Queries, data = flu)
summary(FluTrend1)
##
## Call:
## lm(formula = log(ILI) ~ Queries, data = flu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76003 -0.19696 -0.01657 0.18685 1.06450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.49934 0.03041 -16.42 <2e-16 ***
## Queries 2.96129 0.09312 31.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2995 on 415 degrees of freedom
## Multiple R-squared: 0.709, Adjusted R-squared: 0.7083
## F-statistic: 1011 on 1 and 415 DF, p-value: < 2.2e-16
# == to R-squared
(cor(log(flu$ILI), flu$Queries))^2
## [1] 0.7090201
# load test data
fluTest <- read.csv('FluTest.csv')
# prediction using exp to convert from log back to normal
PredTest1 = exp(predict(FluTrend1, newdata=fluTest))
PredTest1
## 1 2 3 4 5 6 7 8
## 3.520332 2.662689 2.673181 2.510160 2.451624 2.694289 2.780402 2.673181
## 9 10 11 12 13 14 15 16
## 2.375693 2.357081 2.187378 1.796914 1.789861 1.714084 1.789861 1.615894
## 17 18 19 20 21 22 23 24
## 1.529333 1.499555 1.380689 1.487806 1.375269 1.317045 1.359139 1.276254
## 25 26 27 28 29 30 31 32
## 1.317045 1.231872 1.276254 1.332675 1.256335 1.276254 1.266255 1.281283
## 33 34 35 36 37 38 39 40
## 1.348491 1.447408 1.523330 1.628654 1.755010 2.094771 2.195997 2.357081
## 41 42 43 44 45 46 47 48
## 2.422870 2.338614 2.329436 2.539950 2.442001 2.858006 2.758619 3.690445
## 49 50 51 52
## 4.439679 4.898351 6.250888 6.591252
which(fluTest$Week == "2012-03-11 - 2012-03-17")
## [1] 11
PredTest1[11]
## 11
## 2.187378
# calculate relative error
relative_error <- (fluTest$ILI[11] - PredTest1[11]) / fluTest$ILI[11]
# calculate Root Mean Square Error (RMSE)
SSE <- sum((PredTest1-fluTest$ILI)^2)
RMSE <- sqrt(SSE / nrow(fluTest))
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric

flu$ILILag2 = lag(zoo(flu$ILI), -2, na.pad=TRUE)
plot( log(flu$ILILag2), log(flu$ILI) )

FluTrend2 <- lm(log(ILI) ~ Queries + log(ILILag2), data = flu)
summary(FluTrend2)
##
## Call:
## lm(formula = log(ILI) ~ Queries + log(ILILag2), data = flu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52209 -0.11082 -0.01819 0.08143 0.76785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.24064 0.01953 -12.32 <2e-16 ***
## Queries 1.25578 0.07910 15.88 <2e-16 ***
## log(ILILag2) 0.65569 0.02251 29.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1703 on 412 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.9063, Adjusted R-squared: 0.9059
## F-statistic: 1993 on 2 and 412 DF, p-value: < 2.2e-16
ILILag2_test = lag(zoo(fluTest$ILI), -2, na.pad=TRUE)
fluTest$ILILag2 = coredata(ILILag2_test)
# fill in missing values
fluTest$ILILag2[1] <- flu$ILI[416]
fluTest$ILILag2[2] <- flu$ILI[417]
PredTest2 = exp(predict(FluTrend2, newdata=fluTest))
PredTest2
## 1 2 3 4 5 6 7 8
## 2.482236 2.411829 2.140941 1.907817 1.971504 2.081855 2.254461 2.217595
## 9 10 11 12 13 14 15 16
## 2.223407 2.275949 2.199254 2.028643 2.143046 1.874078 1.789706 1.670672
## 17 18 19 20 21 22 23 24
## 1.515187 1.425732 1.315357 1.405183 1.325919 1.275099 1.299165 1.279013
## 25 26 27 28 29 30 31 32
## 1.205075 1.120431 1.117144 1.153245 1.102756 1.025998 1.003550 1.018898
## 33 34 35 36 37 38 39 40
## 1.068151 1.073276 1.085258 1.220592 1.274901 1.437860 1.517127 1.587165
## 41 42 43 44 45 46 47 48
## 1.638432 1.562795 1.671433 1.720690 1.736069 1.966300 2.042260 2.424141
## 49 50 51 52
## 3.160283 3.220680 4.322513 5.006438
# calculate Root Mean Square Error (RMSE)
SSE <- sum((PredTest2-fluTest$ILI)^2)
RMSE <- sqrt(SSE / nrow(fluTest))
RMSE
## [1] 0.2942029