setwd('/home/daria/Courses/R/Edx/Linear Regression')
flu <- read.csv('FluTrain.csv')

# find the highest percentage of ILI-related physician visits
flu[which.max(flu$ILI),]
##                        Week      ILI Queries
## 303 2009-10-18 - 2009-10-24 7.618892       1
# find  the highest percentage of ILI-related query fraction
flu[which.max(flu$Queries),]
##                        Week      ILI Queries
## 303 2009-10-18 - 2009-10-24 7.618892       1
# plot hist
hist(flu$ILI)

plot(flu$Queries, log(flu$ILI))

FluTrend1 <- lm(log(ILI) ~ Queries, data = flu)
summary(FluTrend1)
## 
## Call:
## lm(formula = log(ILI) ~ Queries, data = flu)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76003 -0.19696 -0.01657  0.18685  1.06450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.49934    0.03041  -16.42   <2e-16 ***
## Queries      2.96129    0.09312   31.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2995 on 415 degrees of freedom
## Multiple R-squared:  0.709,  Adjusted R-squared:  0.7083 
## F-statistic:  1011 on 1 and 415 DF,  p-value: < 2.2e-16
# == to R-squared
(cor(log(flu$ILI), flu$Queries))^2
## [1] 0.7090201
# load test data
fluTest <- read.csv('FluTest.csv')

# prediction using exp to convert from log back to normal
PredTest1 = exp(predict(FluTrend1, newdata=fluTest))
PredTest1
##        1        2        3        4        5        6        7        8 
## 3.520332 2.662689 2.673181 2.510160 2.451624 2.694289 2.780402 2.673181 
##        9       10       11       12       13       14       15       16 
## 2.375693 2.357081 2.187378 1.796914 1.789861 1.714084 1.789861 1.615894 
##       17       18       19       20       21       22       23       24 
## 1.529333 1.499555 1.380689 1.487806 1.375269 1.317045 1.359139 1.276254 
##       25       26       27       28       29       30       31       32 
## 1.317045 1.231872 1.276254 1.332675 1.256335 1.276254 1.266255 1.281283 
##       33       34       35       36       37       38       39       40 
## 1.348491 1.447408 1.523330 1.628654 1.755010 2.094771 2.195997 2.357081 
##       41       42       43       44       45       46       47       48 
## 2.422870 2.338614 2.329436 2.539950 2.442001 2.858006 2.758619 3.690445 
##       49       50       51       52 
## 4.439679 4.898351 6.250888 6.591252
which(fluTest$Week == "2012-03-11 - 2012-03-17")
## [1] 11
PredTest1[11]
##       11 
## 2.187378
# calculate relative error
relative_error <- (fluTest$ILI[11] - PredTest1[11]) / fluTest$ILI[11]


# calculate Root Mean Square Error (RMSE)
SSE <- sum((PredTest1-fluTest$ILI)^2)
RMSE <- sqrt(SSE / nrow(fluTest))


library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

flu$ILILag2 = lag(zoo(flu$ILI), -2, na.pad=TRUE)

plot( log(flu$ILILag2), log(flu$ILI) )

FluTrend2 <- lm(log(ILI) ~ Queries + log(ILILag2), data = flu)
summary(FluTrend2)
## 
## Call:
## lm(formula = log(ILI) ~ Queries + log(ILILag2), data = flu)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52209 -0.11082 -0.01819  0.08143  0.76785 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.24064    0.01953  -12.32   <2e-16 ***
## Queries       1.25578    0.07910   15.88   <2e-16 ***
## log(ILILag2)  0.65569    0.02251   29.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1703 on 412 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.9063, Adjusted R-squared:  0.9059 
## F-statistic:  1993 on 2 and 412 DF,  p-value: < 2.2e-16
ILILag2_test = lag(zoo(fluTest$ILI), -2, na.pad=TRUE)
fluTest$ILILag2 = coredata(ILILag2_test)


# fill in missing values
fluTest$ILILag2[1] <- flu$ILI[416]
fluTest$ILILag2[2] <- flu$ILI[417]

PredTest2 = exp(predict(FluTrend2, newdata=fluTest))
PredTest2
##        1        2        3        4        5        6        7        8 
## 2.482236 2.411829 2.140941 1.907817 1.971504 2.081855 2.254461 2.217595 
##        9       10       11       12       13       14       15       16 
## 2.223407 2.275949 2.199254 2.028643 2.143046 1.874078 1.789706 1.670672 
##       17       18       19       20       21       22       23       24 
## 1.515187 1.425732 1.315357 1.405183 1.325919 1.275099 1.299165 1.279013 
##       25       26       27       28       29       30       31       32 
## 1.205075 1.120431 1.117144 1.153245 1.102756 1.025998 1.003550 1.018898 
##       33       34       35       36       37       38       39       40 
## 1.068151 1.073276 1.085258 1.220592 1.274901 1.437860 1.517127 1.587165 
##       41       42       43       44       45       46       47       48 
## 1.638432 1.562795 1.671433 1.720690 1.736069 1.966300 2.042260 2.424141 
##       49       50       51       52 
## 3.160283 3.220680 4.322513 5.006438
# calculate Root Mean Square Error (RMSE)
SSE <- sum((PredTest2-fluTest$ILI)^2)
RMSE <- sqrt(SSE / nrow(fluTest))
RMSE
## [1] 0.2942029