In this lesson students will learn how toā¦
During the COVID-19 Pandemic several researchers published work on estimating/forecasting the number of cases. Due to the non-linear nature of these data many of the researchers chose to fit polynomial regression models.
Read more here:
library(tidyverse)
covid<-read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/complete.csv")
str(covid)
## 'data.frame': 4692 obs. of 10 variables:
## $ Date : chr "2020-01-30" "2020-01-31" "2020-02-01" "2020-02-02" ...
## $ Name.of.State...UT : chr "Kerala" "Kerala" "Kerala" "Kerala" ...
## $ Latitude : num 10.9 10.9 10.9 10.9 10.9 ...
## $ Longitude : num 76.3 76.3 76.3 76.3 76.3 ...
## $ Total.Confirmed.cases : num 1 1 2 3 3 3 3 3 3 3 ...
## $ Death : chr "0" "0" "0" "0" ...
## $ Cured.Discharged.Migrated: num 0 0 0 0 0 0 0 0 0 0 ...
## $ New.cases : int 0 0 1 1 0 0 0 0 0 0 ...
## $ New.deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ New.recovered : int 0 0 0 0 0 0 0 0 0 0 ...
This data is pretty raw. There are daily counts for COVID cases for each state in India (exhibiting affected residents) from 1/30/2020 and 8/30/2020.
We are going to need to wrangle it:
library(magrittr)
## NEEDS TO BE A DATE OBJECT
covid$Date<-as.Date(covid$Date)
## CREATE A VECTOR OF UNIQUE DATES
dateRange<-unique(covid$Date)
## INITIALIZE EMPTY VECTOR FOR CASES
cases<-c()
for(i in 1:length(dateRange)){
## CALCULATE NUMBER OF NEW CASES ACROSS ALL STATES
covidThisDate<-covid%>%
filter(Date==dateRange[i])%$%
sum(New.cases)+1
cases<-c(cases, covidThisDate)
}
## Create a new data frame
newCaseDate<-data.frame(Date=dateRange,
Daily_Cases=cases,
## CUMULATIVE SUM
Total_Cases=cumsum(cases)
)%>%
mutate(Start=as.Date("2020-01-30"))%>%
## NUMBER OF DAYS SINCE START 1/30/2020
mutate(Days=as.numeric(Date-Start))
## Plot for Total Cases
ggplot(newCaseDate, aes(x=Days, y=Total_Cases))+
geom_point()+
ggtitle("Total Case Over Time")
## Plot for New Cases in a day
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
ggtitle("New Cases Daily")
Polynomial regression is used to fit a curved relationship between the feature \(X\) and the outcome \(Y\).
\[Y=\beta_0+\beta_1 X+\beta_2 X^2+\beta_3 X^3+\cdot+\beta_p X^p+\varepsilon\]
Polynomial regression has been applied in:
## DEGREE 1: Linear Model
model_1 <- lm(Daily_Cases ~ poly(Days,1), data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=model_1$fitted.values), color="red", size=1)+
ggtitle("Degree 1")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ā¹ Please use `linewidth` instead.
## Degree 2: Quadratic
model_2 <- lm(Daily_Cases ~ poly(Days,2), data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=model_2$fitted.values), color="red", size=1)+
ggtitle("Degree 2")
## Degree 3: Cubic
model_3 <- lm(Daily_Cases ~ poly(Days,3), data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=model_3$fitted.values), color="red", size=1)+
ggtitle("Degree 3")
## Degree 5
model_5 <- lm(Daily_Cases ~ poly(Days,5), data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=model_5$fitted.values), color="red", size=1)+
ggtitle("Degree 5")
## Degree 10
model_10 <- lm(Daily_Cases ~ poly(Days,10), data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=model_10$fitted.values), color="red", size=1)+
ggtitle("Degree 10")
## Degree 25
model_25 <- lm(Daily_Cases ~ poly(Days,25), data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=model_25$fitted.values), color="red", size=1)+
ggtitle("Degree 25")
Look for significant variables.
model <- lm(Daily_Cases ~ poly(Days,6), data = newCaseDate)
summary(model)
##
## Call:
## lm(formula = Daily_Cases ~ poly(Days, 6), data = newCaseDate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37319 -383 -40 183 32652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10562 370 28.546 < 2e-16 ***
## poly(Days, 6)1 179540 5046 35.581 < 2e-16 ***
## poly(Days, 6)2 112061 5046 22.208 < 2e-16 ***
## poly(Days, 6)3 39258 5046 7.780 5.55e-13 ***
## poly(Days, 6)4 4656 5046 0.923 0.357
## poly(Days, 6)5 -4975 5046 -0.986 0.326
## poly(Days, 6)6 -4975 5046 -0.986 0.325
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5046 on 179 degrees of freedom
## Multiple R-squared: 0.9106, Adjusted R-squared: 0.9076
## F-statistic: 303.8 on 6 and 179 DF, p-value: < 2.2e-16
final.model <- lm(Daily_Cases ~ poly(Days,3), data = newCaseDate)
summary(final.model)
##
## Call:
## lm(formula = Daily_Cases ~ poly(Days, 3), data = newCaseDate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36644 -636 -142 613 33371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10561.6 369.8 28.562 < 2e-16 ***
## poly(Days, 3)1 179540.2 5043.1 35.601 < 2e-16 ***
## poly(Days, 3)2 112060.6 5043.1 22.221 < 2e-16 ***
## poly(Days, 3)3 39258.2 5043.1 7.785 5.1e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5043 on 182 degrees of freedom
## Multiple R-squared: 0.9092, Adjusted R-squared: 0.9077
## F-statistic: 607.3 on 3 and 182 DF, p-value: < 2.2e-16
Training and Testing!
We should split the data into the training set to build/train a model and then predict values using the testing set. Since we actually know the true observations we can asses how much error there was. In reality, we arenāt about to know how much error there was in predictions until the event we were trying to forecast for is in the past.
trainInd<-sample(1:186, 130)
trainDat<-newCaseDate[trainInd, ]
testDat<-newCaseDate[-trainInd, ]
In order to find the amount of curvature we would like in the model, we will perform a grid search. This means we will fit our model with the training set and a given amount of curvature (degree of polynomial). We then use this model to predicting the testing set and assess the resulting error. This process is repeated over and over until we have moved through the whole grid.
# setup
RMSE <- data.frame('kth.order' = NA, 'RMSE' = NA, 'TestRMSE'=NA) # empty data frame to store RMSE
vals <- list('Days' <- seq(min(newCaseDate$Days), max(newCaseDate$Days), by = 0.01)) # set up vector used for prediction
## THIS IS CALLED A GRID SEARCH
#k-th order
k <- 1:8
# run loop
for (i in 1:length(k)){
# build models
model <- lm(Daily_Cases ~ poly(Days,k[i]), data = trainDat)
# calculate RSME and store it for further usage
RMSE[i,1] <- k[i] # store k-th order
RMSE[i,2] <- sqrt(sum((fitted(model)-trainDat$Daily_Cases)^2)/length(trainDat$Daily_Cases)) # calculate RSME
predTest<-predict(model, testDat)
RMSE[i, 3]<-sqrt(sum((predTest-testDat$Daily_Cases)^2)/length(testDat$Daily_Cases)) # calculate RSME
}
## USE GATHER TO CREATE A NEW COL TO DIFFERENTIATE THE TYPE OF RMSE
RMSE%>%
gather(key="Type", value="thisRMSE", -c(kth.order))%>%
ggplot(aes(x=kth.order, y=thisRMSE, color=Type))+
geom_line()
Generate Synthetic Data
#### CITATION: https://www.geo.fu-berlin.de/en/v/soga/Basics-of-statistics/Linear-Regression/Polynomial-Regression/Polynomial-Regression---An-example/index.html
##### Data generating function #####
set.seed(400) # set seed for reproducibility
n <- 150
x <- runif(n, 0, 1)
y <- sin(2*pi*x) + rnorm(n, 0, 0.35)
poly.data <- data.frame('x' = x, 'y' = y)
library(tidyverse)
ggplot(poly.data , aes(x, y))+
geom_point()
In k-fold Cross Validation, rather than
It is common to use \(k=5\) folds
### HOLD MANY FOLDS
kf<-5
### RANDOM SPLIT INTO K FOLDS
### RANDOM INDEXES
ind<-sample(1:150)
### CREATE DF
folds<-data.frame(ind,
fold=rep(1:kf, 150/kf))
### ADD ON COLUMNS TO ORIGINAL DAT
foldPoly<-poly.data[ind,]%>%
cbind(folds)
### INITIALIZE RMSE DATAFRAME TO HOLD OUTPUT
RMSE <- data.frame('fold' = NA, 'kth.order' = NA, 'RMSE' = NA, 'TestRMSE'=NA) # empty data frame to store RMSE
### LOOP FOR CROSS-VALIDATION
for(i in 1:kf){
trainDat<-foldPoly%>%
filter(fold!=i)
testDat<-foldPoly%>%
filter(fold==i)
### INNER LOOP FOR POLY DEGREE
k <- 1:15 #k-th order
for (j in 1:length(k)){
row<-length(k)*(i-1)+j
# build models
model <- lm(y ~ poly(x,k[j]), data = trainDat)
# calculate RSME and store it for further usage
RMSE[row,1] <-i
RMSE[row,2] <- k[j] # store k-th order
RMSE[row,3] <- sqrt(sum((fitted(model)-trainDat$y)^2)/length(trainDat$y)) # calculate RSME
predTest<-predict(model, testDat)
RMSE[row, 4]<-sqrt(sum((predTest-testDat$y)^2)/length(testDat$y)) # calculate RSME
}
}
ggplot(RMSE, aes(x=kth.order, y=RMSE, color=as.factor(fold)))+
geom_line()+
geom_point()+
ggtitle("Training RMSE")
ggplot(RMSE, aes(x=kth.order, y=TestRMSE, color=as.factor(fold)))+
geom_line()+
geom_point()+
ggtitle("Testing RMSE")
### TIDY GRAPHICS TO PLOT TOGETHER
tidyRMSE<-RMSE%>%
gather(key="Type", value="thisRMSE", -c(fold, kth.order))
ggplot(tidyRMSE, aes(x=as.factor(kth.order), y=thisRMSE, fill=Type))+
geom_boxplot()+
ylim(c(.25, .6))+
ggtitle("Observe the Bias-Variance Trade-off")
## Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).
ggplot(tidyRMSE, aes(x=kth.order, y=thisRMSE,
color=Type, lty=as.factor(fold)))+
geom_line()+
ylim(c(.25, .6))
## Warning: Removed 4 rows containing missing values (`geom_line()`).
### AGGREGATE CROSS_VALIDATION
cvRMSE<-RMSE%>%
group_by(kth.order)%>%
summarise(avgTrainMSE=mean(RMSE),
avgTestMSE=mean(TestRMSE),
sdTrain=sd(RMSE),
sdTest=sd(TestRMSE))
cvRMSE
## # A tibble: 15 Ć 5
## kth.order avgTrainMSE avgTestMSE sdTrain sdTest
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.555 0.559 0.0169 0.0673
## 2 2 0.552 0.559 0.0182 0.0712
## 3 3 0.375 0.381 0.0170 0.0781
## 4 4 0.374 0.387 0.0170 0.0794
## 5 5 0.366 0.383 0.0135 0.0586
## 6 6 0.366 0.388 0.0135 0.0594
## 7 7 0.365 0.392 0.0133 0.0583
## 8 8 0.364 0.401 0.0132 0.0618
## 9 9 0.363 0.413 0.0130 0.0668
## 10 10 0.362 0.428 0.0135 0.0816
## 11 11 0.361 0.412 0.0134 0.0664
## 12 12 0.359 0.435 0.0128 0.0772
## 13 13 0.358 0.428 0.0111 0.0505
## 14 14 0.357 0.424 0.0117 0.0460
## 15 15 0.353 0.422 0.0119 0.0592
cvRMSE%>%
gather(key="Type", value=thisRMSE, -c(kth.order))%>%
filter(Type %in% c("avgTrainMSE", "avgTestMSE"))%>%
ggplot(aes(x=kth.order, y=thisRMSE, color=Type))+
geom_line()+
geom_point()
### WHICH MINIMIZES
which.min(cvRMSE$avgTestMSE)
## [1] 3
cvRMSE$avgTestMSE[which.min(cvRMSE$avgTestMSE)]
## [1] 0.3813249
## THIS SPACE IS FOR YOUR CODE ##
Generalized additive models provide a flexible modeling technique that can account for non-linear relationships. This type of modeling falls under the generalized linear model framework and linked with smoothing (spline) functions. The big idea is that sections of the data (separated by knots) can be split up and modeled using a non-linear method. These sections and then combined and smoothed.
The s()
function tells are to automatically find the
ābestā knots for the smoothing spline.
#install.packages("mgcv")
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
mod_gam<-gam(Daily_Cases ~ s(Days), data = newCaseDate)
summary(mod_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Daily_Cases ~ s(Days)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10562 370 28.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Days) 4.823 5.898 306.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.908 Deviance explained = 91%
## GCV = 2.6286e+07 Scale est. = 2.5463e+07 n = 186
Looking at the output you will see edf
which stands for
effective degrees of freedom. We can think of this as āthe more wiggly
the line is, the higher this number should beā. An edf
of
one would mean that the linear is linear. P-values from this table
should be considered approximate.
Now, letās make a graph to compare the GAM model with the polynomial model.
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=mod_gam$fitted.values), color="red", size=1)+
geom_line(aes(y=model_3$fitted.values), color="blue", size=1)+
ggtitle("GAM Model (Red) vs Poly 3 (Blue)")
mod_gam2<-gam(y ~ s(x), data = poly.data)
ggplot(poly.data, aes(x=x, y=y))+
geom_point()+
geom_line(aes(y=mod_gam2$fitted.values), color="red", size=1)
LOESS (locally estimated scatterplot smoothing) is a smoothing technique that uses data points close to the value in question to create a model. When using LOESS we must specify how much data around a given point to use. This is known as the span.
loess1<-loess(Daily_Cases ~ Days, data = newCaseDate)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=predict(loess1)), color="red", size=1)
In the following code, we perform a grid search for the span to decide the appropriate amount of āwiggleā to give to the line.
Letās do k-fold cross-validation to find the ābestā span
set.seed(123)
### HOLD MANY FOLDS
kf<-5
### RANDOM SPLIT INTO K FOLDS
### RANDOM INDEXES
ind<-sample(1:186)
### CREATE DF
folds<-data.frame(ind,
fold=c(rep(1, 37), rep(2, 37),
rep(3, 37), rep(4, 37), rep(5, 38)))
### ADD ON COLUMNS TO ORIGINAL DAT
foldPoly<-newCaseDate[ind,]%>%
cbind(folds)
### INITIALIZE RMSE DATAFRAME TO HOLD OUTPUT
RMSE <- data.frame('fold' = NA, 'span' = NA, 'RMSE' = NA, 'TestRMSE'=NA) # empty data frame to store RMSE
### LOOP FOR CROSS-VALIDATION
for(i in 1:kf){
trainDat<-foldPoly%>%
filter(fold!=i)
testDat<-foldPoly%>%
filter(fold==i)
#span
sp <- seq(0.1, 1, by=.1)
for (j in 1:length(sp)){
row<-length(sp)*(i-1)+j
# build models
model <- loess(Daily_Cases ~ Days, data = trainDat,
span=sp[j])
# calculate RSME and store it for further usage
RMSE[row,1] <-i
RMSE[row,2] <- sp[j]
RMSE[row,3] <- sqrt(sum((predict(model)-trainDat$Daily_Cases)^2)/length(trainDat$Daily_Cases)) # calculate RSME
predTest<-predict(model, testDat)
err<-predTest-testDat$Daily_Cases
RMSE[row, 4]<-sqrt(sum(err^2, na.rm=TRUE)/(length(testDat$Daily_Cases)-sum(is.na(err))))
}
}
ggplot(RMSE, aes(x=span, y=RMSE, color=as.factor(fold)))+
geom_line()+
geom_point()+
ggtitle("Training RMSE")
ggplot(RMSE, aes(x=span, y=TestRMSE, color=as.factor(fold)))+
geom_line()+
geom_point()+
ggtitle("Testing RMSE")
### TIDY GRAPHICS TO PLOT TOGETHER
tidyRMSE<-RMSE%>%
gather(key="Type", value="thisRMSE", -c(fold, span))
ggplot(tidyRMSE, aes(x=span, y=thisRMSE,
color=Type, lty=as.factor(fold)))+
geom_line()
### AGGREGATE CROSS_VALIDATION
cvRMSE<-RMSE%>%
group_by(span)%>%
summarise(avgTrainMSE=mean(RMSE),
avgTestMSE=mean(TestRMSE),
sdTrain=sd(RMSE),
sdTest=sd(TestRMSE))
cvRMSE
## # A tibble: 10 Ć 5
## span avgTrainMSE avgTestMSE sdTrain sdTest
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.1 4278. 5209. 724. 2742.
## 2 0.2 4643. 4727. 610. 2745.
## 3 0.3 4839. 4702. 632. 2556.
## 4 0.4 4894. 4716. 632. 2539.
## 5 0.5 4901. 4693. 629. 2544.
## 6 0.6 4902. 4685. 625. 2546.
## 7 0.7 4910. 4702. 622. 2528.
## 8 0.8 4931. 4744. 616. 2499.
## 9 0.9 4980. 4826. 612. 2459.
## 10 1 5078. 4968. 602. 2398.
cvRMSE%>%
gather(key="Type", value=thisRMSE, -c(span))%>%
filter(Type %in% c("avgTrainMSE", "avgTestMSE"))%>%
ggplot(aes(x=span, y=thisRMSE, color=Type))+
geom_line()+
geom_point()
### WHICH MINIMIZES
which.min(cvRMSE$avgTestMSE)
## [1] 6
cvRMSE$avgTestMSE[which.min(cvRMSE$avgTestMSE)]
## [1] 4684.551
## BEST SPAN
bestSpan<-cvRMSE$span[which.min(cvRMSE$avgTestMSE)]
bestSpan
## [1] 0.6
Using the chosen span, we fit the LOESS model:
loessB<-loess(Daily_Cases ~ Days, data = newCaseDate,
span=bestSpan)
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=predict(loessB)), color="red", size=1)+
ggtitle(paste("LOESS Model with Span = ", bestSpan, sep=""))
Repeat with the synthetic data.
set.seed(123)
### HOLD MANY FOLDS
kf<-5
### RANDOM SPLIT INTO K FOLDS
### RANDOM INDEXES
ind<-sample(1:150)
### CREATE DF
folds<-data.frame(ind,
fold=rep(1:kf, 150/kf))
### ADD ON COLUMNS TO ORIGINAL DAT
foldPoly<-poly.data[ind,]%>%
cbind(folds)
### INITIALIZE RMSE DATAFRAME TO HOLD OUTPUT
RMSE <- data.frame('fold' = NA, 'span' = NA, 'RMSE' = NA, 'TestRMSE'=NA) # empty data frame to store RMSE
### LOOP FOR CROSS-VALIDATION
for(i in 1:kf){
trainDat<-foldPoly%>%
filter(fold!=i)
testDat<-foldPoly%>%
filter(fold==i)
#span
sp <- seq(0.1, 1, by=.1)
for (j in 1:length(sp)){
row<-length(sp)*(i-1)+j
# build models
model <- loess(y ~ x, data = trainDat,
span=sp[j])
# calculate RSME and store it for further usage
RMSE[row,1] <-i
RMSE[row,2] <- sp[j]
RMSE[row,3] <- sqrt(sum((predict(model)-trainDat$y)^2)/length(trainDat$y)) # calculate RSME
predTest<-predict(model, testDat)
err<-predTest-testDat$y
RMSE[row, 4]<-sqrt(sum(err^2, na.rm=TRUE)/(length(testDat$y)-sum(is.na(err))))
}
}
ggplot(RMSE, aes(x=span, y=RMSE, color=as.factor(fold)))+
geom_line()+
geom_point()+
ggtitle("Training RMSE")
ggplot(RMSE, aes(x=span, y=TestRMSE, color=as.factor(fold)))+
geom_line()+
geom_point()+
ggtitle("Testing RMSE")
### TIDY GRAPHICS TO PLOT TOGETHER
tidyRMSE<-RMSE%>%
gather(key="Type", value="thisRMSE", -c(fold, span))
ggplot(tidyRMSE, aes(x=span, y=thisRMSE,
color=Type, lty=as.factor(fold)))+
geom_line()
### AGGREGATE CROSS_VALIDATION
cvRMSE<-RMSE%>%
group_by(span)%>%
summarise(avgTrainMSE=mean(RMSE),
avgTestMSE=mean(TestRMSE),
sdTrain=sd(RMSE),
sdTest=sd(TestRMSE))
cvRMSE
## # A tibble: 10 Ć 5
## span avgTrainMSE avgTestMSE sdTrain sdTest
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.1 0.323 0.420 0.0165 0.0841
## 2 0.2 0.354 0.398 0.0160 0.0694
## 3 0.3 0.362 0.386 0.0141 0.0626
## 4 0.4 0.364 0.382 0.0138 0.0621
## 5 0.5 0.366 0.381 0.0135 0.0631
## 6 0.6 0.366 0.379 0.0134 0.0639
## 7 0.7 0.367 0.379 0.0135 0.0638
## 8 0.8 0.369 0.381 0.0134 0.0646
## 9 0.9 0.377 0.387 0.0140 0.0637
## 10 1 0.406 0.414 0.0136 0.0710
cvRMSE%>%
gather(key="Type", value=thisRMSE, -c(span))%>%
filter(Type %in% c("avgTrainMSE", "avgTestMSE"))%>%
ggplot(aes(x=span, y=thisRMSE, color=Type))+
geom_line()+
geom_point()
### WHICH MINIMIZES
which.min(cvRMSE$avgTestMSE)
## [1] 7
cvRMSE$avgTestMSE[which.min(cvRMSE$avgTestMSE)]
## [1] 0.3791673
## BEST SPAN
bestSpan<-cvRMSE$span[which.min(cvRMSE$avgTestMSE)]
bestSpan
## [1] 0.7
## FINAL MOD
loessB2<-loess(y ~ x, data = poly.data,
span=bestSpan)
ggplot(poly.data, aes(x=x, y=y))+
geom_point()+
geom_line(aes(y=predict(loessB2)), color="red", size=1)+
ggtitle(paste("LOESS Model with Span = ", bestSpan, sep=""))
ggplot(newCaseDate, aes(x=Days, y=Daily_Cases))+
geom_point()+
geom_line(aes(y=mod_gam$fitted.values), color="red", size=1)+
geom_line(aes(y=model_3$fitted.values), color="blue", size=1)+
geom_line(aes(y=predict(loessB)), color="green", size=1)+
ggtitle("GAM Model (Red) vs Poly 3 (Blue) vs LOESS (Green)")
polyMod<-lm(y~poly(x, 10), data=poly.data)
#summary(polyMod)
polyMod5<-lm(y~poly(x, 5), data=poly.data)
vals <- seq(min(poly.data$x), max(poly.data$x), by = 0.01)
trueDF<-data.frame(vals)%>%
mutate(y=sin(2*pi*vals))
ggplot(poly.data, aes(x=x, y=y))+
geom_point()+
geom_line(aes(y=mod_gam2$fitted.values), color="red", size=1)+
geom_line(aes(y=polyMod5$fitted.values), color="blue", size=1)+
geom_line(aes(y=predict(loessB2)), color="green", size=1)+
geom_line(data=trueDF, aes(x=vals, y=y), lty=2)+
ggtitle("GAM Model (Red) vs Poly 5 (Blue) vs LOESS (Green)")
So who is the winner? Drumroll pleaseā¦.
polyMSE<-sum((predict(polyMod5, trueDF)-trueDF$y)^2)/length(trueDF$y)
gamMSE<-sum((predict(mod_gam2, trueDF)-trueDF$y)^2)/length(trueDF$y)
loessGAM<-sum((predict(loessB2, trueDF)-trueDF$y)^2)/length(trueDF$y)
polyMSE
## [1] 1.612422
gamMSE
## [1] 1.567029
loessGAM
## [1] 1.576906