Data Science
Based on Ray Fair’s book, Chapter 3
When on the Server:
presdata <- read.csv('/home/rstudioshared/shared_files/data/rayfair.csv', header=TRUE)
From the shared Google Drive Folder
presdata <- read.csv('Data_Science_Data/election_forecasting/rayfair.csv', header=TRUE)
library(dplyr); library(ggplot2)
presdata.train <- presdata %>% filter(YEAR <= 1996)
presdata.test <- presdata %>% filter(YEAR >= 2000)
m <- lm(VP~I:G+DPER+DUR+I:P+I:Z, data=presdata.train)
predict(m, presdata.test)
## 1 2 3 4 5
## 49.08382 43.36632 54.06060 52.06577 46.62942
bootstrapped_predictions <- replicate(1e3,
{m <- lm(VP~I:G+DPER+DUR+I:P+I:Z, data=sample_frac(presdata.train, 1, replace = TRUE));
predict(m, presdata.test)})
presdata.test$YEAR
## [1] 2000 2004 2008 2012 2016
apply(bootstrapped_predictions, 1, mean)
## 1 2 3 4 5
## 49.16461 43.58039 54.15532 52.00042 46.68897
apply(bootstrapped_predictions, 1, sd)
## 1 2 3 4 5
## 1.2547021 1.8628002 1.2776056 1.1944066 0.8877778
m <- lm(VP~I:G+DPER+DUR+I:P+I:Z, data=presdata.train)
boxplot(bootstrapped_predictions, use.cols=FALSE, names=seq(2000,2016,4),
main="Bootstrapped Predictions and Actual Results (Red)")
points(1:5, presdata.test$VP, col="red", pch=8, cex=3)
points(1:5, predict(m, presdata.test), col="blue", pch=8, cex=3)