Based on Ray Fair’s book, Chapter 3
When on the Server:
presdata <- read.csv('/home/rstudioshared/shared_files/data/rayfair.csv', header=TRUE)
From the shared Google Drive Folder
presdata <- read.csv('Data_Science_Data/election_forecasting/rayfair.csv', header=TRUE)
library(dplyr); library(ggplot2)
presdata.train <- presdata %>% filter(YEAR <= 1996)
presdata.test <- presdata %>% filter(YEAR >= 2000)
VP:: Democratic share of the two-party presidential vote
I: 1 if there is a Democratic presidential incumbent at the time of the election and −1 if there is a Republican presidential incumbent
DPER: 1 if a Democratic presidential incumbent is running again, −1 if a Republican presidential incumbent is running again, and 0 otherwise.
DUR: 0 if either party has been in the White House for one term, 1 [−1] if the Democratic [Republican] party has been in the White House for two consecutive terms, 1.25 [−1.25] if the Democratic [Republican] party has been in the White House for three consecutive terms, 1.50 [−1.50] if the Democratic [Republican] party
WAR: 1 for the elections of 1918, 1920, 1942, 1944, 1946, and 1948, and 0 otherwise.
G: growth rate of real per capita GDP in the first three quarters of the on-term election year (annual rate).
P: absolute value of the growth rate of the GDP deflator in the first 15 quarters of the administration (annual rate) except for 1920, 1944, and 1948, where the values are zero.
Z: number of quarters in the first 15 quarters of the administration in which the growth rate of real per capita GDP is greater than 3.2 percent at an annual rate except for 1920, 1944, and 1948, where the values are zero.
presdata.train <- presdata.train %>% mutate(IV = ifelse(I==1, VP, 100-VP))
coef(lm(IV ~ G, data=presdata.train))
## (Intercept) G
## 51.1745403 0.8681252
int_and_slope <- replicate(1e3,
coef(lm(IV ~ G, data=presdata.train[sample(1:nrow(presdata.train), nrow(presdata.train), replace=TRUE),]))) %>%
as.numeric() %>% matrix(ncol=2, nrow=1000, byrow=TRUE)
apply(int_and_slope, 2, sd)
## [1] 1.2583973 0.2056178
coef(lm(IV ~ G, data=presdata.train))/apply(int_and_slope, 2, sd)
## (Intercept) G
## 40.666442 4.222032
m_lab1 <- lm(IV ~ G, data=presdata.train)
summary(m_lab1)
##
## Call:
## lm(formula = IV ~ G, data = presdata.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.3556 -3.1561 0.4219 3.2397 10.4498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.1745 1.1198 45.698 < 2e-16 ***
## G 0.8681 0.1883 4.611 0.000191 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.978 on 19 degrees of freedom
## Multiple R-squared: 0.5281, Adjusted R-squared: 0.5033
## F-statistic: 21.26 on 1 and 19 DF, p-value: 0.0001905
m_gdp <- lm(VP ~ I(I*G), data=presdata.train)
summary(m_gdp)
##
## Call:
## lm(formula = VP ~ I(I * G), data = presdata.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2041 -3.1047 0.1499 2.6192 8.2454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.2141 1.0541 45.740 < 2e-16 ***
## I(I * G) 0.9625 0.1772 5.432 3.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.772 on 19 degrees of freedom
## Multiple R-squared: 0.6083, Adjusted R-squared: 0.5877
## F-statistic: 29.5 on 1 and 19 DF, p-value: 3.059e-05
m_gdp <- lm(VP ~ I:G, data=presdata.train)
summary(m_gdp)
##
## Call:
## lm(formula = VP ~ I:G, data = presdata.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.2041 -3.1047 0.1499 2.6192 8.2454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.2141 1.0541 45.740 < 2e-16 ***
## I:G 0.9625 0.1772 5.432 3.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.772 on 19 degrees of freedom
## Multiple R-squared: 0.6083, Adjusted R-squared: 0.5877
## F-statistic: 29.5 on 1 and 19 DF, p-value: 3.059e-05
m_gnk <- lm(VP ~ I:Z, data=presdata.train)
summary(m_gnk)
##
## Call:
## lm(formula = VP ~ I:Z, data = presdata.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9834 -3.2695 0.5715 4.6466 12.6564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.1314 1.4033 35.012 <2e-16 ***
## I:Z 0.6597 0.2375 2.777 0.012 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.43 on 19 degrees of freedom
## Multiple R-squared: 0.2888, Adjusted R-squared: 0.2513
## F-statistic: 7.714 on 1 and 19 DF, p-value: 0.012
m_both <- lm(VP ~ I:Z + I:G, data=presdata.train)
summary(m_both)
##
## Call:
## lm(formula = VP ~ I:Z + I:G, data = presdata.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7777 -2.6719 0.3914 1.9062 8.4448
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.3519 0.9752 49.580 < 2e-16 ***
## I:Z 0.3617 0.1744 2.074 0.052710 .
## I:G 0.8315 0.1753 4.742 0.000163 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.405 on 18 degrees of freedom
## Multiple R-squared: 0.6838, Adjusted R-squared: 0.6487
## F-statistic: 19.47 on 2 and 18 DF, p-value: 3.158e-05
m <- lm(VP~I:G+DPER+DUR+I:P+I:Z, data=presdata.train)
summary(m)
##
## Call:
## lm(formula = VP ~ I:G + DPER + DUR + I:P + I:Z, data = presdata.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9077 -1.0858 -0.5325 0.5757 6.8764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.5511 0.6218 76.479 < 2e-16 ***
## DPER 4.2484 1.0978 3.870 0.00151 **
## DUR -2.9677 0.7401 -4.010 0.00114 **
## I:G 0.6477 0.1096 5.913 2.85e-05 ***
## I:P -0.8995 0.2559 -3.515 0.00313 **
## I:Z 0.6674 0.1746 3.822 0.00167 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.581 on 15 degrees of freedom
## Multiple R-squared: 0.9096, Adjusted R-squared: 0.8794
## F-statistic: 30.18 on 5 and 15 DF, p-value: 2.552e-07
presdata.train$expected.VP <- predict(m, presdata.train)
presdata.train %>% ggplot(aes(expected.VP, VP))+
geom_point()+geom_abline(intercept=0, slope=1)
presdata.train %>% ggplot(aes(expected.VP, VP, label=YEAR))+
geom_label()+geom_abline(intercept=0, slope=1)
RMSE <- function(x, y){sqrt(mean((x-y)^2))}
presdata.train %>% summarize(RMSE(VP, expected.VP), RMSE(VP, 50), cor(VP, expected.VP))
## RMSE(VP, expected.VP) RMSE(VP, 50) cor(VP, expected.VP)
## 1 2.180931 7.30839 0.9537177
presdata.test$expected.VP <- predict(m, presdata.test)
presdata.test %>% ggplot(aes(expected.VP, VP, label=YEAR))+
geom_label()+geom_abline(intercept=0, slope=1)+ylim(c(40,55))
presdata.test %>% summarize(RMSE(VP, expected.VP), RMSE(VP, 50), cor(VP, expected.VP))
## RMSE(VP, expected.VP) RMSE(VP, 50) cor(VP, expected.VP)
## 1 3.183795 2.022264 0.9210099
How many choices did Fair make? Which variables should we use? Which exceptions should we make?
Consider for a moment how many possible variables Fair may have chosen from in order to find the ones with the best fit.
“economy of explanation in conformity with Occam’s razor”
Maybe fewer variables would perform better out of sample
More complex models will always perform better in sample even if they over-fit the data.
Try building different (perhaps simpler) models using the training data and then checking how they perform on the test set.