Step One: Create an ANOVA table and produce the F-statistic and discuss the R-squared value. Show your work with R code (from scratch) and confirm using the lm() and anova() functions.

First, we need to load in the data:

library(tidyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v dplyr   1.0.3
## v tibble  3.0.6     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
fec<-read.csv("fec_independent_expenditures.csv", header = TRUE)

opposition<-fec%>%
  filter(support_oppose_indicator=="O", report_year>="2013", candidate_office=="P",
         candidate_id!="P80002801",
         candidate_id!="")%>%
  group_by(candidate_id, report_year)%>%
  summarise(expenditure_amount_Opp=sum(expenditure_amount, na.rm = TRUE),
            n_Opp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
support<-fec%>%
  filter(support_oppose_indicator=="S", report_year>="2013", candidate_office=="P",
         candidate_id!="P00547984",
         candidate_id!="P20002721",
         candidate_id!="P60007671",
         candidate_id!="P60007895",
         candidate_id!="P60009354",
         candidate_id!="P60019239",
         candidate_id!="P60021102",
         candidate_id!="P60022118",
         candidate_id!="P60023215",
         candidate_id!="P80003353",
         candidate_id!="")%>%
  group_by(candidate_id, report_year)%>%
  summarise(expenditure_amount_Supp=sum(expenditure_amount, na.rm = TRUE),
            n_Supp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
tog<-opposition%>%
  inner_join(support)%>%
  mutate(ratio=n_Opp/n_Supp)
## Joining, by = c("candidate_id", "report_year")

Now we need to find the F-Statistic, and to do that, we need a few (a lot) of parts:

response_matrix<-tog$expenditure_amount_Opp

design_matrix<-matrix(c(rep(1, dim(tog)[1]), tog$report_year, tog$n_Supp, tog$n_Opp), nrow=dim(tog)[1])

betaMat<-solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)%*%response_matrix

hat_matrix<-design_matrix%*%solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)

y_hat_matrix<-design_matrix%*%betaMat

error_vector<-response_matrix-y_hat_matrix

n<-dim(tog)[1]

p<-3

ms_res<-sum(error_vector^2)/(n-p-1)

covMat<-ms_res*solve(t(design_matrix)%*%design_matrix)

err<-sqrt(diag(covMat))

I<-diag(dim(tog)[1])

ss_res<-t(response_matrix)%*%(I-hat_matrix)%*%response_matrix

ones<-matrix(c(rep(1, dim(tog)[1])), 
             nrow=dim(tog)[1])

J<-ones%*%t(ones)

ss_tot<-t(response_matrix)%*%(I-(J/n))%*%response_matrix

ss_reg<-t(response_matrix)%*%(hat_matrix-(J/n))%*%response_matrix

ms_res<-ss_res/(n-p-1)

ms_reg<-ss_reg/p

f_stat<-ms_reg/ms_res

f_stat
##          [,1]
## [1,] 3398.157

Now we need our R-Squared value. From scratch, we will need just two pieces:

r_squared<-ss_reg/ss_tot
r_squared
##           [,1]
## [1,] 0.9974561

This R-Squared value tells us that roughly 99.75% of our data is represented by our model. Thatโ€™s pretty good!

Confirming with summary output:

mlr<-lm(expenditure_amount_Opp~report_year+n_Supp+n_Opp, data = tog)
summary(mlr)
## 
## Call:
## lm(formula = expenditure_amount_Opp ~ report_year + n_Supp + 
##     n_Opp, data = tog)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2718176  -689041  -391122  -158051  5863348 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.107e+08  1.051e+09  -0.581    0.566    
## report_year  3.032e+05  5.217e+05   0.581    0.566    
## n_Supp       1.650e+03  5.539e+01  29.787   <2e-16 ***
## n_Opp        6.834e+03  7.356e+01  92.903   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1839000 on 26 degrees of freedom
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9972 
## F-statistic:  3398 on 3 and 26 DF,  p-value: < 2.2e-16

Looking good!

Now for the rest of our table:

p_value<-pf(f_stat, p, n-p-1, lower.tail = FALSE)

# ss_res
ss_res
##              [,1]
## [1,] 8.794712e+13
# ss_reg
ss_reg
##              [,1]
## [1,] 3.448363e+16
# ss_tot
ss_tot
##              [,1]
## [1,] 3.457158e+16
# ms_res
ms_res
##              [,1]
## [1,] 3.382581e+12
# ms_reg
ms_reg
##              [,1]
## [1,] 1.149454e+16
# f statistic
f_stat
##          [,1]
## [1,] 3398.157
# p value
p_value
##              [,1]
## [1,] 7.810158e-34
# Degrees of Freedom (regression)
p
## [1] 3
# Degrees of Freedom (residual)
n-p-1
## [1] 26
# Degrees of Freedom (total)
p + (n-p-1)
## [1] 29

Confirming with the ANOVA output:

anova(mlr)
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## report_year  1 1.7211e+15 1.7211e+15  508.81 < 2.2e-16 ***
## n_Supp       1 3.5677e+15 3.5677e+15 1054.73 < 2.2e-16 ***
## n_Opp        1 2.9195e+16 2.9195e+16 8630.92 < 2.2e-16 ***
## Residuals   26 8.7947e+13 3.3826e+12                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Adding up the sum of squares for regression gives us 3.448363e+16, and adding up the mean sum of squares for regression gives us 1.149454e+16.

Step Two: Perform a variable selection to create a final model using the method of your choice (forward stepwise, backward stepwise, full vs reduced, generalized).

Using the forward stepwise function:

p<-1
n<-dim(tog)[1]
alpha <- 0.05
this.df<-n-p-1

# Rejection Region
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.195972
mdl1<-lm(expenditure_amount_Opp~report_year, data = tog)
anova(mdl1) # 1.467
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##             Df     Sum Sq    Mean Sq F value Pr(>F)
## report_year  1 1.7211e+15 1.7211e+15   1.467 0.2359
## Residuals   28 3.2850e+16 1.1732e+15
mdl2<-lm(expenditure_amount_Opp~n_Supp, data = tog)
anova(mdl2) # 4.0503
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##           Df     Sum Sq    Mean Sq F value  Pr(>F)  
## n_Supp     1 4.3689e+15 4.3689e+15  4.0503 0.05387 .
## Residuals 28 3.0203e+16 1.0787e+15                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mdl3<-lm(expenditure_amount_Opp~n_Opp, data = tog)
anova(mdl3) # 274.65
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##           Df     Sum Sq    Mean Sq F value    Pr(>F)    
## n_Opp      1 3.1373e+16 3.1373e+16  274.65 5.261e-16 ***
## Residuals 28 3.1984e+15 1.1423e+14                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# next!

p<-2
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.210008
mdl1a<-lm(expenditure_amount_Opp~n_Opp+report_year, data = tog)
anova(mdl1a) # 0.9541
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##             Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## n_Opp        1 3.1373e+16 3.1373e+16 274.2034 1.148e-15 ***
## report_year  1 1.0916e+14 1.0916e+14   0.9541    0.3374    
## Residuals   27 3.0892e+15 1.1442e+14                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mdl2a<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
anova(mdl2a) # 942.33
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##           Df     Sum Sq    Mean Sq F value    Pr(>F)    
## n_Opp      1 3.1373e+16 3.1373e+16 9508.18 < 2.2e-16 ***
## n_Supp     1 3.1093e+15 3.1093e+15  942.33 < 2.2e-16 ***
## Residuals 27 8.9089e+13 3.2996e+12                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# and finally!

p<-3
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.225201
mdl3a<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
anova(mdl3a)
## Analysis of Variance Table
## 
## Response: expenditure_amount_Opp
##           Df     Sum Sq    Mean Sq F value    Pr(>F)    
## n_Opp      1 3.1373e+16 3.1373e+16 9508.18 < 2.2e-16 ***
## n_Supp     1 3.1093e+15 3.1093e+15  942.33 < 2.2e-16 ***
## Residuals 27 8.9089e+13 3.2996e+12                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# based on these tests, we can drop report_year, and thus our final model is:

mod_fin<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)