First, we need to load in the data:
library(tidyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v dplyr 1.0.3
## v tibble 3.0.6 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
fec<-read.csv("fec_independent_expenditures.csv", header = TRUE)
opposition<-fec%>%
filter(support_oppose_indicator=="O", report_year>="2013", candidate_office=="P",
candidate_id!="P80002801",
candidate_id!="")%>%
group_by(candidate_id, report_year)%>%
summarise(expenditure_amount_Opp=sum(expenditure_amount, na.rm = TRUE),
n_Opp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
support<-fec%>%
filter(support_oppose_indicator=="S", report_year>="2013", candidate_office=="P",
candidate_id!="P00547984",
candidate_id!="P20002721",
candidate_id!="P60007671",
candidate_id!="P60007895",
candidate_id!="P60009354",
candidate_id!="P60019239",
candidate_id!="P60021102",
candidate_id!="P60022118",
candidate_id!="P60023215",
candidate_id!="P80003353",
candidate_id!="")%>%
group_by(candidate_id, report_year)%>%
summarise(expenditure_amount_Supp=sum(expenditure_amount, na.rm = TRUE),
n_Supp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
tog<-opposition%>%
inner_join(support)%>%
mutate(ratio=n_Opp/n_Supp)
## Joining, by = c("candidate_id", "report_year")
Now we need to find the F-Statistic, and to do that, we need a few (a lot) of parts:
response_matrix<-tog$expenditure_amount_Opp
design_matrix<-matrix(c(rep(1, dim(tog)[1]), tog$report_year, tog$n_Supp, tog$n_Opp), nrow=dim(tog)[1])
betaMat<-solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)%*%response_matrix
hat_matrix<-design_matrix%*%solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)
y_hat_matrix<-design_matrix%*%betaMat
error_vector<-response_matrix-y_hat_matrix
n<-dim(tog)[1]
p<-3
ms_res<-sum(error_vector^2)/(n-p-1)
covMat<-ms_res*solve(t(design_matrix)%*%design_matrix)
err<-sqrt(diag(covMat))
I<-diag(dim(tog)[1])
ss_res<-t(response_matrix)%*%(I-hat_matrix)%*%response_matrix
ones<-matrix(c(rep(1, dim(tog)[1])),
nrow=dim(tog)[1])
J<-ones%*%t(ones)
ss_tot<-t(response_matrix)%*%(I-(J/n))%*%response_matrix
ss_reg<-t(response_matrix)%*%(hat_matrix-(J/n))%*%response_matrix
ms_res<-ss_res/(n-p-1)
ms_reg<-ss_reg/p
f_stat<-ms_reg/ms_res
f_stat
## [,1]
## [1,] 3398.157
Now we need our R-Squared value. From scratch, we will need just two pieces:
r_squared<-ss_reg/ss_tot
r_squared
## [,1]
## [1,] 0.9974561
This R-Squared value tells us that roughly 99.75% of our data is represented by our model. Thatโs pretty good!
Confirming with summary output:
mlr<-lm(expenditure_amount_Opp~report_year+n_Supp+n_Opp, data = tog)
summary(mlr)
##
## Call:
## lm(formula = expenditure_amount_Opp ~ report_year + n_Supp +
## n_Opp, data = tog)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2718176 -689041 -391122 -158051 5863348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.107e+08 1.051e+09 -0.581 0.566
## report_year 3.032e+05 5.217e+05 0.581 0.566
## n_Supp 1.650e+03 5.539e+01 29.787 <2e-16 ***
## n_Opp 6.834e+03 7.356e+01 92.903 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1839000 on 26 degrees of freedom
## Multiple R-squared: 0.9975, Adjusted R-squared: 0.9972
## F-statistic: 3398 on 3 and 26 DF, p-value: < 2.2e-16
Looking good!
Now for the rest of our table:
p_value<-pf(f_stat, p, n-p-1, lower.tail = FALSE)
# ss_res
ss_res
## [,1]
## [1,] 8.794712e+13
# ss_reg
ss_reg
## [,1]
## [1,] 3.448363e+16
# ss_tot
ss_tot
## [,1]
## [1,] 3.457158e+16
# ms_res
ms_res
## [,1]
## [1,] 3.382581e+12
# ms_reg
ms_reg
## [,1]
## [1,] 1.149454e+16
# f statistic
f_stat
## [,1]
## [1,] 3398.157
# p value
p_value
## [,1]
## [1,] 7.810158e-34
# Degrees of Freedom (regression)
p
## [1] 3
# Degrees of Freedom (residual)
n-p-1
## [1] 26
# Degrees of Freedom (total)
p + (n-p-1)
## [1] 29
Confirming with the ANOVA output:
anova(mlr)
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## report_year 1 1.7211e+15 1.7211e+15 508.81 < 2.2e-16 ***
## n_Supp 1 3.5677e+15 3.5677e+15 1054.73 < 2.2e-16 ***
## n_Opp 1 2.9195e+16 2.9195e+16 8630.92 < 2.2e-16 ***
## Residuals 26 8.7947e+13 3.3826e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Adding up the sum of squares for regression gives us 3.448363e+16, and adding up the mean sum of squares for regression gives us 1.149454e+16.
Using the forward stepwise function:
p<-1
n<-dim(tog)[1]
alpha <- 0.05
this.df<-n-p-1
# Rejection Region
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.195972
mdl1<-lm(expenditure_amount_Opp~report_year, data = tog)
anova(mdl1) # 1.467
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## report_year 1 1.7211e+15 1.7211e+15 1.467 0.2359
## Residuals 28 3.2850e+16 1.1732e+15
mdl2<-lm(expenditure_amount_Opp~n_Supp, data = tog)
anova(mdl2) # 4.0503
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## n_Supp 1 4.3689e+15 4.3689e+15 4.0503 0.05387 .
## Residuals 28 3.0203e+16 1.0787e+15
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mdl3<-lm(expenditure_amount_Opp~n_Opp, data = tog)
anova(mdl3) # 274.65
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## n_Opp 1 3.1373e+16 3.1373e+16 274.65 5.261e-16 ***
## Residuals 28 3.1984e+15 1.1423e+14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# next!
p<-2
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.210008
mdl1a<-lm(expenditure_amount_Opp~n_Opp+report_year, data = tog)
anova(mdl1a) # 0.9541
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## n_Opp 1 3.1373e+16 3.1373e+16 274.2034 1.148e-15 ***
## report_year 1 1.0916e+14 1.0916e+14 0.9541 0.3374
## Residuals 27 3.0892e+15 1.1442e+14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mdl2a<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
anova(mdl2a) # 942.33
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## n_Opp 1 3.1373e+16 3.1373e+16 9508.18 < 2.2e-16 ***
## n_Supp 1 3.1093e+15 3.1093e+15 942.33 < 2.2e-16 ***
## Residuals 27 8.9089e+13 3.2996e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# and finally!
p<-3
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.225201
mdl3a<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
anova(mdl3a)
## Analysis of Variance Table
##
## Response: expenditure_amount_Opp
## Df Sum Sq Mean Sq F value Pr(>F)
## n_Opp 1 3.1373e+16 3.1373e+16 9508.18 < 2.2e-16 ***
## n_Supp 1 3.1093e+15 3.1093e+15 942.33 < 2.2e-16 ***
## Residuals 27 8.9089e+13 3.2996e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# based on these tests, we can drop report_year, and thus our final model is:
mod_fin<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)