# loading in the data
fec<-read.csv("fec_independent_expenditures.csv", header = TRUE)
# calling in the appropriate packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(scatterplot3d)
# subsetting the data to filter for oppositional expenditures after (and including) 2013 for the presidential election
opposition<-fec%>%
filter(support_oppose_indicator=="O", report_year>="2013", candidate_office=="P",
candidate_id!="P80002801",
candidate_id!="")%>%
group_by(candidate_id)%>%
summarise(expenditure_amount=sum(expenditure_amount, na.rm = TRUE))
# subsetting the data to filter for supportive expenditures after (and including) 2013 for the presidential election
support<-fec%>%
filter(support_oppose_indicator=="S", report_year>="2013", candidate_office=="P",
candidate_id!="P00547984",
candidate_id!="P20002721",
candidate_id!="P60007671",
candidate_id!="P60007895",
candidate_id!="P60009354",
candidate_id!="P60019239",
candidate_id!="P60021102",
candidate_id!="P60022118",
candidate_id!="P60023215",
candidate_id!="P80003353",
candidate_id!="")%>%
group_by(candidate_id)%>%
summarise(expenditure_amount=sum(expenditure_amount, na.rm = TRUE))
# combining the two sets and removing duplicate columns
fec<-cbind(support, opposition)
fec$opp_amt<-fec[,4]
fec$sup_amt<-fec[,2]
fec2<-fec[-c(2:4)]
# naming variables to make calling them a little easier
x<-fec2$sup_amt
y<-fec2$opp_amt
# using ggplot, this is a visualization of the possible correlation between the two variables
ggplot(fec2, aes(x = sup_amt, y = opp_amt, color = candidate_id))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

# finding the intercept and slope coefficients from scratch
x2 <- sum((x-mean(x))*(y-mean(y)))
x3 <- sum((x-mean(x))^2)
beta_1 <- x2/x3
beta_0 <- mean(y)-(beta_1*mean(x))
beta_1
## [1] 0.9537725
beta_0
## [1] -4679743
# verifying using lm()
mod<-lm(y~x)
mod$coefficients
## (Intercept) x
## -4.679743e+06 9.537725e-01
#Stating the reference distribution, degrees of freedom, the test statistic, and the p-value
n<-dim(fec2)[1]
beta_1<-mod$coefficients[2]
ss_res <- sum(mod$residuals^2)
ms_res<- ss_res/(n-2)
se_b1 <- sqrt(ms_res)/sqrt(sum((x-mean(x))^2))
ref <- (beta_1-0)/se_b1 # This is the reference distribution
df <- (n-2) # This is how many degrees of freedom
df
## [1] 17
# The test statistic is shown as the following:
t_stat <- beta_1/se_b1 # This is the same as the reference distribution.
t_stat
## x
## 6.777593
p_value<-pt(abs(t_stat), df = n-2, lower.tail = FALSE)*2 # This is the P-value for the model for a two sided (non-directional) test.
p_value
## x
## 3.226803e-06
# Verifying using summary()
summary(mod)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75146430 -2864893 1345945 3672020 44668373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.680e+06 6.098e+06 -0.767 0.453
## x 9.538e-01 1.407e-01 6.778 3.23e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23450000 on 17 degrees of freedom
## Multiple R-squared: 0.7299, Adjusted R-squared: 0.714
## F-statistic: 45.94 on 1 and 17 DF, p-value: 3.227e-06
# Finding potential outliers in the data
plot(mod)




# reloading in the data and creating a new dataset that includes more explanatory variables
fec<-read.csv("fec_independent_expenditures.csv", header = TRUE)
opposition<-fec%>%
filter(support_oppose_indicator=="O", report_year>="2013", candidate_office=="P",
candidate_id!="P80002801",
candidate_id!="")%>%
group_by(candidate_id, report_year)%>%
summarise(expenditure_amount_Opp=sum(expenditure_amount, na.rm = TRUE),
n_Opp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
support<-fec%>%
filter(support_oppose_indicator=="S", report_year>="2013", candidate_office=="P",
candidate_id!="P00547984",
candidate_id!="P20002721",
candidate_id!="P60007671",
candidate_id!="P60007895",
candidate_id!="P60009354",
candidate_id!="P60019239",
candidate_id!="P60021102",
candidate_id!="P60022118",
candidate_id!="P60023215",
candidate_id!="P80003353",
candidate_id!="")%>%
group_by(candidate_id, report_year)%>%
summarise(expenditure_amount_Supp=sum(expenditure_amount, na.rm = TRUE),
n_Supp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
tog<-opposition%>%
inner_join(support)%>%
mutate(ratio=n_Opp/n_Supp)
## Joining, by = c("candidate_id", "report_year")
# calculating the slope coeffecients from scratch
YMat<-tog$expenditure_amount_Opp
XMat<-matrix(c(rep(1, dim(tog)[1]), tog$report_year, tog$n_Supp, tog$n_Opp), nrow=dim(tog)[1])
betaMat<-solve(t(XMat)%*%XMat)%*%t(XMat)%*%YMat
betaMat
## [,1]
## [1,] -6.107048e+08
## [2,] 3.031536e+05
## [3,] 1.649807e+03
## [4,] 6.834127e+03
# error terms
eMat<-YMat-(XMat%*%betaMat)
n<-dim(tog)[1]
p<-3
ms_res<-sum(eMat^2)/(n-p-1)
covMat<-ms_res*solve(t(XMat)%*%XMat)
err<-sqrt(diag(covMat))
err
## [1] 1.051444e+09 5.217172e+05 5.538653e+01 7.356215e+01
# and finally, using lm() to verify the results
mlr<-lm(tog$expenditure_amount_Opp~tog$report_year+tog$n_Supp+tog$n_Opp)
summary(mlr)
##
## Call:
## lm(formula = tog$expenditure_amount_Opp ~ tog$report_year + tog$n_Supp +
## tog$n_Opp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2718176 -689041 -391122 -158051 5863348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.107e+08 1.051e+09 -0.581 0.566
## tog$report_year 3.032e+05 5.217e+05 0.581 0.566
## tog$n_Supp 1.650e+03 5.539e+01 29.787 <2e-16 ***
## tog$n_Opp 6.834e+03 7.356e+01 92.903 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1839000 on 26 degrees of freedom
## Multiple R-squared: 0.9975, Adjusted R-squared: 0.9972
## F-statistic: 3398 on 3 and 26 DF, p-value: < 2.2e-16
# finding the f-statistic from scratch
response_matrix<-tog$expenditure_amount_Opp
design_matrix<-matrix(c(rep(1, dim(tog)[1]), tog$report_year, tog$n_Supp, tog$n_Opp), nrow=dim(tog)[1])
betaMat<-solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)%*%response_matrix
hat_matrix<-design_matrix%*%solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)
y_hat_matrix<-design_matrix%*%betaMat
error_vector<-response_matrix-y_hat_matrix
n<-dim(tog)[1]
p<-3
ms_res<-sum(error_vector^2)/(n-p-1)
covMat<-ms_res*solve(t(design_matrix)%*%design_matrix)
err<-sqrt(diag(covMat))
I<-diag(dim(tog)[1])
ss_res<-t(response_matrix)%*%(I-hat_matrix)%*%response_matrix
ones<-matrix(c(rep(1, dim(tog)[1])),
nrow=dim(tog)[1])
J<-ones%*%t(ones)
ss_tot<-t(response_matrix)%*%(I-(J/n))%*%response_matrix
ss_reg<-t(response_matrix)%*%(hat_matrix-(J/n))%*%response_matrix
ms_res<-ss_res/(n-p-1)
ms_reg<-ss_reg/p
f_stat<-ms_reg/ms_res
f_stat
## [,1]
## [1,] 3398.157
# now finding the r squared value
r_squared<-ss_reg/ss_tot
r_squared
## [,1]
## [1,] 0.9974561
p_value<-pf(f_stat, p, n-p-1, lower.tail = FALSE)
# p value
p_value
## [,1]
## [1,] 7.810158e-34
# forward selection to determine a better model
p<-1
n<-dim(tog)[1]
alpha <- 0.05
this.df<-n-p-1
# Rejection Region
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.195972
mdl1<-lm(expenditure_amount_Opp~report_year, data = tog)
anova1<-anova(mdl1) # 1.467
anova1$`F value`[1]
## [1] 1.466979
mdl2<-lm(expenditure_amount_Opp~n_Supp, data = tog)
anova2<-anova(mdl2) # 4.0503
anova2$`F value`[1]
## [1] 4.050264
mdl3<-lm(expenditure_amount_Opp~n_Opp, data = tog)
anova3<-anova(mdl3) # 274.65
anova3$`F value`[1]
## [1] 274.6539
# determining the next rejection region and predictors to use
p<-2
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.210008
mdl1a<-lm(expenditure_amount_Opp~n_Opp+report_year, data = tog)
anova1a<-anova(mdl1a) # 0.9541
anova1a$`F value`[2]
## [1] 0.9540721
mdl2a<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
anova2a<-anova(mdl2a) # 942.33
anova2a$`F value`[2]
## [1] 942.3254
# and finally the last rejection region and forward selection process
p<-3
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.225201
mdl3a<-lm(expenditure_amount_Opp~n_Opp+n_Supp+report_year, data = tog)
anova3a<-anova(mdl3a)
anova3a$`F value`[3]
## [1] 0.3376409
# based on these tests, we can drop report_year, and thus the final model is:
mod_fin<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
# finding possible instances of multicollinearity using pairs plots and vif()
vif(lm(expenditure_amount_Opp~n_Opp+n_Supp+report_year, data = tog))
## n_Opp n_Supp report_year
## 1.032882 1.032798 1.062175
# because pairs() cannot work with categorical variables, and report_year is no longer in the model, they are dropped from the final dataset
tog<-tog[-c(1:2)]
pairs(tog)

tog<-opposition%>%
inner_join(support)%>%
mutate(ratio=n_Opp/n_Supp)
## Joining, by = c("candidate_id", "report_year")
# code to show various visualizations
ggplot(tog, aes(x = n_Opp, y = expenditure_amount_Opp, color = candidate_id))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)+
facet_wrap(tog$report_year)
## `geom_smooth()` using formula 'y ~ x'

ggplot(tog, aes(x = n_Supp, y = expenditure_amount_Opp, color = candidate_id))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)+
facet_wrap(tog$report_year)
## `geom_smooth()` using formula 'y ~ x'

ggplot(tog, aes(x = report_year, y = expenditure_amount_Opp, color = candidate_id))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

fec_s3d<-scatterplot3d(x=tog$n_Opp, y=tog$n_Supp, z=tog$expenditure_amount_Opp,
pch=16, angle = 60, color = tog$report_year)
fec_s3d$plane3d(mod_fin)
