# loading in the data
fec<-read.csv("fec_independent_expenditures.csv", header = TRUE)

# calling in the appropriate packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.1
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(scatterplot3d)


# subsetting the data to filter for oppositional expenditures after (and including) 2013 for the presidential election
opposition<-fec%>%
  filter(support_oppose_indicator=="O", report_year>="2013", candidate_office=="P",
         candidate_id!="P80002801",
         candidate_id!="")%>%
  group_by(candidate_id)%>%
  summarise(expenditure_amount=sum(expenditure_amount, na.rm = TRUE))

# subsetting the data to filter for supportive expenditures after (and including) 2013 for the presidential election
support<-fec%>%
  filter(support_oppose_indicator=="S", report_year>="2013", candidate_office=="P",
         candidate_id!="P00547984",
         candidate_id!="P20002721",
         candidate_id!="P60007671",
         candidate_id!="P60007895",
         candidate_id!="P60009354",
         candidate_id!="P60019239",
         candidate_id!="P60021102",
         candidate_id!="P60022118",
         candidate_id!="P60023215",
         candidate_id!="P80003353",
         candidate_id!="")%>%
  group_by(candidate_id)%>%
  summarise(expenditure_amount=sum(expenditure_amount, na.rm = TRUE))

# combining the two sets and removing duplicate columns
fec<-cbind(support, opposition)
fec$opp_amt<-fec[,4]
fec$sup_amt<-fec[,2]
fec2<-fec[-c(2:4)]

# naming variables to make calling them a little easier
x<-fec2$sup_amt
y<-fec2$opp_amt

# using ggplot, this is a visualization of the possible correlation between the two variables
ggplot(fec2, aes(x = sup_amt, y = opp_amt, color = candidate_id))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

# finding the intercept and slope coefficients from scratch
x2 <- sum((x-mean(x))*(y-mean(y)))
x3 <- sum((x-mean(x))^2)

beta_1 <- x2/x3

beta_0 <- mean(y)-(beta_1*mean(x))

beta_1
## [1] 0.9537725
beta_0
## [1] -4679743
# verifying using lm()
mod<-lm(y~x)
mod$coefficients
##   (Intercept)             x 
## -4.679743e+06  9.537725e-01
#Stating the reference distribution, degrees of freedom, the test statistic, and the p-value
n<-dim(fec2)[1]
beta_1<-mod$coefficients[2]

ss_res <- sum(mod$residuals^2)

ms_res<- ss_res/(n-2)
se_b1 <- sqrt(ms_res)/sqrt(sum((x-mean(x))^2))

ref <- (beta_1-0)/se_b1 # This is the reference distribution

df <- (n-2) # This is how many degrees of freedom
df
## [1] 17
# The test statistic is shown as the following:

t_stat <- beta_1/se_b1 # This is the same as the reference distribution.

t_stat
##        x 
## 6.777593
p_value<-pt(abs(t_stat), df = n-2, lower.tail = FALSE)*2 # This is the P-value for the model for a two sided (non-directional) test.

p_value
##            x 
## 3.226803e-06
# Verifying using summary()

summary(mod)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -75146430  -2864893   1345945   3672020  44668373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.680e+06  6.098e+06  -0.767    0.453    
## x            9.538e-01  1.407e-01   6.778 3.23e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23450000 on 17 degrees of freedom
## Multiple R-squared:  0.7299, Adjusted R-squared:  0.714 
## F-statistic: 45.94 on 1 and 17 DF,  p-value: 3.227e-06
# Finding potential outliers in the data
plot(mod)

# reloading in the data and creating a new dataset that includes more explanatory variables
fec<-read.csv("fec_independent_expenditures.csv", header = TRUE)
opposition<-fec%>%
  filter(support_oppose_indicator=="O", report_year>="2013", candidate_office=="P",
         candidate_id!="P80002801",
         candidate_id!="")%>%
  group_by(candidate_id, report_year)%>%
  summarise(expenditure_amount_Opp=sum(expenditure_amount, na.rm = TRUE),
            n_Opp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
support<-fec%>%
  filter(support_oppose_indicator=="S", report_year>="2013", candidate_office=="P",
         candidate_id!="P00547984",
         candidate_id!="P20002721",
         candidate_id!="P60007671",
         candidate_id!="P60007895",
         candidate_id!="P60009354",
         candidate_id!="P60019239",
         candidate_id!="P60021102",
         candidate_id!="P60022118",
         candidate_id!="P60023215",
         candidate_id!="P80003353",
         candidate_id!="")%>%
  group_by(candidate_id, report_year)%>%
  summarise(expenditure_amount_Supp=sum(expenditure_amount, na.rm = TRUE),
            n_Supp=n())
## `summarise()` has grouped output by 'candidate_id'. You can override using the `.groups` argument.
tog<-opposition%>%
  inner_join(support)%>%
  mutate(ratio=n_Opp/n_Supp)
## Joining, by = c("candidate_id", "report_year")
# calculating the slope coeffecients from scratch
YMat<-tog$expenditure_amount_Opp
XMat<-matrix(c(rep(1, dim(tog)[1]), tog$report_year, tog$n_Supp, tog$n_Opp), nrow=dim(tog)[1])

betaMat<-solve(t(XMat)%*%XMat)%*%t(XMat)%*%YMat

betaMat
##               [,1]
## [1,] -6.107048e+08
## [2,]  3.031536e+05
## [3,]  1.649807e+03
## [4,]  6.834127e+03
# error terms
eMat<-YMat-(XMat%*%betaMat)
n<-dim(tog)[1]
p<-3
ms_res<-sum(eMat^2)/(n-p-1)
covMat<-ms_res*solve(t(XMat)%*%XMat)
err<-sqrt(diag(covMat))
err
## [1] 1.051444e+09 5.217172e+05 5.538653e+01 7.356215e+01
# and finally, using lm() to verify the results
mlr<-lm(tog$expenditure_amount_Opp~tog$report_year+tog$n_Supp+tog$n_Opp)
summary(mlr)
## 
## Call:
## lm(formula = tog$expenditure_amount_Opp ~ tog$report_year + tog$n_Supp + 
##     tog$n_Opp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2718176  -689041  -391122  -158051  5863348 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6.107e+08  1.051e+09  -0.581    0.566    
## tog$report_year  3.032e+05  5.217e+05   0.581    0.566    
## tog$n_Supp       1.650e+03  5.539e+01  29.787   <2e-16 ***
## tog$n_Opp        6.834e+03  7.356e+01  92.903   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1839000 on 26 degrees of freedom
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.9972 
## F-statistic:  3398 on 3 and 26 DF,  p-value: < 2.2e-16
# finding the f-statistic from scratch
response_matrix<-tog$expenditure_amount_Opp

design_matrix<-matrix(c(rep(1, dim(tog)[1]), tog$report_year, tog$n_Supp, tog$n_Opp), nrow=dim(tog)[1])

betaMat<-solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)%*%response_matrix

hat_matrix<-design_matrix%*%solve(t(design_matrix)%*%design_matrix)%*%t(design_matrix)

y_hat_matrix<-design_matrix%*%betaMat

error_vector<-response_matrix-y_hat_matrix

n<-dim(tog)[1]

p<-3

ms_res<-sum(error_vector^2)/(n-p-1)

covMat<-ms_res*solve(t(design_matrix)%*%design_matrix)

err<-sqrt(diag(covMat))

I<-diag(dim(tog)[1])

ss_res<-t(response_matrix)%*%(I-hat_matrix)%*%response_matrix

ones<-matrix(c(rep(1, dim(tog)[1])), 
             nrow=dim(tog)[1])

J<-ones%*%t(ones)

ss_tot<-t(response_matrix)%*%(I-(J/n))%*%response_matrix

ss_reg<-t(response_matrix)%*%(hat_matrix-(J/n))%*%response_matrix

ms_res<-ss_res/(n-p-1)

ms_reg<-ss_reg/p

f_stat<-ms_reg/ms_res

f_stat
##          [,1]
## [1,] 3398.157
# now finding the r squared value
r_squared<-ss_reg/ss_tot
r_squared
##           [,1]
## [1,] 0.9974561
p_value<-pf(f_stat, p, n-p-1, lower.tail = FALSE)

# p value
p_value
##              [,1]
## [1,] 7.810158e-34
# forward selection to determine a better model
p<-1
n<-dim(tog)[1]
alpha <- 0.05
this.df<-n-p-1

# Rejection Region
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.195972
mdl1<-lm(expenditure_amount_Opp~report_year, data = tog)
anova1<-anova(mdl1) # 1.467
anova1$`F value`[1]
## [1] 1.466979
mdl2<-lm(expenditure_amount_Opp~n_Supp, data = tog)
anova2<-anova(mdl2) # 4.0503
anova2$`F value`[1]
## [1] 4.050264
mdl3<-lm(expenditure_amount_Opp~n_Opp, data = tog)
anova3<-anova(mdl3) # 274.65
anova3$`F value`[1]
## [1] 274.6539
# determining the next rejection region and predictors to use
p<-2
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.210008
mdl1a<-lm(expenditure_amount_Opp~n_Opp+report_year, data = tog)
anova1a<-anova(mdl1a) # 0.9541
anova1a$`F value`[2]
## [1] 0.9540721
mdl2a<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
anova2a<-anova(mdl2a) # 942.33
anova2a$`F value`[2]
## [1] 942.3254
# and finally the last rejection region and forward selection process

p<-3
this.df<-n-p-1
qf(alpha, df1 = 1, df2 = this.df, lower.tail = FALSE)
## [1] 4.225201
mdl3a<-lm(expenditure_amount_Opp~n_Opp+n_Supp+report_year, data = tog)
anova3a<-anova(mdl3a)
anova3a$`F value`[3]
## [1] 0.3376409
# based on these tests, we can drop report_year, and thus the final model is:

mod_fin<-lm(expenditure_amount_Opp~n_Opp+n_Supp, data = tog)
# finding possible instances of multicollinearity using pairs plots and vif()
vif(lm(expenditure_amount_Opp~n_Opp+n_Supp+report_year, data = tog))
##       n_Opp      n_Supp report_year 
##    1.032882    1.032798    1.062175
# because pairs() cannot work with categorical variables, and report_year is no longer in the model, they are dropped from the final dataset
tog<-tog[-c(1:2)]

pairs(tog)

tog<-opposition%>%
  inner_join(support)%>%
  mutate(ratio=n_Opp/n_Supp)
## Joining, by = c("candidate_id", "report_year")
# code to show various visualizations
ggplot(tog, aes(x = n_Opp, y = expenditure_amount_Opp, color = candidate_id))+
  geom_point()+
  geom_smooth(method = "lm", se = FALSE)+
  facet_wrap(tog$report_year)
## `geom_smooth()` using formula 'y ~ x'

ggplot(tog, aes(x = n_Supp, y = expenditure_amount_Opp, color = candidate_id))+
  geom_point()+
  geom_smooth(method = "lm", se = FALSE)+
  facet_wrap(tog$report_year)
## `geom_smooth()` using formula 'y ~ x'

ggplot(tog, aes(x = report_year, y = expenditure_amount_Opp, color = candidate_id))+
  geom_point()+
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

fec_s3d<-scatterplot3d(x=tog$n_Opp, y=tog$n_Supp, z=tog$expenditure_amount_Opp, 
                    pch=16, angle = 60, color = tog$report_year)

fec_s3d$plane3d(mod_fin)