library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidycensus)
url<-"https://raw.githubusercontent.com/fivethirtyeight/election-results/main/election_results_presidential.csv"
presidential_elections<-read_csv(url)
## Rows: 7423 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): state_abbrev, state, office_name, stage, party, candidate_name, bal...
## dbl (8): id, race_id, office_id, cycle, politician_id, candidate_id, votes, ...
## lgl (4): office_seat_name, special, unopposed, winner
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Q1
pres_2020<-presidential_elections|>
filter(cycle == 2020)|>
filter(stage == "general")|>
filter(!is.na(state))
counts_mutated<-pres_2020|>
group_by(state)|>
mutate(`number of candidates` = n())|>
arrange(-`number of candidates`)
pres_2020|>
filter(ballot_party!="W")
pres_2020|>
# %in% will evaluate true if there's a match on the right hand vector
filter(state %in% c("Nebraska CD-1", "Nebraska CD-2", "Maine CD-1", "Maine CD-2") )
pres_2020 |>
# str_detect
filter(str_detect(state, "CD-[0-9]") ==FALSE)
party_totals = pres_2020|>
# remove split electoral counts
filter(str_detect(state, "CD-[0-9]")==FALSE)|>
# create a three category variable for party vote shares
mutate(vote_type= case_when(
`candidate_name` == "Joe Biden" ~ "Joe Biden",
`candidate_name` == "Donald Trump" ~ "Donald Trump",
!`candidate_name`%in%c("Joe Biden", "Donald Trump") ~ "other"))|>
# get the total votes for each group
group_by(state, vote_type)|>
summarise(votes = sum(votes))|>
# now get the total number of votes cast:
group_by(state)|>
mutate(total = sum(votes))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
party_totals|>
pivot_wider(names_from = vote_type, values_from = votes)
final_results <- party_totals |>
pivot_wider(names_from = vote_type, values_from = votes) |>
select(-total, -other) # Remove "total" and "other" columns
print(final_results)
## # A tibble: 51 × 3
## # Groups: state [51]
## state `Donald Trump` `Joe Biden`
## <chr> <dbl> <dbl>
## 1 Alabama 1441170 849624
## 2 Alaska 189951 153778
## 3 Arizona 1661686 1672143
## 4 Arkansas 760647 423932
## 5 California 6006429 11110250
## 6 Colorado 1364607 1804352
## 7 Connecticut 714717 1080831
## 8 Delaware 200603 296268
## 9 District of Columbia 18586 317323
## 10 Florida 5668731 5297045
## # ℹ 41 more rows
Q2
biden_share <- final_results |>
mutate(
biden_vote_share = `Joe Biden` / (`Joe Biden` + `Donald Trump`)
)
print(biden_share)
## # A tibble: 51 × 4
## # Groups: state [51]
## state `Donald Trump` `Joe Biden` biden_vote_share
## <chr> <dbl> <dbl> <dbl>
## 1 Alabama 1441170 849624 0.371
## 2 Alaska 189951 153778 0.447
## 3 Arizona 1661686 1672143 0.502
## 4 Arkansas 760647 423932 0.358
## 5 California 6006429 11110250 0.649
## 6 Colorado 1364607 1804352 0.569
## 7 Connecticut 714717 1080831 0.602
## 8 Delaware 200603 296268 0.596
## 9 District of Columbia 18586 317323 0.945
## 10 Florida 5668731 5297045 0.483
## # ℹ 41 more rows
Q3
median_income <- get_acs(geography = "state",
variables = c(median_income = "B19013_001"),
year = 2020)
## Getting data from the 2016-2020 5-year ACS
income_and_votes<-biden_share|>
# restrict to third party vote shares
left_join(median_income, by=join_by(state == NAME))
income_and_votes
## # A tibble: 51 × 8
## # Groups: state [51]
## state `Donald Trump` `Joe Biden` biden_vote_share GEOID variable estimate
## <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 Alabama 1441170 849624 0.371 01 median_… 52035
## 2 Alaska 189951 153778 0.447 02 median_… 77790
## 3 Arizona 1661686 1672143 0.502 04 median_… 61529
## 4 Arkansas 760647 423932 0.358 05 median_… 49475
## 5 Californ… 6006429 11110250 0.649 06 median_… 78672
## 6 Colorado 1364607 1804352 0.569 08 median_… 75231
## 7 Connecti… 714717 1080831 0.602 09 median_… 79855
## 8 Delaware 200603 296268 0.596 10 median_… 69110
## 9 District… 18586 317323 0.945 11 median_… 90842
## 10 Florida 5668731 5297045 0.483 12 median_… 57703
## # ℹ 41 more rows
## # ℹ 1 more variable: moe <dbl>
Q4
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
biden_plot<-income_and_votes|>
ggplot(aes(x = estimate, y= biden_vote_share, label=state)) +
geom_point()
ggplotly(biden_plot)
biden_model<-lm(biden_vote_share ~ estimate , data= income_and_votes)
summary(biden_model)
##
## Call:
## lm(formula = biden_vote_share ~ estimate, data = income_and_votes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.223857 -0.034904 -0.002384 0.044009 0.241841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.203e-02 7.327e-02 -0.301 0.765
## estimate 7.979e-06 1.111e-06 7.183 3.45e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08681 on 49 degrees of freedom
## Multiple R-squared: 0.5129, Adjusted R-squared: 0.503
## F-statistic: 51.6 on 1 and 49 DF, p-value: 3.453e-09
Median income seems to have a statistically significant and positive
relationship with Biden’s vote share. According to the regression model,
for every $1,000 increase in median income, Biden’s vote share increases
by approximately 0.007979, or about 0.798%. According to the adjusted
R-squared value, approximately 50.3% of the variation in Biden’s vote
share can be explained by median income. The residual standard error of
0.08681 indicates that the model’s predictions are off by about 8.7
percentage points. While the model is statistically significant and
provides useful insights, it lacks explanatory power that could be
bolstered by the addition of other demographic variables.