library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidycensus)
url<-"https://raw.githubusercontent.com/fivethirtyeight/election-results/main/election_results_presidential.csv"
presidential_elections<-read_csv(url)
## Rows: 7423 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): state_abbrev, state, office_name, stage, party, candidate_name, bal...
## dbl (8): id, race_id, office_id, cycle, politician_id, candidate_id, votes, ...
## lgl (4): office_seat_name, special, unopposed, winner
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Q1

pres_2020<-presidential_elections|>
  filter(cycle == 2020)|>
  filter(stage == "general")|>
  filter(!is.na(state))

counts_mutated<-pres_2020|>
  group_by(state)|>
  mutate(`number of candidates` = n())|>
  arrange(-`number of candidates`)

pres_2020|>
  filter(ballot_party!="W")

pres_2020|>
  # %in% will evaluate true if there's a match on the right hand vector
  filter(state %in% c("Nebraska CD-1", "Nebraska CD-2", "Maine CD-1", "Maine CD-2") )

pres_2020 |>
  # str_detect 
  filter(str_detect(state, "CD-[0-9]") ==FALSE)

party_totals = pres_2020|>
  # remove split electoral counts
  filter(str_detect(state, "CD-[0-9]")==FALSE)|>
  
  # create a three category variable for party vote shares
  mutate(vote_type= case_when(
  `candidate_name` == "Joe Biden" ~ "Joe Biden", 
  `candidate_name` == "Donald Trump" ~ "Donald Trump", 
  !`candidate_name`%in%c("Joe Biden", "Donald Trump") ~ "other"))|>
  # get the total votes for each group
  group_by(state, vote_type)|>
  summarise(votes = sum(votes))|>
  # now get the total number of votes cast: 
  group_by(state)|>
  mutate(total = sum(votes))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
party_totals|>
  pivot_wider(names_from = vote_type, values_from = votes)
final_results <- party_totals |>
  pivot_wider(names_from = vote_type, values_from = votes) |>
  select(-total, -other) # Remove "total" and "other" columns

print(final_results)
## # A tibble: 51 × 3
## # Groups:   state [51]
##    state                `Donald Trump` `Joe Biden`
##    <chr>                         <dbl>       <dbl>
##  1 Alabama                     1441170      849624
##  2 Alaska                       189951      153778
##  3 Arizona                     1661686     1672143
##  4 Arkansas                     760647      423932
##  5 California                  6006429    11110250
##  6 Colorado                    1364607     1804352
##  7 Connecticut                  714717     1080831
##  8 Delaware                     200603      296268
##  9 District of Columbia          18586      317323
## 10 Florida                     5668731     5297045
## # ℹ 41 more rows

Q2

biden_share <- final_results |>
  mutate(
    biden_vote_share = `Joe Biden` / (`Joe Biden` + `Donald Trump`)
  )

print(biden_share)
## # A tibble: 51 × 4
## # Groups:   state [51]
##    state                `Donald Trump` `Joe Biden` biden_vote_share
##    <chr>                         <dbl>       <dbl>            <dbl>
##  1 Alabama                     1441170      849624            0.371
##  2 Alaska                       189951      153778            0.447
##  3 Arizona                     1661686     1672143            0.502
##  4 Arkansas                     760647      423932            0.358
##  5 California                  6006429    11110250            0.649
##  6 Colorado                    1364607     1804352            0.569
##  7 Connecticut                  714717     1080831            0.602
##  8 Delaware                     200603      296268            0.596
##  9 District of Columbia          18586      317323            0.945
## 10 Florida                     5668731     5297045            0.483
## # ℹ 41 more rows

Q3

median_income <- get_acs(geography = "state", 
                         variables = c(median_income = "B19013_001"), 
                         year = 2020)
## Getting data from the 2016-2020 5-year ACS
income_and_votes<-biden_share|>
  # restrict to third party vote shares
  left_join(median_income, by=join_by(state == NAME))

income_and_votes
## # A tibble: 51 × 8
## # Groups:   state [51]
##    state     `Donald Trump` `Joe Biden` biden_vote_share GEOID variable estimate
##    <chr>              <dbl>       <dbl>            <dbl> <chr> <chr>       <dbl>
##  1 Alabama          1441170      849624            0.371 01    median_…    52035
##  2 Alaska            189951      153778            0.447 02    median_…    77790
##  3 Arizona          1661686     1672143            0.502 04    median_…    61529
##  4 Arkansas          760647      423932            0.358 05    median_…    49475
##  5 Californ…        6006429    11110250            0.649 06    median_…    78672
##  6 Colorado         1364607     1804352            0.569 08    median_…    75231
##  7 Connecti…         714717     1080831            0.602 09    median_…    79855
##  8 Delaware          200603      296268            0.596 10    median_…    69110
##  9 District…          18586      317323            0.945 11    median_…    90842
## 10 Florida          5668731     5297045            0.483 12    median_…    57703
## # ℹ 41 more rows
## # ℹ 1 more variable: moe <dbl>

Q4

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
biden_plot<-income_and_votes|>
  ggplot(aes(x = estimate, y= biden_vote_share, label=state)) +
  geom_point()
ggplotly(biden_plot)
biden_model<-lm(biden_vote_share ~ estimate  , data= income_and_votes)
summary(biden_model)
## 
## Call:
## lm(formula = biden_vote_share ~ estimate, data = income_and_votes)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.223857 -0.034904 -0.002384  0.044009  0.241841 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.203e-02  7.327e-02  -0.301    0.765    
## estimate     7.979e-06  1.111e-06   7.183 3.45e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08681 on 49 degrees of freedom
## Multiple R-squared:  0.5129, Adjusted R-squared:  0.503 
## F-statistic:  51.6 on 1 and 49 DF,  p-value: 3.453e-09

Median income seems to have a statistically significant and positive relationship with Biden’s vote share. According to the regression model, for every $1,000 increase in median income, Biden’s vote share increases by approximately 0.007979, or about 0.798%. According to the adjusted R-squared value, approximately 50.3% of the variation in Biden’s vote share can be explained by median income. The residual standard error of 0.08681 indicates that the model’s predictions are off by about 8.7 percentage points. While the model is statistically significant and provides useful insights, it lacks explanatory power that could be bolstered by the addition of other demographic variables.