library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(stats)
file_path <- "../data/propsix.xlsx" # Path to file
data <- read_excel(file_path, sheet = "Prop6_keyvalues") # Import file, sheet 1
#standardize header names
data <- data %>%
clean_names()
head(data)
## # A tibble: 6 × 13
## geography cofips county prop6_result prop6_margin_ppts median_household_inc…¹
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 0500000US… 06001 Alame… YES 18 122488
## 2 0500000US… 06003 Alpine YES 4 101125
## 3 0500000US… 06005 Amador NO -44 74853
## 4 0500000US… 06007 Butte NO -22 66085
## 5 0500000US… 06009 Calav… NO -42 77526
## 6 0500000US… 06011 Colusa NO -48 69619
## # ℹ abbreviated name: ¹median_household_income
## # ℹ 7 more variables: percent_income_below_the_poverty_level <dbl>,
## # pct_black <dbl>, pct_hispanic <dbl>, percent_non_white <dbl>,
## # total_adult_imprisonments_per_100_000_population_age_18_69_2016 <dbl>,
## # margin_harris_negative <dbl>, pct_collegegrads <dbl>
summary(data) #can view quartile ranges here
## geography cofips county prop6_result
## Length:58 Length:58 Length:58 Length:58
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## prop6_margin_ppts median_household_income
## Min. :-56.00 Min. : 47317
## 1st Qu.:-37.50 1st Qu.: 64143
## Median :-22.00 Median : 76148
## Mean :-20.14 Mean : 82967
## 3rd Qu.: -6.00 3rd Qu.: 98694
## Max. : 34.00 Max. :153792
## percent_income_below_the_poverty_level pct_black pct_hispanic
## Min. : 6.40 Min. :0.00000 Min. :0.0754
## 1st Qu.:10.43 1st Qu.:0.01153 1st Qu.:0.1535
## Median :13.10 Median :0.01747 Median :0.2682
## Mean :13.18 Mean :0.02860 Mean :0.3172
## 3rd Qu.:16.20 3rd Qu.:0.03223 3rd Qu.:0.4587
## Max. :22.00 Max. :0.12944 Max. :0.8541
## percent_non_white
## Min. :0.1337
## 1st Qu.:0.3116
## Median :0.4998
## Mean :0.4844
## 3rd Qu.:0.6730
## Max. :0.9043
## total_adult_imprisonments_per_100_000_population_age_18_69_2016
## Min. : 126.6
## 1st Qu.: 304.9
## Median : 471.4
## Mean : 484.0
## 3rd Qu.: 620.1
## Max. :1338.1
## margin_harris_negative pct_collegegrads
## Min. :-65.000 Min. :0.2113
## 1st Qu.:-27.000 1st Qu.:0.2985
## Median : 1.700 Median :0.3508
## Mean : -2.958 Mean :0.3840
## 3rd Qu.: 21.000 3rd Qu.:0.4808
## Max. : 54.000 Max. :0.6745
# Identify missing values - there are none
data %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 13
## geography cofips county prop6_result prop6_margin_ppts median_household_income
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0
## # ℹ 7 more variables: percent_income_below_the_poverty_level <int>,
## # pct_black <int>, pct_hispanic <int>, percent_non_white <int>,
## # total_adult_imprisonments_per_100_000_population_age_18_69_2016 <int>,
## # margin_harris_negative <int>, pct_collegegrads <int>
By splitting our numeric variables into quartiles, we can look at whether counties in each buckets had different levels of support for Prop 6.
*** NOTE – NOT DONE YET ***
# List of columns to split into quartiles - all numeric variables
columns_to_quartile <- c(
"median_household_income",
"percent_income_below_the_poverty_level",
"pct_black",
"pct_hispanic",
"percent_non_white",
"total_adult_imprisonments_per_100_000_population_age_18_69_2016",
"margin_harris_negative",
"pct_collegegrads"
)
# Add new quartile columns for each variable
data <- data %>%
mutate(across(all_of(columns_to_quartile),
~ ntile(.x, 4),
.names = "{.col}_quartile"))
## NOTE - NOT DONE YET
Now we want to see whether the relationship between these variables is significant, so we are going to put them into a regression. The outcome variable we are interested in is the margin of victory for prop 6, where a passing vote for prop 6 is positive and a failing vote is negative.
#run regression on prop 6 results and presidential results
regression_harris <- lm(prop6_margin_ppts ~ margin_harris_negative, data=data)
summary(regression_harris) #super strong correlation, so we are running this one separately so it doesn't dwarf other relationships in full model
##
## Call:
## lm(formula = prop6_margin_ppts ~ margin_harris_negative, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8422 -3.2617 0.1281 3.5270 13.5149
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -22.07456 0.62015 -35.60 <2e-16 ***
## margin_harris_negative -0.65476 0.02112 -31.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.699 on 56 degrees of freedom
## Multiple R-squared: 0.945, Adjusted R-squared: 0.944
## F-statistic: 961.4 on 1 and 56 DF, p-value: < 2.2e-16
#run regression model on the remainder of the variables to see what else correlates
regression_full_model <- lm(
prop6_margin_ppts ~ median_household_income + percent_income_below_the_poverty_level + percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 + pct_collegegrads, data=data
)
summary(regression_full_model)
##
## Call:
## lm(formula = prop6_margin_ppts ~ median_household_income + percent_income_below_the_poverty_level +
## percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 +
## pct_collegegrads, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2780 -7.0976 -0.2619 5.9943 29.0581
##
## Coefficients:
## Estimate
## (Intercept) -1.008e+02
## median_household_income -1.537e-04
## percent_income_below_the_poverty_level 1.152e+00
## percent_non_white 3.762e+01
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 -1.154e-02
## pct_collegegrads 1.707e+02
## Std. Error
## (Intercept) 1.580e+01
## median_household_income 2.027e-04
## percent_income_below_the_poverty_level 6.754e-01
## percent_non_white 1.133e+01
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 7.982e-03
## pct_collegegrads 3.238e+01
## t value
## (Intercept) -6.379
## median_household_income -0.759
## percent_income_below_the_poverty_level 1.706
## percent_non_white 3.320
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 -1.446
## pct_collegegrads 5.272
## Pr(>|t|)
## (Intercept) 4.82e-08 ***
## median_household_income 0.45152
## percent_income_below_the_poverty_level 0.09389 .
## percent_non_white 0.00165 **
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 0.15409
## pct_collegegrads 2.64e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.1 on 52 degrees of freedom
## Multiple R-squared: 0.7149, Adjusted R-squared: 0.6875
## F-statistic: 26.08 on 5 and 52 DF, p-value: 4.459e-13
# results - significant correlations for counties with higher percent of college grads and higher percent of nonwhite people
# college grads is definitely the most significant
What do these regressions tell us? Counties with more support for Kamala Harris is by far the strongest indicator of counties with more support for Prop 6, out of all the variables in our data. In addition, counties with more college graduates and a higher percentage of nonwhite people are also more likely to vote for prop 6.
Support for Prop 6 and Kamala Harris are highly positively correlated, but we know this is not the full explanation – while Harris won the majority of voters in California, the majority of voters voted against Prop 6. Which counties had the greatest different between how they voted for Harris and for Prop 6? Which demographic factors might play a role?
# Add new column to make vote share for Harris positive values
# Quick math assumes Trump and Kamala are the only candidates
data <- data %>%
mutate(harris_voteshare = (100 - margin_harris_negative) / 2)
# Add new column to make vote share for Prop 6 positive values
data <- data %>%
mutate(prop6_voteshare = (100 + prop6_margin_ppts) / 2)
# Add new column for Prop 6 minus Harris voteshare
# Higher values means counties with relatively more harris support compared to prop6 support
data <- data %>%
mutate(harris_minus_prop6 = harris_voteshare - prop6_voteshare)
# summary table showing these variables
prop6_harris_support <- data[,c('county','harris_minus_prop6','percent_non_white','pct_collegegrads','prop6_margin_ppts','prop6_voteshare','margin_harris_negative','harris_voteshare')]
# run regression on the difference between harris and prop6 support
regression_harris_minus_prop6 <- lm(
harris_minus_prop6 ~ median_household_income + percent_income_below_the_poverty_level + percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 + pct_collegegrads, data=data
)
summary(regression_harris_minus_prop6)
##
## Call:
## lm(formula = harris_minus_prop6 ~ median_household_income + percent_income_below_the_poverty_level +
## percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 +
## pct_collegegrads, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6669 -1.6254 0.1209 1.7809 6.5754
##
## Coefficients:
## Estimate
## (Intercept) -2.560e+00
## median_household_income 5.010e-05
## percent_income_below_the_poverty_level -5.693e-02
## percent_non_white 7.377e+00
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 -2.739e-03
## pct_collegegrads 2.202e+01
## Std. Error
## (Intercept) 4.822e+00
## median_household_income 6.187e-05
## percent_income_below_the_poverty_level 2.062e-01
## percent_non_white 3.459e+00
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 2.437e-03
## pct_collegegrads 9.885e+00
## t value
## (Intercept) -0.531
## median_household_income 0.810
## percent_income_below_the_poverty_level -0.276
## percent_non_white 2.133
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 -1.124
## pct_collegegrads 2.227
## Pr(>|t|)
## (Intercept) 0.5978
## median_household_income 0.4217
## percent_income_below_the_poverty_level 0.7835
## percent_non_white 0.0377 *
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 0.2662
## pct_collegegrads 0.0303 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.388 on 52 degrees of freedom
## Multiple R-squared: 0.6656, Adjusted R-squared: 0.6334
## F-statistic: 20.7 on 5 and 52 DF, p-value: 2.559e-11
# results - significant correlations for counties with more college grads and greater pct nonwhite
What can we conclude? Counties with relatively more Harris support and less prop 6 support are more highly educated (more college grads) and have more non-white people in them. Importantly, this flips our thinking of these variables from our last regression. In other words: While counties with more college grads and a greater percent of non-white people did support Prop 6 more overall, they also showed a greater gap between Harris support and Prop 6 support. These counties help explain why California ended up voting for Harris while voting against Prop 6.
The data shows the biggest vote differentials were in bay area counties, so that region could be good for further exploration.
-- choropleth map of vote differential on the county level - will show concentrations in bay area counties
-- could repeat regression but at the precinct level for the bay area, comparing precinct support for harris and prop6 - possible explanation could be more Asian people in these areas are driving the difference in vote outcomes
-- go back and create summary tables for all the quartiles