library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(stats)

file_path <- "../data/propsix.xlsx" # Path to file
data <- read_excel(file_path, sheet = "Prop6_keyvalues") # Import file, sheet 1
#standardize header names
data <- data %>%
  clean_names()

head(data)
## # A tibble: 6 × 13
##   geography  cofips county prop6_result prop6_margin_ppts median_household_inc…¹
##   <chr>      <chr>  <chr>  <chr>                    <dbl>                  <dbl>
## 1 0500000US… 06001  Alame… YES                         18                 122488
## 2 0500000US… 06003  Alpine YES                          4                 101125
## 3 0500000US… 06005  Amador NO                         -44                  74853
## 4 0500000US… 06007  Butte  NO                         -22                  66085
## 5 0500000US… 06009  Calav… NO                         -42                  77526
## 6 0500000US… 06011  Colusa NO                         -48                  69619
## # ℹ abbreviated name: ¹​median_household_income
## # ℹ 7 more variables: percent_income_below_the_poverty_level <dbl>,
## #   pct_black <dbl>, pct_hispanic <dbl>, percent_non_white <dbl>,
## #   total_adult_imprisonments_per_100_000_population_age_18_69_2016 <dbl>,
## #   margin_harris_negative <dbl>, pct_collegegrads <dbl>
summary(data) #can view quartile ranges here
##   geography            cofips             county          prop6_result      
##  Length:58          Length:58          Length:58          Length:58         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  prop6_margin_ppts median_household_income
##  Min.   :-56.00    Min.   : 47317         
##  1st Qu.:-37.50    1st Qu.: 64143         
##  Median :-22.00    Median : 76148         
##  Mean   :-20.14    Mean   : 82967         
##  3rd Qu.: -6.00    3rd Qu.: 98694         
##  Max.   : 34.00    Max.   :153792         
##  percent_income_below_the_poverty_level   pct_black        pct_hispanic   
##  Min.   : 6.40                          Min.   :0.00000   Min.   :0.0754  
##  1st Qu.:10.43                          1st Qu.:0.01153   1st Qu.:0.1535  
##  Median :13.10                          Median :0.01747   Median :0.2682  
##  Mean   :13.18                          Mean   :0.02860   Mean   :0.3172  
##  3rd Qu.:16.20                          3rd Qu.:0.03223   3rd Qu.:0.4587  
##  Max.   :22.00                          Max.   :0.12944   Max.   :0.8541  
##  percent_non_white
##  Min.   :0.1337   
##  1st Qu.:0.3116   
##  Median :0.4998   
##  Mean   :0.4844   
##  3rd Qu.:0.6730   
##  Max.   :0.9043   
##  total_adult_imprisonments_per_100_000_population_age_18_69_2016
##  Min.   : 126.6                                                 
##  1st Qu.: 304.9                                                 
##  Median : 471.4                                                 
##  Mean   : 484.0                                                 
##  3rd Qu.: 620.1                                                 
##  Max.   :1338.1                                                 
##  margin_harris_negative pct_collegegrads
##  Min.   :-65.000        Min.   :0.2113  
##  1st Qu.:-27.000        1st Qu.:0.2985  
##  Median :  1.700        Median :0.3508  
##  Mean   : -2.958        Mean   :0.3840  
##  3rd Qu.: 21.000        3rd Qu.:0.4808  
##  Max.   : 54.000        Max.   :0.6745
# Identify missing values - there are none
data %>%
  summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 13
##   geography cofips county prop6_result prop6_margin_ppts median_household_income
##       <int>  <int>  <int>        <int>             <int>                   <int>
## 1         0      0      0            0                 0                       0
## # ℹ 7 more variables: percent_income_below_the_poverty_level <int>,
## #   pct_black <int>, pct_hispanic <int>, percent_non_white <int>,
## #   total_adult_imprisonments_per_100_000_population_age_18_69_2016 <int>,
## #   margin_harris_negative <int>, pct_collegegrads <int>

Split numeric data into quartiles

By splitting our numeric variables into quartiles, we can look at whether counties in each buckets had different levels of support for Prop 6.

*** NOTE – NOT DONE YET ***

# List of columns to split into quartiles - all numeric variables
columns_to_quartile <- c(
  "median_household_income",
  "percent_income_below_the_poverty_level", 
  "pct_black", 
  "pct_hispanic", 
  "percent_non_white", 
  "total_adult_imprisonments_per_100_000_population_age_18_69_2016", 
  "margin_harris_negative", 
  "pct_collegegrads"
)

# Add new quartile columns for each variable
data <- data %>%
  mutate(across(all_of(columns_to_quartile), 
                ~ ntile(.x, 4), 
                .names = "{.col}_quartile"))

## NOTE - NOT DONE YET

Run regressions

Now we want to see whether the relationship between these variables is significant, so we are going to put them into a regression. The outcome variable we are interested in is the margin of victory for prop 6, where a passing vote for prop 6 is positive and a failing vote is negative.

#run regression on prop 6 results and presidential results
regression_harris <- lm(prop6_margin_ppts ~ margin_harris_negative, data=data)
summary(regression_harris) #super strong correlation, so we are running this one separately so it doesn't dwarf other relationships in full model
## 
## Call:
## lm(formula = prop6_margin_ppts ~ margin_harris_negative, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8422 -3.2617  0.1281  3.5270 13.5149 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -22.07456    0.62015  -35.60   <2e-16 ***
## margin_harris_negative  -0.65476    0.02112  -31.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.699 on 56 degrees of freedom
## Multiple R-squared:  0.945,  Adjusted R-squared:  0.944 
## F-statistic: 961.4 on 1 and 56 DF,  p-value: < 2.2e-16
#run regression model on the remainder of the variables to see what else correlates
regression_full_model <- lm(
  prop6_margin_ppts ~ median_household_income + percent_income_below_the_poverty_level + percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 + pct_collegegrads, data=data
)
summary(regression_full_model)
## 
## Call:
## lm(formula = prop6_margin_ppts ~ median_household_income + percent_income_below_the_poverty_level + 
##     percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 + 
##     pct_collegegrads, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2780  -7.0976  -0.2619   5.9943  29.0581 
## 
## Coefficients:
##                                                                   Estimate
## (Intercept)                                                     -1.008e+02
## median_household_income                                         -1.537e-04
## percent_income_below_the_poverty_level                           1.152e+00
## percent_non_white                                                3.762e+01
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 -1.154e-02
## pct_collegegrads                                                 1.707e+02
##                                                                 Std. Error
## (Intercept)                                                      1.580e+01
## median_household_income                                          2.027e-04
## percent_income_below_the_poverty_level                           6.754e-01
## percent_non_white                                                1.133e+01
## total_adult_imprisonments_per_100_000_population_age_18_69_2016  7.982e-03
## pct_collegegrads                                                 3.238e+01
##                                                                 t value
## (Intercept)                                                      -6.379
## median_household_income                                          -0.759
## percent_income_below_the_poverty_level                            1.706
## percent_non_white                                                 3.320
## total_adult_imprisonments_per_100_000_population_age_18_69_2016  -1.446
## pct_collegegrads                                                  5.272
##                                                                 Pr(>|t|)    
## (Intercept)                                                     4.82e-08 ***
## median_household_income                                          0.45152    
## percent_income_below_the_poverty_level                           0.09389 .  
## percent_non_white                                                0.00165 ** 
## total_adult_imprisonments_per_100_000_population_age_18_69_2016  0.15409    
## pct_collegegrads                                                2.64e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.1 on 52 degrees of freedom
## Multiple R-squared:  0.7149, Adjusted R-squared:  0.6875 
## F-statistic: 26.08 on 5 and 52 DF,  p-value: 4.459e-13
# results - significant correlations for counties with higher percent of college grads and higher percent of nonwhite people
# college grads is definitely the most significant

What do these regressions tell us? Counties with more support for Kamala Harris is by far the strongest indicator of counties with more support for Prop 6, out of all the variables in our data. In addition, counties with more college graduates and a higher percentage of nonwhite people are also more likely to vote for prop 6.

Prop 6 results vs. presidential results

Support for Prop 6 and Kamala Harris are highly positively correlated, but we know this is not the full explanation – while Harris won the majority of voters in California, the majority of voters voted against Prop 6. Which counties had the greatest different between how they voted for Harris and for Prop 6? Which demographic factors might play a role?

# Add new column to make vote share for Harris positive values
# Quick math assumes Trump and Kamala are the only candidates
data <- data %>%
  mutate(harris_voteshare = (100 - margin_harris_negative) / 2)

# Add new column to make vote share for Prop 6 positive values
data <- data %>%
  mutate(prop6_voteshare = (100 + prop6_margin_ppts) / 2)

# Add new column for Prop 6 minus Harris voteshare
# Higher values means counties with relatively more harris support compared to prop6 support
data <- data %>%
  mutate(harris_minus_prop6 = harris_voteshare - prop6_voteshare) 

# summary table showing these variables
prop6_harris_support <- data[,c('county','harris_minus_prop6','percent_non_white','pct_collegegrads','prop6_margin_ppts','prop6_voteshare','margin_harris_negative','harris_voteshare')]


# run regression on the difference between harris and prop6 support
regression_harris_minus_prop6 <- lm(
  harris_minus_prop6 ~ median_household_income + percent_income_below_the_poverty_level + percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 + pct_collegegrads, data=data
)
summary(regression_harris_minus_prop6) 
## 
## Call:
## lm(formula = harris_minus_prop6 ~ median_household_income + percent_income_below_the_poverty_level + 
##     percent_non_white + total_adult_imprisonments_per_100_000_population_age_18_69_2016 + 
##     pct_collegegrads, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6669 -1.6254  0.1209  1.7809  6.5754 
## 
## Coefficients:
##                                                                   Estimate
## (Intercept)                                                     -2.560e+00
## median_household_income                                          5.010e-05
## percent_income_below_the_poverty_level                          -5.693e-02
## percent_non_white                                                7.377e+00
## total_adult_imprisonments_per_100_000_population_age_18_69_2016 -2.739e-03
## pct_collegegrads                                                 2.202e+01
##                                                                 Std. Error
## (Intercept)                                                      4.822e+00
## median_household_income                                          6.187e-05
## percent_income_below_the_poverty_level                           2.062e-01
## percent_non_white                                                3.459e+00
## total_adult_imprisonments_per_100_000_population_age_18_69_2016  2.437e-03
## pct_collegegrads                                                 9.885e+00
##                                                                 t value
## (Intercept)                                                      -0.531
## median_household_income                                           0.810
## percent_income_below_the_poverty_level                           -0.276
## percent_non_white                                                 2.133
## total_adult_imprisonments_per_100_000_population_age_18_69_2016  -1.124
## pct_collegegrads                                                  2.227
##                                                                 Pr(>|t|)  
## (Intercept)                                                       0.5978  
## median_household_income                                           0.4217  
## percent_income_below_the_poverty_level                            0.7835  
## percent_non_white                                                 0.0377 *
## total_adult_imprisonments_per_100_000_population_age_18_69_2016   0.2662  
## pct_collegegrads                                                  0.0303 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.388 on 52 degrees of freedom
## Multiple R-squared:  0.6656, Adjusted R-squared:  0.6334 
## F-statistic:  20.7 on 5 and 52 DF,  p-value: 2.559e-11
# results - significant correlations for counties with more college grads and greater pct nonwhite

What can we conclude? Counties with relatively more Harris support and less prop 6 support are more highly educated (more college grads) and have more non-white people in them. Importantly, this flips our thinking of these variables from our last regression. In other words: While counties with more college grads and a greater percent of non-white people did support Prop 6 more overall, they also showed a greater gap between Harris support and Prop 6 support. These counties help explain why California ended up voting for Harris while voting against Prop 6.

The data shows the biggest vote differentials were in bay area counties, so that region could be good for further exploration.

Next steps

-- choropleth map of vote differential on the county level - will show concentrations in bay area counties
-- could repeat regression but at the precinct level for the bay area, comparing precinct support for harris and prop6 - possible explanation could be more Asian people in these areas are driving the difference in vote outcomes
-- go back and create summary tables for all the quartiles