library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
data <- read_csv("/Users/vero/Desktop/UTSA Grad School/FALL 2025/CAPSTONE/TX_2024_Election_ACS_with_Poverty.csv")
## Rows: 16 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): county, county_type
## dbl (10): dem_votes, rep_votes, total_votes, rep_pct, dem_pct, pct_latino_fo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and standardize county_type labels
data <- data %>%
  mutate(
    county_type = case_when(
      str_detect(county_type, "Urban") ~ "urban",
      str_detect(county_type, "Border") ~ "border",
      TRUE ~ county_type
    ),
    county_type = factor(county_type)
  )
glimpse(data)
## Rows: 16
## Columns: 12
## $ county                 <chr> "BEXAR", "BREWSTER", "CAMERON", "DALLAS", "HARR…
## $ dem_votes              <dbl> 411389, 1969, 54258, 511118, 808771, 104517, 27…
## $ rep_votes              <dbl> 337545, 2545, 60991, 322569, 722695, 110760, 75…
## $ total_votes            <dbl> 748934, 4514, 115249, 833687, 1531466, 215277, …
## $ rep_pct                <dbl> 0.4507006, 0.5638015, 0.5292107, 0.3869186, 0.4…
## $ dem_pct                <dbl> 0.5492994, 0.4361985, 0.4707893, 0.6130814, 0.5…
## $ pct_latino_foreignborn <dbl> 0.14215219, 0.08768844, 0.15651261, 0.16727009,…
## $ pct_latino_bach_high   <dbl> 0.21410653, 0.27959698, 0.17366767, 0.13991612,…
## $ pct_spanish_home       <dbl> 0.46175684, 0.40749665, 0.22789238, 0.18326234,…
## $ latino_median_income   <dbl> 62583, 54630, 48181, 65311, 62240, 50783, 46563…
## $ pct_latino_poverty     <dbl> 0.17210248, 0.07547656, 0.25638002, 0.16682765,…
## $ county_type            <fct> urban, border, border, urban, urban, border, bo…
summary(data$county_type)
## border  urban 
##     12      4
###############################################
#DESCRIPTIVE STATISTICS BY COUNTY TYPE
###############################################

demo_vars <- c(
  "rep_pct",
  "pct_latino_foreignborn",
  "pct_latino_bach_high",
  "pct_spanish_home",
  "pct_latino_poverty",
  "latino_median_income"
)

descriptives <- data %>%
  select(county_type, all_of(demo_vars)) %>%
  pivot_longer(
    cols = -county_type,
    names_to = "variable",
    values_to = "value"
  ) %>%
  group_by(county_type, variable) %>%
  summarise(
    n    = n(),
    mean = mean(value, na.rm = TRUE),
    sd   = sd(value, na.rm = TRUE),
    min  = min(value, na.rm = TRUE),
    max  = max(value, na.rm = TRUE),
    .groups = "drop"
  )

descriptives
## # A tibble: 12 × 7
##    county_type variable                   n      mean        sd      min     max
##    <fct>       <chr>                  <int>     <dbl>     <dbl>    <dbl>   <dbl>
##  1 border      latino_median_income      12 50499.    8905.      3.56e+4 6.39e+4
##  2 border      pct_latino_bach_high      12     0.146    0.0588  5.11e-2 2.80e-1
##  3 border      pct_latino_foreignborn    12     0.130    0.0439  5.73e-2 1.77e-1
##  4 border      pct_latino_poverty        12     0.215    0.0997  4.40e-2 3.83e-1
##  5 border      pct_spanish_home          12     0.189    0.114   6.60e-2 4.07e-1
##  6 border      rep_pct                   12     0.610    0.0961  5.11e-1 7.75e-1
##  7 urban       latino_median_income       4 66948.    7270.      6.22e+4 7.77e+4
##  8 urban       pct_latino_bach_high       4     0.212    0.0852  1.40e-1 3.31e-1
##  9 urban       pct_latino_foreignborn     4     0.153    0.0129  1.41e-1 1.67e-1
## 10 urban       pct_latino_poverty         4     0.175    0.0182  1.61e-1 2.02e-1
## 11 urban       pct_spanish_home           4     0.311    0.138   1.83e-1 4.62e-1
## 12 urban       rep_pct                    4     0.402    0.0773  3.00e-1 4.72e-1
###############################################
#CORRELATIONS BY COUNTY TYPE
###############################################

corr_by_type <- function(df, type_label) {
  df %>%
    filter(county_type == type_label) %>%
    select(all_of(demo_vars)) %>%
    cor(use = "complete.obs", method = "pearson")
}

corr_border <- corr_by_type(data, "border")
corr_urban  <- corr_by_type(data, "urban")

corr_border
##                            rep_pct pct_latino_foreignborn pct_latino_bach_high
## rep_pct                 1.00000000             -0.7237771           -0.5712163
## pct_latino_foreignborn -0.72377706              1.0000000            0.2527132
## pct_latino_bach_high   -0.57121628              0.2527132            1.0000000
## pct_spanish_home       -0.01215235             -0.3862963            0.4062150
## pct_latino_poverty     -0.53295384              0.8055738           -0.1629919
## latino_median_income    0.28502588             -0.3824470            0.3385300
##                        pct_spanish_home pct_latino_poverty latino_median_income
## rep_pct                     -0.01215235         -0.5329538            0.2850259
## pct_latino_foreignborn      -0.38629630          0.8055738           -0.3824470
## pct_latino_bach_high         0.40621497         -0.1629919            0.3385300
## pct_spanish_home             1.00000000         -0.5490292            0.2721415
## pct_latino_poverty          -0.54902920          1.0000000           -0.7035255
## latino_median_income         0.27214148         -0.7035255            1.0000000
corr_urban
##                           rep_pct pct_latino_foreignborn pct_latino_bach_high
## rep_pct                 1.0000000              0.2997772           -0.7288661
## pct_latino_foreignborn  0.2997772              1.0000000           -0.8355317
## pct_latino_bach_high   -0.7288661             -0.8355317            1.0000000
## pct_spanish_home       -0.2010519             -0.9576157            0.6934689
## pct_latino_poverty      0.7803154              0.3593599           -0.5164873
## latino_median_income   -0.9557104             -0.4651765            0.8687021
##                        pct_spanish_home pct_latino_poverty latino_median_income
## rep_pct                      -0.2010519          0.7803154           -0.9557104
## pct_latino_foreignborn       -0.9576157          0.3593599           -0.4651765
## pct_latino_bach_high          0.6934689         -0.5164873            0.8687021
## pct_spanish_home              1.0000000         -0.4478251            0.3062052
## pct_latino_poverty           -0.4478251          1.0000000           -0.6315773
## latino_median_income          0.3062052         -0.6315773            1.0000000
###############################################
#MULTIPLE LINEAR REGRESSION (ALL 16 COUNTIES)
###############################################

model <- lm(
  rep_pct ~ pct_latino_foreignborn +
    pct_latino_bach_high +
    pct_spanish_home +
    pct_latino_poverty +
    latino_median_income,
  data = data
)

summary(model)
## 
## Call:
## lm(formula = rep_pct ~ pct_latino_foreignborn + pct_latino_bach_high + 
##     pct_spanish_home + pct_latino_poverty + latino_median_income, 
##     data = data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.110907 -0.027420  0.005401  0.028766  0.115281 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.197e+00  1.962e-01   6.104 0.000115 ***
## pct_latino_foreignborn -1.625e+00  9.506e-01  -1.709 0.118185    
## pct_latino_bach_high   -4.983e-01  4.587e-01  -1.086 0.302781    
## pct_spanish_home       -2.591e-01  2.275e-01  -1.139 0.281392    
## pct_latino_poverty     -3.261e-01  5.356e-01  -0.609 0.556217    
## latino_median_income   -3.922e-06  3.183e-06  -1.232 0.246067    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07791 on 10 degrees of freedom
## Multiple R-squared:  0.7561, Adjusted R-squared:  0.6342 
## F-statistic: 6.201 on 5 and 10 DF,  p-value: 0.007202
model_tidy   <- tidy(model)
model_glance <- glance(model)

model_tidy
## # A tibble: 6 × 5
##   term                      estimate  std.error statistic  p.value
##   <chr>                        <dbl>      <dbl>     <dbl>    <dbl>
## 1 (Intercept)             1.20       0.196          6.10  0.000115
## 2 pct_latino_foreignborn -1.62       0.951         -1.71  0.118   
## 3 pct_latino_bach_high   -0.498      0.459         -1.09  0.303   
## 4 pct_spanish_home       -0.259      0.228         -1.14  0.281   
## 5 pct_latino_poverty     -0.326      0.536         -0.609 0.556   
## 6 latino_median_income   -0.00000392 0.00000318    -1.23  0.246
model_glance
## # A tibble: 1 × 12
##   r.squared adj.r.squared  sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl>  <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.756         0.634 0.0779      6.20 0.00720     5   21.9 -29.8 -24.4
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
###############################################
# ASSUMPTION CHECKS
###############################################

# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)

par(mfrow = c(1, 1))

# Normality of residuals
resid_model <- residuals(model)
shapiro.test(resid_model)
## 
##  Shapiro-Wilk normality test
## 
## data:  resid_model
## W = 0.96382, p-value = 0.7314
# Homoscedasticity (Breusch-Pagan)
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 7.2419, df = 5, p-value = 0.2033
# Multicollinearity (VIF)
vif(model)
## pct_latino_foreignborn   pct_latino_bach_high       pct_spanish_home 
##               3.454444               2.528402               2.074080 
##     pct_latino_poverty   latino_median_income 
##               5.428821               3.075615
summary(model)
## 
## Call:
## lm(formula = rep_pct ~ pct_latino_foreignborn + pct_latino_bach_high + 
##     pct_spanish_home + pct_latino_poverty + latino_median_income, 
##     data = data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.110907 -0.027420  0.005401  0.028766  0.115281 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.197e+00  1.962e-01   6.104 0.000115 ***
## pct_latino_foreignborn -1.625e+00  9.506e-01  -1.709 0.118185    
## pct_latino_bach_high   -4.983e-01  4.587e-01  -1.086 0.302781    
## pct_spanish_home       -2.591e-01  2.275e-01  -1.139 0.281392    
## pct_latino_poverty     -3.261e-01  5.356e-01  -0.609 0.556217    
## latino_median_income   -3.922e-06  3.183e-06  -1.232 0.246067    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07791 on 10 degrees of freedom
## Multiple R-squared:  0.7561, Adjusted R-squared:  0.6342 
## F-statistic: 6.201 on 5 and 10 DF,  p-value: 0.007202
model_glance
## # A tibble: 1 × 12
##   r.squared adj.r.squared  sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl>  <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.756         0.634 0.0779      6.20 0.00720     5   21.9 -29.8 -24.4
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>