library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data <- read_csv("/Users/vero/Desktop/UTSA Grad School/FALL 2025/CAPSTONE/TX_2024_Election_ACS_with_Poverty.csv")
## Rows: 16 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): county, county_type
## dbl (10): dem_votes, rep_votes, total_votes, rep_pct, dem_pct, pct_latino_fo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean and standardize county_type labels
data <- data %>%
mutate(
county_type = case_when(
str_detect(county_type, "Urban") ~ "urban",
str_detect(county_type, "Border") ~ "border",
TRUE ~ county_type
),
county_type = factor(county_type)
)
glimpse(data)
## Rows: 16
## Columns: 12
## $ county <chr> "BEXAR", "BREWSTER", "CAMERON", "DALLAS", "HARR…
## $ dem_votes <dbl> 411389, 1969, 54258, 511118, 808771, 104517, 27…
## $ rep_votes <dbl> 337545, 2545, 60991, 322569, 722695, 110760, 75…
## $ total_votes <dbl> 748934, 4514, 115249, 833687, 1531466, 215277, …
## $ rep_pct <dbl> 0.4507006, 0.5638015, 0.5292107, 0.3869186, 0.4…
## $ dem_pct <dbl> 0.5492994, 0.4361985, 0.4707893, 0.6130814, 0.5…
## $ pct_latino_foreignborn <dbl> 0.14215219, 0.08768844, 0.15651261, 0.16727009,…
## $ pct_latino_bach_high <dbl> 0.21410653, 0.27959698, 0.17366767, 0.13991612,…
## $ pct_spanish_home <dbl> 0.46175684, 0.40749665, 0.22789238, 0.18326234,…
## $ latino_median_income <dbl> 62583, 54630, 48181, 65311, 62240, 50783, 46563…
## $ pct_latino_poverty <dbl> 0.17210248, 0.07547656, 0.25638002, 0.16682765,…
## $ county_type <fct> urban, border, border, urban, urban, border, bo…
summary(data$county_type)
## border urban
## 12 4
###############################################
#DESCRIPTIVE STATISTICS BY COUNTY TYPE
###############################################
demo_vars <- c(
"rep_pct",
"pct_latino_foreignborn",
"pct_latino_bach_high",
"pct_spanish_home",
"pct_latino_poverty",
"latino_median_income"
)
descriptives <- data %>%
select(county_type, all_of(demo_vars)) %>%
pivot_longer(
cols = -county_type,
names_to = "variable",
values_to = "value"
) %>%
group_by(county_type, variable) %>%
summarise(
n = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
min = min(value, na.rm = TRUE),
max = max(value, na.rm = TRUE),
.groups = "drop"
)
descriptives
## # A tibble: 12 × 7
## county_type variable n mean sd min max
## <fct> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 border latino_median_income 12 50499. 8905. 3.56e+4 6.39e+4
## 2 border pct_latino_bach_high 12 0.146 0.0588 5.11e-2 2.80e-1
## 3 border pct_latino_foreignborn 12 0.130 0.0439 5.73e-2 1.77e-1
## 4 border pct_latino_poverty 12 0.215 0.0997 4.40e-2 3.83e-1
## 5 border pct_spanish_home 12 0.189 0.114 6.60e-2 4.07e-1
## 6 border rep_pct 12 0.610 0.0961 5.11e-1 7.75e-1
## 7 urban latino_median_income 4 66948. 7270. 6.22e+4 7.77e+4
## 8 urban pct_latino_bach_high 4 0.212 0.0852 1.40e-1 3.31e-1
## 9 urban pct_latino_foreignborn 4 0.153 0.0129 1.41e-1 1.67e-1
## 10 urban pct_latino_poverty 4 0.175 0.0182 1.61e-1 2.02e-1
## 11 urban pct_spanish_home 4 0.311 0.138 1.83e-1 4.62e-1
## 12 urban rep_pct 4 0.402 0.0773 3.00e-1 4.72e-1
###############################################
#CORRELATIONS BY COUNTY TYPE
###############################################
corr_by_type <- function(df, type_label) {
df %>%
filter(county_type == type_label) %>%
select(all_of(demo_vars)) %>%
cor(use = "complete.obs", method = "pearson")
}
corr_border <- corr_by_type(data, "border")
corr_urban <- corr_by_type(data, "urban")
corr_border
## rep_pct pct_latino_foreignborn pct_latino_bach_high
## rep_pct 1.00000000 -0.7237771 -0.5712163
## pct_latino_foreignborn -0.72377706 1.0000000 0.2527132
## pct_latino_bach_high -0.57121628 0.2527132 1.0000000
## pct_spanish_home -0.01215235 -0.3862963 0.4062150
## pct_latino_poverty -0.53295384 0.8055738 -0.1629919
## latino_median_income 0.28502588 -0.3824470 0.3385300
## pct_spanish_home pct_latino_poverty latino_median_income
## rep_pct -0.01215235 -0.5329538 0.2850259
## pct_latino_foreignborn -0.38629630 0.8055738 -0.3824470
## pct_latino_bach_high 0.40621497 -0.1629919 0.3385300
## pct_spanish_home 1.00000000 -0.5490292 0.2721415
## pct_latino_poverty -0.54902920 1.0000000 -0.7035255
## latino_median_income 0.27214148 -0.7035255 1.0000000
corr_urban
## rep_pct pct_latino_foreignborn pct_latino_bach_high
## rep_pct 1.0000000 0.2997772 -0.7288661
## pct_latino_foreignborn 0.2997772 1.0000000 -0.8355317
## pct_latino_bach_high -0.7288661 -0.8355317 1.0000000
## pct_spanish_home -0.2010519 -0.9576157 0.6934689
## pct_latino_poverty 0.7803154 0.3593599 -0.5164873
## latino_median_income -0.9557104 -0.4651765 0.8687021
## pct_spanish_home pct_latino_poverty latino_median_income
## rep_pct -0.2010519 0.7803154 -0.9557104
## pct_latino_foreignborn -0.9576157 0.3593599 -0.4651765
## pct_latino_bach_high 0.6934689 -0.5164873 0.8687021
## pct_spanish_home 1.0000000 -0.4478251 0.3062052
## pct_latino_poverty -0.4478251 1.0000000 -0.6315773
## latino_median_income 0.3062052 -0.6315773 1.0000000
###############################################
#MULTIPLE LINEAR REGRESSION (ALL 16 COUNTIES)
###############################################
model <- lm(
rep_pct ~ pct_latino_foreignborn +
pct_latino_bach_high +
pct_spanish_home +
pct_latino_poverty +
latino_median_income,
data = data
)
summary(model)
##
## Call:
## lm(formula = rep_pct ~ pct_latino_foreignborn + pct_latino_bach_high +
## pct_spanish_home + pct_latino_poverty + latino_median_income,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.110907 -0.027420 0.005401 0.028766 0.115281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.197e+00 1.962e-01 6.104 0.000115 ***
## pct_latino_foreignborn -1.625e+00 9.506e-01 -1.709 0.118185
## pct_latino_bach_high -4.983e-01 4.587e-01 -1.086 0.302781
## pct_spanish_home -2.591e-01 2.275e-01 -1.139 0.281392
## pct_latino_poverty -3.261e-01 5.356e-01 -0.609 0.556217
## latino_median_income -3.922e-06 3.183e-06 -1.232 0.246067
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07791 on 10 degrees of freedom
## Multiple R-squared: 0.7561, Adjusted R-squared: 0.6342
## F-statistic: 6.201 on 5 and 10 DF, p-value: 0.007202
model_tidy <- tidy(model)
model_glance <- glance(model)
model_tidy
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.20 0.196 6.10 0.000115
## 2 pct_latino_foreignborn -1.62 0.951 -1.71 0.118
## 3 pct_latino_bach_high -0.498 0.459 -1.09 0.303
## 4 pct_spanish_home -0.259 0.228 -1.14 0.281
## 5 pct_latino_poverty -0.326 0.536 -0.609 0.556
## 6 latino_median_income -0.00000392 0.00000318 -1.23 0.246
model_glance
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.756 0.634 0.0779 6.20 0.00720 5 21.9 -29.8 -24.4
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
###############################################
# ASSUMPTION CHECKS
###############################################
# Diagnostic plots
par(mfrow = c(2, 2))
plot(model)

par(mfrow = c(1, 1))
# Normality of residuals
resid_model <- residuals(model)
shapiro.test(resid_model)
##
## Shapiro-Wilk normality test
##
## data: resid_model
## W = 0.96382, p-value = 0.7314
# Homoscedasticity (Breusch-Pagan)
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 7.2419, df = 5, p-value = 0.2033
# Multicollinearity (VIF)
vif(model)
## pct_latino_foreignborn pct_latino_bach_high pct_spanish_home
## 3.454444 2.528402 2.074080
## pct_latino_poverty latino_median_income
## 5.428821 3.075615
summary(model)
##
## Call:
## lm(formula = rep_pct ~ pct_latino_foreignborn + pct_latino_bach_high +
## pct_spanish_home + pct_latino_poverty + latino_median_income,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.110907 -0.027420 0.005401 0.028766 0.115281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.197e+00 1.962e-01 6.104 0.000115 ***
## pct_latino_foreignborn -1.625e+00 9.506e-01 -1.709 0.118185
## pct_latino_bach_high -4.983e-01 4.587e-01 -1.086 0.302781
## pct_spanish_home -2.591e-01 2.275e-01 -1.139 0.281392
## pct_latino_poverty -3.261e-01 5.356e-01 -0.609 0.556217
## latino_median_income -3.922e-06 3.183e-06 -1.232 0.246067
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07791 on 10 degrees of freedom
## Multiple R-squared: 0.7561, Adjusted R-squared: 0.6342
## F-statistic: 6.201 on 5 and 10 DF, p-value: 0.007202
model_glance
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.756 0.634 0.0779 6.20 0.00720 5 21.9 -29.8 -24.4
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>