code

Author

plumber

Published

January 18, 2025

来自： https://freerangestats.info/blog/2024/12/23/depression-and-vote

DATA

# https://stacks.cdc.gov/view/cdc/129404
# 抑郁症数据 dep
dep <- readxl::read_excel(here('dataset', 'cdc_129404_DS1.xlsx'), skip = 1)


# 选举数据 votes
# "https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-24/refs/heads/master/2024_US_County_Level_Presidential_Results.csv"
fn <- here('dataset', "2024_US_County_Level_Presidential_Results.csv")
votes <- read_csv(fn)

Rows: 3160 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): state_name, county_fips, county_name
dbl (7): votes_gop, votes_dem, total_votes, diff, per_gop, per_dem, per_poin...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 合并数据
combined <- votes |>
  # 基于fips合并
  inner_join(dep, by = join_by(`county_fips` == `CountyFIPS code`)) |>
  mutate(cpe = `Crude Prevalence Estimate` / 100, # 抑郁率
         aape = `Age-adjusted Prevalence Estimate` / 100) # 年龄调整的抑郁率

OLS

ols_model <- lm(per_gop ~ cpe, data = combined)
tidy(ols_model)

# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    0.399    0.0186      21.5 2.53e-95
2 cpe            1.28     0.0871      14.7 3.39e-47

查看图片

the_caption = "Source: data from tonmcg and CDC; analysis by freerangestats.info"

combined |>
  ggplot(aes(x= cpe, y = per_gop)) +
  geom_point(colour = "steelblue", alpha = 0.5) +
  geom_smooth(method = "lm", fill = "black", colour = "white", alpha = 0.8) +
  scale_x_continuous(label = percent) +
  scale_y_continuous(label = percent) +
  labs(x = "Crude prevalence estimate of depression",
       y = "Percentage vote for Trump in 2024 election",
       subtitle = "Line is ordinary least squares fit to all county data together",
       title = "Counties with more depression voted more for Trump",
       caption = the_caption)

`geom_smooth()` using formula = 'y ~ x'

quasibinomial response

# logit模型，作为对比
logit_model <- glm(per_gop ~ cpe,
                   data = combined,
                   family = binomial(),
                   weights = total_votes)
tidy(logit_model)

# A tibble: 2 × 5
  term        estimate std.error statistic p.value
  <chr>          <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    -1.83   0.00105    -1748.       0
2 cpe             9.29   0.00524     1771.       0

# quasi-binomial模型
# 允许方差按某种一致的比例增大（或减小
quasi_model <- glm(per_gop ~ cpe,
                   data = combined,
                   family = quasibinomial(),
                   weights = total_votes)
tidy(quasi_model)

# A tibble: 2 × 5
  term        estimate std.error statistic   p.value
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)    -1.83    0.0683     -26.8 1.75e-142
2 cpe             9.29    0.342       27.2 5.82e-146

preds2 <- predict(quasi_model, type = "response", se.fit = TRUE)

# draw chart:
combined |>
  mutate(fit = preds2$fit,
         se = preds2$se.fit,
         lower = fit - 1.96 * se,
         upper = fit + 1.96 * se) |>
  ggplot(aes(x = cpe, y = per_gop)) +
  geom_point(colour = "steelblue", alpha = 0.5) +
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "black", alpha = 0.5) +
  geom_line(aes(y = fit), colour = "white") +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 1), label = percent) +
  scale_x_continuous(label  = percent)  +
  labs(x = "Crude prevalence estimate of depression",
       y = "Percentage vote for Trump in 2024 election",
       subtitle = "Line is generalized linear model with quasibinomial response, fit to all county data together",
       title = "Counties with more depression voted more for Trump",
       caption = the_caption)

Random intercepts

combined <- mutate(combined, state_name = as.factor(state_name))

randinter <- gam(per_gop ~ cpe + s(state_name, bs = 're') , 
              family = quasibinomial, weights = total_votes,
              data = combined)
summary(randinter)


Family: quasibinomial 
Link function: logit 

Formula:
per_gop ~ cpe + s(state_name, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -3.2435     0.2792  -11.62   <2e-16 ***
cpe          16.3126     0.6222   26.22   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                edf Ref.df     F p-value    
s(state_name) 48.43     49 20.43  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.345   Deviance explained = 40.2%
GCV = 3364.8  Scale est. = 3418.8    n = 3107

绘图

preds <- predict(randinter, se.fit = TRUE, type = "response")
combined |>
  mutate(fit = preds$fit,
         se = preds$se.fit,
         lower = fit - 1.96 * se,
         upper = fit + 1.96 * se) |>
  ggplot(aes(x = cpe, group = state_name)) +
  geom_point(aes(y = per_gop, colour = state_name), alpha = 0.5) +
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "black", alpha = 0.5) +
  geom_line(aes(y = fit, colour = state_name)) +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 1), label = percent) +
  scale_x_continuous(label  = percent)  +
  labs(x = "Crude prevalence estimate of depression",
       y = "Percentage vote for Trump in 2024 election",
       subtitle = "Lines are quasibinomial generalized additive model with state-level random intercept",
       title = "Counties with more depression voted more for Trump",
       caption = the_caption)

Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

spatial autocorrelation

Add a fixed effect that is a sort of rubber sheet laid across the geography of interest

# 所有县的geometry
counties <- st_read(here('dataset', 'cb_2023_us_county_500k', "cb_2023_us_county_500k.shp"))

Reading layer `cb_2023_us_county_500k' from data source 
  `/Users/dongshuhao/Desktop/trump/dataset/cb_2023_us_county_500k/cb_2023_us_county_500k.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 3235 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.1467 ymin: -14.5487 xmax: 179.7785 ymax: 71.38782
Geodetic CRS:  NAD83

# 提取质心
county_cent <- counties |>
  st_centroid()

Warning: st_centroid assumes attributes are constant over geometries

sc <- st_coordinates(county_cent) # X和Y坐标
county_cent <- county_cent |>
  mutate(x = sc[, 1],
         y = sc[, 2],
         # combine the two digit state code with the 3 digit county code:
         county_fips = paste0(STATEFP, COUNTYFP))

# 合并数据并检查
combined2 <- combined |>
  left_join(county_cent, by = "county_fips")

ggplot(combined2, aes(x = x, y = y, colour = state_name)) + 
  geom_point() +
  theme(legend.position = "none") +
  coord_map() +
  labs(title = "Centres of counties after merging data")

# 拟合模型
model6 <- gam(per_gop ~ cpe + s(x, y) + s(state_name, bs = 're') , 
              family = quasibinomial, weights = total_votes,
              data = combined2)

# the spatial rubber mat that is correcting for spatial correlation for us;
# scheme=1 is what makes it draw a perspective plot rather than contour or
# heatmap):
plot(model6, select = 1, scheme = 1, main = "Higher vote for GOP")

绘制预期

preds6 <- predict(model6, se.fit = TRUE, type = "response")

combined2 |>
  mutate(fit = preds6$fit,
         se = preds6$se.fit,
         lower = fit - 1.96 * se,
         upper = fit + 1.96 * se) |>
  ggplot(aes(x = cpe, group = state_name)) +
  geom_point(aes(y = per_gop, colour = state_name), alpha = 0.5) +
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "black", alpha = 0.5) +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 1), label = percent) +
  scale_x_continuous(label  = percent)  +
  labs(x = "Crude prevalence estimate of depression",
       y = "Percentage vote for Trump in 2024 election",
       subtitle = "Grey ribbons are 95% confidence intervals from quasibinomial generalized additive model with spatial effect and state-level random intercept effect",
       title = "Counties with more depression voted more for Trump",
       caption = the_caption) +
  facet_wrap(~state_name)

Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
-Inf

summary(model6)


Family: quasibinomial 
Link function: logit 

Formula:
per_gop ~ cpe + s(x, y) + s(state_name, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -2.790     33.100  -0.084    0.933    
cpe           15.542      0.678  22.922   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                edf Ref.df     F p-value    
s(x,y)        28.37  28.94 7.968  <2e-16 ***
s(state_name) 49.00  49.00 7.873  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.385   Deviance explained = 44.8%
GCV = 3168.3  Scale est. = 3201.2    n = 3107