We will 1. Build a dataset for percentage african-american by state (including DC) 2. read in excel file from Charles Gaba with %total pop vaxxed and Trump % by states & DC 3. join these together
# first get data for pct aa
# downloaded from 2020 census at
# https://www.censusscope.org/us/rank_race_blackafricanamerican.html
# made into datafile with {datapasta}
dat <- tibble::tribble(
~Rank, ~State, ~`Number.Selecting.Race.of.Black/African.American.Alone`, ~Total.Population, ~`Percent.Selecting.Race.of.Black/African.American.Alone`,
1, "District of Columbia", 343213, 572059, "60.00%",
2, "Mississippi", 1033437, 2844658, "36.33%",
3, "Louisiana", 1444566, 4468976, "32.32%",
4, "South Carolina", 1182727, 4012012, "29.48%",
5, "Georgia", 2342110, 8186453, "28.61%",
6, "Maryland", 1468243, 5296486, "27.72%",
7, "Alabama", 1153044, 4447100, "25.93%",
8, "North Carolina", 1734154, 8049313, "21.54%",
9, "Virginia", 1384008, 7078515, "19.55%",
10, "Delaware", 148823, 783600, "18.99%",
11, "Tennessee", 929864, 5689283, "16.34%",
12, "New York", 2986242, 18976457, "15.74%",
13, "Arkansas", 417881, 2673400, "15.63%",
14, "Illinois", 1864619, 12419293, "15.01%",
15, "Florida", 2312105, 15982378, "14.47%",
16, "Michigan", 1401723, 9938444, "14.10%",
17, "New Jersey", 1127266, 8414350, "13.40%",
18, "Texas", 2385554, 20851820, "11.44%",
19, "Ohio", 1288359, 11353140, "11.35%",
20, "Missouri", 622087, 5595211, "11.12%",
21, "Pennsylvania", 1211669, 12281054, "9.87%",
22, "Connecticut", 305902, 3405565, "8.98%",
23, "Indiana", 504449, 6080485, "8.30%",
24, "Oklahoma", 258532, 3450654, "7.49%",
25, "Kentucky", 293915, 4041769, "7.27%",
26, "Nevada", 132490, 1998257, "6.63%",
27, "California", 2219190, 33871648, "6.55%",
28, "Kansas", 150584, 2688418, "5.60%",
29, "Wisconsin", 300355, 5363675, "5.60%",
30, "Massachusetts", 337157, 6349097, "5.31%",
31, "Rhode Island", 45236, 1048319, "4.32%",
32, "Nebraska", 67435, 1711263, "3.94%",
33, "Colorado", 159279, 4301261, "3.70%",
34, "Alaska", 21968, 626932, "3.50%",
35, "Minnesota", 167857, 4919479, "3.41%",
36, "Washington", 185052, 5894121, "3.14%",
37, "West Virginia", 55999, 1808344, "3.10%",
38, "Arizona", 154316, 5130632, "3.01%",
39, "Iowa", 59758, 2926324, "2.04%",
40, "New Mexico", 33513, 1819046, "1.84%",
41, "Hawaii", 20945, 1211537, "1.73%",
42, "Oregon", 53032, 3421399, "1.55%",
43, "New Hampshire", 8984, 1235786, "0.73%",
44, "Utah", 16150, 2233169, "0.72%",
45, "Wyoming", 3126, 493782, "0.63%",
46, "South Dakota", 4518, 754844, "0.60%",
47, "North Dakota", 3673, 642200, "0.57%",
48, "Vermont", 2981, 608827, "0.49%",
49, "Maine", 6047, 1274923, "0.47%",
50, "Idaho", 5244, 1293953, "0.41%",
51, "Montana", 2359, 902195, "0.26%"
)
# Now build a dataframe of state names and abbreviations
states <- data.frame(state.abb, state.name)
# add DC
de <- data.frame("DC", "District of Columbia")
names(de) <- c("state.abb", "state.name")
# bind together
states <- bind_rows(states, de)
# clean up
dat2 <- left_join(dat, states, by = c("State" = "state.name")) %>% select(-Rank) %>%
relocate(State, state.abb) %>%
set_names('state', 'state_abb', 'aa_num', 'tot_pop', 'pct_aa') %>% mutate(pct_aa = parse_number(pct_aa))
dat2
## # A tibble: 51 x 5
## state state_abb aa_num tot_pop pct_aa
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 District of Columbia DC 343213 572059 60
## 2 Mississippi MS 1033437 2844658 36.3
## 3 Louisiana LA 1444566 4468976 32.3
## 4 South Carolina SC 1182727 4012012 29.5
## 5 Georgia GA 2342110 8186453 28.6
## 6 Maryland MD 1468243 5296486 27.7
## 7 Alabama AL 1153044 4447100 25.9
## 8 North Carolina NC 1734154 8049313 21.5
## 9 Virginia VA 1384008 7078515 19.6
## 10 Delaware DE 148823 783600 19.0
## # ... with 41 more rows
# read in vax data, clean names,
# convert from proportion to percent
vax <- readxl::read_excel("vaxxes_by_state_red_blue_other.xlsx") %>% janitor::clean_names() %>%
mutate(trump_percent = 100* trump_percent) %>%
mutate(percent_total_pop_vaxxed = 100* percent_total_pop_vaxxed)
You can also embed plots, for example:
vax_demog <- left_join(dat2, vax) %>%
janitor::clean_names()
## Joining, by = "state"
vax_demog
## # A tibble: 51 x 17
## state state_abb aa_num tot_pop pct_aa trump_percent cumul_cases_1k
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 District of Col~ DC 3.43e5 572059 60 5.4 70.4
## 2 Mississippi MS 1.03e6 2844658 36.3 57.6 106.
## 3 Louisiana LA 1.44e6 4468976 32.3 58.5 99.8
## 4 South Carolina SC 1.18e6 4012012 29.5 55.1 115.
## 5 Georgia GA 2.34e6 8186453 28.6 49.2 104.
## 6 Maryland MD 1.47e6 5296486 27.7 32.2 73.8
## 7 Alabama AL 1.15e6 4447100 25.9 62.0 107.
## 8 North Carolina NC 1.73e6 8049313 21.5 49.9 94.8
## 9 Virginia VA 1.38e6 7078515 19.6 44 77.6
## 10 Delaware DE 1.49e5 783600 19.0 39.8 108.
## # ... with 41 more rows, and 10 more variables: cumul_deaths_10k <dbl>,
## # bachelors <dbl>, med_hh_income <dbl>, total_pop <dbl>,
## # dose_equiv_needed <dbl>, doses <dbl>, doses_per_100k <dbl>, j_j_est <dbl>,
## # dose_equivs <dbl>, percent_total_pop_vaxxed <dbl>
vax_demog$percent_total_pop_vaxxed
## [1] 54.22960 29.31156 32.46006 36.08423 34.05570 46.98692 29.61555 38.43342
## [9] 45.95004 45.74028 34.15777 46.31530 33.73774 43.44102 42.06562 41.59108
## [17] 46.92433 37.92969 41.02814 37.27400 46.27757 53.76829 36.27240 37.05195
## [25] 39.59623 39.17783 46.26807 38.79716 45.57027 53.96797 50.78738 42.50621
## [33] 46.41418 41.24348 45.82595 46.22629 35.62278 40.47817 43.66605 48.99970
## [41] 51.72683 44.64318 46.10259 37.92714 33.93485 43.45425 39.60440 55.72396
## [49] 52.78062 33.10704 39.30872
vax_demog %>%
lm(percent_total_pop_vaxxed ~ trump_percent, data = .) %>% broom::glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.653 0.646 3.92 92.2 7.64e-13 1 -141. 288. 294.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
This model, with only one predictor, predicts 64.6% of the variance in vaccination. This is a LOT for a social outcomes model.
vax_demog %>%
lm(percent_total_pop_vaxxed ~ trump_percent, data = .) %>% broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 64.1 2.34 27.4 2.33e-31
## 2 trump_percent -0.444 0.0462 -9.60 7.64e-13
in a hypothetical US state with no Trump voters, 64.1% of the people would be vaccinated (Intercept). For every one percent increase in Trump voters in each state (on average), the percent vaccinated goes down by 0.44%. This is a highly significant effect (p value 7.64 x 10^-13).
vax_demog %>%
lm(percent_total_pop_vaxxed ~ pct_aa + trump_percent, data = .) %>% broom::glance()
## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.843 0.836 2.67 128. 5.35e-20 2 -121. 250. 257.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
This model, with two predictors, predicts 83.6% of the variance in vaccination. This is an ENORMOUS AMOUNT for a social outcomes model.
vax_demog %>%
lm(percent_total_pop_vaxxed ~ pct_aa + trump_percent, data = .) %>% broom::tidy()
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 70.5 1.80 39.1 4.54e-38
## 2 pct_aa -0.255 0.0335 -7.61 8.73e-10
## 3 trump_percent -0.518 0.0329 -15.7 1.41e-20
in a hypothetical US state with no Trump voters and no African-Americans, 70.5% of the people would be vaccinated (Intercept). For every one percent increase in African-Americans in each state (on average), the percent vaccinated goes down by 0.26%. This is a highly significant effect (p value 8.73 x 10^-10). For every one percent increase in Trump voters in each state (on average), the percent vaccinated goes down by 0.518%. This is also a highly significant effect (p value 1.41 x 10^-20). By controlling for the effect of African-American population, we have revealed a stronger (and more significant) effect of Trump voting. We also have a better overall prediction model.
Let’s use this better model to make predictions about each state, and see how accurate these are. In states that are less accurate, other variables (that we have not included in the model) may be important, and could be causing the variance between our predictions and the actual percent_total_pop_vaxxed.
predicted <- vax_demog %>%
lm(percent_total_pop_vaxxed ~ pct_aa + trump_percent, data = .) %>%
broom::augment() %>%
rename(predicted = .fitted)
predictions_by_state <- vax_demog %>%
select(state, trump_percent) %>%
right_join(predicted)
## Joining, by = "trump_percent"
predictions_by_state %>%
select(state, trump_percent, pct_aa, percent_total_pop_vaxxed, predicted) %>%
flextable::flextable()
state | trump_percent | pct_aa | percent_total_pop_vaxxed | predicted |
District of Columbia | 5.40 | 60.00 | 54.22960 | 52.41792 |
Mississippi | 57.60 | 36.33 | 29.31156 | 31.38639 |
Louisiana | 58.46 | 32.32 | 32.46006 | 31.96181 |
South Carolina | 55.11 | 29.48 | 36.08423 | 34.42167 |
Georgia | 49.24 | 28.61 | 34.05570 | 37.68615 |
Maryland | 32.15 | 27.72 | 46.98692 | 46.77199 |
Alabama | 62.03 | 25.93 | 29.61555 | 31.73854 |
North Carolina | 49.93 | 21.54 | 38.43342 | 39.12901 |
Virginia | 44.00 | 19.55 | 45.95004 | 42.70982 |
Delaware | 39.77 | 18.99 | 45.74028 | 45.04520 |
Tennessee | 60.66 | 16.34 | 34.15777 | 34.89105 |
New York | 37.75 | 15.74 | 46.31530 | 46.92003 |
Arkansas | 62.40 | 15.63 | 33.73774 | 34.16988 |
Illinois | 40.55 | 15.01 | 43.44102 | 45.65446 |
Florida | 51.22 | 14.47 | 42.06562 | 40.26083 |
Michigan | 47.84 | 14.10 | 41.59108 | 42.10720 |
New Jersey | 41.40 | 13.40 | 46.92433 | 45.62386 |
Texas | 52.06 | 11.44 | 37.92969 | 40.59705 |
Ohio | 53.27 | 11.35 | 41.02814 | 39.99272 |
Missouri | 56.80 | 11.12 | 37.27400 | 38.22140 |
Pennsylvania | 48.84 | 9.87 | 46.27757 | 42.66608 |
Connecticut | 39.19 | 8.98 | 53.76829 | 47.89514 |
Indiana | 57.02 | 8.30 | 36.27240 | 38.82554 |
Oklahoma | 65.37 | 7.49 | 37.05195 | 34.70331 |
Kentucky | 62.09 | 7.27 | 39.59623 | 36.45964 |
Nevada | 47.67 | 6.63 | 39.17783 | 44.09773 |
California | 34.32 | 6.55 | 46.26807 | 51.03853 |
Kansas | 56.21 | 5.60 | 38.79716 | 39.93304 |
Wisconsin | 48.82 | 5.60 | 45.57027 | 43.76390 |
Massachusetts | 32.14 | 5.31 | 53.96797 | 52.48440 |
Rhode Island | 38.61 | 4.32 | 50.78738 | 49.38258 |
Nebraska | 58.22 | 3.94 | 42.50621 | 39.31385 |
Colorado | 41.90 | 3.70 | 46.41418 | 47.83500 |
Alaska | 52.83 | 3.50 | 41.24348 | 42.21999 |
Minnesota | 45.28 | 3.41 | 45.82595 | 46.15671 |
Washington | 38.77 | 3.14 | 46.22629 | 49.60016 |
West Virginia | 68.62 | 3.10 | 35.62278 | 34.13658 |
Arizona | 49.06 | 3.01 | 40.47817 | 44.29909 |
Iowa | 53.09 | 2.04 | 43.66605 | 42.45704 |
New Mexico | 43.50 | 1.84 | 48.99970 | 47.47927 |
Hawaii | 34.27 | 1.73 | 51.72683 | 52.29197 |
Oregon | 40.37 | 1.55 | 44.64318 | 49.17567 |
New Hampshire | 45.36 | 0.73 | 46.10259 | 46.79777 |
Utah | 58.13 | 0.72 | 37.92714 | 40.18055 |
Wyoming | 69.94 | 0.63 | 33.93485 | 34.08136 |
South Dakota | 61.77 | 0.60 | 43.45425 | 38.32419 |
North Dakota | 65.11 | 0.57 | 39.60440 | 36.60043 |
Vermont | 30.67 | 0.49 | 55.72396 | 54.47395 |
Maine | 44.02 | 0.47 | 52.78062 | 47.55862 |
Idaho | 63.84 | 0.41 | 33.10704 | 37.29953 |
Montana | 56.92 | 0.26 | 39.30872 | 40.92494 |
Most of these predictions are pretty accurate, between 1-3% of the actual values. There may be a west coast vaccine fear factor, as several states are lower than predicted by these 2 variables, and the Dakotas are higher than predicted.