Init

Packages and settings.

options(digits = 2,
        tibble.print_max = 30)
library(pacman)
p_load(kirkegaard, haven, readxl, rms, spatialstatstools, sf, broom, doFuture, stringdist, gtools, formula.tools)
theme_set(theme_bw())

Parallel

Parallelization for speeding up some tasks.

registerDoFuture()
plan(multiprocess(workers = 4))
## Warning in supportsMulticoreAndRStudio(...): [ONE-TIME WARNING] Forked
## processing ('multicore') is not supported when running R from RStudio
## because it is considered unstable. For more details, how to control forked
## processing or not, and how to silence this warning in future R sessions, see ?
## parallelly::supportsMulticore

Functions

Ad hoc functions.

#bugfree sample
sample = function(x, size, replace = F, prob = NULL) {
  if (length(x) == 1) return(x)
  base::sample(x, size = size, replace = replace, prob = prob)
}

#describe without bug
describe = function(...) {y = psych::describe(...); class(y) = "data.frame"; y}

#overwrite name
map = purrr::map

#merge units as desired by group variable
st_union_by = function(geo, group) {
  # browser()
  geo
  group

  y2 = list()
  #loop over by groups and merge units
  for (i in unique(group)) {
    #which units
    z = geo[group == i]

    #merge
    y = Reduce(st_union, z)
    y2[[i]] = y
  }

  st_sfc(y2)
}

#model anova partial R2s
model_partialR2s = function(x) {
  # browser()
  #try calculate
  y = try({plot(anova(x), what='proportion R2', pl = F)}, silent = T)
  
  #if error, return NULL
  if (is.error(y)) return(NULL)
  
  #fix names
  if (!has_names(y)) names(y) = names(coef(x)[-1])
  
  #data frame result
  y %>% as.list() %>% as_tibble()
}

Data

Read data files, merge, rename, recode.

#Kirkegaard 2016 datafile
#social, demographics, SES etc.
maindata = read_csv("data/data_merged.csv") %>% filter(!duplicated(FIPS))
## Warning: Missing column names filled in: 'X1' [1]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   County = col_character(),
##   State = col_character(),
##   temp_bins = col_character(),
##   lat_bins = col_character(),
##   lon_bins = col_character(),
##   precip_bins = col_character(),
##   elevation_bins = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
#remove units without names
maindata %<>% filter(!is.na(County))

#fix data error
maindata$lon[which(maindata$County == "Aleutians West Census Area, Alaska")] = -178

#rename old elevation
maindata %<>% rename(
  elevation_old = elevation
)

#remove unit types
rm_unit_type = function(x) {
  x %>% 
    str_replace(" City and Borough$", "") %>% 
    #these cause duplicates unfortunately
    # str_replace(" City$", "") %>% 
    # str_replace(" city$", "") %>% 
    str_replace(" Borough$", "") %>% 
    str_replace(" County$", "") %>% 
    str_replace(" Parish$", "") %>% 
    str_replace(" Census( Area)?$", "") %>% 
    str_replace(" Municipality$", "")
}

#mutate for joining with datasets without FIPS
maindata %<>% mutate(
  county_name = County %>% str_match("(.+), ") %>% .[, 2],
  county_name2 = County %>% str_match("(.+), ") %>% .[, 2] %>% 
    rm_unit_type(),
  state_name = County %>% str_match(", (.+)") %>% .[, 2],
  state_abbrev = mapvalues(state_name, from = state.name, to = state.abb),
  county_state = str_glue("{county_name2} - {state_abbrev}") %>% str_to_title(),
  county_state2 = str_glue("{county_name2} - {state_name}") %>% str_to_title()
)

#dups in new id?
any(duplicated(maindata$county_state))
## [1] FALSE
#SEDA pooled data
#https://edopportunity.org/get-the-data/seda-archive-downloads/
seda_CS = haven::read_dta("data/SEDA/3.0/seda_county_pool_cs_v30.dta") %>% 
  rename(
    FIPS = countyid,
    ) %>% 
  mutate(
    FIPS = FIPS %>% as.character() %>% as.numeric()
    )

#spread mean and SE by SIRE
seda_CS_wide = cbind(
  {
    seda_CS %>% 
    select(FIPS, subgroup, mn_avg_ol) %>% 
    spread(key = subgroup, value = mn_avg_ol) %>% 
    select(FIPS, all, asn, blk, hsp, wht, wbg, whg, wag)
    
  },
  {
    . = seda_CS %>% 
    select(FIPS, subgroup, mn_avg_ol_se) %>% 
    spread(key = subgroup, value = mn_avg_ol_se) %>% 
    select(FIPS, all, asn, blk, hsp, wht, wbg, whg, wag)
    
    #add _se to names
    colnames(.)[-1] = colnames(.)[-1] + "_se"
    .[-1]
  }
)

#SEDA covariates
seda_covars = read_dta("data/SEDA/3.0/SEDA_cov_county_pool_v30.dta") %>% 
  rename(
    FIPS = countyid,
    #SES composites
    SES_all = sesavgall,
    SES_white = sesavgwht,
    SES_black = sesavgblk,
    SES_hispanic = sesavghsp,
    #SES gaps
    SES_d_bw = sesavgwhtblk,
    SES_d_hw = sesavgwhthsp,
    #demographics
    frc_amerindian = perind,
    frc_asian = perasn,
    frc_hispanic = perhsp,
    frc_black = perblk,
    frc_white = perwht,
  ) %>% 
  mutate(
    FIPS = FIPS %>% as.character() %>% as.numeric()
    )

#climate
#https://wonder.cdc.gov/nasa-nldas.html
climate = readxl::read_excel("data/North America Land Data Assimilation System (NLDAS) Daily Air Temperatures and Heat Index (1979-2011).xlsx") %>% 
  rename(FIPS = Fips)
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting logical in A3113 / R3113C1: got 'Total'
#UV
#https://gis.cancer.gov/tools/uv-exposure/
uv = readxl::read_excel("data/uv-county.xlsx")
uv$FIPS = uv$COUNTY_FIPS %>% as.numeric()

#altitude data AHRF
#https://www.usgs.gov/core-science-systems/ngp/board-on-geographic-names/download-gnis-data
AHRF = read_sas("data/AHRF_2018-2019_SAS/ahrf2019.sas7bdat")
AHRF_dict = AHRF %>% df_var_table()
AHRF$FIPS = AHRF$f00002 %>% as.numeric()
AHRF$elevation = AHRF$f0081176 * 0.3048
AHRF$elevation_z = AHRF$elevation %>% standardize()

#disease data
disease_sheets = readxl::excel_sheets("data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014_NATIONAL_Y2018M03D27.XLSX")
# disease = read_excel("data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014_NATIONAL_Y2018M03D27.XLSX", skip = 1)
disease = map_df(disease_sheets, function(s) {
  # browser()
  #read sheet
  d_s = read_excel("data/IHME_USA_COUNTY_INFECT_DIS_MORT_1980_2014_NATIONAL_Y2018M03D27.XLSX", skip = 1,
                   sheet = s)
  
  #restructure to long form
  d_s %>% 
    filter(!is.na(FIPS)) %>% 
    select(-11) %>% 
    mutate(
      disease = s
    ) %>% 
    gather(key = year, value = rate_orig, starts_with("Mortality"))
}) %>% 
  mutate(
    rate = str_match(rate_orig, "(\\d+?\\.?\\d+?) \\((\\d+?\\.?\\d+?), (\\d+?\\.?\\d+?)\\)")[, 2] %>% as.numeric(),
    CI_lower = str_match(rate_orig, "(\\d+?\\.?\\d+?) \\((\\d+?\\.?\\d+?), (\\d+?\\.?\\d+?)\\)")[, 3] %>% as.numeric(),
    CI_upper = str_match(rate_orig, "(\\d+?\\.?\\d+?) \\((\\d+?\\.?\\d+?), (\\d+?\\.?\\d+?)\\)")[, 4] %>% as.numeric(),
    year = str_match(year, "\\d+")[, 1] %>% as.numeric()
  )

#disease wide for 2014 data
disease_wide = disease %>% 
  filter(year == 2014) %>% 
  select(FIPS, disease, rate) %>% 
  spread(key = disease, value = rate) %>% 
  df_legalize_names()


#save dataset for manual treatment
fluoride_county = read_csv("data/fluoride_county.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   county_state = col_character(),
##   state = col_character(),
##   waterworks = col_double(),
##   population_served = col_double(),
##   fluoridated = col_double(),
##   fluoride_conc = col_double(),
##   exact = col_logical(),
##   FIPS = col_double()
## )
#lead
#https://mytapscore.com/pages/the-lead-map
#screenshot in figures folder if site goes down
lead_county = read_csv("data/lead_county.csv") %>% 
  mutate(
    lead_index = ageidx + gw_corr_idx
  )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   name_2 = col_character(),
##   name = col_character(),
##   ca_average_age = col_double(),
##   ageidx = col_double(),
##   gw_corr_idx = col_double(),
##   state_abbr = col_character(),
##   county_state = col_character(),
##   exact = col_logical(),
##   FIPS = col_double()
## )
#lead poisoning counts
#https://www.reuters.com/investigates/special-report/usa-lead-testing/
#kinda problematic data source

#voting data in 2016
voting = read_csv("data/votingdata_KirkPesta2019.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   name_16 = col_character(),
##   votes16_jacobp = col_logical(),
##   votes16_whitej = col_logical(),
##   votes16_mooreheadm = col_logical(),
##   votes16_none_of_these_candidates = col_logical(),
##   votes16_duncanr = col_logical(),
##   votes16_skewesp = col_logical(),
##   votes16_giordanir = col_logical(),
##   name_prev = col_character(),
##   ST = col_character(),
##   statecode_prev = col_character(),
##   County = col_character(),
##   State = col_character(),
##   temp_bins = col_character(),
##   lat_bins = col_character(),
##   lon_bins = col_character(),
##   precip_bins = col_character(),
##   elevation_bins = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
#no dups
assert_that(!any(duplicated(maindata$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(seda_CS_wide$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(climate$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(uv$COUNTY_FIPS)))
## [1] TRUE
assert_that(!any(duplicated(AHRF$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(voting$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(seda_covars$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(lead_county$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(fluoride_county$FIPS)))
## [1] TRUE
assert_that(!any(duplicated(disease_wide$FIPS)))
## [1] TRUE
#merge
d = full_join(maindata, climate, by = "FIPS") %>% 
  full_join(uv, by = "FIPS") %>% 
  full_join(AHRF %>% select(FIPS, elevation, elevation_z), by = "FIPS") %>% 
  full_join(seda_CS_wide, by = "FIPS") %>% 
  full_join(voting %>% select(FIPS, rep16_frac:dem16_frac2), by = "FIPS") %>% 
  full_join(seda_covars %>% select(FIPS, frc_amerindian:frc_white, SES_all:SES_d_hw), by = "FIPS") %>% 
  full_join(lead_county %>% select(FIPS, lead_index), by = "FIPS") %>% 
  full_join(fluoride_county %>% select(FIPS, waterworks:fluoride_conc), by = "FIPS") %>% 
  left_join(disease_wide, by = "FIPS")

#remove units without CA data
#do we need
# d %<>% filter(!is.na(County.x), !is.na(CA))

#rename
d$UV = d$`UV_ Wh/m²` %>% standardize()
d$UV_orig = d$`UV_ Wh/m²`
d$avg_max_temp = d$`Avg Daily Max Heat Index (C)` %>% standardize()
d$avg_max_temp_orig = d$`Avg Daily Max Heat Index (C)`
d$avg_min_air_temp = d$`Avg Daily Min Air Temperature (C)` %>% standardize()
d$avg_min_air_temp_orig = d$`Avg Daily Min Air Temperature (C)`
d$avg_temp = d$`Avg Temperature (1979 to 2011)` %>% standardize()
d$avg_temp_orig = d$`Avg Temperature (1979 to 2011)`

#z score geoposition
d$lat_z = d$lat %>% standardize()
d$lon_z = d$lon %>% standardize()

#cognitive ability standardization
#keep originals as well
d$old_CA = d$CA
d$CA_orig = d$all
d$CA_all_orig = d$CA_orig #synonym
d$CA_asian_orig = d$asn
d$CA_black_orig = d$blk
d$CA_hispanic_orig = d$hsp
d$CA_white_orig = d$wht

d$CA = d$all %>% standardize()
d$CA_all = d$CA #synonym
d$CA_asian = d$asn %>% standardize()
d$CA_black = d$blk %>% standardize()
d$CA_hispanic = d$hsp %>% standardize()
d$CA_white = d$wht %>% standardize()
d$CA_d_bw = d$wbg
d$CA_d_hw = d$whg
d$CA_d_aw = d$wag

#SES variables standardization
d$SES_all_orig = d$SES_all
d$SES_black_orig = d$SES_black
d$SES_hispanic_orig = d$SES_hispanic
d$SES_white_orig = d$SES_white

d$SES_all = d$SES_all %>% standardize()
d$SES_black = d$SES_black %>% standardize()
d$SES_hispanic = d$SES_hispanic %>% standardize()
d$SES_white = d$SES_white %>% standardize()

#more population sizes
d$pop_amerindian = d$Amerindian * d$Total.Population
d$pop_asian = d$Asian * d$Total.Population
d$pop_black = d$Black * d$Total.Population
d$pop_hispanic = d$Hispanic * d$Total.Population
d$pop_other = d$Other * d$Total.Population
d$pop_white = d$White * d$Total.Population
d$pop_all = d$Total.Population
d$pop_weight = d$Total.Population %>% sqrt()

#other renamings / z scorings
d$lead_index = d$lead_index
d$lead_index_z = d$lead_index %>% standardize()
d$fluoride_conc_z = d$fluoride_conc %>% standardize()
d$Diarrheal_diseases_z = d$Diarrheal_diseases %>% standardize()
d$Hepatitis_z = d$Hepatitis %>% standardize()
d$Tuberculosis_z = d$Tuberculosis %>% standardize()
d$HIV_AIDS_z = d$HIV_AIDS %>% standardize()
d$Lower_respiratory_infections_z = d$Lower_respiratory_infections %>% standardize()
d$Meningitis_z = d$Meningitis %>% standardize()



#demographics divide by 100
#these are the old variables from ACS and Kirk 2016
race_vars = c("White", "Black", "Asian", "Hispanic", "Amerindian", "Other")
for (v in race_vars) d[[v]] = d[[v]] / 100
d[race_vars] %>% rowSums() %>% psych::describe() %>% as.matrix()
##    vars    n mean      sd median trimmed     mad min max range  skew kurtosis
## X1    1 3143    1 0.00049      1       1 0.00074   1   1 0.003 0.036    -0.12
##         se
## X1 8.7e-06
#new ones from CEDA
race_vars2 = c("frc_white", "frc_black", "frc_asian", "frc_hispanic", "frc_amerindian")
d[race_vars2] %>% rowSums() %>% psych::describe() %>% as.matrix()
##    vars    n mean      sd median trimmed     mad min max   range skew kurtosis
## X1    1 3124    1 2.1e-09      1       1 1.4e-09   1   1 2.5e-08 0.16        4
##         se
## X1 3.7e-11
#var lists
climate_vars = c("UV", "avg_temp", "lat", "lon", "elevation", "lat_z", "lon_z", "elevation_z", "fluoride_conc", "fluoride_conc_z", "lead_index", "lead_index_z")
climate_vars_model = c("UV", "avg_temp", "lat_z", "lon_z", "elevation_z")
water_vars = c("lead_index_z", "fluoride_conc_z")
disease_vars = c("Diarrheal_diseases_z", "Hepatitis_z", "HIV_AIDS_z", "Lower_respiratory_infections_z", "Meningitis_z", "Tuberculosis_z")
CA_vars = c("CA", "CA_all", "CA_asian", "CA_black", "CA_hispanic", "CA_white")
SES_vars = c("SES_all", "SES_black", "SES_hispanic", "SES_white")
main_vars = c(CA_vars, SES_vars, race_vars2, climate_vars)
main_vars2 = c(CA_vars, SES_vars, climate_vars)
main_vars3 = c(main_vars2, water_vars, disease_vars)

#make spatial lag variables
#slow because it recalculates distances every time
#we cant just precompute because the function cannot handle missing data properly
for (v in main_vars) {
  #no NA
  tmp = d[!is.na(d[[v]]) & !is.na(d$lat) & !is.na(d$lon), ]
  tmp_var = str_glue("{v}_lag") %>% as.character()
  
  #estimate
  tmp[[tmp_var]] = SAC_knsnr(tmp, v, k = 3)
  
  #join
  d[[tmp_var]] = NULL
  d = left_join(d, tmp %>% select(FIPS, !!tmp_var), by = "FIPS")
}

Spatial data

#load shapefile
#https://catalog.data.gov/dataset/tiger-line-shapefile-2017-nation-u-s-current-county-and-equivalent-national-shapefile
county_sf = st_read("data/spatial/tl_2017_us_county.shp")
## Reading layer `tl_2017_us_county' from data source `/media/luks8tb2/Projects/UV USA counties/data/spatial/tl_2017_us_county.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 17 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -180 ymin: -15 xmax: 180 ymax: 71
## CRS:            4269
#make FIPS
county_sf$FIPS = (as.character(county_sf$STATEFP) + as.character(county_sf$COUNTYFP)) %>% as.numeric()
county_sf = county_sf %>% filter(FIPS %in% d$FIPS)

#join in data
county_sf_data = left_join(county_sf, d, by = "FIPS")

#calculate area
county_sf_data$area = st_area(county_sf_data) %>% as.numeric()

#same as existing variable? yep! perfectly sums to 1
ols(ALAND ~ area, data = county_sf_data)
## Linear Regression Model
##  
##  ols(formula = ALAND ~ area, data = county_sf_data)
##  
##                     Model Likelihood    Discrimination    
##                           Ratio Test           Indexes    
##  Obs     3224    LR chi2    13211.32    R2       0.983    
##  sigma1.2e+09    d.f.              1    R2 adj   0.983    
##  d.f.    3222    Pr(> chi2)   0.0000    g      3.2e+09    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -3.162e+10  5.051e+07  9.160e+07  1.402e+08  2.012e+10 
##  
##  
##            Coef     S.E.    t      Pr(>|t|)
##  Intercept -1.6e+07 2.2e+07  -0.74 0.4566  
##  area       9.3e-01 2.1e-03 436.77 <0.0001 
## 
ols(area ~ ALAND + AWATER, data = county_sf_data)
## Linear Regression Model
##  
##  ols(formula = area ~ ALAND + AWATER, data = county_sf_data)
##  
##                       Model Likelihood    Discrimination    
##                             Ratio Test           Indexes    
##  Obs      3224    LR chi2    111735.33    R2       1.000    
##  sigma289.1779    d.f.               2    R2 adj   1.000    
##  d.f.     3221    Pr(> chi2)    0.0000    g      3.4e+09    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -3752.401  -110.288   -77.642     1.031  3868.906 
##  
##  
##            Coef     S.E.   t       Pr(>|t|)
##  Intercept 110.0715 5.3347 2.1e+01 <0.0001 
##  ALAND       1.0000 0.0000 1.7e+09 <0.0001 
##  AWATER      1.0000 0.0000 2.5e+08 <0.0001 
## 
#water proportion
county_sf_data$water_area_prop = county_sf_data$AWATER / county_sf_data$area

#nicer names
county_sf_data$land_area = county_sf_data$ALAND
county_sf_data$water_area = county_sf_data$AWATER

#new latitude and longitudide
#https://stackoverflow.com/questions/52522872/r-sf-package-centroid-within-polygon
county_sf_data$lat2 = map_dbl(county_sf_data$geometry, ~st_point_on_surface(.x)[[2]])
county_sf_data$lon2 = map_dbl(county_sf_data$geometry, ~st_point_on_surface(.x)[[1]])
         

#z score versions
county_sf_data %<>% mutate(
  land_area_z = land_area %>% standardize(),
  water_area_z = water_area %>% standardize(),
  water_area_prop_z = water_area_prop %>% standardize(),
  
  #pop density
  pop_density = Total.Population / land_area,
  pop_density_log10 = pop_density %>% log10(),
  pop_density_log10_z = pop_density_log10 %>% standardize()
)

#back to normal
d = county_sf_data %>% st_drop_geometry()

State level data

Aggregate to state level and compute spatial lags.

#aggregate
d_state = d %>% 
  filter(!State.x %in% c("Hawaii", "Alaska")) %>% 
  plyr::ddply("State.x", .fun = function(dd) {
    # browser()
    # if (dd$State.x[1] == "District of Columbia") browser()
    #sum populations
    y = tibble(
      #population counts
      population = wtd_sum(dd$Total.Population),
      pop_all = population,
      pop_amerindian = wtd_sum(dd$Total.Population * dd$frc_amerindian),
      pop_asian = wtd_sum(dd$Total.Population * dd$frc_asian),
      pop_black = wtd_sum(dd$Total.Population * dd$frc_black),
      pop_hispanic = wtd_sum(dd$Total.Population * dd$frc_hispanic),
      # pop_other = wtd_sum(dd$Total.Population * dd$Other),
      pop_white = wtd_sum(dd$Total.Population * dd$frc_white),
      
      #fractions
      frc_amerindian = pop_amerindian / population,
      frc_asian = pop_asian / population,
      frc_black = pop_black / population,
      frc_hispanic = pop_hispanic / population,
      # Other = pop_other / population,
      frc_white = pop_white / population
    )
    
    #loop vars
    #not population composition
    for (v in c(main_vars3)) {
      #compute
      y[[v]] = wtd_mean(dd[[v]], w = dd$Total.Population)
    }
    
    y
})

#pop weight
d_state$pop_weight = d_state$population %>% sqrt()

#standardize again
#not population composition
for (v in c(main_vars2)) d_state[[v]] = d_state[[v]] %>% standardize()

#make spatial lag variables
for (v in c(main_vars)) {
  #no NA
  tmp = d_state[!is.na(d_state[[v]]) & !is.na(d_state$lat) & !is.na(d_state$lon), ]
  tmp_var = str_glue("{v}_lag") %>% as.character()
  
  #estimate
  tmp[[tmp_var]] = SAC_knsnr(tmp, v, k = 3)
  
  #join
  d_state[[tmp_var]] = NULL
  d_state = left_join(d_state, tmp %>% select(State.x, !!tmp_var), by = "State.x")
}

National data

Needed for subanalysis.

#UV
#http://apps.who.int/gho/data/view.main.35300
UV_UN = read_csv("data/UV countries.csv", col_names = c("Country", "UV"), skip = 1) %>% 
  #skip world
  .[-1, ] %>% 
  #ISO
  mutate(
    ISO = pu_translate(Country)
  )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Country = col_character(),
##   UV = col_double()
## )
#rainfall
#https://data.worldbank.org/indicator/AG.LND.PRCP.MM?end=2014&start=2014&view=map
rainfall = read_excel("data/API_AG.LND.PRCP.MM_DS2_en_excel_v2_202879.xls", skip = 3) %>% 
  df_legalize_names() %>% 
  rename(
    ISO = Country_Code,
    rainfall2012 = x2012
  )

#merge with collection to get latitude etc.
mega = read_csv("data/Megadataset_v2.0p.csv") %>% 
  rename(
    ISO = X1
  )
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   X1 = col_character(),
##   LVnotes = col_character(),
##   Names = col_character(),
##   Capital = col_character(),
##   se_Land = col_character(),
##   se_abbrev = col_character(),
##   de_Country = col_character(),
##   de_abbrev = col_character(),
##   de_Country_EN = col_character(),
##   de_Western_neighbor = col_logical(),
##   de_NW_European = col_logical(),
##   dk_Origin = col_character(),
##   Country_EN = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
#merge
assert_that(!any(duplicated(mega$ISO)))
## [1] TRUE
assert_that(!any(duplicated(UV_UN$ISO)))
## [1] TRUE
assert_that(!any(duplicated(rainfall$ISO)))
## [1] TRUE
natl = full_join(UV_UN, mega, by = "ISO") %>% 
  full_join(rainfall, by ="ISO") %>% 
    mutate(
    abs_lat = abs(lat)
  )

Standardized variants

#do it
#exclude the population sizes
d_std = d %>% 
  df_standardize(exclude = d %>% names() %>% str_subset("pop_"),
                             messages = F)

#states too
d_state_std = d_state %>% 
  df_standardize(exclude = d_state %>% names() %>% str_subset("pop_"),
                 messages = F)

Analyses

Descriptives

#descriptives for all main variables
#use original scale variables here
d %>% select(CA_all_orig:CA_white_orig, 
             SES_all_orig:SES_white_orig, 
             frc_white:frc_amerindian,
             UV_orig, avg_temp_orig, lat, lon, elevation, fluoride_conc, lead_index) %>% describe()
#descriptives for old SIRE variables
d %>% select(!!race_vars) %>% describe()
#correlations, unweighted
wtd.cors(d %>% select(White:Other, frc_amerindian:frc_white))
##                White   Black Hispanic   Asian Amerindian  Other frc_amerindian
## White           1.00 -0.6225   -0.585 -0.2640    -0.2878 -0.169         -0.258
## Black          -0.62  1.0000   -0.107  0.0129    -0.1004 -0.087         -0.119
## Hispanic       -0.59 -0.1068    1.000  0.1387    -0.0383 -0.025         -0.051
## Asian          -0.26  0.0129    0.139  1.0000    -0.0092  0.424          0.029
## Amerindian     -0.29 -0.1004   -0.038 -0.0092     1.0000  0.275          0.948
## Other          -0.17 -0.0872   -0.025  0.4240     0.2748  1.000          0.538
## frc_amerindian -0.26 -0.1189   -0.051  0.0293     0.9475  0.538          1.000
## frc_asian      -0.19 -0.0028    0.106  0.9140    -0.0234  0.356         -0.015
## frc_hispanic   -0.56 -0.1239    0.973  0.1638    -0.0598 -0.019         -0.075
## frc_black      -0.61  0.9839   -0.123  0.0250    -0.1013 -0.094         -0.122
## frc_white       0.97 -0.6302   -0.566 -0.2414    -0.2494 -0.163         -0.255
##                frc_asian frc_hispanic frc_black frc_white
## White            -0.1913       -0.556   -0.6066      0.97
## Black            -0.0028       -0.124    0.9839     -0.63
## Hispanic          0.1061        0.973   -0.1230     -0.57
## Asian             0.9140        0.164    0.0250     -0.24
## Amerindian       -0.0234       -0.060   -0.1013     -0.25
## Other             0.3564       -0.019   -0.0944     -0.16
## frc_amerindian   -0.0148       -0.075   -0.1220     -0.26
## frc_asian         1.0000        0.132    0.0091     -0.20
## frc_hispanic      0.1318        1.000   -0.1361     -0.57
## frc_black         0.0091       -0.136    1.0000     -0.63
## frc_white        -0.1983       -0.565   -0.6314      1.00
#with sample sizes
pairwiseCount(d %>% select(White:Other, frc_amerindian:frc_white))
##                White Black Hispanic Asian Amerindian Other frc_amerindian
## White           3140  3140     3140  3140       3140  3140           3122
## Black           3140  3140     3140  3140       3140  3140           3122
## Hispanic        3140  3140     3140  3140       3140  3140           3122
## Asian           3140  3140     3140  3140       3140  3140           3122
## Amerindian      3140  3140     3140  3140       3140  3140           3122
## Other           3140  3140     3140  3140       3140  3140           3122
## frc_amerindian  3122  3122     3122  3122       3122  3122           3124
## frc_asian       3122  3122     3122  3122       3122  3122           3124
## frc_hispanic    3122  3122     3122  3122       3122  3122           3124
## frc_black       3122  3122     3122  3122       3122  3122           3124
## frc_white       3122  3122     3122  3122       3122  3122           3124
##                frc_asian frc_hispanic frc_black frc_white
## White               3122         3122      3122      3122
## Black               3122         3122      3122      3122
## Hispanic            3122         3122      3122      3122
## Asian               3122         3122      3122      3122
## Amerindian          3122         3122      3122      3122
## Other               3122         3122      3122      3122
## frc_amerindian      3124         3124      3124      3124
## frc_asian           3124         3124      3124      3124
## frc_hispanic        3124         3124      3124      3124
## frc_black           3124         3124      3124      3124
## frc_white           3124         3124      3124      3124

Old and new data

#cors
combine_upperlower(
  wtd.cors(d %>% select(S, SES_all, old_CA, CA)),
  wtd.cors(d %>% select(S, SES_all, old_CA, CA), weight = d$pop_weight)
)
##            S SES_all old_CA   CA
## S         NA    0.92   0.71 0.76
## SES_all 0.93      NA   0.69 0.73
## old_CA  0.71    0.69     NA 0.85
## CA      0.77    0.75   0.89   NA
#scatterpots of new and old CA
GG_scatter(d, "CA", "old_CA") +
  xlab("New cognitive data (SEDA 3.0)") +
  ylab("Old cognitive data (Kirkegaard 2016)")
## `geom_smooth()` using formula 'y ~ x'

GG_scatter(d, "CA", "old_CA", weights = "pop_weight") +
  xlab("New cognitive data (SEDA 3.0)") +
  ylab("Old cognitive data (Kirkegaard 2016)")
## `geom_smooth()` using formula 'y ~ x'

#SES/S
GG_scatter(d, "SES_all", "S") +
  xlab("New SES data (SEDA 3.0)") +
  ylab("Old SES data (Kirkegaard 2016)")
## `geom_smooth()` using formula 'y ~ x'

GG_scatter(d, "SES_all", "S", weights = "pop_weight") +
  xlab("New SES data (SEDA 3.0)") +
  ylab("Old SES data (Kirkegaard 2016)")
## `geom_smooth()` using formula 'y ~ x'

Correlations

Get impression of data by looking at the correlations

#single output
combine_upperlower(
  #no weights
  d %>% select(!!main_vars2) %>% select(-contains("_z")) %>% wtd.cors(),

  #weights
  d %>% select(!!main_vars2) %>% select(-contains("_z")) %>% wtd.cors(weight = d$pop_weight)
)
##                   CA CA_all CA_asian CA_black CA_hispanic CA_white SES_all
## CA                NA  1.000    0.447    0.672       0.598    0.757   0.735
## CA_all         1.000     NA    0.447    0.672       0.598    0.757   0.735
## CA_asian       0.478  0.478       NA    0.306       0.389    0.529   0.249
## CA_black       0.647  0.647    0.343       NA       0.540    0.470   0.455
## CA_hispanic    0.607  0.607    0.386    0.588          NA    0.421   0.243
## CA_white       0.687  0.687    0.587    0.405       0.350       NA   0.581
## SES_all        0.748  0.748    0.331    0.432       0.249    0.605      NA
## SES_black      0.419  0.419    0.308    0.554       0.242    0.413   0.646
## SES_hispanic   0.337  0.337    0.288    0.269       0.352    0.326   0.559
## SES_white      0.401  0.401    0.436    0.209       0.083    0.756   0.728
## UV            -0.367 -0.367   -0.054   -0.171      -0.161   -0.105  -0.195
## avg_temp      -0.392 -0.392    0.069   -0.194      -0.032   -0.132  -0.340
## lat            0.312  0.312   -0.126    0.132      -0.028    0.099   0.326
## lon            0.199  0.199    0.312    0.115       0.305    0.111  -0.055
## elevation      0.053  0.053   -0.180    0.110      -0.065   -0.024   0.115
## fluoride_conc  0.020  0.020    0.018   -0.028      -0.042    0.110   0.098
## lead_index    -0.045 -0.045    0.104   -0.073      -0.036    0.021  -0.130
##               SES_black SES_hispanic SES_white     UV avg_temp      lat     lon
## CA               0.4349        0.284     0.369 -0.327  -0.4221  0.32822  0.0824
## CA_all           0.4349        0.284     0.369 -0.327  -0.4221  0.32822  0.0824
## CA_asian         0.2467        0.236     0.327 -0.065   0.0508 -0.12758  0.3277
## CA_black         0.4958        0.222     0.198 -0.252  -0.3226  0.27202  0.0609
## CA_hispanic      0.2365        0.278     0.099 -0.167  -0.0729  0.04058  0.2467
## CA_white         0.3953        0.285     0.646 -0.101  -0.2017  0.19139 -0.0075
## SES_all          0.6459        0.502     0.726 -0.190  -0.4050  0.39275 -0.1503
## SES_black            NA        0.496     0.495 -0.022  -0.1249  0.14866 -0.0947
## SES_hispanic     0.5525           NA     0.461  0.048   0.0085  0.00074 -0.0503
## SES_white        0.5582        0.498        NA  0.076  -0.1219  0.19703 -0.2119
## UV               0.1162        0.101     0.058     NA   0.7304 -0.73481 -0.3360
## avg_temp         0.0187        0.078    -0.051  0.774       NA -0.92471  0.0563
## lat              0.0093       -0.068     0.078 -0.752  -0.9347       NA -0.2873
## lon             -0.1454       -0.089    -0.079 -0.416  -0.0986 -0.09121      NA
## elevation        0.0840        0.016     0.023  0.247  -0.2664  0.18372 -0.4131
## fluoride_conc    0.0601        0.071     0.149  0.043  -0.0426  0.03357 -0.0351
## lead_index      -0.1934       -0.222    -0.038 -0.376  -0.1830  0.09036  0.5146
##               elevation fluoride_conc lead_index
## CA                0.124         0.030    -0.1271
## CA_all            0.124         0.030    -0.1271
## CA_asian         -0.140         0.031     0.0655
## CA_black          0.143         0.017    -0.1470
## CA_hispanic      -0.094        -0.046    -0.0529
## CA_white          0.106         0.108    -0.1126
## SES_all           0.239         0.101    -0.1854
## SES_black         0.140         0.101    -0.2256
## SES_hispanic      0.049         0.041    -0.1901
## SES_white         0.195         0.149    -0.1272
## UV                0.296         0.135    -0.2076
## avg_temp         -0.336         0.025     0.0267
## lat               0.227        -0.033    -0.0894
## lon              -0.555        -0.025     0.4611
## elevation            NA         0.170    -0.3779
## fluoride_conc     0.156            NA     0.0027
## lead_index       -0.382        -0.033         NA
#sample sizes
d %>% select(!!main_vars2) %>% select(-contains("_z")) %>% pairwiseCount()
##                 CA CA_all CA_asian CA_black CA_hispanic CA_white SES_all
## CA            3134   3134     1483     2135        2647     3112    3124
## CA_all        3134   3134     1483     2135        2647     3112    3124
## CA_asian      1483   1483     1483     1394        1475     1483    1473
## CA_black      2135   2135     1394     2135        2015     2120    2125
## CA_hispanic   2647   2647     1475     2015        2647     2643    2637
## CA_white      3112   3112     1483     2120        2643     3112    3102
## SES_all       3124   3124     1473     2125        2637     3102    3124
## SES_black     2108   2108     1373     2108        1989     2093    2108
## SES_hispanic  2624   2624     1465     1998        2624     2620    2624
## SES_white     3099   3099     1473     2109        2631     3099    3099
## UV            3099   3099     1465     2125        2629     3080    3093
## avg_temp      3098   3098     1464     2124        2628     3079    3092
## lat           3132   3132     1483     2135        2647     3112    3122
## lon           3132   3132     1483     2135        2647     3112    3122
## elevation     3070   3070     1444     2096        2601     3050    3064
## fluoride_conc 2209   2209      930     1493        1838     2190    2207
## lead_index    3130   3130     1482     2133        2645     3110    3120
##               SES_black SES_hispanic SES_white   UV avg_temp  lat  lon
## CA                 2108         2624      3099 3099     3098 3132 3132
## CA_all             2108         2624      3099 3099     3098 3132 3132
## CA_asian           1373         1465      1473 1465     1464 1483 1483
## CA_black           2108         1998      2109 2125     2124 2135 2135
## CA_hispanic        1989         2624      2631 2629     2628 2647 2647
## CA_white           2093         2620      3099 3080     3079 3112 3112
## SES_all            2108         2624      3099 3093     3092 3122 3122
## SES_black          2108         1982      2092 2102     2101 2108 2108
## SES_hispanic       1982         2624      2618 2610     2609 2624 2624
## SES_white          2092         2618      3099 3071     3070 3099 3099
## UV                 2102         2610      3071 3106     3105 3106 3106
## avg_temp           2101         2609      3070 3105     3105 3105 3105
## lat                2108         2624      3099 3106     3105 3140 3140
## lon                2108         2624      3099 3106     3105 3140 3140
## elevation          2073         2582      3041 3073     3072 3074 3074
## fluoride_conc      1485         1826      2186 2189     2188 2216 2216
## lead_index         2106         2622      3097 3104     3103 3138 3138
##               elevation fluoride_conc lead_index
## CA                 3070          2209       3130
## CA_all             3070          2209       3130
## CA_asian           1444           930       1482
## CA_black           2096          1493       2133
## CA_hispanic        2601          1838       2645
## CA_white           3050          2190       3110
## SES_all            3064          2207       3120
## SES_black          2073          1485       2106
## SES_hispanic       2582          1826       2622
## SES_white          3041          2186       3097
## UV                 3073          2189       3104
## avg_temp           3072          2188       3103
## lat                3074          2216       3138
## lon                3074          2216       3138
## elevation          3075          2160       3073
## fluoride_conc      2160          2216       2214
## lead_index         3073          2214       3138
#national correlations
natl %>% 
  select(UV, abs_lat) %>% 
  wtd.cors()
##            UV abs_lat
## UV       1.00   -0.89
## abs_lat -0.89    1.00
#plot of UV and latitude
GG_scatter(d, "lat", "UV", case_names = "County.x", case_names_color = "green") +
  xlab("Latitude") +
  ylab("UV index (z score)")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/lat_UV_counties.png")
## `geom_smooth()` using formula 'y ~ x'

Basic spatial stats

#how much spatial autocorrelation is there?
#we measure this by seeing how much we can predict each county by its 3 closest neighbors' values
#counties
wtd.cors(d %>% select(!!!main_vars, !!!(main_vars + "_lag")))[main_vars, (main_vars + "_lag")] %>% 
  {
    y = diag(.)
    names(y) = rownames(.)
    y
  }
##              CA          CA_all        CA_asian        CA_black     CA_hispanic 
##            0.60            0.60            0.39            0.46            0.42 
##        CA_white         SES_all       SES_black    SES_hispanic       SES_white 
##            0.60            0.75            0.56            0.43            0.80 
##       frc_white       frc_black       frc_asian    frc_hispanic  frc_amerindian 
##            0.80            0.84            0.64            0.87            0.70 
##              UV        avg_temp             lat             lon       elevation 
##            1.00            0.99            1.00            1.00            0.96 
##           lat_z           lon_z     elevation_z   fluoride_conc fluoride_conc_z 
##            1.00            1.00            0.96            0.58            0.58 
##      lead_index    lead_index_z 
##            0.81            0.81
#states
wtd.cors(d_state %>% select(!!!main_vars, !!!(main_vars + "_lag")))[main_vars, (main_vars + "_lag")] %>% 
  {
    y = diag(.)
    names(y) = rownames(.)
    y
  }
##              CA          CA_all        CA_asian        CA_black     CA_hispanic 
##            0.58            0.58            0.57            0.36            0.42 
##        CA_white         SES_all       SES_black    SES_hispanic       SES_white 
##            0.20            0.68            0.38            0.57            0.31 
##       frc_white       frc_black       frc_asian    frc_hispanic  frc_amerindian 
##            0.68            0.72            0.13            0.63            0.53 
##              UV        avg_temp             lat             lon       elevation 
##            0.94            0.94            0.98            0.98            0.68 
##           lat_z           lon_z     elevation_z   fluoride_conc fluoride_conc_z 
##            0.98            0.98            0.68            0.56            0.56 
##      lead_index    lead_index_z 
##            0.82            0.82

Main regressions

Carry out the usual regressions.

#formulas for models
form_1 = CA ~ UV
form_2 = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian
form_3 = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian + UV
form_3b = CA ~ UV + frc_white
form_3c = CA ~ CA_lag + UV + frc_black + frc_asian + frc_hispanic + frc_amerindian+ avg_temp
form_4 = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian + UV + avg_temp + lat_z + lon_z + elevation_z
form_5 = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian + UV + avg_temp + lat_z + lon_z + elevation_z + CA_lag
form_5b = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian + UV + avg_temp + lat_z + lon_z + elevation_z + fluoride_conc_z + CA_lag
form_5c = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian + UV + avg_temp + lat_z + lon_z + elevation_z + lead_index_z + fluoride_conc_z + CA_lag
form_5d = CA ~ frc_black + frc_asian + frc_hispanic + frc_amerindian + UV + avg_temp + lat_z + lon_z + elevation_z + Diarrheal_diseases_z + Hepatitis_z + HIV_AIDS_z + Lower_respiratory_infections_z + Meningitis_z + Tuberculosis_z + CA_lag
form_6 = CA ~ CA_lag + frc_black + frc_asian + frc_hispanic + frc_amerindian + rcs(UV) + avg_temp + lat_z + lon_z + elevation_z
form_7 = CA ~ CA_lag + UV + frc_black*UV + frc_asian*UV + frc_hispanic*UV + frc_amerindian*UV + avg_temp + lat_z + lon_z + elevation_z

#UV only
(model_1 = ols(form_1, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##        CA        UV (weights) 
##        90       118        84 
## 
## Linear Regression Model
##  
##  ols(formula = form_1, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     3099    LR chi2    449.25    R2       0.135    
##  sigma13.7075    d.f.            1    R2 adj   0.135    
##  d.f.    3097    Pr(> chi2) 0.0000    g        0.376    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -4.76357 -0.63754 -0.03289  0.51446  3.34724 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept  0.0942 0.0164   5.75 <0.0001 
##  UV        -0.3347 0.0152 -21.98 <0.0001 
## 
#demographics only
(model_2 = ols(form_2, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##             90            100            100            100            100 
##      (weights) 
##             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_2, data = d, weights = d$pop_weight)
##  
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs     3122    LR chi2    1734.97    R2       0.426    
##  sigma11.1986    d.f.             4    R2 adj   0.426    
##  d.f.    3117    Pr(> chi2)  0.0000    g        0.689    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -3.7222 -0.5960 -0.1093  0.3481  2.4635 
##  
##  
##                 Coef    S.E.   t      Pr(>|t|)
##  Intercept       0.7274 0.0224  32.52 <0.0001 
##  frc_black      -2.8440 0.0729 -39.01 <0.0001 
##  frc_asian       5.1768 0.2949  17.56 <0.0001 
##  frc_hispanic   -1.9295 0.0749 -25.76 <0.0001 
##  frc_amerindian -3.7858 0.1891 -20.02 <0.0001 
## 
#demo + UV
(model_3 = ols(form_3, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##             90            100            100            100            100 
##             UV      (weights) 
##            118             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_3, data = d, weights = d$pop_weight)
##  
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs     3093    LR chi2    1880.51    R2       0.456    
##  sigma10.8898    d.f.             5    R2 adj   0.455    
##  d.f.    3087    Pr(> chi2)  0.0000    g        0.705    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.99222 -0.55837 -0.07525  0.36663  2.40816 
##  
##  
##                 Coef    S.E.   t      Pr(>|t|)
##  Intercept       0.6224 0.0247  25.22 <0.0001 
##  frc_black      -2.7262 0.0741 -36.81 <0.0001 
##  frc_asian       6.7410 0.3306  20.39 <0.0001 
##  frc_hispanic   -1.6454 0.0927 -17.74 <0.0001 
##  frc_amerindian -3.4211 0.2052 -16.67 <0.0001 
##  UV             -0.1074 0.0155  -6.92 <0.0001 
## 
#climate
(model_4 = ols(form_4, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##             90            100            100            100            100 
##             UV       avg_temp          lat_z          lon_z    elevation_z 
##            118            119             84             84            149 
##      (weights) 
##             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_4, data = d, weights = d$pop_weight)
##  
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs     3062    LR chi2    2283.97    R2       0.526    
##  sigma10.1785    d.f.             9    R2 adj   0.524    
##  d.f.    3052    Pr(> chi2)  0.0000    g        0.739    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.99195 -0.53719 -0.06725  0.38842  2.38029 
##  
##  
##                 Coef    S.E.   t      Pr(>|t|)
##  Intercept       0.5717 0.0238  24.05 <0.0001 
##  frc_black      -3.0225 0.0784 -38.53 <0.0001 
##  frc_asian       9.0644 0.3395  26.70 <0.0001 
##  frc_hispanic   -1.4146 0.0898 -15.75 <0.0001 
##  frc_amerindian -3.0865 0.1970 -15.67 <0.0001 
##  UV             -0.1638 0.0315  -5.21 <0.0001 
##  avg_temp        0.3868 0.0512   7.56 <0.0001 
##  lat_z           0.2668 0.0501   5.32 <0.0001 
##  lon_z           0.3505 0.0195  17.95 <0.0001 
##  elevation_z     0.2984 0.0247  12.07 <0.0001 
## 
#spatial autocorrelation in residual?
d$model_4_resid = resid(model_4) %>% standardize()
SAC_measures(d %>% filter(!is.na(model_4_resid)), "model_4_resid", measures = "KNSNR")
#spatial lag
(model_5 = ols(form_5, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##             90            100            100            100            100 
##             UV       avg_temp          lat_z          lon_z    elevation_z 
##            118            119             84             84            149 
##         CA_lag      (weights) 
##             92             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_5, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs    3062    LR chi2    2620.25    R2       0.575    
##  sigma9.6362    d.f.            10    R2 adj   0.574    
##  d.f.   3051    Pr(> chi2)  0.0000    g        0.793    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.85759 -0.48683 -0.07155  0.35909  2.47774 
##  
##  
##                 Coef    S.E.   t      Pr(>|t|)
##  Intercept       0.5440 0.0226  24.12 <0.0001 
##  frc_black      -2.6827 0.0764 -35.10 <0.0001 
##  frc_asian       7.7561 0.3289  23.59 <0.0001 
##  frc_hispanic   -1.5484 0.0853 -18.15 <0.0001 
##  frc_amerindian -2.7955 0.1872 -14.94 <0.0001 
##  UV             -0.1036 0.0299  -3.46 0.0005  
##  avg_temp        0.3508 0.0485   7.24 <0.0001 
##  lat_z           0.1712 0.0477   3.59 0.0003  
##  lon_z           0.2434 0.0193  12.59 <0.0001 
##  elevation_z     0.2236 0.0237   9.42 <0.0001 
##  CA_lag          0.3495 0.0186  18.82 <0.0001 
## 
#No SAC in residuals now
d$model_5_resid = resid(model_5) %>% standardize()
SAC_measures(d %>% filter(!is.na(model_5_resid)), "model_5_resid", measures = "KNSNR")
#nonlinear UV
(model_6 = ols(form_6, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA         CA_lag      frc_black      frc_asian   frc_hispanic 
##             90             92            100            100            100 
## frc_amerindian             UV       avg_temp          lat_z          lon_z 
##            100            118            119             84             84 
##    elevation_z      (weights) 
##            149             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_6, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs    3062    LR chi2    2630.20    R2       0.576    
##  sigma9.6253    d.f.            13    R2 adj   0.575    
##  d.f.   3048    Pr(> chi2)  0.0000    g        0.791    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.80324 -0.48663 -0.07103  0.35421  2.55881 
##  
##  
##                 Coef    S.E.   t      Pr(>|t|)
##  Intercept       0.4676 0.0914   5.12 <0.0001 
##  CA_lag          0.3458 0.0187  18.50 <0.0001 
##  frc_black      -2.7309 0.0780 -35.01 <0.0001 
##  frc_asian       7.6536 0.3312  23.11 <0.0001 
##  frc_hispanic   -1.4838 0.0938 -15.82 <0.0001 
##  frc_amerindian -2.8328 0.1877 -15.09 <0.0001 
##  UV             -0.1156 0.0708  -1.63 0.1028  
##  UV'             0.1902 0.6160   0.31 0.7575  
##  UV''            0.3642 1.8813   0.19 0.8465  
##  UV'''          -2.2352 2.2602  -0.99 0.3228  
##  avg_temp        0.3393 0.0490   6.93 <0.0001 
##  lat_z           0.2192 0.0501   4.38 <0.0001 
##  lon_z           0.2663 0.0215  12.40 <0.0001 
##  elevation_z     0.2144 0.0241   8.91 <0.0001 
## 
#UV x race
(model_7 = ols(form_7, data = d, weights = d$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA         CA_lag             UV      frc_black      frc_asian 
##             90             92            118            100            100 
##   frc_hispanic frc_amerindian       avg_temp          lat_z          lon_z 
##            100            100            119             84             84 
##    elevation_z      (weights) 
##            149             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_7, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs    3062    LR chi2    2763.71    R2       0.594    
##  sigma9.4193    d.f.            14    R2 adj   0.593    
##  d.f.   3047    Pr(> chi2)  0.0000    g        0.777    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.78002 -0.49616 -0.07814  0.35145  2.54808 
##  
##  
##                      Coef    S.E.   t      Pr(>|t|)
##  Intercept            0.5525 0.0227  24.31 <0.0001 
##  CA_lag               0.3346 0.0187  17.90 <0.0001 
##  UV                   0.1051 0.0391   2.69 0.0073  
##  frc_black           -2.9387 0.0797 -36.87 <0.0001 
##  frc_asian            8.3827 0.3342  25.08 <0.0001 
##  frc_hispanic        -1.4080 0.1051 -13.40 <0.0001 
##  frc_amerindian      -2.8321 0.1885 -15.02 <0.0001 
##  avg_temp             0.1898 0.0494   3.84 0.0001  
##  lat_z                0.1342 0.0477   2.81 0.0049  
##  lon_z                0.2331 0.0198  11.75 <0.0001 
##  elevation_z          0.1453 0.0243   5.97 <0.0001 
##  UV * frc_black       0.5083 0.1099   4.63 <0.0001 
##  UV * frc_asian      -2.8707 0.2659 -10.80 <0.0001 
##  UV * frc_hispanic   -0.1943 0.0603  -3.23 0.0013  
##  UV * frc_amerindian -0.3310 0.1440  -2.30 0.0216  
## 
#effect sizes comparisons
#proportion R2s
map_df(mget("model_" + 1:7), model_partialR2s)
#numerical
(model_5_partialR2s = plot(anova(model_5), what='proportion R2', pl = F))
##      frc_black      frc_asian         CA_lag   frc_hispanic frc_amerindian 
##         0.2984         0.1347         0.0858         0.0798         0.0540 
##          lon_z    elevation_z       avg_temp          lat_z             UV 
##         0.0384         0.0215         0.0127         0.0031         0.0029
sum(model_5_partialR2s[c(race_vars2[-1])])
## [1] 0.57
sum(model_5_partialR2s[climate_vars_model])
## [1] 0.079
(model_7_partialR2s = plot(anova(model_7), what='proportion R2', pl = F))
##           frc_black           frc_asian        frc_hispanic              CA_lag 
##              0.3093              0.1484              0.0760              0.0717 
##      frc_amerindian                  UV               lon_z      UV * frc_asian 
##              0.0575              0.0355              0.0309              0.0261 
##         elevation_z      UV * frc_black            avg_temp   UV * frc_hispanic 
##              0.0080              0.0048              0.0033              0.0023 
##               lat_z UV * frc_amerindian 
##              0.0018              0.0012
sum(model_7_partialR2s[c(race_vars2[-1])])
## [1] 0.59
sum(model_5_partialR2s[climate_vars_model])
## [1] 0.079
#all UV * SIRE interactions
sum(model_7_partialR2s[str_subset(names(model_7_partialR2s), "\\*")])
## [1] 0.034
#summarize model results
mget("model_" + 1:7) %>% 
  summarize_models()
#supplementary models
list(
  ols(form_3b, data = d, weights = d$pop_weight),
  ols(form_3c, data = d, weights = d$pop_weight),
  ols(form_5b, data = d, weights = d$pop_weight),
  ols(form_5c, data = d, weights = d$pop_weight),
  ols(form_5d, data = d, weights = d$pop_weight)
) %>% 
  walk(function(x) {
    print(x)
    cat("Relative importance: partial R2s\n")
    print(model_partialR2s(x))
  })
## Frequencies of Missing Values Due to Each Variable
##        CA        UV frc_white (weights) 
##        90       118       100        84 
## 
## Linear Regression Model
##  
##  ols(formula = form_3b, data = d, weights = d$pop_weight)
##  
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs     3093    LR chi2     999.59    R2       0.276    
##  sigma12.5504    d.f.             2    R2 adj   0.276    
##  d.f.    3090    Pr(> chi2)  0.0000    g        0.550    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -3.8842 -0.7157 -0.1845  0.3050  2.8013 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept -1.0168 0.0481 -21.15 <0.0001 
##  UV        -0.1241 0.0165  -7.53 <0.0001 
##  frc_white  1.6926 0.0693  24.41 <0.0001 
##  
## Relative importance: partial R2s
## # A tibble: 1 x 2
##   frc_white     UV
##       <dbl>  <dbl>
## 1     0.505 0.0481
## Frequencies of Missing Values Due to Each Variable
##             CA         CA_lag             UV      frc_black      frc_asian 
##             90             92            118            100            100 
##   frc_hispanic frc_amerindian       avg_temp      (weights) 
##            100            100            119             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_3c, data = d, weights = d$pop_weight)
##  
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs     3092    LR chi2    2435.78    R2       0.545    
##  sigma 9.9585    d.f.             7    R2 adj   0.544    
##  d.f.    3084    Pr(> chi2)  0.0000    g        0.804    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.82297 -0.49647 -0.08481  0.34240  2.90949 
##  
##  
##                 Coef    S.E.   t      Pr(>|t|)
##  Intercept       0.5788 0.0228  25.40 <0.0001 
##  CA_lag          0.4348 0.0181  24.01 <0.0001 
##  UV             -0.0381 0.0205  -1.86 0.0634  
##  frc_black      -2.4592 0.0753 -32.68 <0.0001 
##  frc_asian       5.9749 0.3068  19.47 <0.0001 
##  frc_hispanic   -1.7787 0.0857 -20.75 <0.0001 
##  frc_amerindian -2.9232 0.1902 -15.37 <0.0001 
##  avg_temp        0.1073 0.0213   5.03 <0.0001 
##  
## Relative importance: partial R2s
## # A tibble: 1 x 7
##   frc_black CA_lag frc_hispanic frc_asian frc_amerindian avg_temp       UV
##       <dbl>  <dbl>        <dbl>     <dbl>          <dbl>    <dbl>    <dbl>
## 1     0.289  0.156        0.116     0.103         0.0639  0.00683 0.000933
## Frequencies of Missing Values Due to Each Variable
##              CA       frc_black       frc_asian    frc_hispanic  frc_amerindian 
##              90             100             100             100             100 
##              UV        avg_temp           lat_z           lon_z     elevation_z 
##             118             119              84              84             149 
## fluoride_conc_z          CA_lag       (weights) 
##            1008              92              84 
## 
## Linear Regression Model
##  
##  ols(formula = form_5b, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs    2153    LR chi2    2015.71    R2       0.608    
##  sigma8.7183    d.f.            11    R2 adj   0.606    
##  d.f.   2141    Pr(> chi2)  0.0000    g        0.771    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.13986 -0.47069 -0.05363  0.37739  2.07743 
##  
##  
##                  Coef    S.E.   t      Pr(>|t|)
##  Intercept        0.4229 0.0251  16.84 <0.0001 
##  frc_black       -2.7802 0.0819 -33.94 <0.0001 
##  frc_asian       13.8716 0.5711  24.29 <0.0001 
##  frc_hispanic    -1.4864 0.0991 -15.00 <0.0001 
##  frc_amerindian  -2.8079 0.4083  -6.88 <0.0001 
##  UV               0.3135 0.0499   6.29 <0.0001 
##  avg_temp        -0.1883 0.0839  -2.24 0.0249  
##  lat_z           -0.0368 0.0693  -0.53 0.5953  
##  lon_z            0.1362 0.0348   3.92 <0.0001 
##  elevation_z     -0.0669 0.0410  -1.63 0.1029  
##  fluoride_conc_z  0.0155 0.0179   0.86 0.3883  
##  CA_lag           0.2859 0.0215  13.28 <0.0001 
##  
## Relative importance: partial R2s
## # A tibble: 1 x 11
##   frc_black frc_asian frc_hispanic CA_lag frc_amerindian     UV   lon_z avg_temp
##       <dbl>     <dbl>        <dbl>  <dbl>          <dbl>  <dbl>   <dbl>    <dbl>
## 1     0.347     0.178       0.0678 0.0531         0.0143 0.0119 0.00462  0.00152
## # … with 3 more variables: elevation_z <dbl>, fluoride_conc_z <dbl>,
## #   lat_z <dbl>
## Frequencies of Missing Values Due to Each Variable
##              CA       frc_black       frc_asian    frc_hispanic  frc_amerindian 
##              90             100             100             100             100 
##              UV        avg_temp           lat_z           lon_z     elevation_z 
##             118             119              84              84             149 
##    lead_index_z fluoride_conc_z          CA_lag       (weights) 
##              86            1008              92              84 
## 
## Linear Regression Model
##  
##  ols(formula = form_5c, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs    2152    LR chi2    2119.16    R2       0.626    
##  sigma8.5111    d.f.            12    R2 adj   0.624    
##  d.f.   2139    Pr(> chi2)  0.0000    g        0.792    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.89522 -0.43880 -0.02404  0.38089  2.11818 
##  
##  
##                  Coef    S.E.   t      Pr(>|t|)
##  Intercept        0.3616 0.0252  14.34 <0.0001 
##  frc_black       -2.5580 0.0828 -30.89 <0.0001 
##  frc_asian       12.6287 0.5706  22.13 <0.0001 
##  frc_hispanic    -1.4105 0.0970 -14.54 <0.0001 
##  frc_amerindian  -2.9846 0.3989  -7.48 <0.0001 
##  UV               0.4304 0.0500   8.61 <0.0001 
##  avg_temp        -0.2507 0.0821  -3.05 0.0023  
##  lat_z            0.0376 0.0681   0.55 0.5812  
##  lon_z            0.3346 0.0392   8.55 <0.0001 
##  elevation_z     -0.1151 0.0403  -2.86 0.0043  
##  lead_index_z    -0.1719 0.0167 -10.29 <0.0001 
##  fluoride_conc_z  0.0153 0.0175   0.88 0.3807  
##  CA_lag           0.2758 0.0210  13.11 <0.0001 
##  
## Relative importance: partial R2s
## # A tibble: 1 x 12
##   frc_black frc_asian frc_hispanic CA_lag lead_index_z     UV  lon_z
##       <dbl>     <dbl>        <dbl>  <dbl>        <dbl>  <dbl>  <dbl>
## 1     0.266     0.137       0.0589 0.0479       0.0295 0.0207 0.0204
## # … with 5 more variables: frc_amerindian <dbl>, avg_temp <dbl>,
## #   elevation_z <dbl>, fluoride_conc_z <dbl>, lat_z <dbl>
## Frequencies of Missing Values Due to Each Variable
##                             CA                      frc_black 
##                             90                            100 
##                      frc_asian                   frc_hispanic 
##                            100                            100 
##                 frc_amerindian                             UV 
##                            100                            118 
##                       avg_temp                          lat_z 
##                            119                             84 
##                          lon_z                    elevation_z 
##                             84                            149 
##           Diarrheal_diseases_z                    Hepatitis_z 
##                             82                             82 
##                     HIV_AIDS_z Lower_respiratory_infections_z 
##                             82                             82 
##                   Meningitis_z                 Tuberculosis_z 
##                             82                             82 
##                         CA_lag                      (weights) 
##                             92                             84 
## 
## Linear Regression Model
##  
##  ols(formula = form_5d, data = d, weights = d$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs    3062    LR chi2    3215.73    R2       0.650    
##  sigma8.7520    d.f.            16    R2 adj   0.648    
##  d.f.   3045    Pr(> chi2)  0.0000    g        0.860    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.23696 -0.46236 -0.06136  0.31593  3.05156 
##  
##  
##                                 Coef    S.E.   t      Pr(>|t|)
##  Intercept                       0.3922 0.0272  14.42 <0.0001 
##  frc_black                      -1.5491 0.1249 -12.40 <0.0001 
##  frc_asian                       6.8560 0.3134  21.88 <0.0001 
##  frc_hispanic                   -1.5441 0.0865 -17.85 <0.0001 
##  frc_amerindian                 -1.5919 0.1983  -8.03 <0.0001 
##  UV                             -0.1753 0.0276  -6.34 <0.0001 
##  avg_temp                        0.4788 0.0444  10.78 <0.0001 
##  lat_z                           0.1021 0.0440   2.32 0.0205  
##  lon_z                           0.2308 0.0199  11.58 <0.0001 
##  elevation_z                     0.3044 0.0224  13.58 <0.0001 
##  Diarrheal_diseases_z           -0.0205 0.0121  -1.70 0.0898  
##  Hepatitis_z                    -0.0957 0.0164  -5.83 <0.0001 
##  HIV_AIDS_z                      0.0470 0.0168   2.79 0.0052  
##  Lower_respiratory_infections_z -0.1108 0.0171  -6.47 <0.0001 
##  Meningitis_z                   -0.3350 0.0269 -12.45 <0.0001 
##  Tuberculosis_z                 -0.0170 0.0275  -0.62 0.5369  
##  CA_lag                          0.2871 0.0176  16.30 <0.0001 
##  
## Relative importance: partial R2s
## # A tibble: 1 x 16
##   frc_asian frc_hispanic CA_lag elevation_z Meningitis_z frc_black  lon_z
##       <dbl>        <dbl>  <dbl>       <dbl>        <dbl>     <dbl>  <dbl>
## 1    0.0846       0.0563 0.0469      0.0326       0.0274    0.0272 0.0237
## # … with 9 more variables: avg_temp <dbl>, frc_amerindian <dbl>,
## #   Lower_respiratory_infections_z <dbl>, UV <dbl>, Hepatitis_z <dbl>,
## #   HIV_AIDS_z <dbl>, lat_z <dbl>, Diarrheal_diseases_z <dbl>,
## #   Tuberculosis_z <dbl>

Standardized betas

Alternative way to compare importance as numerical predictors, ignoring the variance as a function of prevalence issue.

list(
  ols(form_1, data = d_std, weights = d_std$pop_weight),
  ols(form_2, data = d_std, weights = d_std$pop_weight),
  ols(form_3, data = d_std, weights = d_std$pop_weight),
  ols(form_4, data = d_std, weights = d_std$pop_weight),
  ols(form_5, data = d_std, weights = d_std$pop_weight),
  ols(form_6, data = d_std, weights = d_std$pop_weight),
  ols(form_7, data = d_std, weights = d_std$pop_weight)
) %>% 
  summarize_models()

Robust regressions

Lasker requested these, but they are pretty much the same as above but with worse output.

#UV
(model_1r = MASS::rlm(form_1, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_1, data = d, weights = d$pop_weight)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -75.9602  -8.2043  -0.0881   6.6366  76.9205 
## 
## Coefficients:
##             Value   Std. Error t value
## (Intercept)   0.066   0.014      4.736
## UV           -0.324   0.013    -24.895
## 
## Residual standard error: 11 on 3097 degrees of freedom
##   (125 observations deleted due to missingness)
#demos
(model_2r = MASS::rlm(form_2, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_2, data = d, weights = d$pop_weight)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -223.642   -6.325   -0.564    5.157   55.888 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)      0.585   0.018     31.730
## frc_black       -2.939   0.060    -48.903
## frc_asian       11.294   0.243     46.461
## frc_hispanic    -1.884   0.062    -30.512
## frc_amerindian  -3.468   0.156    -22.251
## 
## Residual standard error: 8.5 on 3117 degrees of freedom
##   (102 observations deleted due to missingness)
#UV and demos
(model_3r = MASS::rlm(form_3, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_3, data = d, weights = d$pop_weight)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.762   -6.142   -0.477    5.155   55.395 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)      0.536   0.021     25.922
## frc_black       -2.869   0.062    -46.208
## frc_asian       11.965   0.277     43.168
## frc_hispanic    -1.724   0.078    -22.169
## frc_amerindian  -3.193   0.172    -18.566
## UV              -0.054   0.013     -4.148
## 
## Residual standard error: 8.3 on 3087 degrees of freedom
##   (131 observations deleted due to missingness)
#all climate
(model_4r = MASS::rlm(form_4, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_4, data = d, weights = d$pop_weight)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -123.591   -6.087   -0.484    4.937   46.267 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)      0.520   0.021     24.918
## frc_black       -3.019   0.069    -43.832
## frc_asian       12.715   0.298     42.651
## frc_hispanic    -1.591   0.079    -20.166
## frc_amerindian  -3.015   0.173    -17.429
## UV              -0.113   0.028     -4.084
## avg_temp         0.395   0.045      8.801
## lat_z            0.329   0.044      7.486
## lon_z            0.321   0.017     18.728
## elevation_z      0.269   0.022     12.398
## 
## Residual standard error: 8.2 on 3052 degrees of freedom
##   (162 observations deleted due to missingness)
#all main variables + spatial lag
(model_5r = MASS::rlm(form_5, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_5, data = d, weights = d$pop_weight)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -110.881   -5.488   -0.385    4.717   51.188 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)      0.461   0.020     23.622
## frc_black       -2.476   0.066    -37.418
## frc_asian       10.439   0.285     36.661
## frc_hispanic    -1.651   0.074    -22.348
## frc_amerindian  -2.657   0.162    -16.394
## UV              -0.060   0.026     -2.325
## avg_temp         0.338   0.042      8.064
## lat_z            0.208   0.041      5.025
## lon_z            0.201   0.017     12.029
## elevation_z      0.201   0.021      9.803
## CA_lag           0.384   0.016     23.902
## 
## Residual standard error: 7.6 on 3051 degrees of freedom
##   (162 observations deleted due to missingness)
#rcs UV
(model_6r = MASS::rlm(form_6, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_6, data = d, weights = d$pop_weight)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -117.00   -5.45   -0.47    4.71   50.87 
## 
## Coefficients:
##                Value   Std. Error t value
## (Intercept)      0.506   0.079      6.405
## CA_lag           0.372   0.016     23.037
## frc_black       -2.546   0.067    -37.760
## frc_asian       10.590   0.286     37.001
## frc_hispanic    -1.567   0.081    -19.339
## frc_amerindian  -2.692   0.162    -16.599
## rcs(UV)UV        0.028   0.061      0.463
## rcs(UV)UV'      -0.718   0.532     -1.349
## rcs(UV)UV''      3.172   1.626      1.951
## rcs(UV)UV'''    -5.599   1.953     -2.867
## avg_temp         0.307   0.042      7.259
## lat_z            0.239   0.043      5.524
## lon_z            0.215   0.019     11.603
## elevation_z      0.192   0.021      9.236
## 
## Residual standard error: 7.5 on 3048 degrees of freedom
##   (162 observations deleted due to missingness)
#demo interactions
(model_7r = MASS::rlm(form_7, data = d, weights = d$pop_weight) %>% summary())
## 
## Call: rlm(formula = form_7, data = d, weights = d$pop_weight)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.265  -5.501  -0.448   4.669  50.355 
## 
## Coefficients:
##                   Value   Std. Error t value
## (Intercept)         0.470   0.020     23.534
## CA_lag              0.371   0.016     22.603
## UV                  0.124   0.034      3.607
## frc_black          -2.752   0.070    -39.315
## frc_asian          10.399   0.294     35.424
## frc_hispanic       -1.461   0.092    -15.835
## frc_amerindian     -2.648   0.166    -15.992
## avg_temp            0.193   0.043      4.439
## lat_z               0.166   0.042      3.976
## lon_z               0.210   0.017     12.070
## elevation_z         0.140   0.021      6.554
## UV:frc_black        0.467   0.096      4.840
## UV:frc_asian       -3.696   0.234    -15.826
## UV:frc_hispanic    -0.172   0.053     -3.250
## UV:frc_amerindian  -0.276   0.127     -2.182
## 
## Residual standard error: 7.6 on 3047 degrees of freedom
##   (162 observations deleted due to missingness)

Correlations vs. OLS?

Can weighted regression betas differ from weighted correlations? Apparently, yes.

d %>% 
  filter(!is.na(CA), !is.na(UV), !is.na(pop_weight)) %>% {
    #restandardized in complete cases subset
    .$CA2 = .$CA %>% standardize()
    .$UV2 = .$UV %>% standardize()
    
    #compute
    list(
      CA = describe(.$CA),
      UV = describe(.$UV),
      cor = wtd.cors(select(., CA, UV)),
      cor_wt = wtd.cors(select(., CA, UV), w = .$pop_weight),
      ols = ols(CA ~ UV, data = .) %>% coef(),
      ols_wt = ols(CA ~ UV, data = ., weights = .$pop_weight) %>% coef(),
      ols_restd = ols(CA2 ~ UV2, data = .) %>% coef(),
      ols_wt_restd = ols(CA2 ~ UV2, data = ., weights = .$pop_weight) %>% coef()
      )
}
## $CA
##    vars    n   mean   sd median trimmed  mad  min max range  skew kurtosis
## X1    1 3099 0.0087 0.98  0.073   0.051 0.92 -4.5 2.9   7.4 -0.49     0.86
##       se
## X1 0.018
## 
## $UV
##    vars    n     mean sd median trimmed mad  min max range skew kurtosis    se
## X1    1 3099 -0.00013  1 -0.064  -0.058 1.1 -3.1 3.4   6.4 0.48     0.16 0.018
## 
## $cor
##       CA    UV
## CA  1.00 -0.33
## UV -0.33  1.00
## 
## $cor_wt
##       CA    UV
## CA  1.00 -0.37
## UV -0.37  1.00
## 
## $ols
## Intercept        UV 
##    0.0087   -0.3219 
## 
## $ols_wt
## Intercept        UV 
##     0.094    -0.335 
## 
## $ols_restd
## Intercept       UV2 
##  -1.1e-17  -3.3e-01 
## 
## $ols_wt_restd
## Intercept       UV2 
##     0.087    -0.340

State level

For comparison with Leon.

#UV only
(model_1s = ols(form_1, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##        CA        UV (weights) 
##         1         1         1 
## 
## Linear Regression Model
##  
##  ols(formula = form_1, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     19.57    R2       0.329    
##  sigma35.8247    d.f.            1    R2 adj   0.315    
##  d.f.      47    Pr(> chi2) 0.0000    g        0.546    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -3.1375 -0.4679  0.1638  0.4223  1.6269 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.0125 0.1091  0.11 0.9095  
##  UV        -0.5058 0.1053 -4.80 <0.0001 
## 
#demographics only
(model_2s = ols(form_2, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##              1              1              1              1              1 
##      (weights) 
##              1 
## 
## Linear Regression Model
##  
##  ols(formula = form_2, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     27.40    R2       0.428    
##  sigma34.1821    d.f.            4    R2 adj   0.376    
##  d.f.      44    Pr(> chi2) 0.0000    g        0.753    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.98570 -0.47002  0.05129  0.50455  1.52791 
##  
##  
##                 Coef    S.E.   t     Pr(>|t|)
##  Intercept       1.0991 0.2857  3.85 0.0004  
##  frc_black      -4.1687 0.8797 -4.74 <0.0001 
##  frc_asian       5.6319 4.3970  1.28 0.2070  
##  frc_hispanic   -3.1272 0.8245 -3.79 0.0005  
##  frc_amerindian -6.5548 3.8188 -1.72 0.0931  
## 
#demo + UV
(model_3s = ols(form_3, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##              1              1              1              1              1 
##             UV      (weights) 
##              1              1 
## 
## Linear Regression Model
##  
##  ols(formula = form_3, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     33.69    R2       0.497    
##  sigma32.4295    d.f.            5    R2 adj   0.439    
##  d.f.      43    Pr(> chi2) 0.0000    g        0.788    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.86540 -0.39426  0.04279  0.31334  1.40189 
##  
##  
##                 Coef    S.E.   t     Pr(>|t|)
##  Intercept       0.6179 0.3359  1.84 0.0727  
##  frc_black      -3.1410 0.9359 -3.36 0.0017  
##  frc_asian       1.0041 4.5871  0.22 0.8278  
##  frc_hispanic   -0.4505 1.3526 -0.33 0.7407  
##  frc_amerindian -3.8699 3.7883 -1.02 0.3127  
##  UV             -0.4202 0.1732 -2.43 0.0195  
## 
#climate
(model_4s = ols(form_4, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##              1              1              1              1              1 
##             UV       avg_temp          lat_z          lon_z    elevation_z 
##              1              1              1              1              1 
##      (weights) 
##              1 
## 
## Linear Regression Model
##  
##  ols(formula = form_4, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     62.56    R2       0.721    
##  sigma25.3630    d.f.            9    R2 adj   0.657    
##  d.f.      39    Pr(> chi2) 0.0000    g        0.909    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.06879 -0.18211  0.05978  0.27205  0.90978 
##  
##  
##                 Coef    S.E.   t     Pr(>|t|)
##  Intercept       0.4332 0.2774  1.56 0.1265  
##  frc_black      -4.4850 0.9401 -4.77 <0.0001 
##  frc_asian      16.4494 4.9652  3.31 0.0020  
##  frc_hispanic   -1.3160 1.1108 -1.18 0.2433  
##  frc_amerindian -3.6634 3.1824 -1.15 0.2567  
##  UV             -0.7981 0.2461 -3.24 0.0024  
##  avg_temp        1.7740 0.5268  3.37 0.0017  
##  lat_z           1.1112 0.4665  2.38 0.0222  
##  lon_z           0.8534 0.1716  4.97 <0.0001 
##  elevation_z     0.9042 0.2147  4.21 0.0001  
## 
#spatial autocorrelation in residual?
d_state$model_4s_resid = resid(model_4s) %>% standardize()
SAC_measures(d_state %>% filter(!is.na(model_4s_resid)), "model_4s_resid", measures = "KNSNR")
#spatial lag
(model_5s = ols(form_5, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA      frc_black      frc_asian   frc_hispanic frc_amerindian 
##              1              1              1              1              1 
##             UV       avg_temp          lat_z          lon_z    elevation_z 
##              1              1              1              1              1 
##         CA_lag      (weights) 
##              1              1 
## 
## Linear Regression Model
##  
##  ols(formula = form_5, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     62.91    R2       0.723    
##  sigma25.6011    d.f.           10    R2 adj   0.650    
##  d.f.      38    Pr(> chi2) 0.0000    g        0.910    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.9694 -0.2172  0.0677  0.2924  0.8829 
##  
##  
##                 Coef    S.E.   t     Pr(>|t|)
##  Intercept       0.4010 0.2866  1.40 0.1699  
##  frc_black      -4.3825 0.9687 -4.52 <0.0001 
##  frc_asian      17.3630 5.3030  3.27 0.0023  
##  frc_hispanic   -1.3673 1.1255 -1.21 0.2319  
##  frc_amerindian -3.5101 3.2254 -1.09 0.2833  
##  UV             -0.8181 0.2512 -3.26 0.0024  
##  avg_temp        1.8307 0.5426  3.37 0.0017  
##  lat_z           1.0687 0.4778  2.24 0.0313  
##  lon_z           0.8399 0.1751  4.80 <0.0001 
##  elevation_z     0.9240 0.2199  4.20 0.0002  
##  CA_lag          0.1103 0.2092  0.53 0.6011  
## 
#SAC in residuals now
d_state$model_5s_resid = resid(model_5s) %>% standardize()
SAC_measures(d_state %>% filter(!is.na(model_5s_resid)), "model_5s_resid", measures = "KNSNR")
#nonlinear UV
(model_6s = ols(form_6, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA         CA_lag      frc_black      frc_asian   frc_hispanic 
##              1              1              1              1              1 
## frc_amerindian             UV       avg_temp          lat_z          lon_z 
##              1              1              1              1              1 
##    elevation_z      (weights) 
##              1              1 
## 
## Linear Regression Model
##  
##  ols(formula = form_6, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     65.08    R2       0.735    
##  sigma26.0919    d.f.           13    R2 adj   0.637    
##  d.f.      35    Pr(> chi2) 0.0000    g        0.927    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.85503 -0.27715  0.07099  0.26890  0.83428 
##  
##  
##                 Coef    S.E.    t     Pr(>|t|)
##  Intercept       0.3295  0.8160  0.40 0.6888  
##  CA_lag         -0.0253  0.2432 -0.10 0.9179  
##  frc_black      -4.5325  1.0000 -4.53 <0.0001 
##  frc_asian      14.4935  5.8904  2.46 0.0190  
##  frc_hispanic   -0.6532  1.3079 -0.50 0.6206  
##  frc_amerindian -3.9672  3.3849 -1.17 0.2491  
##  UV             -0.6174  0.8143 -0.76 0.4534  
##  UV'            -2.2669 27.4190 -0.08 0.9346  
##  UV''            5.4466 41.0354  0.13 0.8952  
##  UV'''          -5.9599 17.6434 -0.34 0.7375  
##  avg_temp        1.8167  0.5897  3.08 0.0040  
##  lat_z           1.4342  0.6023  2.38 0.0228  
##  lon_z           0.9417  0.2077  4.54 <0.0001 
##  elevation_z     0.8046  0.2487  3.24 0.0027  
## 
#UV x race
(model_7s = ols(form_7, data = d_state, weights = d_state$pop_weight))
## Frequencies of Missing Values Due to Each Variable
##             CA         CA_lag             UV      frc_black      frc_asian 
##              1              1              1              1              1 
##   frc_hispanic frc_amerindian       avg_temp          lat_z          lon_z 
##              1              1              1              1              1 
##    elevation_z      (weights) 
##              1              1 
## 
## Linear Regression Model
##  
##  ols(formula = form_7, data = d_state, weights = d_state$pop_weight)
##  
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs       49    LR chi2     83.29    R2       0.817    
##  sigma21.9833    d.f.           14    R2 adj   0.742    
##  d.f.      34    Pr(> chi2) 0.0000    g        0.974    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.31880 -0.23782  0.04668  0.23206  0.74668 
##  
##  
##                      Coef     S.E.   t     Pr(>|t|)
##  Intercept             0.5546 0.2743  2.02 0.0511  
##  CA_lag               -0.0809 0.2059 -0.39 0.6966  
##  UV                    0.9556 0.5687  1.68 0.1021  
##  frc_black            -4.9719 0.8734 -5.69 <0.0001 
##  frc_asian            13.0377 5.1196  2.55 0.0156  
##  frc_hispanic         -0.4472 1.2957 -0.35 0.7321  
##  frc_amerindian        1.7082 3.6537  0.47 0.6431  
##  avg_temp              0.1981 0.6405  0.31 0.7590  
##  lat_z                 0.3427 0.5266  0.65 0.5196  
##  lon_z                 0.6677 0.1867  3.58 0.0011  
##  elevation_z           0.2202 0.2728  0.81 0.4252  
##  UV * frc_black       -0.5415 1.4087 -0.38 0.7031  
##  UV * frc_asian      -10.8260 3.5178 -3.08 0.0041  
##  UV * frc_hispanic    -0.7802 0.7706 -1.01 0.3185  
##  UV * frc_amerindian -10.4274 4.0766 -2.56 0.0152  
## 
#effect sizes comparisons
#proportion R2
map_df(mget("model_" + 1:7 + "s"), model_partialR2s)
#numerical values
(model_5s_partialR2s = plot(anova(model_5s), what='proportion R2', pl = F))
##          lon_z      frc_black    elevation_z       avg_temp      frc_asian 
##         0.2319         0.2063         0.1779         0.1148         0.1081 
##             UV          lat_z   frc_hispanic frc_amerindian         CA_lag 
##         0.1069         0.0504         0.0149         0.0119         0.0028
sum(model_5s_partialR2s[c(race_vars2[-1])])
## [1] 0.34
sum(model_5s_partialR2s[climate_vars_model])
## [1] 0.68
(model_7s_partialR2s = plot(anova(model_7s), what='proportion R2', pl = F))
##           frc_black                  UV           frc_asian               lon_z 
##             0.21573             0.20987             0.09793             0.08414 
##      UV * frc_asian      frc_amerindian UV * frc_amerindian        frc_hispanic 
##             0.06227             0.05794             0.04302             0.01384 
##   UV * frc_hispanic         elevation_z               lat_z              CA_lag 
##             0.00674             0.00428             0.00278             0.00102 
##      UV * frc_black            avg_temp 
##             0.00097             0.00063
sum(model_7s_partialR2s[c(race_vars2[-1])])
## [1] 0.39
sum(model_7s_partialR2s[climate_vars_model])
## [1] 0.3
sum(model_7s_partialR2s[str_subset(names(model_7s_partialR2s), "\\*")])
## [1] 0.11
#summarize model results
list(model_1s, model_2s, model_3s, model_4s, model_5s, model_6s, model_7s) %>% 
  summarize_models()
#supplement models
list(
  ols(form_3b, data = d, weights = d$pop_weight)
) %>% 
  walk(function(x) {
    print(x)
    cat("Relative importance: partial R2s\n")
    print(model_partialR2s(x))
  })
## Frequencies of Missing Values Due to Each Variable
##        CA        UV frc_white (weights) 
##        90       118       100        84 
## 
## Linear Regression Model
##  
##  ols(formula = form_3b, data = d, weights = d$pop_weight)
##  
##  
##                    Model Likelihood    Discrimination    
##                          Ratio Test           Indexes    
##  Obs     3093    LR chi2     999.59    R2       0.276    
##  sigma12.5504    d.f.             2    R2 adj   0.276    
##  d.f.    3090    Pr(> chi2)  0.0000    g        0.550    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -3.8842 -0.7157 -0.1845  0.3050  2.8013 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept -1.0168 0.0481 -21.15 <0.0001 
##  UV        -0.1241 0.0165  -7.53 <0.0001 
##  frc_white  1.6926 0.0693  24.41 <0.0001 
##  
## Relative importance: partial R2s
## # A tibble: 1 x 2
##   frc_white     UV
##       <dbl>  <dbl>
## 1     0.505 0.0481

Standardized

list(
  ols(form_1, data = d_state_std, weights = d_state_std$pop_weight),
  ols(form_2, data = d_state_std, weights = d_state_std$pop_weight),
  ols(form_3, data = d_state_std, weights = d_state_std$pop_weight),
  ols(form_4, data = d_state_std, weights = d_state_std$pop_weight),
  ols(form_5, data = d_state_std, weights = d_state_std$pop_weight),
  ols(form_6, data = d_state_std, weights = d_state_std$pop_weight),
  ols(form_7, data = d_state_std, weights = d_state_std$pop_weight)
) %>% summarize_models()

Pseudo-states

Examples

#algorithm
assign_random_states = function(x, x_neighbors = NULL, n_states = 48, var = "new_state", verbose = F) {
  
  #pick random counties as seeds
  #set state status
  var_s = sym(var)
  x$..rowi = 1:nrow(x)
  x[[var]] = NA
  x[[var]][sample(1:nrow(x), size = n_states)] = 1:48
  
  #loop while adding counties until none left
  #construct neighbor matrix if needed
  if (is.null(x_neighbors)) x_neighbors = spdep::poly2nb(as(x, "Spatial"), queen = FALSE)
  
  #states without free neighbors remaining
  states_nofree_nbs = c()
  
  #itooliketolivedangerously.png
  stuck_count = 0
  last_unit = -1
  states_tried = c()
  while (T) {
    #status
    if (verbose) message(str_glue("Units done {sum(!is.na(x[[var]]))}"))
    
    # browser()

    #check if done
    #all units assigned
    if (!any(is.na(x[[var]]))) {
      break
    }
    #no states have any county to expand to
    if (length(states_nofree_nbs) == n_states) {
      break
    }
    
    #pick a random state to expand (with free units remaining)
    state_i = sample(setdiff(1:n_states, states_nofree_nbs), size = 1)
    states_tried = c(states_tried, state_i)
    
    #stuck?
    if (last_unit == state_i) {
      stuck_count = stuck_count + 1
    }
    if (stuck_count >= 10) browser()
    
    #units in state
    state_i_counties = x %>% filter(!!var_s == state_i) %>% as.data.frame() %>% pull(..rowi)
    
    #find neighboring units
    state_i_neighbors = x_neighbors[state_i_counties] %>% unlist() %>% unique()
    
    #filter to those not already owned
    state_i_neighbors = x[state_i_neighbors, ] %>% filter(is.na(!!var_s))
    
    #any of them not taken?
    #if all taken, try again
    if (nrow(state_i_neighbors) == 0) {
      #add state to list of those without more free neighbors
      states_nofree_nbs = c(states_nofree_nbs, state_i)
      next
    }
    
    #assign take a random one
    x[[var]][sample(state_i_neighbors$..rowi, size = 1)] = state_i
    
    #last unit
    last_unit = state_i
    #stuck count
    stuck_count = 0
  }
  
  x
}

#without outlying states
county_sf_data_no = county_sf_data %>% 
    # filter(!State.x %in% c("Alaska", "Hawaii")) %>% 
    #remove Alaska and Hawaii, and outlying islands
    filter(
      !STATEFP %in% c("02", "15"),
      #tricky, but this removes them
      as.numeric(as.character(STATEFP)) <= 56
    )

#find neighbors
#https://github.com/r-spatial/sf/issues/234
#this method is slower
county_sf_data_no_nbs = spdep::poly2nb(as(county_sf_data_no, "Spatial"), queen = FALSE)
## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape
#fast but includes corners and self and messes up results for us
# county_sf_data_no_nbs2 = st_intersects(county_sf_data_no, county_sf_data_no)

#inspect
county_sf_data_no_nbs
## Neighbour list object:
## Number of regions: 3108 
## Number of nonzero links: 17744 
## Percentage nonzero weights: 0.18 
## Average number of links: 5.7
#table of counts of neighbors
county_sf_data_no_nbs_counts = county_sf_data_no_nbs %>% map_int(function(x) {
  if (identical(x, 0L)) return(0L)
  length(x)
})
(county_sf_data_no_nbs_counts_table = county_sf_data_no_nbs_counts %>% table())
## .
##    1    2    3    4    5    6    7    8    9   10   11   13 
##   13   26   68  343  860 1066  515  166   40    9    1    1
#they are all cities in Virginia which are enclaved but their exclaves are missing due to no IQ data
#because the county they are enclaved in does not have any data and were filtered earlier
#so we need to remove them as well
county_sf_data_no = county_sf_data_no[county_sf_data_no_nbs %>% map_lgl(~!identical(., 0L)), ]
county_sf_data_no_nbs = spdep::poly2nb(as(county_sf_data_no, "Spatial"), queen = FALSE)

#assign new states
set.seed(1)
new_states_example = assign_random_states(county_sf_data_no, county_sf_data_no_nbs, verbose = F)

#merge into state geometries
new_states_example_states = tibble(
  new_state = new_states_example$new_state %>% unique()
)
st_geometry(new_states_example_states) = st_union_by(new_states_example$geometry, new_states_example$new_state)

#plot states
new_states_example_states %>% 
  ggplot() +
  #plot by color, with borders since some are hard to see
  geom_sf(aes(fill = factor(new_state)), lwd = 1) +
  scale_fill_discrete("State", guide = F) +
  #add a label on the center
  geom_sf_text(aes(label = new_state)) +
  theme_classic()

GG_save("figs/example_new_states.png")

#real states
#merge
real_states = tibble(
  real_state = county_sf_data_no$State.x %>% unique()
)
st_geometry(real_states) = st_union_by(county_sf_data_no$geometry, county_sf_data_no$State.x)

real_states %>% 
  ggplot() +
  #plot by color, with borders since some are hard to see
  geom_sf(aes(fill = factor(real_state)), lwd = 1) +
  scale_fill_discrete("State", guide = F) +
  #add a label on the center
  geom_sf_text(aes(label = real_state), check_overlap = T, size = 3) +
  theme_classic()
## Warning: Removed 1 rows containing missing values (geom_text).

GG_save("figs/real_states.png")
## Warning: Removed 1 rows containing missing values (geom_text).

Analysis

We generate many new state maps as above. We save the cache of this because it takes a few hours to complete.

#settings
set.seed(1)
random_states_n = 1000

#generate
random_states = cache_object({
  plyr::llply(1:random_states_n, .parallel = T, .fun = function(i) {
    new_states = assign_random_states(county_sf_data_no, county_sf_data_no_nbs, verbose = F)
    new_states$new_state
  })
}, filename = "cache/new_states.rds", renew = F)
## Cache found, reading object from disk
#aggregate datasets
random_states_datasets = cache_object({plyr::llply(random_states, .parallel = T, .fun = function(x) {
  #add vector
  county_sf_data_no %>% 
    mutate(new_state = x) %>% 
    #aggregate
    plyr::ddply("new_state", .fun = function(dd) {
      # browser()
      #weighted mean of main variables
      #sum population
      y = tibble(
        population = sum(dd$Total.Population),
        pop_weight = sqrt(population)
      )
      
      #loop vars
      for (v in c(main_vars, "lon")) y[[v]] = wtd_mean(dd[[v]], w = dd$Total.Population)
      
      y
  })
})}, filename = "cache/random_states_datasets.rds")
## Cache found, reading object from disk
#fit regressions
random_states_results = plyr::ldply(seq_along(random_states_datasets), .fun = function(i) {
  #fetch
  i_state = random_states_datasets[[i]]
  
  #have to do obnoxious stuff with the formula because it has the global env bound
  #formula.tools:::as.character.formula is convenience function to avoid issues with long formulas and base R
  
  #run regressions
  #UV only
  model_1 = ols(as.formula(formula.tools:::as.character.formula(form_1)), data = i_state, weights = i_state$pop_weight)
  
  #demographics only
  model_2 = ols(as.formula(formula.tools:::as.character.formula(form_2)), data = i_state, weights = i_state$pop_weight)
  
  #demo + UV
  model_3 = ols(as.formula(formula.tools:::as.character.formula(form_3)), data = i_state, weights = i_state$pop_weight)
  
  #white + UV
  model_3b = ols(as.formula(formula.tools:::as.character.formula(form_3b)), data = i_state, weights = i_state$pop_weight)
  
  #save stats
  bind_rows(
    model_1 %>% 
      summary.lm() %>% 
      tidy() %>% 
      mutate(model = "1",
             adj_r2 = model_1 %>% summary.lm() %>% 
               .$adj.r.squared),
    
    model_2 %>% 
      summary.lm() %>% 
      tidy() %>% 
      mutate(model = "2",
             adj_r2 = model_2 %>% 
               summary.lm() %>% 
               .$adj.r.squared, 
             
             partial_r2 = c(NA, rms:::plot.anova.rms(anova(model_2), what = 'partial R2', pl = F, sort = "none"))
                                                   ),
    model_3 %>% 
      summary.lm() %>% 
      tidy() %>% 
      mutate(model = "3", adj_r2 = model_3 %>% summary.lm() %>% 
               .$adj.r.squared,
             partial_r2 = c(NA, rms:::plot.anova.rms(anova(model_3), what = 'partial R2', pl = F, sort = "none"))
                                                   ),
    model_3b %>% 
      summary.lm() %>% 
      tidy() %>% 
      mutate(model = "3b",
             adj_r2 = model_3b %>% summary.lm() %>% 
               .$adj.r.squared,
             partial_r2 = c(NA, rms:::plot.anova.rms(anova(model_3b), what = 'partial R2', pl = F, sort = "none")))
  ) %>% 
    mutate(model = ordered(model),
           i = i)
})

#plots
random_states_results %>% 
  filter(term == "Intercept") %>% 
  ggplot(aes(model, adj_r2)) +
  geom_boxplot() +
  theme_bw()

GG_save("figs/pseudostates_model_r2s.png")

#partial_r2
#all demos
random_states_results %>% 
  filter(term != "Intercept", model == 3) %>% 
  ggplot(aes(term, partial_r2, fill = term)) +
  geom_boxplot() +
  theme_bw()

GG_save("figs/pseudostates_model3_partialR2s.png")

#demos vs. UV
random_states_results %>% 
  filter(term != "Intercept", model == 3) %>% 
  select(i, term, partial_r2) %>% 
  spread(key = term, value = partial_r2) %>% 
  mutate(
    demo = frc_black + frc_asian + frc_hispanic + frc_amerindian,
    ratio = demo / UV,
    demo_greater = (demo > UV) %>% as.numeric()
  ) %>% 
  select(demo, UV, ratio, demo_greater) %>% 
  describe()
#white vs. UV
random_states_results %>% 
  filter(term != "Intercept", model == "3b") %>% 
  ggplot(aes(term, partial_r2, fill = term)) +
  geom_boxplot() +
  theme_bw()

GG_save("figs/pseudostates_model3b_partialR2s.png")

Gap analysis

Within county gaps.

#gap sizes within units
#white-black
seda_CS %>% 
  filter(subgroup == "wbg") %>% 
  {
    wtd_mean(.$mn_avg_ol) %>% print()
    wtd_mean(.$mn_avg_ol, w = 1/.$mn_avg_ol_se) %>% print()
  }
## [1] 0.53
## [1] 0.6
#white hispanic
seda_CS %>% 
  filter(subgroup == "whg") %>% 
  {
    wtd_mean(.$mn_avg_ol) %>% print()
    wtd_mean(.$mn_avg_ol, 1/.$mn_avg_ol_se) %>% print()
  }
## [1] 0.36
## [1] 0.44
#white asian
seda_CS %>% 
  filter(subgroup == "wag") %>% 
  {
    wtd_mean(.$mn_avg_ol) %>% print()
    wtd_mean(.$mn_avg_ol, .$mn_avg_ol_se) %>% print()
  }
## [1] -0.17
## [1] -0.16
#gap relationships
d %>% 
  select(FIPS, CA, CA_asian:CA_white, CA_d_bw:CA_d_aw, SES_all:SES_d_hw, UV, frc_white, dem16_frac, pop_weight) %>% 
  select(-FIPS) %>% 
  {
    combine_upperlower(
      wtd.cors(., w = .$pop_weight),
      wtd.cors(.),
      .diag = 1
    ) %>% 
      .[-nrow(.), -ncol(.)]
  }
##                   CA CA_asian CA_black CA_hispanic CA_white CA_d_bw CA_d_hw
## CA            1.0000  4.8e-01    0.647       0.607    0.687   0.129   0.150
## CA_asian      0.4473  1.0e+00    0.343       0.386    0.587   0.252   0.224
## CA_black      0.6717  3.1e-01    1.000       0.588    0.405  -0.457  -0.089
## CA_hispanic   0.5978  3.9e-01    0.540       1.000    0.350  -0.165  -0.481
## CA_white      0.7570  5.3e-01    0.470       0.421    1.000   0.626   0.649
## CA_d_bw       0.1655  2.2e-01   -0.458      -0.088    0.565   1.000   0.710
## CA_d_hw       0.1909  1.5e-01   -0.024      -0.498    0.569   0.608   1.000
## CA_d_aw      -0.0067 -7.9e-01   -0.047      -0.159    0.102   0.147   0.241
## SES_all       0.7348  2.5e-01    0.455       0.243    0.581   0.221   0.346
## SES_black     0.4349  2.5e-01    0.496       0.236    0.395  -0.063   0.176
## SES_hispanic  0.2843  2.4e-01    0.222       0.278    0.285   0.089   0.021
## SES_white     0.3689  3.3e-01    0.198       0.099    0.646   0.491   0.541
## SES_d_bw     -0.2673 -7.1e-02   -0.452      -0.218   -0.074   0.348   0.127
## SES_d_hw     -0.0245  5.2e-02   -0.076      -0.221    0.196   0.271   0.386
## UV           -0.3271 -6.5e-02   -0.252      -0.167   -0.101   0.055   0.015
## frc_white     0.6020 -6.5e-05    0.397       0.321    0.029  -0.296  -0.259
## dem16_frac   -0.2049  2.4e-01   -0.117      -0.080    0.224   0.383   0.327
##              CA_d_aw SES_all SES_black SES_hispanic SES_white SES_d_bw SES_d_hw
## CA           -0.0646   0.748     0.419        0.337     0.401  -0.2372   -0.039
## CA_asian     -0.7572   0.331     0.308        0.288     0.436  -0.0812    0.068
## CA_black     -0.1003   0.432     0.554        0.269     0.209  -0.5343   -0.108
## CA_hispanic  -0.1883   0.249     0.242        0.352     0.083  -0.2459   -0.319
## CA_white      0.0820   0.605     0.413        0.326     0.756   0.0011    0.281
## CA_d_bw       0.1762   0.242    -0.072        0.084     0.566   0.4577    0.365
## CA_d_hw       0.2327   0.363     0.192        0.017     0.645   0.2029    0.523
## CA_d_aw       1.0000   0.060    -0.076       -0.083     0.097   0.1396    0.147
## SES_all       0.1339   1.000     0.646        0.559     0.728  -0.3311   -0.064
## SES_black    -0.0274   0.646     1.000        0.553     0.558  -0.8427   -0.149
## SES_hispanic -0.0371   0.502     0.496        1.000     0.498  -0.3538   -0.685
## SES_white     0.1514   0.726     0.495        0.461     1.000  -0.0571    0.223
## SES_d_bw      0.0884  -0.360    -0.865       -0.314    -0.048   1.0000    0.346
## SES_d_hw      0.1159  -0.064    -0.162       -0.717     0.163   0.3055    1.000
## UV           -0.0129  -0.190    -0.022        0.048     0.076  -0.0167   -0.049
## frc_white    -0.0517   0.442     0.118        0.065    -0.226  -0.2232   -0.209
## dem16_frac    0.0025  -0.196     0.038        0.011     0.292   0.1752    0.254
##                   UV frc_white dem16_frac
## CA           -0.3674     0.515    -0.1020
## CA_asian     -0.0542    -0.069     0.2629
## CA_black     -0.1711     0.307    -0.1090
## CA_hispanic  -0.1609     0.323    -0.1527
## CA_white     -0.1045    -0.156     0.4479
## CA_d_bw       0.0156    -0.387     0.5334
## CA_d_hw       0.0067    -0.399     0.5473
## CA_d_aw      -0.0300    -0.049     0.0550
## SES_all      -0.1952     0.305     0.0039
## SES_black     0.1162    -0.050     0.1341
## SES_hispanic  0.1009     0.019     0.0438
## SES_white     0.0580    -0.349     0.5035
## SES_d_bw     -0.1500    -0.140     0.1836
## SES_d_hw     -0.1061    -0.283     0.3808
## UV            1.0000    -0.530    -0.0393
## frc_white    -0.5315     1.000    -0.6488
## dem16_frac   -0.0913    -0.583     1.0000
#pairwise sample sizes
d %>% 
  select(CA, CA_asian:CA_white, CA_d_bw:CA_d_aw, SES_all:SES_d_hw, UV, frc_white, dem16_frac) %>% 
  pairwiseCount()
##                CA CA_asian CA_black CA_hispanic CA_white CA_d_bw CA_d_hw
## CA           3134     1483     2135        2647     3112    2116    2639
## CA_asian     1483     1483     1394        1475     1483    1393    1475
## CA_black     2135     1394     2135        2015     2120    2116    2012
## CA_hispanic  2647     1475     2015        2647     2643    2011    2639
## CA_white     3112     1483     2120        2643     3112    2116    2639
## CA_d_bw      2116     1393     2116        2011     2116    2116    2009
## CA_d_hw      2639     1475     2012        2639     2639    2009    2639
## CA_d_aw      1483     1483     1394        1475     1483    1393    1475
## SES_all      3124     1473     2125        2637     3102    2106    2629
## SES_black    2108     1373     2108        1989     2093    2091    1986
## SES_hispanic 2624     1465     1998        2624     2620    1994    2618
## SES_white    3099     1473     2109        2631     3099    2106    2627
## SES_d_bw     2092     1373     2092        1988     2092    2091    1986
## SES_d_hw     2618     1465     1997        2618     2618    1994    2616
## UV           3099     1465     2125        2629     3080    2106    2621
## frc_white    3124     1473     2125        2637     3102    2106    2629
## dem16_frac   3103     1469     2129        2633     3084    2110    2625
##              CA_d_aw SES_all SES_black SES_hispanic SES_white SES_d_bw SES_d_hw
## CA              1483    3124      2108         2624      3099     2092     2618
## CA_asian        1483    1473      1373         1465      1473     1373     1465
## CA_black        1394    2125      2108         1998      2109     2092     1997
## CA_hispanic     1475    2637      1989         2624      2631     1988     2618
## CA_white        1483    3102      2093         2620      3099     2092     2618
## CA_d_bw         1393    2106      2091         1994      2106     2091     1994
## CA_d_hw         1475    2629      1986         2618      2627     1986     2616
## CA_d_aw         1483    1473      1373         1465      1473     1373     1465
## SES_all         1473    3124      2108         2624      3099     2092     2618
## SES_black       1373    2108      2108         1982      2092     2092     1981
## SES_hispanic    1465    2624      1982         2624      2618     1981     2618
## SES_white       1473    3099      2092         2618      3099     2092     2618
## SES_d_bw        1373    2092      2092         1981      2092     2092     1981
## SES_d_hw        1465    2618      1981         2618      2618     1981     2618
## UV              1465    3093      2102         2610      3071     2086     2604
## frc_white       1473    3124      2108         2624      3099     2092     2618
## dem16_frac      1469    3094      2103         2611      3072     2087     2605
##                UV frc_white dem16_frac
## CA           3099      3124       3103
## CA_asian     1465      1473       1469
## CA_black     2125      2125       2129
## CA_hispanic  2629      2637       2633
## CA_white     3080      3102       3084
## CA_d_bw      2106      2106       2110
## CA_d_hw      2621      2629       2625
## CA_d_aw      1465      1473       1469
## SES_all      3093      3124       3094
## SES_black    2102      2108       2103
## SES_hispanic 2610      2624       2611
## SES_white    3071      3099       3072
## SES_d_bw     2086      2092       2087
## SES_d_hw     2604      2618       2605
## UV           3106      3093       3106
## frc_white    3093      3124       3094
## dem16_frac   3106      3094       3111

Within SIRE regressions

Alterative method is to regression within SIRE.

#regression function
do_regs = function(group, data = d, y = "CA") {
  # browser()

  #models
  # wts = data[[str_glue("{str_to_sentence(y)}_pop")]] %>% sqrt()
  wts = data[[str_glue("pop_{group}")]] %>% sqrt()
  
  mods = list(
    ols(as.formula(str_glue("{y}_{group} ~ UV")), data = data, weights = wts),
    
    ols(as.formula(str_glue("{y}_{group} ~ {y}_{group}_lag + UV + avg_temp")), data = data, weights = wts),
    
    ols(as.formula(str_glue("{y}_{group} ~ {str_c(climate_vars_model, collapse = ' + ')}")), data = data, weights = wts),

    ols(as.formula(str_glue("{y}_{group} ~ {str_c(climate_vars_model, collapse = ' + ')} + {y}_{group}_lag")), data = data, weights = wts),
    
    ols(as.formula(str_glue("{y}_{group} ~ {str_c(c(climate_vars_model, water_vars), collapse = ' + ')} + {y}_{group}_lag")), data = data, weights = wts)
  )
  
  mods
}

#do regs and summarize
do_regs("black") %>% summarize_models()
do_regs("hispanic") %>% summarize_models()
do_regs("asian") %>% summarize_models()
do_regs("white") %>% summarize_models()
do_regs("all") %>% summarize_models() #just for comparison
#state level
do_regs("black", data = d_state) %>% summarize_models()
do_regs("hispanic", data = d_state) %>% summarize_models()
do_regs("asian", data = d_state) %>% summarize_models()
do_regs("white", data = d_state) %>% summarize_models()
do_regs("all", data = d_state) %>% summarize_models() #just for comparison

Standardized

#do regs and summarize
do_regs("black", data = d_std) %>% summarize_models()
do_regs("hispanic", data = d_std) %>% summarize_models()
do_regs("asian", data = d_std) %>% summarize_models()
do_regs("white", data = d_std) %>% summarize_models()
do_regs("all", data = d_std) %>% summarize_models() #just for comparison
#state level
do_regs("black", data = d_state_std) %>% summarize_models()
do_regs("hispanic", data = d_state_std) %>% summarize_models()
do_regs("asian", data = d_state_std) %>% summarize_models()
do_regs("white", data = d_state_std) %>% summarize_models()
do_regs("all", data = d_state_std) %>% summarize_models() #just for comparison

Diseases

#correlations
combine_upperlower(
  #no weights
  d %>% select(CA_all:CA_white, !!disease_vars) %>% wtd.cors(),

  #weights
  d %>% select(CA_all:CA_white, !!disease_vars) %>% wtd.cors(weight = d$pop_weight)
)
##                                CA_all CA_asian CA_black CA_hispanic CA_white
## CA_all                             NA  0.44730   0.6717       0.598    0.757
## CA_asian                        0.478       NA   0.3058       0.389    0.529
## CA_black                        0.647  0.34316       NA       0.540    0.470
## CA_hispanic                     0.607  0.38592   0.5881          NA    0.421
## CA_white                        0.687  0.58697   0.4050       0.350       NA
## Diarrheal_diseases_z            0.024  0.09960  -0.0081       0.061   -0.043
## Hepatitis_z                    -0.352 -0.14011  -0.2081      -0.265   -0.072
## HIV_AIDS_z                     -0.335  0.11489  -0.2141      -0.076    0.122
## Lower_respiratory_infections_z -0.358 -0.08421  -0.2033      -0.043   -0.367
## Meningitis_z                   -0.632 -0.11404  -0.3970      -0.173   -0.288
## Tuberculosis_z                 -0.575  0.00068  -0.3409      -0.208   -0.068
##                                Diarrheal_diseases_z Hepatitis_z HIV_AIDS_z
## CA_all                                      -0.0461      -0.278     -0.353
## CA_asian                                     0.1739      -0.126      0.103
## CA_black                                     0.0404      -0.168     -0.260
## CA_hispanic                                  0.0853      -0.177     -0.060
## CA_white                                    -0.0864      -0.117     -0.054
## Diarrheal_diseases_z                             NA       0.102      0.099
## Hepatitis_z                                  0.0169          NA      0.583
## HIV_AIDS_z                                   0.0225       0.503         NA
## Lower_respiratory_infections_z               0.2242       0.013      0.193
## Meningitis_z                                 0.1287       0.295      0.556
## Tuberculosis_z                              -0.0075       0.407      0.595
##                                Lower_respiratory_infections_z Meningitis_z
## CA_all                                                 -0.401       -0.684
## CA_asian                                               -0.033       -0.077
## CA_black                                               -0.274       -0.471
## CA_hispanic                                            -0.065       -0.179
## CA_white                                               -0.369       -0.371
## Diarrheal_diseases_z                                    0.291        0.203
## Hepatitis_z                                             0.123        0.308
## HIV_AIDS_z                                              0.290        0.558
## Lower_respiratory_infections_z                             NA        0.639
## Meningitis_z                                            0.608           NA
## Tuberculosis_z                                          0.340        0.722
##                                Tuberculosis_z
## CA_all                                -0.5899
## CA_asian                              -0.0051
## CA_black                              -0.4057
## CA_hispanic                           -0.1801
## CA_white                              -0.1679
## Diarrheal_diseases_z                   0.0821
## Hepatitis_z                            0.2857
## HIV_AIDS_z                             0.5209
## Lower_respiratory_infections_z         0.3906
## Meningitis_z                           0.7441
## Tuberculosis_z                             NA

Predict SES instead

Requested by reviewer. These are the models 1-7 from “Main regressions” above, but swap outcome to SES_all.

#update forms
SES_y_forms = mget("form_" + 1:7) %>% 
  #change dependent to 
  map(~update(., SES_all ~ .))

#fits
SES_y_forms %>% 
  map(function(x) {
    ols(x, data = d, weights = d$pop_weight)
  }) %>% 
  summarize_models()

Standardized

#fits
SES_y_forms %>% 
  map(function(x) {
    ols(x, data = d_std, weights = d_std$pop_weight)
  }) %>% 
  summarize_models()

Within SIRE

Predict SES within SIRE. Note there is no SES_Asian variable in CEDA.

#do regs and summarize
do_regs("black", y = "SES") %>% summarize_models()
do_regs("hispanic", y = "SES") %>% summarize_models()
#do_regs("asian", y = "SES") %>% summarize_models()
do_regs("white", y = "SES") %>% summarize_models()
do_regs("all", y = "SES") %>% summarize_models() #just for comparison
#state level
do_regs("black", data = d_state, y = "SES") %>% summarize_models()
do_regs("hispanic", data = d_state, y = "SES") %>% summarize_models()
# do_regs("asian", data = d_state, y = "SES") %>% summarize_models()
do_regs("white", data = d_state, y = "SES") %>% summarize_models()
do_regs("all", data = d_state, y = "SES") %>% summarize_models() #just for comparison

Standardized

#do regs and summarize
do_regs("black", y = "SES", data = d_std) %>% summarize_models()
do_regs("hispanic", y = "SES", data = d_std) %>% summarize_models()
#do_regs("asian", y = "SES") %>% summarize_models()
do_regs("white", y = "SES", data = d_std) %>% summarize_models()
do_regs("all", y = "SES", data = d_std) %>% summarize_models() #just for comparison
#state level
do_regs("black", data = d_state_std, y = "SES") %>% summarize_models()
do_regs("hispanic", data = d_state_std, y = "SES") %>% summarize_models()
# do_regs("asian", data = d_state_std, y = "SES") %>% summarize_models()
do_regs("white", data = d_state_std, y = "SES") %>% summarize_models()
do_regs("all", data = d_state_std, y = "SES") %>% summarize_models() #just for comparison

Maps / plots

Make a lot of maps and bivariate scatterplots.

#plot maps
#we plot one for each main variable for future use
var_to_plot = tibble(
  var = main_vars[-1] %>% str_subset("_z", negate = T) %>% setdiff(c("lat", "lon")),
  nice_name = c("Cognitive ability (All)",
                "Cognitive ability (Asian)",
                "Cognitive ability (Black)",
                "Cognitive ability (Hispanic)",
                "Cognitive ability (White)",
                "Socioeconomic status (All)",
                "Socioeconomic status (Black)",
                "Socioeconomic status (Hispanic)",
                "Socioeconomic status (White)",
                "Proportion White",
                "Proportion Black",
                "Proportion Asian",
                "Proportion Hispanic",
                "Proportion Amerindian",
                "Ultraviolet Radiation",
                "Average temperature",
                "Elevation", 
                "Fluoride concentration in water",
                "Lead index")
)

#plot!
for (i in seq_along_rows(var_to_plot)) {
  i_var = var_to_plot$var[i]
  message(i_var)
  i_var2 = as.symbol(i_var)
  
  #limits
  lims = c(NA, NA)
  local_data = county_sf_data

  if (str_detect(i_var, "^(SES|CA)")) {
    lims = c(-3, 3)
    #winsorize data to limits
    local_data[[i_var]] = local_data[[i_var]] %>% winsorise(lower = -3, 3)
  }
  if (i_var %in% race_vars2) {
    lims = c(0, 1)
  }
  
  #plot
  local_data %>% 
    # filter(!State.x %in% c("Alaska", "Hawaii")) %>% 
    #remove Alaska and Hawaii, and outlying islands
    filter(
      !STATEFP %in% c("02", "15"),
      #tricky, but this removes them
      as.numeric(as.character(STATEFP)) <= 56
      ) %>% 
    ggplot() +
    geom_sf(aes(fill = !!i_var2), lwd = 0) +
    scale_fill_gradient2(var_to_plot$nice_name[i], low = "red", high = "blue", mid = "green", limits = lims, na.value = "lightgrey") +
    theme_classic() ->
    i_plot
  
  GG_save(plot = i_plot, str_glue("figs/maps/{i_var}_map.png"))
}
## CA_all
## CA_asian
## CA_black
## CA_hispanic
## CA_white
## SES_all
## SES_black
## SES_hispanic
## SES_white
## frc_white
## frc_black
## frc_asian
## frc_hispanic
## frc_amerindian
## UV
## avg_temp
## elevation
## fluoride_conc
## lead_index
#bivariate scatterplots
#pairs
pairs_to_plot = gtools::combinations(nrow(var_to_plot), r = 2, v = var_to_plot$var)

for (i in 1:nrow(pairs_to_plot)) {
  message(str_glue("{i} of {nrow(pairs_to_plot)}"))
  
  #var names
  i_var1 = pairs_to_plot[i, 1]
  i_var2 = pairs_to_plot[i, 2]
  
  #weights are tricky, we use group weights when possible
  #and then harmonic combination when 2 group variables
  i_groups = tibble(
    asian = str_detect(c(i_var1, i_var2), "_asian") %>% any(),
    black = str_detect(c(i_var1, i_var2), "_black") %>% any(),
    hispanic = str_detect(c(i_var1, i_var2), "_hispanic") %>% any(),
    white = str_detect(c(i_var1, i_var2), "_white") %>% any()
  )
  
  #weights defaults to standard, but changes to group specific if needed
  i_wt = d$pop_weight
  
  #if one group
  if (sum(as.matrix(i_groups)) == 1) {
    i_group = which(as.matrix(i_groups))
    
    i_wt = switch(i_group,
      "1" = d$pop_asian %>% sqrt(),
      "2" = d$pop_black %>% sqrt(),
      "3" = d$pop_hispanic %>% sqrt(),
      "4" = d$pop_white %>% sqrt()
    )
  }
  #if two
  if (sum(as.matrix(i_groups)) == 2) {
    i_group = which(as.matrix(i_groups)) %>% as.numeric()
    
    #if then for each of the 6 combos
    if (identical(i_group, c(1, 2))) i_wt = matrix(c(d$pop_asian %>% sqrt(),
                                                     d$pop_black %>% sqrt()),
                                                   ncol = 2) %>% t() %>% harmonic.mean()
    
    if (identical(i_group, c(1, 3))) i_wt = matrix(c(d$pop_asian %>% sqrt(),
                                                     d$pop_hispanic %>% sqrt()),
                                                   ncol = 2) %>% t() %>% harmonic.mean()
    
    if (identical(i_group, c(1, 4))) i_wt = matrix(c(d$pop_asian %>% sqrt(),
                                                     d$pop_white %>% sqrt()),
                                                   ncol = 2) %>% t() %>% harmonic.mean()
    
    if (identical(i_group, c(2, 3))) i_wt = matrix(c(d$pop_black %>% sqrt(),
                                                     d$pop_hispanic %>% sqrt()),
                                                   ncol = 2) %>% t() %>% harmonic.mean()
    
    if (identical(i_group, c(2, 4))) i_wt = matrix(c(d$pop_black %>% sqrt(),
                                                     d$pop_white %>% sqrt()),
                                                   ncol = 2) %>% t() %>% harmonic.mean()
    
    if (identical(i_group, c(3, 4))) i_wt = matrix(c(d$pop_hispanic %>% sqrt(),
                                                     d$pop_white %>% sqrt()),
                                                   ncol = 2) %>% t() %>% harmonic.mean()
  }
  
  
  #plot with good labels and correct weights!
  #we hope
  GG_scatter(d, i_var1, i_var2, weights = i_wt) +
    xlab(var_to_plot$nice_name[var_to_plot$var==i_var1]) + 
    ylab(var_to_plot$nice_name[var_to_plot$var==i_var2])
  
  GG_save(str_glue("figs/pairs/{i_var1} x {i_var2}.png"))
}
## 1 of 171
## `geom_smooth()` using formula 'y ~ x'
## 2 of 171
## `geom_smooth()` using formula 'y ~ x'
## 3 of 171
## `geom_smooth()` using formula 'y ~ x'
## 4 of 171
## `geom_smooth()` using formula 'y ~ x'
## 5 of 171
## `geom_smooth()` using formula 'y ~ x'
## 6 of 171
## `geom_smooth()` using formula 'y ~ x'
## 7 of 171
## `geom_smooth()` using formula 'y ~ x'
## 8 of 171
## `geom_smooth()` using formula 'y ~ x'
## 9 of 171
## `geom_smooth()` using formula 'y ~ x'
## 10 of 171
## `geom_smooth()` using formula 'y ~ x'
## 11 of 171
## `geom_smooth()` using formula 'y ~ x'
## 12 of 171
## `geom_smooth()` using formula 'y ~ x'
## 13 of 171
## `geom_smooth()` using formula 'y ~ x'
## 14 of 171
## `geom_smooth()` using formula 'y ~ x'
## 15 of 171
## `geom_smooth()` using formula 'y ~ x'
## 16 of 171
## `geom_smooth()` using formula 'y ~ x'
## 17 of 171
## `geom_smooth()` using formula 'y ~ x'
## 18 of 171
## `geom_smooth()` using formula 'y ~ x'
## 19 of 171
## `geom_smooth()` using formula 'y ~ x'
## 20 of 171
## `geom_smooth()` using formula 'y ~ x'
## 21 of 171
## `geom_smooth()` using formula 'y ~ x'
## 22 of 171
## `geom_smooth()` using formula 'y ~ x'
## 23 of 171
## `geom_smooth()` using formula 'y ~ x'
## 24 of 171
## `geom_smooth()` using formula 'y ~ x'
## 25 of 171
## `geom_smooth()` using formula 'y ~ x'
## 26 of 171
## `geom_smooth()` using formula 'y ~ x'
## 27 of 171
## `geom_smooth()` using formula 'y ~ x'
## 28 of 171
## `geom_smooth()` using formula 'y ~ x'
## 29 of 171
## `geom_smooth()` using formula 'y ~ x'
## 30 of 171
## `geom_smooth()` using formula 'y ~ x'
## 31 of 171
## `geom_smooth()` using formula 'y ~ x'
## 32 of 171
## `geom_smooth()` using formula 'y ~ x'
## 33 of 171
## `geom_smooth()` using formula 'y ~ x'
## 34 of 171
## `geom_smooth()` using formula 'y ~ x'
## 35 of 171
## `geom_smooth()` using formula 'y ~ x'
## 36 of 171
## `geom_smooth()` using formula 'y ~ x'
## 37 of 171
## `geom_smooth()` using formula 'y ~ x'
## 38 of 171
## `geom_smooth()` using formula 'y ~ x'
## 39 of 171
## `geom_smooth()` using formula 'y ~ x'
## 40 of 171
## `geom_smooth()` using formula 'y ~ x'
## 41 of 171
## `geom_smooth()` using formula 'y ~ x'
## 42 of 171
## `geom_smooth()` using formula 'y ~ x'
## 43 of 171
## `geom_smooth()` using formula 'y ~ x'
## 44 of 171
## `geom_smooth()` using formula 'y ~ x'
## 45 of 171
## `geom_smooth()` using formula 'y ~ x'
## 46 of 171
## `geom_smooth()` using formula 'y ~ x'
## 47 of 171
## `geom_smooth()` using formula 'y ~ x'
## 48 of 171
## `geom_smooth()` using formula 'y ~ x'
## 49 of 171
## `geom_smooth()` using formula 'y ~ x'
## 50 of 171
## `geom_smooth()` using formula 'y ~ x'
## 51 of 171
## `geom_smooth()` using formula 'y ~ x'
## 52 of 171
## `geom_smooth()` using formula 'y ~ x'
## 53 of 171
## `geom_smooth()` using formula 'y ~ x'
## 54 of 171
## `geom_smooth()` using formula 'y ~ x'
## 55 of 171
## `geom_smooth()` using formula 'y ~ x'
## 56 of 171
## `geom_smooth()` using formula 'y ~ x'
## 57 of 171
## `geom_smooth()` using formula 'y ~ x'
## 58 of 171
## `geom_smooth()` using formula 'y ~ x'
## 59 of 171
## `geom_smooth()` using formula 'y ~ x'
## 60 of 171
## `geom_smooth()` using formula 'y ~ x'
## 61 of 171
## `geom_smooth()` using formula 'y ~ x'
## 62 of 171
## `geom_smooth()` using formula 'y ~ x'
## 63 of 171
## `geom_smooth()` using formula 'y ~ x'
## 64 of 171
## `geom_smooth()` using formula 'y ~ x'
## 65 of 171
## `geom_smooth()` using formula 'y ~ x'
## 66 of 171
## `geom_smooth()` using formula 'y ~ x'
## 67 of 171
## `geom_smooth()` using formula 'y ~ x'
## 68 of 171
## `geom_smooth()` using formula 'y ~ x'
## 69 of 171
## `geom_smooth()` using formula 'y ~ x'
## 70 of 171
## `geom_smooth()` using formula 'y ~ x'
## 71 of 171
## `geom_smooth()` using formula 'y ~ x'
## 72 of 171
## `geom_smooth()` using formula 'y ~ x'
## 73 of 171
## `geom_smooth()` using formula 'y ~ x'
## 74 of 171
## `geom_smooth()` using formula 'y ~ x'
## 75 of 171
## `geom_smooth()` using formula 'y ~ x'
## 76 of 171
## `geom_smooth()` using formula 'y ~ x'
## 77 of 171
## `geom_smooth()` using formula 'y ~ x'
## 78 of 171
## `geom_smooth()` using formula 'y ~ x'
## 79 of 171
## `geom_smooth()` using formula 'y ~ x'
## 80 of 171
## `geom_smooth()` using formula 'y ~ x'
## 81 of 171
## `geom_smooth()` using formula 'y ~ x'
## 82 of 171
## `geom_smooth()` using formula 'y ~ x'
## 83 of 171
## `geom_smooth()` using formula 'y ~ x'
## 84 of 171
## `geom_smooth()` using formula 'y ~ x'
## 85 of 171
## `geom_smooth()` using formula 'y ~ x'
## 86 of 171
## `geom_smooth()` using formula 'y ~ x'
## 87 of 171
## `geom_smooth()` using formula 'y ~ x'
## 88 of 171
## `geom_smooth()` using formula 'y ~ x'
## 89 of 171
## `geom_smooth()` using formula 'y ~ x'
## 90 of 171
## `geom_smooth()` using formula 'y ~ x'
## 91 of 171
## `geom_smooth()` using formula 'y ~ x'
## 92 of 171
## `geom_smooth()` using formula 'y ~ x'
## 93 of 171
## `geom_smooth()` using formula 'y ~ x'
## 94 of 171
## `geom_smooth()` using formula 'y ~ x'
## 95 of 171
## `geom_smooth()` using formula 'y ~ x'
## 96 of 171
## `geom_smooth()` using formula 'y ~ x'
## 97 of 171
## `geom_smooth()` using formula 'y ~ x'
## 98 of 171
## `geom_smooth()` using formula 'y ~ x'
## 99 of 171
## `geom_smooth()` using formula 'y ~ x'
## 100 of 171
## `geom_smooth()` using formula 'y ~ x'
## 101 of 171
## `geom_smooth()` using formula 'y ~ x'
## 102 of 171
## `geom_smooth()` using formula 'y ~ x'
## 103 of 171
## `geom_smooth()` using formula 'y ~ x'
## 104 of 171
## `geom_smooth()` using formula 'y ~ x'
## 105 of 171
## `geom_smooth()` using formula 'y ~ x'
## 106 of 171
## `geom_smooth()` using formula 'y ~ x'
## 107 of 171
## `geom_smooth()` using formula 'y ~ x'
## 108 of 171
## `geom_smooth()` using formula 'y ~ x'
## 109 of 171
## `geom_smooth()` using formula 'y ~ x'
## 110 of 171
## `geom_smooth()` using formula 'y ~ x'
## 111 of 171
## `geom_smooth()` using formula 'y ~ x'
## 112 of 171
## `geom_smooth()` using formula 'y ~ x'
## 113 of 171
## `geom_smooth()` using formula 'y ~ x'
## 114 of 171
## `geom_smooth()` using formula 'y ~ x'
## 115 of 171
## `geom_smooth()` using formula 'y ~ x'
## 116 of 171
## `geom_smooth()` using formula 'y ~ x'
## 117 of 171
## `geom_smooth()` using formula 'y ~ x'
## 118 of 171
## `geom_smooth()` using formula 'y ~ x'
## 119 of 171
## `geom_smooth()` using formula 'y ~ x'
## 120 of 171
## `geom_smooth()` using formula 'y ~ x'
## 121 of 171
## `geom_smooth()` using formula 'y ~ x'
## 122 of 171
## `geom_smooth()` using formula 'y ~ x'
## 123 of 171
## `geom_smooth()` using formula 'y ~ x'
## 124 of 171
## `geom_smooth()` using formula 'y ~ x'
## 125 of 171
## `geom_smooth()` using formula 'y ~ x'
## 126 of 171
## `geom_smooth()` using formula 'y ~ x'
## 127 of 171
## `geom_smooth()` using formula 'y ~ x'
## 128 of 171
## `geom_smooth()` using formula 'y ~ x'
## 129 of 171
## `geom_smooth()` using formula 'y ~ x'
## 130 of 171
## `geom_smooth()` using formula 'y ~ x'
## 131 of 171
## `geom_smooth()` using formula 'y ~ x'
## 132 of 171
## `geom_smooth()` using formula 'y ~ x'
## 133 of 171
## `geom_smooth()` using formula 'y ~ x'
## 134 of 171
## `geom_smooth()` using formula 'y ~ x'
## 135 of 171
## `geom_smooth()` using formula 'y ~ x'
## 136 of 171
## `geom_smooth()` using formula 'y ~ x'
## 137 of 171
## `geom_smooth()` using formula 'y ~ x'
## 138 of 171
## `geom_smooth()` using formula 'y ~ x'
## 139 of 171
## `geom_smooth()` using formula 'y ~ x'
## 140 of 171
## `geom_smooth()` using formula 'y ~ x'
## 141 of 171
## `geom_smooth()` using formula 'y ~ x'
## 142 of 171
## `geom_smooth()` using formula 'y ~ x'
## 143 of 171
## `geom_smooth()` using formula 'y ~ x'
## 144 of 171
## `geom_smooth()` using formula 'y ~ x'
## 145 of 171
## `geom_smooth()` using formula 'y ~ x'
## 146 of 171
## `geom_smooth()` using formula 'y ~ x'
## 147 of 171
## `geom_smooth()` using formula 'y ~ x'
## 148 of 171
## `geom_smooth()` using formula 'y ~ x'
## 149 of 171
## `geom_smooth()` using formula 'y ~ x'
## 150 of 171
## `geom_smooth()` using formula 'y ~ x'
## 151 of 171
## `geom_smooth()` using formula 'y ~ x'
## 152 of 171
## `geom_smooth()` using formula 'y ~ x'
## 153 of 171
## `geom_smooth()` using formula 'y ~ x'
## 154 of 171
## `geom_smooth()` using formula 'y ~ x'
## 155 of 171
## `geom_smooth()` using formula 'y ~ x'
## 156 of 171
## `geom_smooth()` using formula 'y ~ x'
## 157 of 171
## `geom_smooth()` using formula 'y ~ x'
## 158 of 171
## `geom_smooth()` using formula 'y ~ x'
## 159 of 171
## `geom_smooth()` using formula 'y ~ x'
## 160 of 171
## `geom_smooth()` using formula 'y ~ x'
## 161 of 171
## `geom_smooth()` using formula 'y ~ x'
## 162 of 171
## `geom_smooth()` using formula 'y ~ x'
## 163 of 171
## `geom_smooth()` using formula 'y ~ x'
## 164 of 171
## `geom_smooth()` using formula 'y ~ x'
## 165 of 171
## `geom_smooth()` using formula 'y ~ x'
## 166 of 171
## `geom_smooth()` using formula 'y ~ x'
## 167 of 171
## `geom_smooth()` using formula 'y ~ x'
## 168 of 171
## `geom_smooth()` using formula 'y ~ x'
## 169 of 171
## `geom_smooth()` using formula 'y ~ x'
## 170 of 171
## `geom_smooth()` using formula 'y ~ x'
## 171 of 171
## `geom_smooth()` using formula 'y ~ x'
##old code
# county_sf_data %>% 
#   filter(!State.x %in% c("Alaska", "Hawaii")) %>% 
#   ggplot() +
#   geom_sf(aes(fill = CA), lwd = 0) +
#   scale_fill_gradient2("Intelligence", low = "red", high = "blue", mid = "green") +
#   theme_classic()
# GG_save("figs/CA_map.png")
# 
# county_sf_data %>% 
#   filter(!State.x %in% c("Alaska", "Hawaii")) %>% 
#   ggplot() +
#   geom_sf(aes(fill = UV), lwd = 0) +
#   scale_fill_gradient2(low = "red", high = "blue", mid = "green") +
#   theme_classic()
# GG_save("figs/UV_map.png")
# 
# county_sf_data %>% 
#   filter(!State.x %in% c("Alaska", "Hawaii")) %>% 
#   ggplot() +
#   geom_sf(aes(fill = avg_temp), lwd = 0) +
#   scale_fill_gradient2(low = "red", high = "blue", mid = "green") +
#   theme_classic()
# GG_save("figs/avg_temp_map.png")

Output data

#data dictionary
d_dict = d %>% df_var_table()
d_dict
#county data
d %>% write_rds("data/data_out_counties.rds", compress = "xz")
d %>% write_rds("data/data_out_counties.csv.gz")

#states
d_state %>% write_rds("data/data_out_states.rds", compress = "xz")
d_state %>% write_rds("data/data_out_states.csv.gz")

Versions

write_sessioninfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] formula.tools_1.7.1     gtools_3.8.2            stringdist_0.9.6.3     
##  [4] doFuture_0.12.0         future_1.21.0           foreach_1.5.1          
##  [7] broom_0.7.4             sf_0.9-7                spatialstatstools_0.1.0
## [10] ape_5.4-1               geosphere_1.5-10        rmngb_0.6-1            
## [13] fields_11.6             spam_2.6-0              dotCall64_1.0-0        
## [16] rms_6.1-0               SparseM_1.78            readxl_1.3.1           
## [19] haven_2.3.1             kirkegaard_2021-02-08   metafor_2.4-0          
## [22] Matrix_1.3-2            psych_2.0.12            magrittr_2.0.1         
## [25] assertthat_0.2.1        weights_1.0.1           mice_3.13.0            
## [28] gdata_2.18.0            Hmisc_4.4-2             Formula_1.2-4          
## [31] survival_3.2-7          lattice_0.20-41         forcats_0.5.1          
## [34] stringr_1.4.0           dplyr_1.0.4             purrr_0.3.4            
## [37] readr_1.4.0             tidyr_1.1.2             tibble_3.0.6           
## [40] ggplot2_3.3.3           tidyverse_1.3.0         pacman_0.5.1           
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.2.1      lwgeom_0.2-5         plyr_1.8.6          
##   [4] sp_1.4-5             splines_4.0.3        operator.tools_1.6.3
##   [7] listenv_0.8.0        TH.data_1.0-10       digest_0.6.27       
##  [10] htmltools_0.5.1.1    fansi_0.4.2          checkmate_2.0.0     
##  [13] cluster_2.1.0        globals_0.14.0       modelr_0.1.8        
##  [16] gmodels_2.18.1       matrixStats_0.57.0   sandwich_3.0-0      
##  [19] jpeg_0.1-8.1         colorspace_2.0-0     rvest_0.3.6         
##  [22] xfun_0.20            crayon_1.4.0         jsonlite_1.7.2      
##  [25] zoo_1.8-8            iterators_1.0.13     glue_1.4.2          
##  [28] gtable_0.3.0         MatrixModels_0.4-1   maps_3.3.0          
##  [31] scales_1.1.1         mvtnorm_1.1-1        DBI_1.1.1           
##  [34] Rcpp_1.0.6           spData_0.3.8         htmlTable_2.1.0     
##  [37] tmvnsim_1.0-2        units_0.6-7          spdep_1.1-5         
##  [40] foreign_0.8-81       htmlwidgets_1.5.3    httr_1.4.2          
##  [43] RColorBrewer_1.1-2   ellipsis_0.3.1       pkgconfig_2.0.3     
##  [46] farver_2.0.3         deldir_0.2-9         nnet_7.3-14         
##  [49] dbplyr_2.0.0         utf8_1.1.4           tidyselect_1.1.0    
##  [52] labeling_0.4.2       rlang_0.4.10         munsell_0.5.0       
##  [55] multilevel_2.6       cellranger_1.1.0     tools_4.0.3         
##  [58] cli_2.3.0            generics_0.1.0       evaluate_0.14       
##  [61] yaml_2.2.1           knitr_1.31           fs_1.5.0            
##  [64] nlme_3.1-151         quantreg_5.82        xml2_1.3.2          
##  [67] psychometric_2.2     compiler_4.0.3       rstudioapi_0.13     
##  [70] png_0.1-7            e1071_1.7-4          reprex_1.0.0.9000   
##  [73] stringi_1.5.3        highr_0.8            classInt_0.4-3      
##  [76] vctrs_0.3.6          LearnBayes_2.15.1    pillar_1.4.7        
##  [79] lifecycle_0.2.0      data.table_1.13.6    conquer_1.0.2       
##  [82] raster_3.4-5         R6_2.5.0             latticeExtra_0.6-29 
##  [85] KernSmooth_2.23-18   gridExtra_2.3        parallelly_1.23.0   
##  [88] codetools_0.2-18     polspline_1.1.19     boot_1.3-25         
##  [91] MASS_7.3-53          withr_2.4.1          mnormt_2.0.2        
##  [94] multcomp_1.4-15      expm_0.999-6         mgcv_1.8-33         
##  [97] parallel_4.0.3       hms_1.0.0            rpart_4.1-15        
## [100] coda_0.19-4          class_7.3-17         rmarkdown_2.6       
## [103] lubridate_1.7.9.2    base64enc_0.1-3

Upload to OSF

#run manually
if (F) {
  library(osfr)
  
  #auth
  osf_auth(read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/wzs3v/")
  
  #zip the data dir, too large
  system("zip -r data.zip data")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(osf_proj, conflicts = "overwrite", path = c(
    "figs", "data.zip",
    "notebook.Rmd", "notebook.html", "session_info.txt",
    "SAC.Rmd", "SAC.html",
    "fluoride_merging.R", "lead_merging.R", "merge_example.R", "scrape_lead_data.R"
  ))
}