Initialize

Packages.

options(digits=2)
library(pacman)
p_load(kirkegaard, readr, dplyr, rms)

Load data.

#water data
water_data = readr::read_csv("data/fluoride_conc_bycounty.csv")
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   county = col_character(),
##   fluoridated = col_character(),
##   conc = col_double()
## )
#county data
d = read_rds("data/USA_county_data.rds")

#save a complete copy
d_orig = d

#recode water
water_data %<>% mutate(
  #get state
  state = str_match(ID, pattern = "\\w+") %>% as.vector,
  
  #get county name without fluff
  county = str_replace(county, pattern = " \\(.+\\)$", replacement = ""),
  
  #combined state-county
  state_county = state + " - " + county,
  
  #rename
  fluoride_content = conc
)

#recode county
d %<>% mutate(
  #make a combined id for merging
  state_county = ST + " - " + name_16,
  
  #sqrt_pop, for weights
  sqrt_pop = sqrt(Total.Population)
)

Aggregate water data within counties.

#for each county, calc the mean fluoride content. For the non-numerical cols, use the first value.
water_data_county = plyr::ddply(water_data, .variables = "state_county", function(x) {
  data_frame(
    ID = x$ID[1],
    county = x$county[1],
    fluoridated = wtd_mean(x$fluoridated == "Yes"),
    fluoride_content = wtd_mean(x$fluoride_content),
    state = x$state[1],
    state_county = x$state_county[1]
  )
})

Merge data.

#remove units with missing ids
water_data_county %<>% filter(!is.na(state_county))
d %<>% filter(!is.na(state_county))

#overlapping data
intersect(d$state_county, water_data_county$state_county) %>% length
## [1] 1704
#assert no duplicates!
assert_that(!any(duplicated(d$state_county)))
## [1] TRUE
assert_that(!any(duplicated(water_data_county$state_county)))
## [1] TRUE
#merge!
d = dplyr::inner_join(d, water_data_county, by = "state_county")

#remove cases without CA data
d %<>% filter(!is.na(CA))

#spatial lag
d$CA_spatial = SAC_knsnr(d, dependent = "CA", lat_var = "lat", lon_var = "lon")

#standardized version
#because Emil's fancy modeling function does not support interaction terms!
d_std = d[c("CA", "fluoride_content", "fluoridated", "Black", "Hispanic", "Asian", "dem16_frac", "CA_spatial")] %>% df_standardize

Modeling

Regression models.

#correlation
GG_scatter(d, "fluoride_content", "CA", case_names = F)

ggsave("figures/F_content_intel.png")
## Saving 7 x 5 in image
GG_scatter(d, "fluoride_content", "CA", case_names = F, weights = d$sqrt_pop)

ggsave("figures/F_content_intel_wtd.png")
## Saving 7 x 5 in image
#with racial covar
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian", data = d) %>% MOD_summary()
## Model coefficients
##                   Beta    SE CI_lower CI_upper
## fluoride_content  0.18 0.025     0.13    0.231
## fluoridated      -0.12 0.024    -0.17   -0.074
## Black            -0.68 0.018    -0.71   -0.642
## Hispanic         -0.18 0.020    -0.22   -0.138
## Asian             0.17 0.018     0.13    0.201
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1687 1681 0.48    0.48  0.47
## 
## 
## Etas from analysis of variance
##                    Eta Eta_partial
## fluoride_content 0.130        0.18
## fluoridated      0.089        0.12
## Black            0.656        0.67
## Hispanic         0.158        0.21
## Asian            0.165        0.22
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian", data = d, weights = d$sqrt_pop) %>% MOD_summary
## Warning in sqrt(.): NaNs produced
## Model coefficients
##                   Beta    SE CI_lower CI_upper
## fluoride_content  0.14 0.026    0.091    0.194
## fluoridated      -0.10 0.023   -0.148   -0.056
## Black            -0.66 0.017   -0.694   -0.626
## Hispanic         -0.17 0.017   -0.200   -0.132
## Asian             0.14 0.010    0.119    0.159
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1687 1681 0.49    0.49  0.48
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## fluoride_content NaN           1
## fluoridated      NaN           1
## Black            NaN           1
## Hispanic         NaN           1
## Asian            NaN           1
#with political covar
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac", data = d) %>% MOD_summary
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## fluoride_content  0.179 0.025    0.129    0.229
## fluoridated      -0.117 0.024   -0.164   -0.069
## Black            -0.667 0.024   -0.714   -0.620
## Hispanic         -0.171 0.021   -0.213   -0.130
## Asian             0.172 0.019    0.134    0.210
## dem16_frac       -0.019 0.025   -0.069    0.031
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1687 1680 0.48    0.48  0.47
## 
## 
## Etas from analysis of variance
##                    Eta Eta_partial
## fluoride_content 0.123       0.169
## fluoridated      0.085       0.117
## Black            0.491       0.562
## Hispanic         0.143       0.194
## Asian            0.156       0.211
## dem16_frac       0.013       0.018
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac", data = d, weights = d$sqrt_pop) %>% MOD_summary
## Warning in sqrt(.): NaNs produced
## Model coefficients
##                   Beta    SE CI_lower CI_upper
## fluoride_content  0.15 0.027    0.095    0.202
## fluoridated      -0.11 0.024   -0.155   -0.060
## Black            -0.67 0.022   -0.716   -0.629
## Hispanic         -0.17 0.019   -0.212   -0.135
## Asian             0.13 0.012    0.111    0.157
## dem16_frac        0.02 0.023   -0.025    0.066
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1687 1680 0.49    0.49  0.47
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## fluoride_content NaN           1
## fluoridated      NaN           1
## Black            NaN           1
## Hispanic         NaN           1
## Asian            NaN           1
## dem16_frac       NaN           1
#with spatial lag
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial", data = d) %>% MOD_summary
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## fluoride_content  0.135 0.024    0.089   0.1816
## fluoridated      -0.093 0.022   -0.137  -0.0490
## Black            -0.407 0.027   -0.460  -0.3536
## Hispanic         -0.140 0.020   -0.179  -0.1017
## Asian             0.134 0.018    0.098   0.1692
## dem16_frac       -0.053 0.024   -0.099  -0.0062
## CA_spatial        0.367 0.022    0.325   0.4103
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1687 1679 0.55    0.55  0.54
## 
## 
## Etas from analysis of variance
##                    Eta Eta_partial
## fluoride_content 0.093       0.138
## fluoridated      0.068       0.101
## Black            0.246       0.345
## Hispanic         0.116       0.172
## Asian            0.120       0.177
## dem16_frac       0.036       0.054
## CA_spatial       0.274       0.379
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial", data = d, weights = d$sqrt_pop) %>% MOD_summary
## Warning in sqrt(.): NaNs produced
## Model coefficients
##                    Beta    SE CI_lower CI_upper
## fluoride_content  0.109 0.026    0.058   0.1595
## fluoridated      -0.092 0.023   -0.137  -0.0475
## Black            -0.479 0.025   -0.527  -0.4296
## Hispanic         -0.141 0.019   -0.177  -0.1044
## Asian             0.107 0.011    0.085   0.1290
## dem16_frac       -0.037 0.022   -0.080   0.0072
## CA_spatial        0.288 0.020    0.248   0.3268
## 
## 
## Model meta-data
##   outcome    N   df   R2 R2-adj. R2-cv
## 1      CA 1687 1679 0.55    0.54  0.53
## 
## 
## Etas from analysis of variance
##                  Eta Eta_partial
## fluoride_content NaN           1
## fluoridated      NaN           1
## Black            NaN           1
## Hispanic         NaN           1
## Asian            NaN           1
## dem16_frac       NaN           1
## CA_spatial       NaN           1
#interaction?
ols(CA ~ fluoride_content * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
## Linear Regression Model
##  
##  ols(formula = CA ~ fluoride_content * fluoridated + Black + Hispanic + 
##      Asian + dem16_frac + CA_spatial, data = d_std)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1687    LR chi2    1367.36    R2       0.555    
##  sigma0.6684    d.f.             8    R2 adj   0.553    
##  d.f.   1678    Pr(> chi2)  0.0000    g        0.759    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -4.2166950 -0.4013999  0.0009606  0.4054652  2.3990577 
##  
##  
##                                 Coef    S.E.   t      Pr(>|t|)
##  Intercept                       0.0428 0.0225   1.90 0.0576  
##  fluoride_content                0.2031 0.0343   5.93 <0.0001 
##  fluoridated                    -0.1398 0.0282  -4.97 <0.0001 
##  Black                          -0.4101 0.0270 -15.20 <0.0001 
##  Hispanic                       -0.1451 0.0197  -7.37 <0.0001 
##  Asian                           0.1324 0.0181   7.31 <0.0001 
##  dem16_frac                     -0.0525 0.0236  -2.22 0.0264  
##  CA_spatial                      0.3611 0.0219  16.46 <0.0001 
##  fluoride_content * fluoridated -0.0690 0.0251  -2.75 0.0061  
## 
ols(CA ~ fluoride_content * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
## Linear Regression Model
##  
##  ols(formula = CA ~ fluoride_content * fluoridated + Black + Hispanic + 
##      Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1687    LR chi2    1336.09    R2       0.547    
##  sigma8.7927    d.f.             8    R2 adj   0.545    
##  d.f.   1678    Pr(> chi2)  0.0000    g        0.720    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -4.31443 -0.43367 -0.02315  0.38062  2.49536 
##  
##  
##                                 Coef    S.E.   t      Pr(>|t|)
##  Intercept                       0.0691 0.0229   3.02 0.0026  
##  fluoride_content                0.1696 0.0347   4.89 <0.0001 
##  fluoridated                    -0.1374 0.0286  -4.80 <0.0001 
##  Black                          -0.4800 0.0249 -19.26 <0.0001 
##  Hispanic                       -0.1459 0.0186  -7.84 <0.0001 
##  Asian                           0.1048 0.0113   9.28 <0.0001 
##  dem16_frac                     -0.0349 0.0222  -1.57 0.1163  
##  CA_spatial                      0.2854 0.0199  14.31 <0.0001 
##  fluoride_content * fluoridated -0.0666 0.0256  -2.60 0.0093  
## 
#nonlinear?
ols(CA ~ rcs(fluoride_content, 3) + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
## Linear Regression Model
##  
##  ols(formula = CA ~ rcs(fluoride_content, 3) + fluoridated + Black + 
##      Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1687    LR chi2    1375.34    R2       0.557    
##  sigma0.6668    d.f.             8    R2 adj   0.555    
##  d.f.   1678    Pr(> chi2)  0.0000    g        0.764    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -4.231865 -0.397997  0.003881  0.407008  2.339993 
##  
##  
##                    Coef    S.E.   t      Pr(>|t|)
##  Intercept          0.1390 0.0388   3.58 0.0004  
##  fluoride_content   0.4834 0.0915   5.29 <0.0001 
##  fluoride_content' -0.2402 0.0609  -3.94 <0.0001 
##  fluoridated       -0.1835 0.0320  -5.73 <0.0001 
##  Black             -0.4137 0.0270 -15.35 <0.0001 
##  Hispanic          -0.1495 0.0197  -7.59 <0.0001 
##  Asian              0.1332 0.0181   7.38 <0.0001 
##  dem16_frac        -0.0499 0.0236  -2.12 0.0344  
##  CA_spatial         0.3549 0.0220  16.14 <0.0001 
## 
ols(CA ~ rcs(fluoride_content, 3) + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
## Linear Regression Model
##  
##  ols(formula = CA ~ rcs(fluoride_content, 3) + fluoridated + Black + 
##      Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, 
##      weights = d$sqrt_pop)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1687    LR chi2    1334.11    R2       0.547    
##  sigma8.7979    d.f.             8    R2 adj   0.544    
##  d.f.   1678    Pr(> chi2)  0.0000    g        0.720    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -4.31784 -0.43673 -0.01989  0.37814  2.42350 
##  
##  
##                    Coef    S.E.   t      Pr(>|t|)
##  Intercept          0.1045 0.0387   2.70 0.0070  
##  fluoride_content   0.2930 0.0878   3.34 0.0009  
##  fluoride_content' -0.1300 0.0593  -2.19 0.0286  
##  fluoridated       -0.1426 0.0324  -4.41 <0.0001 
##  Black             -0.4820 0.0250 -19.30 <0.0001 
##  Hispanic          -0.1471 0.0187  -7.85 <0.0001 
##  Asian              0.1051 0.0113   9.29 <0.0001 
##  dem16_frac        -0.0331 0.0223  -1.49 0.1373  
##  CA_spatial         0.2848 0.0200  14.26 <0.0001 
## 
#nonlinear + interaction?
ols(CA ~ rcs(fluoride_content, 3) * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
## Linear Regression Model
##  
##  ols(formula = CA ~ rcs(fluoride_content, 3) * fluoridated + Black + 
##      Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1687    LR chi2    1377.41    R2       0.558    
##  sigma0.6668    d.f.            10    R2 adj   0.555    
##  d.f.   1676    Pr(> chi2)  0.0000    g        0.765    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -4.235754 -0.400459  0.006811  0.403027  2.341620 
##  
##  
##                                  Coef    S.E.   t      Pr(>|t|)
##  Intercept                        0.1803 0.0502   3.59 0.0003  
##  fluoride_content                 0.6098 0.1600   3.81 0.0001  
##  fluoride_content'               -0.3837 0.1276  -3.01 0.0027  
##  fluoridated                     -0.1974 0.0368  -5.37 <0.0001 
##  Black                           -0.4143 0.0270 -15.36 <0.0001 
##  Hispanic                        -0.1498 0.0198  -7.57 <0.0001 
##  Asian                            0.1341 0.0181   7.42 <0.0001 
##  dem16_frac                      -0.0488 0.0236  -2.06 0.0392  
##  CA_spatial                       0.3540 0.0220  16.06 <0.0001 
##  fluoride_content * fluoridated   0.0489 0.0899   0.54 0.5864  
##  fluoride_content' * fluoridated  0.0218 0.0708   0.31 0.7580  
## 
ols(CA ~ rcs(fluoride_content, 3) * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
## Linear Regression Model
##  
##  ols(formula = CA ~ rcs(fluoride_content, 3) * fluoridated + Black + 
##      Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, 
##      weights = d$sqrt_pop)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1687    LR chi2    1336.13    R2       0.547    
##  sigma8.7978    d.f.            10    R2 adj   0.544    
##  d.f.   1676    Pr(> chi2)  0.0000    g        0.719    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -4.31160 -0.43582 -0.02268  0.38208  2.50707 
##  
##  
##                                  Coef    S.E.   t      Pr(>|t|)
##  Intercept                        0.0602 0.0498   1.21 0.2262  
##  fluoride_content                 0.1454 0.1473   0.99 0.3239  
##  fluoride_content'                0.0249 0.1249   0.20 0.8417  
##  fluoridated                     -0.1331 0.0375  -3.55 0.0004  
##  Black                           -0.4796 0.0251 -19.15 <0.0001 
##  Hispanic                        -0.1454 0.0188  -7.73 <0.0001 
##  Asian                            0.1049 0.0113   9.27 <0.0001 
##  dem16_frac                      -0.0353 0.0224  -1.58 0.1144  
##  CA_spatial                       0.2856 0.0200  14.29 <0.0001 
##  fluoride_content * fluoridated  -0.0727 0.0832  -0.87 0.3824  
##  fluoride_content' * fluoridated -0.0039 0.0700  -0.06 0.9556  
## 

There seems to be some non-linear effects and some interaction between the F variables, but both are too small to matter much including in conjunction.