Packages.
options(digits=2)
library(pacman)
p_load(kirkegaard, readr, dplyr, rms)
Load data.
#water data
water_data = readr::read_csv("data/fluoride_conc_bycounty.csv")
## Parsed with column specification:
## cols(
## ID = col_character(),
## county = col_character(),
## fluoridated = col_character(),
## conc = col_double()
## )
#county data
d = read_rds("data/USA_county_data.rds")
#save a complete copy
d_orig = d
#recode water
water_data %<>% mutate(
#get state
state = str_match(ID, pattern = "\\w+") %>% as.vector,
#get county name without fluff
county = str_replace(county, pattern = " \\(.+\\)$", replacement = ""),
#combined state-county
state_county = state + " - " + county,
#rename
fluoride_content = conc
)
#recode county
d %<>% mutate(
#make a combined id for merging
state_county = ST + " - " + name_16,
#sqrt_pop, for weights
sqrt_pop = sqrt(Total.Population)
)
Aggregate water data within counties.
#for each county, calc the mean fluoride content. For the non-numerical cols, use the first value.
water_data_county = plyr::ddply(water_data, .variables = "state_county", function(x) {
data_frame(
ID = x$ID[1],
county = x$county[1],
fluoridated = wtd_mean(x$fluoridated == "Yes"),
fluoride_content = wtd_mean(x$fluoride_content),
state = x$state[1],
state_county = x$state_county[1]
)
})
Merge data.
#remove units with missing ids
water_data_county %<>% filter(!is.na(state_county))
d %<>% filter(!is.na(state_county))
#overlapping data
intersect(d$state_county, water_data_county$state_county) %>% length
## [1] 1704
#assert no duplicates!
assert_that(!any(duplicated(d$state_county)))
## [1] TRUE
assert_that(!any(duplicated(water_data_county$state_county)))
## [1] TRUE
#merge!
d = dplyr::inner_join(d, water_data_county, by = "state_county")
#remove cases without CA data
d %<>% filter(!is.na(CA))
#spatial lag
d$CA_spatial = SAC_knsnr(d, dependent = "CA", lat_var = "lat", lon_var = "lon")
#standardized version
#because Emil's fancy modeling function does not support interaction terms!
d_std = d[c("CA", "fluoride_content", "fluoridated", "Black", "Hispanic", "Asian", "dem16_frac", "CA_spatial")] %>% df_standardize
Regression models.
#correlation
GG_scatter(d, "fluoride_content", "CA", case_names = F)
ggsave("figures/F_content_intel.png")
## Saving 7 x 5 in image
GG_scatter(d, "fluoride_content", "CA", case_names = F, weights = d$sqrt_pop)
ggsave("figures/F_content_intel_wtd.png")
## Saving 7 x 5 in image
#with racial covar
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian", data = d) %>% MOD_summary()
## Model coefficients
## Beta SE CI_lower CI_upper
## fluoride_content 0.18 0.025 0.13 0.231
## fluoridated -0.12 0.024 -0.17 -0.074
## Black -0.68 0.018 -0.71 -0.642
## Hispanic -0.18 0.020 -0.22 -0.138
## Asian 0.17 0.018 0.13 0.201
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1687 1681 0.48 0.48 0.47
##
##
## Etas from analysis of variance
## Eta Eta_partial
## fluoride_content 0.130 0.18
## fluoridated 0.089 0.12
## Black 0.656 0.67
## Hispanic 0.158 0.21
## Asian 0.165 0.22
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian", data = d, weights = d$sqrt_pop) %>% MOD_summary
## Warning in sqrt(.): NaNs produced
## Model coefficients
## Beta SE CI_lower CI_upper
## fluoride_content 0.14 0.026 0.091 0.194
## fluoridated -0.10 0.023 -0.148 -0.056
## Black -0.66 0.017 -0.694 -0.626
## Hispanic -0.17 0.017 -0.200 -0.132
## Asian 0.14 0.010 0.119 0.159
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1687 1681 0.49 0.49 0.48
##
##
## Etas from analysis of variance
## Eta Eta_partial
## fluoride_content NaN 1
## fluoridated NaN 1
## Black NaN 1
## Hispanic NaN 1
## Asian NaN 1
#with political covar
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac", data = d) %>% MOD_summary
## Model coefficients
## Beta SE CI_lower CI_upper
## fluoride_content 0.179 0.025 0.129 0.229
## fluoridated -0.117 0.024 -0.164 -0.069
## Black -0.667 0.024 -0.714 -0.620
## Hispanic -0.171 0.021 -0.213 -0.130
## Asian 0.172 0.019 0.134 0.210
## dem16_frac -0.019 0.025 -0.069 0.031
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1687 1680 0.48 0.48 0.47
##
##
## Etas from analysis of variance
## Eta Eta_partial
## fluoride_content 0.123 0.169
## fluoridated 0.085 0.117
## Black 0.491 0.562
## Hispanic 0.143 0.194
## Asian 0.156 0.211
## dem16_frac 0.013 0.018
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac", data = d, weights = d$sqrt_pop) %>% MOD_summary
## Warning in sqrt(.): NaNs produced
## Model coefficients
## Beta SE CI_lower CI_upper
## fluoride_content 0.15 0.027 0.095 0.202
## fluoridated -0.11 0.024 -0.155 -0.060
## Black -0.67 0.022 -0.716 -0.629
## Hispanic -0.17 0.019 -0.212 -0.135
## Asian 0.13 0.012 0.111 0.157
## dem16_frac 0.02 0.023 -0.025 0.066
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1687 1680 0.49 0.49 0.47
##
##
## Etas from analysis of variance
## Eta Eta_partial
## fluoride_content NaN 1
## fluoridated NaN 1
## Black NaN 1
## Hispanic NaN 1
## Asian NaN 1
## dem16_frac NaN 1
#with spatial lag
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial", data = d) %>% MOD_summary
## Model coefficients
## Beta SE CI_lower CI_upper
## fluoride_content 0.135 0.024 0.089 0.1816
## fluoridated -0.093 0.022 -0.137 -0.0490
## Black -0.407 0.027 -0.460 -0.3536
## Hispanic -0.140 0.020 -0.179 -0.1017
## Asian 0.134 0.018 0.098 0.1692
## dem16_frac -0.053 0.024 -0.099 -0.0062
## CA_spatial 0.367 0.022 0.325 0.4103
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1687 1679 0.55 0.55 0.54
##
##
## Etas from analysis of variance
## Eta Eta_partial
## fluoride_content 0.093 0.138
## fluoridated 0.068 0.101
## Black 0.246 0.345
## Hispanic 0.116 0.172
## Asian 0.120 0.177
## dem16_frac 0.036 0.054
## CA_spatial 0.274 0.379
lm("CA ~ fluoride_content + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial", data = d, weights = d$sqrt_pop) %>% MOD_summary
## Warning in sqrt(.): NaNs produced
## Model coefficients
## Beta SE CI_lower CI_upper
## fluoride_content 0.109 0.026 0.058 0.1595
## fluoridated -0.092 0.023 -0.137 -0.0475
## Black -0.479 0.025 -0.527 -0.4296
## Hispanic -0.141 0.019 -0.177 -0.1044
## Asian 0.107 0.011 0.085 0.1290
## dem16_frac -0.037 0.022 -0.080 0.0072
## CA_spatial 0.288 0.020 0.248 0.3268
##
##
## Model meta-data
## outcome N df R2 R2-adj. R2-cv
## 1 CA 1687 1679 0.55 0.54 0.53
##
##
## Etas from analysis of variance
## Eta Eta_partial
## fluoride_content NaN 1
## fluoridated NaN 1
## Black NaN 1
## Hispanic NaN 1
## Asian NaN 1
## dem16_frac NaN 1
## CA_spatial NaN 1
#interaction?
ols(CA ~ fluoride_content * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
## Linear Regression Model
##
## ols(formula = CA ~ fluoride_content * fluoridated + Black + Hispanic +
## Asian + dem16_frac + CA_spatial, data = d_std)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1687 LR chi2 1367.36 R2 0.555
## sigma0.6684 d.f. 8 R2 adj 0.553
## d.f. 1678 Pr(> chi2) 0.0000 g 0.759
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.2166950 -0.4013999 0.0009606 0.4054652 2.3990577
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0428 0.0225 1.90 0.0576
## fluoride_content 0.2031 0.0343 5.93 <0.0001
## fluoridated -0.1398 0.0282 -4.97 <0.0001
## Black -0.4101 0.0270 -15.20 <0.0001
## Hispanic -0.1451 0.0197 -7.37 <0.0001
## Asian 0.1324 0.0181 7.31 <0.0001
## dem16_frac -0.0525 0.0236 -2.22 0.0264
## CA_spatial 0.3611 0.0219 16.46 <0.0001
## fluoride_content * fluoridated -0.0690 0.0251 -2.75 0.0061
##
ols(CA ~ fluoride_content * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
## Linear Regression Model
##
## ols(formula = CA ~ fluoride_content * fluoridated + Black + Hispanic +
## Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1687 LR chi2 1336.09 R2 0.547
## sigma8.7927 d.f. 8 R2 adj 0.545
## d.f. 1678 Pr(> chi2) 0.0000 g 0.720
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.31443 -0.43367 -0.02315 0.38062 2.49536
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0691 0.0229 3.02 0.0026
## fluoride_content 0.1696 0.0347 4.89 <0.0001
## fluoridated -0.1374 0.0286 -4.80 <0.0001
## Black -0.4800 0.0249 -19.26 <0.0001
## Hispanic -0.1459 0.0186 -7.84 <0.0001
## Asian 0.1048 0.0113 9.28 <0.0001
## dem16_frac -0.0349 0.0222 -1.57 0.1163
## CA_spatial 0.2854 0.0199 14.31 <0.0001
## fluoride_content * fluoridated -0.0666 0.0256 -2.60 0.0093
##
#nonlinear?
ols(CA ~ rcs(fluoride_content, 3) + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
## Linear Regression Model
##
## ols(formula = CA ~ rcs(fluoride_content, 3) + fluoridated + Black +
## Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1687 LR chi2 1375.34 R2 0.557
## sigma0.6668 d.f. 8 R2 adj 0.555
## d.f. 1678 Pr(> chi2) 0.0000 g 0.764
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.231865 -0.397997 0.003881 0.407008 2.339993
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.1390 0.0388 3.58 0.0004
## fluoride_content 0.4834 0.0915 5.29 <0.0001
## fluoride_content' -0.2402 0.0609 -3.94 <0.0001
## fluoridated -0.1835 0.0320 -5.73 <0.0001
## Black -0.4137 0.0270 -15.35 <0.0001
## Hispanic -0.1495 0.0197 -7.59 <0.0001
## Asian 0.1332 0.0181 7.38 <0.0001
## dem16_frac -0.0499 0.0236 -2.12 0.0344
## CA_spatial 0.3549 0.0220 16.14 <0.0001
##
ols(CA ~ rcs(fluoride_content, 3) + fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
## Linear Regression Model
##
## ols(formula = CA ~ rcs(fluoride_content, 3) + fluoridated + Black +
## Hispanic + Asian + dem16_frac + CA_spatial, data = d_std,
## weights = d$sqrt_pop)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1687 LR chi2 1334.11 R2 0.547
## sigma8.7979 d.f. 8 R2 adj 0.544
## d.f. 1678 Pr(> chi2) 0.0000 g 0.720
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.31784 -0.43673 -0.01989 0.37814 2.42350
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.1045 0.0387 2.70 0.0070
## fluoride_content 0.2930 0.0878 3.34 0.0009
## fluoride_content' -0.1300 0.0593 -2.19 0.0286
## fluoridated -0.1426 0.0324 -4.41 <0.0001
## Black -0.4820 0.0250 -19.30 <0.0001
## Hispanic -0.1471 0.0187 -7.85 <0.0001
## Asian 0.1051 0.0113 9.29 <0.0001
## dem16_frac -0.0331 0.0223 -1.49 0.1373
## CA_spatial 0.2848 0.0200 14.26 <0.0001
##
#nonlinear + interaction?
ols(CA ~ rcs(fluoride_content, 3) * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
## Linear Regression Model
##
## ols(formula = CA ~ rcs(fluoride_content, 3) * fluoridated + Black +
## Hispanic + Asian + dem16_frac + CA_spatial, data = d_std)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1687 LR chi2 1377.41 R2 0.558
## sigma0.6668 d.f. 10 R2 adj 0.555
## d.f. 1676 Pr(> chi2) 0.0000 g 0.765
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.235754 -0.400459 0.006811 0.403027 2.341620
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.1803 0.0502 3.59 0.0003
## fluoride_content 0.6098 0.1600 3.81 0.0001
## fluoride_content' -0.3837 0.1276 -3.01 0.0027
## fluoridated -0.1974 0.0368 -5.37 <0.0001
## Black -0.4143 0.0270 -15.36 <0.0001
## Hispanic -0.1498 0.0198 -7.57 <0.0001
## Asian 0.1341 0.0181 7.42 <0.0001
## dem16_frac -0.0488 0.0236 -2.06 0.0392
## CA_spatial 0.3540 0.0220 16.06 <0.0001
## fluoride_content * fluoridated 0.0489 0.0899 0.54 0.5864
## fluoride_content' * fluoridated 0.0218 0.0708 0.31 0.7580
##
ols(CA ~ rcs(fluoride_content, 3) * fluoridated + Black + Hispanic + Asian + dem16_frac + CA_spatial, data = d_std, weights = d$sqrt_pop)
## Linear Regression Model
##
## ols(formula = CA ~ rcs(fluoride_content, 3) * fluoridated + Black +
## Hispanic + Asian + dem16_frac + CA_spatial, data = d_std,
## weights = d$sqrt_pop)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 1687 LR chi2 1336.13 R2 0.547
## sigma8.7978 d.f. 10 R2 adj 0.544
## d.f. 1676 Pr(> chi2) 0.0000 g 0.719
##
## Residuals
##
## Min 1Q Median 3Q Max
## -4.31160 -0.43582 -0.02268 0.38208 2.50707
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0602 0.0498 1.21 0.2262
## fluoride_content 0.1454 0.1473 0.99 0.3239
## fluoride_content' 0.0249 0.1249 0.20 0.8417
## fluoridated -0.1331 0.0375 -3.55 0.0004
## Black -0.4796 0.0251 -19.15 <0.0001
## Hispanic -0.1454 0.0188 -7.73 <0.0001
## Asian 0.1049 0.0113 9.27 <0.0001
## dem16_frac -0.0353 0.0224 -1.58 0.1144
## CA_spatial 0.2856 0.0200 14.29 <0.0001
## fluoride_content * fluoridated -0.0727 0.0832 -0.87 0.3824
## fluoride_content' * fluoridated -0.0039 0.0700 -0.06 0.9556
##
There seems to be some non-linear effects and some interaction between the F variables, but both are too small to matter much including in conjunction.