Linear and Logistic Models for Superfund Sites, Long Island, NY

Descriptive Statistics

Demographic Factors

#PNPL Standardized Split

MEDIAN <- round(median(df$PNPL),4)
LABEL0 <- paste("PNPL Score < Median:", round(MEDIAN,2))
LABEL1 <- paste("PNPL Score >= Median:", round(MEDIAN,2))

df <- df %>% 
  mutate(PNPL_SPLIT = as.numeric(PNPL >= MEDIAN),
        PNPL_SPLIT = factor(PNPL_SPLIT,
                             levels = 0:1,
                             labels = c(LABEL0, LABEL1)))

tapply(df$PNPL, df$PNPL_SPLIT, range)

## $`PNPL Score < Median: 0.21`
## [1] 0.03104137 0.21264430
## 
## $`PNPL Score >= Median: 0.21`
## [1] 0.2128068 5.1370115

tbl_dem <- 
  df %>% 
  dplyr::select(VULEOPCT, MINORPCT, LOWINCPCT, LESSHSPCT, LINGISOPCT, UNEMPPCT, UNDER5PCT, OVER64PCT, PNPL_SPLIT) %>%
  tbl_summary(
    # The "by" variable
    by =PNPL_SPLIT,
    statistic = list(all_continuous()  ~ "{mean} ({sd})"),
    digits = list(all_continuous()  ~ c(2, 2)),
    type = list(VULEOPCT ~ "continuous",
                MINORPCT ~ "continuous",
                LOWINCPCT ~ "continuous",
                UNEMPPCT ~ "continuous",
                LESSHSPCT ~ "continuous",
                LINGISOPCT ~ "continuous",
                UNDER5PCT ~ "continuous",
                OVER64PCT ~ "continuous"),
    
    label  = list(VULEOPCT ~ "Vulerability Index %",
                  MINORPCT ~ "Racial Minority %",
                  LOWINCPCT   ~ "Low Income %",
                  UNEMPPCT ~ "Unemployed %",
                  LESSHSPCT ~ "Less than HS Ed. %",
                  LINGISOPCT ~ "Lingustic Isolation %",
                  UNDER5PCT ~ "Under Age 5 %",
                  OVER64PCT ~ "Over Age 64 %")
  ) %>%
  modify_header(
    label = "**Variable**",
    # The following adds the % to the column total label
    # <br> is the location of a line break
    all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
  ) %>%
  modify_caption("Demographic Characteristics") %>%
  bold_labels()  %>%
  # Include an "overall" column
  add_overall(
    last = FALSE,
    # The ** make it bold
    col_label = "**All census block groups**<br>N = {N}"
  ) %>% 
  add_p(
    test = list(all_continuous()  ~ "t.test",
                all_categorical() ~ "chisq.test"),
    pvalue_fun = function(x) style_pvalue(x, digits = 3)
  ) 

tbl_dem

Demographic Characteristics
Variable	All census block groups N = 2190¹	PNPL Score < Median: 0.21 N = 1095 (50.0%)¹	PNPL Score >= Median: 0.21 N = 1095 (50.0%)¹	p-value²
Vulerability Index %	0.25 (0.17)	0.22 (0.16)	0.27 (0.18)	<0.001
Racial Minority %	0.34 (0.28)	0.29 (0.25)	0.40 (0.29)	<0.001
Low Income %	0.15 (0.13)	0.15 (0.13)	0.15 (0.14)	0.872
Less than HS Ed. %	0.08 (0.09)	0.08 (0.08)	0.09 (0.10)	<0.001
Lingustic Isolation %	0.04 (0.07)	0.03 (0.06)	0.05 (0.07)	<0.001
Unemployed %	0.04 (0.05)	0.04 (0.05)	0.04 (0.05)	0.201
Under Age 5 %	0.05 (0.04)	0.05 (0.04)	0.05 (0.04)	0.105
Over Age 64 %	0.19 (0.11)	0.19 (0.13)	0.18 (0.09)	0.001
¹ Mean (SD)
² Welch Two Sample t-test

tbl_dem_strat <- 
  df %>% 
  dplyr::select(VULEOPCT, MINORPCT, LOWINCPCT, LESSHSPCT, LINGISOPCT, UNEMPPCT, UNDER5PCT, OVER64PCT, PNPL_SPLIT, CNTY_NAME) %>%
  mutate(CNTY_NAME = paste(CNTY_NAME,"County")) %>%
  tbl_strata(
    strata = CNTY_NAME,
    .tbl_fun =
      ~ .x %>%
      tbl_summary(
        # The "by" variable
        by =PNPL_SPLIT,
        statistic = list(all_continuous()  ~ "{mean} ({sd})"),
        digits = list(all_continuous()  ~ c(2, 2)),
        type = list(VULEOPCT ~ "continuous",
                    MINORPCT ~ "continuous",
                    LOWINCPCT ~ "continuous",
                    UNEMPPCT ~ "continuous",
                    LESSHSPCT ~ "continuous",
                    LINGISOPCT ~ "continuous",
                    UNDER5PCT ~ "continuous",
                    OVER64PCT ~ "continuous"),
        
        label  = list(VULEOPCT ~ "Vulerability Index %",
                      MINORPCT ~ "Racial Minority %",
                      LOWINCPCT   ~ "Low Income %",
                      UNEMPPCT ~ "Unemployed %",
                      LESSHSPCT ~ "Less than HS Ed. %",
                      LINGISOPCT ~ "Lingustic Isolation %",
                      UNDER5PCT ~ "Under Age 5 %",
                      OVER64PCT ~ "Over Age 64 %")
      ) %>%
      modify_header(
        label = "**Variable**",
        # The following adds the % to the column total label
        # <br> is the location of a line break
        all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
      ) %>%
      modify_caption("Demographic Characteristics - Stratified by County") %>%
      bold_labels()  %>%
      # Include an "overall" column
      add_overall(
        last = FALSE,
        # The ** make it bold
        col_label = "**All census block groups**<br>N = {N}"
      ) %>% 
      add_p(
        test = list(all_continuous()  ~ "t.test",
                    all_categorical() ~ "chisq.test"),
        pvalue_fun = function(x) style_pvalue(x, digits = 3)
      ) )

tbl_dem_strat

Demographic Characteristics - Stratified by County
Variable	Nassau County County				Suffolk County County
Variable	All census block groups N = 1134¹	PNPL Score < Median: 0.21 N = 358 (31.6%)¹	PNPL Score >= Median: 0.21 N = 776 (68.4%)¹	p-value²	All census block groups N = 1056¹	PNPL Score < Median: 0.21 N = 737 (69.8%)¹	PNPL Score >= Median: 0.21 N = 319 (30.2%)¹	p-value²
Vulerability Index %	0.26 (0.18)	0.24 (0.19)	0.27 (0.18)	0.007	0.23 (0.16)	0.21 (0.15)	0.28 (0.19)	<0.001
Racial Minority %	0.39 (0.29)	0.35 (0.30)	0.40 (0.28)	0.002	0.30 (0.26)	0.26 (0.22)	0.39 (0.31)	<0.001
Low Income %	0.13 (0.13)	0.13 (0.13)	0.14 (0.13)	0.478	0.16 (0.13)	0.16 (0.13)	0.17 (0.14)	0.100
Less than HS Ed. %	0.08 (0.09)	0.07 (0.09)	0.09 (0.09)	0.006	0.09 (0.10)	0.08 (0.08)	0.11 (0.12)	<0.001
Lingustic Isolation %	0.05 (0.08)	0.04 (0.08)	0.06 (0.08)	<0.001	0.03 (0.05)	0.02 (0.05)	0.04 (0.06)	<0.001
Unemployed %	0.04 (0.05)	0.04 (0.06)	0.04 (0.05)	0.310	0.04 (0.04)	0.04 (0.04)	0.04 (0.05)	0.837
Under Age 5 %	0.05 (0.04)	0.05 (0.04)	0.05 (0.04)	0.592	0.05 (0.04)	0.05 (0.04)	0.05 (0.04)	0.221
Over Age 64 %	0.18 (0.09)	0.18 (0.09)	0.18 (0.09)	0.700	0.19 (0.13)	0.20 (0.14)	0.17 (0.11)	<0.001
¹ Mean (SD)
² Welch Two Sample t-test

Environmental Factors

tbl_env <- 
  df %>% 
  dplyr::select(PRE1960PCT, PTRAF, PWDIS, PRMP, PTSDF, OZONE, PM25, UST, PNPL_SPLIT) %>%
      tbl_summary(
        # The "by" variable
        by =PNPL_SPLIT,
        statistic = list(all_continuous()  ~ "{mean} ({sd})"),
        digits = list(all_continuous()  ~ c(2, 2)),
        type = list(PRE1960PCT ~ "continuous",
                    PTRAF ~ "continuous",
                    PWDIS ~ "continuous",
                    PRMP ~ "continuous",
                    PTSDF ~ "continuous",
                    OZONE ~ "continuous",
                    PM25 ~ "continuous",
                    UST ~ "continuous"),
        
        label  = list(PRE1960PCT ~ "% Houses Built Pre 1960",
                      PTRAF ~ "Proximity to Traffic",
                      PWDIS ~ "Proximity to Waterwater Discharge Facility",
                      PRMP   ~ "Proximity to Regulated Management Plan Facility",
                      PTSDF ~ "Proximity to Hazardous Waste Facility",
                      OZONE ~ "Average Seasonal Ozone Concentration",
                      PM25 ~ "Annual Average PM2.5 Concentration",
                      UST ~ "Proximity to Underground Storage Tank")
      ) %>%
      modify_header(
        label = "**Variable**",
        # The following adds the % to the column total label
        # <br> is the location of a line break
        all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
      ) %>%
      modify_caption("Environmental Characteristics - Stratified by County") %>%
      bold_labels()  %>%
      # Include an "overall" column
      add_overall(
        last = FALSE,
        # The ** make it bold
        col_label = "**All census block groups**<br>N = {N}"
      ) %>% 
      add_p(
        test = list(all_continuous()  ~ "t.test",
                    all_categorical() ~ "chisq.test"),
        pvalue_fun = function(x) style_pvalue(x, digits = 3)
      ) 

tbl_env

Environmental Characteristics - Stratified by County
Variable	All census block groups N = 2190¹	PNPL Score < Median: 0.21 N = 1095 (50.0%)¹	PNPL Score >= Median: 0.21 N = 1095 (50.0%)¹	p-value²
% Houses Built Pre 1960	0.53 (0.29)	0.46 (0.27)	0.61 (0.29)	<0.001
Proximity to Traffic	504.84 (685.51)	387.27 (540.78)	622.41 (787.50)	<0.001
Proximity to Waterwater Discharge Facility	0.01 (0.19)	0.02 (0.19)	0.01 (0.19)	0.225
Proximity to Regulated Management Plan Facility	0.16 (0.21)	0.09 (0.06)	0.23 (0.27)	<0.001
Proximity to Hazardous Waste Facility	2.02 (1.74)	1.20 (1.03)	2.84 (1.91)	<0.001
Average Seasonal Ozone Concentration	43.56 (0.61)	43.51 (0.70)	43.61 (0.49)	<0.001
Annual Average PM2.5 Concentration	7.48 (0.46)	7.26 (0.44)	7.70 (0.38)	<0.001
Proximity to Underground Storage Tank	1.16 (1.98)	0.91 (1.54)	1.41 (2.32)	<0.001
¹ Mean (SD)
² Welch Two Sample t-test

tbl_env_strat <- 
  df %>% 
  dplyr::select(PRE1960PCT, PTRAF, PWDIS, PRMP, PTSDF, OZONE, PM25, UST, PNPL_SPLIT, CNTY_NAME) %>%
  mutate(CNTY_NAME = paste(CNTY_NAME,"County")) %>%
  tbl_strata(
    strata = CNTY_NAME,
    .tbl_fun =
      ~ .x %>%
      tbl_summary(
        # The "by" variable
        by =PNPL_SPLIT,
        statistic = list(all_continuous()  ~ "{mean} ({sd})"),
        digits = list(all_continuous()  ~ c(2, 2)),
        type = list(PRE1960PCT ~ "continuous",
                    PTRAF ~ "continuous",
                    PWDIS ~ "continuous",
                    PRMP ~ "continuous",
                    PTSDF ~ "continuous",
                    OZONE ~ "continuous",
                    PM25 ~ "continuous",
                    UST ~ "continuous"),
        
        label  = list(PRE1960PCT ~ "% Houses Built Pre 1960",
                      PTRAF ~ "Proximity to Traffic",
                      PWDIS ~ "Proximity to Waterwater Discharge Facility",
                      PRMP   ~ "Proximity to Regulated Management Plan Facility",
                      PTSDF ~ "Proximity to Hazardous Waste Facility",
                      OZONE ~ "Average Seasonal Ozone Concentration",
                      PM25 ~ "Annual Average PM2.5 Concentration",
                      UST ~ "Proximity to Underground Storage Tank")
      ) %>%
      modify_header(
        label = "**Variable**",
        # The following adds the % to the column total label
        # <br> is the location of a line break
        all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
      ) %>%
      modify_caption("Environmental Characteristics - Stratified by County") %>%
      bold_labels()  %>%
      # Include an "overall" column
      add_overall(
        last = FALSE,
        # The ** make it bold
        col_label = "**All census block groups**<br>N = {N}"
      ) %>% 
      add_p(
        test = list(all_continuous()  ~ "t.test",
                    all_categorical() ~ "chisq.test"),
        pvalue_fun = function(x) style_pvalue(x, digits = 3)
      ) )

tbl_env_strat

Environmental Characteristics - Stratified by County
Variable	Nassau County County				Suffolk County County
Variable	All census block groups N = 1134¹	PNPL Score < Median: 0.21 N = 358 (31.6%)¹	PNPL Score >= Median: 0.21 N = 776 (68.4%)¹	p-value²	All census block groups N = 1056¹	PNPL Score < Median: 0.21 N = 737 (69.8%)¹	PNPL Score >= Median: 0.21 N = 319 (30.2%)¹	p-value²
% Houses Built Pre 1960	0.72 (0.21)	0.68 (0.22)	0.74 (0.20)	<0.001	0.33 (0.22)	0.35 (0.23)	0.28 (0.20)	<0.001
Proximity to Traffic	664.91 (809.44)	587.33 (729.43)	700.70 (841.84)	0.021	332.94 (462.90)	290.08 (384.30)	431.97 (595.81)	<0.001
Proximity to Waterwater Discharge Facility	0.03 (0.27)	0.06 (0.33)	0.01 (0.23)	0.020	0.00 (0.00)	0.00 (0.00)	0.00 (0.00)	0.019
Proximity to Regulated Management Plan Facility	0.23 (0.24)	0.14 (0.07)	0.27 (0.28)	<0.001	0.09 (0.12)	0.07 (0.03)	0.16 (0.19)	<0.001
Proximity to Hazardous Waste Facility	2.38 (1.72)	1.44 (0.97)	2.81 (1.82)	<0.001	1.64 (1.67)	1.09 (1.04)	2.91 (2.11)	<0.001
Average Seasonal Ozone Concentration	43.60 (0.51)	43.50 (0.55)	43.64 (0.48)	<0.001	43.53 (0.70)	43.52 (0.77)	43.55 (0.50)	0.408
Annual Average PM2.5 Concentration	7.81 (0.25)	7.62 (0.17)	7.89 (0.23)	<0.001	7.13 (0.38)	7.09 (0.42)	7.23 (0.24)	<0.001
Proximity to Underground Storage Tank	1.71 (2.48)	1.56 (2.15)	1.78 (2.62)	0.129	0.57 (0.93)	0.60 (0.99)	0.51 (0.77)	0.126
¹ Mean (SD)
² Welch Two Sample t-test

Regressions

Vulnerability Index was removed from the regrssion due to having a high correlation with the other variables.

Linear Regression

### Linear Regression 

### Adjusted, PTRAF, PWDIS, PRMP, and PTSDF were turned into categorical variabels (1-5) due to having non-linearity 

fit.adjusted <- lm(PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT +
                     LINGISOPCT + UNDER5PCT + OVER64PCT
                   + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, data = df)

summary(fit.adjusted)

## 
## Call:
## lm(formula = PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + 
##     LINGISOPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + 
##     PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8736 -0.4653 -0.1013  0.3984  2.4621 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  0.927103   1.107757   0.837              0.40273    
## MINORPCT    -0.458205   0.070759  -6.476       0.000000000116 ***
## LOWINCPCT    0.187780   0.125374   1.498              0.13434    
## LESSHSPCT    0.110720   0.201946   0.548              0.58357    
## LINGISOPCT   0.609317   0.246230   2.475              0.01341 *  
## UNDER5PCT    0.364534   0.355793   1.025              0.30568    
## OVER64PCT   -0.110311   0.135906  -0.812              0.41707    
## PRE1960PCT  -0.191164   0.062178  -3.074              0.00214 ** 
## PTRAF_Q      0.031599   0.011500   2.748              0.00605 ** 
## PWDIS_Q      0.189355   0.012090  15.662 < 0.0000000000000002 ***
## PRMP_Q      -0.245570   0.014202 -17.292 < 0.0000000000000002 ***
## PTDF_Q      -0.176359   0.011465 -15.383 < 0.0000000000000002 ***
## OZONE       -0.161639   0.029262  -5.524       0.000000037112 ***
## PM25         0.711391   0.052906  13.446 < 0.0000000000000002 ***
## UST          0.001443   0.007834   0.184              0.85382    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6514 on 2175 degrees of freedom
## Multiple R-squared:  0.5236, Adjusted R-squared:  0.5206 
## F-statistic: 170.8 on 14 and 2175 DF,  p-value: < 0.00000000000000022

# Table
NOBS <- length(fit.adjusted$residuals)

library(gtsummary)
fit.adjusted %>%
  tbl_regression(
                 estimate_fun = function(x) style_sigfig(x, digits = 3),
                 pvalue_fun   = function(x) style_pvalue(x, digits = 3),
                 label  = list(MINORPCT ~ "Racial Minority %",
                               LOWINCPCT   ~ "Low Income %",
                               LESSHSPCT ~ "Less than HS Ed. %",
                               LINGISOPCT ~ "Linguistic Isolation %",
                               UNDER5PCT ~ "Under 5%",
                               OVER64PCT   ~ "Over 64%",
                               PRE1960PCT ~ "Houses Built Pre-1960 %",
                               PTRAF_Q ~ "Proximity to Traffic (log)",
                               PWDIS_Q ~ "Proximity to Wastewater Discharge Facility",
                               PRMP_Q ~ "Proximity to RMP Facility",
                               PTDF_Q ~ "Proximity to Hazardous Waste Facility",
                               OZONE ~ "Average O3 Ambient Air Concentration",
                               PM25 ~ "Average Annual PM 2.5 Ambient Air Concentration",
                               UST ~ "Proximity to Underground Storage Facility"
                              )) %>%
  add_global_p(keep = T) %>% 
  modify_caption(paste("Linear regression results for proximity to Superfund sites (log transformed) by census block group (N = ", NOBS, ")", sep=""))

Linear regression results for proximity to Superfund sites (log transformed) by census block group (N = 2190)
Characteristic	Beta	95% CI¹	p-value
Racial Minority %	-0.458	-0.597, -0.319	<0.001
Low Income %	0.188	-0.058, 0.434	0.134
Less than HS Ed. %	0.111	-0.285, 0.507	0.584
Linguistic Isolation %	0.609	0.126, 1.09	0.013
Under 5%	0.365	-0.333, 1.06	0.306
Over 64%	-0.110	-0.377, 0.156	0.417
Houses Built Pre-1960 %	-0.191	-0.313, -0.069	0.002
Proximity to Traffic (log)	0.032	0.009, 0.054	0.006
Proximity to Wastewater Discharge Facility	0.189	0.166, 0.213	<0.001
Proximity to RMP Facility	-0.246	-0.273, -0.218	<0.001
Proximity to Hazardous Waste Facility	-0.176	-0.199, -0.154	<0.001
Average O3 Ambient Air Concentration	-0.162	-0.219, -0.104	<0.001
Average Annual PM 2.5 Ambient Air Concentration	0.711	0.608, 0.815	<0.001
Proximity to Underground Storage Facility	0.001	-0.014, 0.017	0.854
¹ CI = Confidence Interval

# plot histogram of residuals
hist(resid(fit.adjusted), breaks = 30, col = "grey", main = "Histogram of Residuals")

# create QQ-plot of residuals
qqnorm(resid(fit.adjusted))
qqline(resid(fit.adjusted))

# Fit the linear regression model
model <- glm(formula = PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + 
    LINGISOPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + 
    PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, data = df)

Fitting best linear model

# Removing variables with low coefficents or those that are insignficant

reduced_model <- step(fit.adjusted, direction = "backward")

## Start:  AIC=-1862.75
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + PM25 + UST
## 
##              Df Sum of Sq     RSS     AIC
## - UST         1     0.014  922.79 -1864.7
## - LESSHSPCT   1     0.128  922.91 -1864.4
## - OVER64PCT   1     0.280  923.06 -1864.1
## - UNDER5PCT   1     0.445  923.22 -1863.7
## <none>                     922.78 -1862.8
## - LOWINCPCT   1     0.952  923.73 -1862.5
## - LINGISOPCT  1     2.598  925.38 -1858.6
## - PTRAF_Q     1     3.203  925.98 -1857.2
## - PRE1960PCT  1     4.010  926.79 -1855.2
## - OZONE       1    12.946  935.72 -1834.2
## - MINORPCT    1    17.791  940.57 -1822.9
## - PM25        1    76.709  999.49 -1689.9
## - PTDF_Q      1   100.397 1023.18 -1638.6
## - PWDIS_Q     1   104.065 1026.84 -1630.7
## - PRMP_Q      1   126.856 1049.63 -1582.7
## 
## Step:  AIC=-1864.71
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + PM25
## 
##              Df Sum of Sq     RSS     AIC
## - LESSHSPCT   1     0.126  922.92 -1866.4
## - OVER64PCT   1     0.282  923.07 -1866.0
## - UNDER5PCT   1     0.439  923.23 -1865.7
## <none>                     922.79 -1864.7
## - LOWINCPCT   1     0.953  923.75 -1864.5
## - LINGISOPCT  1     2.641  925.43 -1860.5
## - PTRAF_Q     1     3.200  925.99 -1859.1
## - PRE1960PCT  1     3.999  926.79 -1857.2
## - OZONE       1    13.787  936.58 -1834.2
## - MINORPCT    1    17.808  940.60 -1824.8
## - PM25        1    79.654 1002.45 -1685.4
## - PTDF_Q      1   100.517 1023.31 -1640.3
## - PWDIS_Q     1   104.950 1027.74 -1630.8
## - PRMP_Q      1   127.430 1050.22 -1583.4
## 
## Step:  AIC=-1866.41
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LINGISOPCT + UNDER5PCT + 
##     OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + 
##     OZONE + PM25
## 
##              Df Sum of Sq     RSS     AIC
## - OVER64PCT   1     0.289  923.21 -1867.7
## - UNDER5PCT   1     0.444  923.36 -1867.4
## <none>                     922.92 -1866.4
## - LOWINCPCT   1     1.259  924.18 -1865.4
## - LINGISOPCT  1     3.171  926.09 -1860.9
## - PTRAF_Q     1     3.202  926.12 -1860.8
## - PRE1960PCT  1     4.082  927.00 -1858.8
## - OZONE       1    13.684  936.60 -1836.2
## - MINORPCT    1    19.503  942.42 -1822.6
## - PM25        1    79.948 1002.87 -1686.5
## - PTDF_Q      1   101.304 1024.22 -1640.3
## - PWDIS_Q     1   104.919 1027.84 -1632.6
## - PRMP_Q      1   128.395 1051.31 -1583.2
## 
## Step:  AIC=-1867.73
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LINGISOPCT + UNDER5PCT + 
##     PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + 
##     PM25
## 
##              Df Sum of Sq     RSS     AIC
## - UNDER5PCT   1     0.683  923.89 -1868.1
## <none>                     923.21 -1867.7
## - LOWINCPCT   1     1.180  924.39 -1866.9
## - LINGISOPCT  1     3.146  926.35 -1862.3
## - PTRAF_Q     1     3.163  926.37 -1862.2
## - PRE1960PCT  1     3.930  927.14 -1860.4
## - OZONE       1    13.513  936.72 -1837.9
## - MINORPCT    1    19.569  942.78 -1823.8
## - PM25        1    79.696 1002.90 -1688.4
## - PTDF_Q      1   101.704 1024.91 -1640.8
## - PWDIS_Q     1   104.928 1028.14 -1634.0
## - PRMP_Q      1   128.330 1051.54 -1584.7
## 
## Step:  AIC=-1868.11
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LINGISOPCT + PRE1960PCT + 
##     PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25
## 
##              Df Sum of Sq     RSS     AIC
## <none>                     923.89 -1868.1
## - LOWINCPCT   1     1.307  925.20 -1867.0
## - PTRAF_Q     1     3.082  926.97 -1862.8
## - LINGISOPCT  1     3.136  927.03 -1862.7
## - PRE1960PCT  1     3.795  927.69 -1861.1
## - OZONE       1    13.553  937.44 -1838.2
## - MINORPCT    1    19.096  942.99 -1825.3
## - PM25        1    79.587 1003.48 -1689.1
## - PTDF_Q      1   101.747 1025.64 -1641.3
## - PWDIS_Q     1   104.938 1028.83 -1634.5
## - PRMP_Q      1   128.251 1052.14 -1585.4

summary(reduced_model)

## 
## Call:
## lm(formula = PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LINGISOPCT + 
##     PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + 
##     PM25, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8108 -0.4678 -0.1027  0.3972  2.4349 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  0.91945    1.08174   0.850              0.39543    
## MINORPCT    -0.42234    0.06293  -6.711      0.0000000000246 ***
## LOWINCPCT    0.21010    0.11966   1.756              0.07926 .  
## LINGISOPCT   0.64417    0.23685   2.720              0.00659 ** 
## PRE1960PCT  -0.18421    0.06157  -2.992              0.00281 ** 
## PTRAF_Q      0.03073    0.01140   2.696              0.00707 ** 
## PWDIS_Q      0.18911    0.01202  15.732 < 0.0000000000000002 ***
## PRMP_Q      -0.24569    0.01413 -17.392 < 0.0000000000000002 ***
## PTDF_Q      -0.17710    0.01143 -15.491 < 0.0000000000000002 ***
## OZONE       -0.16096    0.02847  -5.654      0.0000000177607 ***
## PM25         0.70780    0.05166  13.701 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6512 on 2179 degrees of freedom
## Multiple R-squared:  0.523,  Adjusted R-squared:  0.5209 
## F-statistic:   239 on 10 and 2179 DF,  p-value: < 0.00000000000000022

reduced_model %>%
  tbl_regression(intercept = T,
                 estimate_fun = function(x) style_sigfig(x, digits = 2),
                 pvalue_fun   = function(x) style_pvalue(x, digits = 3),
                 label  = list(MINORPCT ~ "Racial Minority %",
                               LOWINCPCT   ~ "Low Income %",
                               LINGISOPCT ~ "Linguistic Isolation %",
                               PRE1960PCT ~ "Houses Built Pre-1960 %",
                               PTRAF_Q ~ "Proximity to Traffic",
                               PWDIS_Q ~ "Proximity to Wastewater Discharge Facility",
                               PRMP_Q ~ "Proximity to RMP Facility",
                               PTDF_Q ~ "Proximity to Hazardous Waste Facility",
                               OZONE ~ "Average O3 Ambient Air Concentration",
                               PM25 ~ "Average Annual PM 2.5 Ambient Air Concentration"                              )) %>%
  add_global_p(keep = T) %>% 
  modify_caption(paste("Reduced Linear regression results for proximity to Superfund sites (log transformed) by census block group (N = ", NOBS, ")", sep=""))

Reduced Linear regression results for proximity to Superfund sites (log transformed) by census block group (N = 2190)
Characteristic	Beta	95% CI¹	p-value
(Intercept)	0.92	-1.2, 3.0	0.395
Racial Minority %	-0.42	-0.55, -0.30	<0.001
Low Income %	0.21	-0.02, 0.44	0.079
Linguistic Isolation %	0.64	0.18, 1.1	0.007
Houses Built Pre-1960 %	-0.18	-0.30, -0.06	0.003
Proximity to Traffic	0.03	0.01, 0.05	0.007
Proximity to Wastewater Discharge Facility	0.19	0.17, 0.21	<0.001
Proximity to RMP Facility	-0.25	-0.27, -0.22	<0.001
Proximity to Hazardous Waste Facility	-0.18	-0.20, -0.15	<0.001
Average O3 Ambient Air Concentration	-0.16	-0.22, -0.11	<0.001
Average Annual PM 2.5 Ambient Air Concentration	0.71	0.61, 0.81	<0.001
¹ CI = Confidence Interval

# Get the coefficient estimates and confidence intervals
tidy_fit_reduced <- tidy(reduced_model, exponentiate = TRUE, conf.int = TRUE)

tidy_fit_reduced <- tidy(reduced_model, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Facility Quartile",
      term == "PRMP_Q" ~ "Proximity to RMP Facility Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

tidy_fit_reduced %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
NA	2.51	12.30	1.35	1217420362.37
Percent Minority	0.66	1.93	1.79	2.10
Percent Low Income	1.23	3.42	2.66	4.76
Percent Limited English Proficiency	1.90	6.69	3.32	20.70
Percent Pre-1960 Housing	0.83	2.29	2.10	2.56
Traffic Density Quartile	1.03	2.80	2.75	2.86
Proximity to Wastewater Discharge Facility Quartile	1.21	3.35	3.25	3.46
Proximity to RMP Facility Quartile	0.78	2.18	2.14	2.23
Proximity to Hazardous Waste Facility Quartile	0.84	2.32	2.27	2.36
Ozone Concentration (ppb)	0.85	2.34	2.25	2.46
PM2.5 Concentration (ug/m3)	2.03	7.61	6.23	9.49

Exponentiate Model

### Exponentiation Model

# Get the coefficient estimates and confidence intervals
tidy_fit <- tidy(fit.adjusted, exponentiate = TRUE, conf.int = TRUE)

tidy_fit <- tidy(fit.adjusted, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Facility Quartile",
      term == "PRMP_Q" ~ "Proximity to RMP Facility Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Proximity to Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

tidy_fit %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
NA	2.53	12.55	1.34	4335054416.82
Percent Minority	0.63	1.88	1.73	2.08
Percent Low Income	1.21	3.35	2.56	4.66
Percent Less than High School Education	1.12	3.06	2.12	5.26
Percent Limited English Proficiency	1.84	6.30	3.10	19.69
Percent Under 5 Years Old	1.44	4.22	2.05	17.99
Percent Over 64 Years Old	0.90	2.46	1.99	3.22
Percent Pre-1960 Housing	0.83	2.29	2.08	2.53
Traffic Density Quartile	1.03	2.80	2.75	2.89
Proximity to Wastewater Discharge Facility Quartile	1.21	3.35	3.25	3.46
Proximity to RMP Facility Quartile	0.78	2.18	2.14	2.23
Proximity to Hazardous Waste Facility Quartile	0.84	2.32	2.27	2.36
Ozone Concentration (ppb)	0.85	2.34	2.23	2.46
PM2.5 Concentration (ug/m3)	2.04	7.69	6.30	9.58
Proximity to Underground Storage Tanks	1.00	2.72	2.69	2.77

# Get the coefficient estimates and confidence intervals
tidy_fit <- tidy(fit.adjusted, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Quartile",
      term == "PRMP_Q" ~ "Percent in Poverty Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

# Filter significant and positive effect variables
tidy_fit_significant <- tidy_fit %>%
  filter(p.value < 0.05, exp_estimate > 1) %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  arrange(desc(exp_estimate))

# Display the results
tidy_fit_significant %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
PM2.5 Concentration (ug/m3)	2.04	7.69	6.30	9.58
Percent Limited English Proficiency	1.84	6.30	3.10	19.69
Proximity to Wastewater Discharge Quartile	1.21	3.35	3.25	3.46
Traffic Density Quartile	1.03	2.80	2.75	2.89
Ozone Concentration (ppb)	0.85	2.34	2.23	2.46
Proximity to Hazardous Waste Facility Quartile	0.84	2.32	2.27	2.36
Percent Pre-1960 Housing	0.83	2.29	2.08	2.53
Percent in Poverty Quartile	0.78	2.18	2.14	2.23
Percent Minority	0.63	1.88	1.73	2.08

Linear Regression - Nassau

NassauEJ <- subset(EJScreen, CNTY_NAME == "Nassau County")
df_nassau = as.data.frame(NassauEJ)

fit.adjusted.nassau <- lm(PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT +
                     LINGISOPCT + UNDER5PCT + OVER64PCT
                   + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, data = df_nassau)

# Obtain model summary
summary(fit.adjusted.nassau)

## 
## Call:
## lm(formula = PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + 
##     LINGISOPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + 
##     PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, data = df_nassau)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.01836 -0.43066 -0.03288  0.42088  2.39012 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -11.97764    2.31475  -5.174 0.000000270717775821 ***
## MINORPCT     -0.77608    0.09451  -8.212 0.000000000000000597 ***
## LOWINCPCT     0.36371    0.19091   1.905               0.0570 .  
## LESSHSPCT     0.54359    0.30127   1.804               0.0714 .  
## LINGISOPCT    0.24242    0.31480   0.770               0.4414    
## UNDER5PCT     0.72408    0.52116   1.389               0.1650    
## OVER64PCT    -0.01349    0.24772  -0.054               0.9566    
## PRE1960PCT    0.20892    0.10379   2.013               0.0444 *  
## PTRAF_Q      -0.00969    0.01659  -0.584               0.5593    
## PWDIS_Q       0.20242    0.01713  11.820 < 0.0000000000000002 ***
## PRMP_Q       -0.16811    0.02655  -6.332 0.000000000348690279 ***
## PTDF_Q       -0.15018    0.01729  -8.686 < 0.0000000000000002 ***
## OZONE         0.06728    0.05137   1.310               0.1905    
## PM25          1.03121    0.09194  11.216 < 0.0000000000000002 ***
## UST           0.01773    0.00880   2.015               0.0441 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6779 on 1119 degrees of freedom
## Multiple R-squared:  0.4278, Adjusted R-squared:  0.4207 
## F-statistic: 59.76 on 14 and 1119 DF,  p-value: < 0.00000000000000022

# Calculate VIF values
vif(fit.adjusted.nassau)

##   MINORPCT  LOWINCPCT  LESSHSPCT LINGISOPCT  UNDER5PCT  OVER64PCT PRE1960PCT 
##   1.835252   1.553097   1.918606   1.515844   1.078144   1.150854   1.170427 
##    PTRAF_Q    PWDIS_Q     PRMP_Q     PTDF_Q      OZONE       PM25        UST 
##   1.180701   1.362653   1.669211   1.276865   1.681449   1.312099   1.175993

# Get the coefficient estimates and confidence intervals
tidy_fit_nassau <- tidy(fit.adjusted.nassau, exponentiate = TRUE, conf.int = TRUE)

tidy_fit_nassau <- tidy(fit.adjusted.nassau, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Facility Quartile",
      term == "PRMP_Q" ~ "Proximity to RMP Facility Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Proximity to Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

tidy_fit_nassau %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
NA	0.00	1.00	1.00	1.00
Percent Minority	0.46	1.58	1.46	1.73
Percent Low Income	1.44	4.22	2.69	8.08
Percent Less than High School Education	1.72	5.58	2.59	22.42
Percent Limited English Proficiency	1.27	3.56	1.99	10.59
Percent Under 5 Years Old	2.06	7.85	2.10	311.06
Percent Over 64 Years Old	0.99	2.69	1.84	4.95
Percent Pre-1960 Housing	1.23	3.42	2.75	4.53
Traffic Density Quartile	0.99	2.69	2.61	2.77
Proximity to Wastewater Discharge Facility Quartile	1.22	3.39	3.25	3.56
Proximity to RMP Facility Quartile	0.85	2.34	2.23	2.44
Proximity to Hazardous Waste Facility Quartile	0.86	2.36	2.29	2.44
Ozone Concentration (ppb)	1.07	2.92	2.64	3.25
PM2.5 Concentration (ug/m3)	2.80	16.44	10.38	28.79
Proximity to Underground Storage Tanks	1.02	2.77	2.72	2.83

### Significant results

# Get the coefficient estimates and confidence intervals
tidy_fit_nassau <- tidy(fit.adjusted.nassau, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Quartile",
      term == "PRMP_Q" ~ "Percent in Poverty Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

# Filter significant and positive effect variables
tidy_fit_significant_nassau <- tidy_fit_nassau %>%
  filter(p.value < 0.05, exp_estimate > 1) %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  arrange(desc(exp_estimate))

# Display the results
tidy_fit_significant_nassau %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
PM2.5 Concentration (ug/m3)	2.80	16.44	10.38	28.79
Percent Pre-1960 Housing	1.23	3.42	2.75	4.53
Proximity to Wastewater Discharge Quartile	1.22	3.39	3.25	3.56
Underground Storage Tanks	1.02	2.77	2.72	2.83
Proximity to Hazardous Waste Facility Quartile	0.86	2.36	2.29	2.44
Percent in Poverty Quartile	0.85	2.34	2.23	2.44
Percent Minority	0.46	1.58	1.46	1.73

## Reduced Model for Nassau

reduced_model_nassau <- step(fit.adjusted.nassau, direction = "backward")

## Start:  AIC=-866.76
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + PM25 + UST
## 
##              Df Sum of Sq    RSS     AIC
## - OVER64PCT   1     0.001 514.25 -868.76
## - PTRAF_Q     1     0.157 514.41 -868.41
## - LINGISOPCT  1     0.273 514.52 -868.16
## - OZONE       1     0.789 515.04 -867.02
## - UNDER5PCT   1     0.887 515.14 -866.81
## <none>                    514.25 -866.76
## - LESSHSPCT   1     1.496 515.75 -865.47
## - LOWINCPCT   1     1.668 515.92 -865.09
## - PRE1960PCT  1     1.862 516.11 -864.66
## - UST         1     1.866 516.12 -864.65
## - PRMP_Q      1    18.429 532.68 -828.83
## - MINORPCT    1    30.988 545.24 -802.41
## - PTDF_Q      1    34.672 548.92 -794.77
## - PM25        1    57.811 572.06 -747.95
## - PWDIS_Q     1    64.209 578.46 -735.34
## 
## Step:  AIC=-868.76
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + 
##     OZONE + PM25 + UST
## 
##              Df Sum of Sq    RSS     AIC
## - PTRAF_Q     1     0.156 514.41 -870.41
## - LINGISOPCT  1     0.273 514.53 -870.15
## - OZONE       1     0.788 515.04 -869.02
## <none>                    514.25 -868.76
## - UNDER5PCT   1     0.951 515.20 -868.66
## - LESSHSPCT   1     1.495 515.75 -867.46
## - LOWINCPCT   1     1.667 515.92 -867.09
## - UST         1     1.872 516.13 -866.64
## - PRE1960PCT  1     1.906 516.16 -866.56
## - PRMP_Q      1    18.470 532.72 -830.74
## - MINORPCT    1    32.096 546.35 -802.10
## - PTDF_Q      1    34.746 549.00 -796.62
## - PM25        1    58.211 572.46 -749.15
## - PWDIS_Q     1    64.219 578.47 -737.31
## 
## Step:  AIC=-870.41
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + PRE1960PCT + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + 
##     PM25 + UST
## 
##              Df Sum of Sq    RSS     AIC
## - LINGISOPCT  1     0.300 514.71 -871.75
## - OZONE       1     0.786 515.20 -870.68
## <none>                    514.41 -870.41
## - UNDER5PCT   1     0.984 515.39 -870.25
## - LESSHSPCT   1     1.478 515.89 -869.16
## - LOWINCPCT   1     1.658 516.07 -868.76
## - PRE1960PCT  1     1.963 516.37 -868.09
## - UST         1     2.029 516.44 -867.95
## - PRMP_Q      1    19.300 533.71 -830.65
## - MINORPCT    1    32.077 546.49 -803.82
## - PTDF_Q      1    36.376 550.78 -794.93
## - PM25        1    58.258 572.67 -750.75
## - PWDIS_Q     1    64.669 579.08 -738.13
## 
## Step:  AIC=-871.75
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + UNDER5PCT + PRE1960PCT + 
##     PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST
## 
##              Df Sum of Sq    RSS     AIC
## - OZONE       1     0.870 515.58 -871.84
## <none>                    514.71 -871.75
## - UNDER5PCT   1     0.980 515.69 -871.59
## - LOWINCPCT   1     1.965 516.67 -869.43
## - PRE1960PCT  1     1.972 516.68 -869.42
## - LESSHSPCT   1     2.099 516.81 -869.14
## - UST         1     2.179 516.89 -868.96
## - PRMP_Q      1    19.142 533.85 -832.34
## - MINORPCT    1    31.826 546.53 -805.72
## - PTDF_Q      1    36.352 551.06 -796.36
## - PM25        1    59.988 574.70 -748.74
## - PWDIS_Q     1    64.391 579.10 -740.08
## 
## Step:  AIC=-871.84
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + UNDER5PCT + PRE1960PCT + 
##     PWDIS_Q + PRMP_Q + PTDF_Q + PM25 + UST
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    515.58 -871.84
## - UNDER5PCT   1     0.945 516.52 -871.76
## - PRE1960PCT  1     1.518 517.10 -870.50
## - UST         1     1.899 517.48 -869.67
## - LESSHSPCT   1     1.972 517.55 -869.51
## - LOWINCPCT   1     2.017 517.60 -869.41
## - PRMP_Q      1    28.221 543.80 -813.41
## - MINORPCT    1    35.144 550.72 -799.06
## - PTDF_Q      1    35.975 551.55 -797.35
## - PM25        1    61.478 577.06 -746.09
## - PWDIS_Q     1    79.838 595.42 -710.57

summary(reduced_model_nassau)

## 
## Call:
## lm(formula = PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + 
##     UNDER5PCT + PRE1960PCT + PWDIS_Q + PRMP_Q + PTDF_Q + PM25 + 
##     UST, data = df_nassau)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.84426 -0.44494 -0.02027  0.42442  2.39773 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) -9.14761    0.70112 -13.047 < 0.0000000000000002 ***
## MINORPCT    -0.78168    0.08934  -8.749 < 0.0000000000000002 ***
## LOWINCPCT    0.39388    0.18791   2.096               0.0363 *  
## LESSHSPCT    0.59415    0.28665   2.073               0.0384 *  
## UNDER5PCT    0.72722    0.50700   1.434               0.1517    
## PRE1960PCT   0.18247    0.10035   1.818               0.0693 .  
## PWDIS_Q      0.21055    0.01597  13.187 < 0.0000000000000002 ***
## PRMP_Q      -0.18520    0.02362  -7.840   0.0000000000000104 ***
## PTDF_Q      -0.14539    0.01642  -8.852 < 0.0000000000000002 ***
## PM25         1.04287    0.09012  11.572 < 0.0000000000000002 ***
## UST          0.01757    0.00864   2.034               0.0422 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6776 on 1123 degrees of freedom
## Multiple R-squared:  0.4263, Adjusted R-squared:  0.4212 
## F-statistic: 83.46 on 10 and 1123 DF,  p-value: < 0.00000000000000022

# Get the coefficient estimates and confidence intervals
tidy_fit_nassau_reduced <- tidy(reduced_model_nassau, exponentiate = TRUE, conf.int = TRUE)

tidy_fit_nassau_reduced <- tidy(reduced_model_nassau, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Facility Quartile",
      term == "PRMP_Q" ~ "Proximity to RMP Facility Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Proximity to Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

tidy_fit_nassau_reduced %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
NA	0.00	1.00	1.00	1.00
Percent Minority	0.46	1.58	1.46	1.73
Percent Low Income	1.48	4.39	2.80	8.50
Percent Less than High School Education	1.81	6.11	2.80	24.05
Percent Under 5 Years Old	2.07	7.92	2.16	270.43
Percent Pre-1960 Housing	1.20	3.32	2.69	4.31
Proximity to Wastewater Discharge Facility Quartile	1.23	3.42	3.32	3.56
Proximity to RMP Facility Quartile	0.83	2.29	2.20	2.39
Proximity to Hazardous Waste Facility Quartile	0.86	2.36	2.32	2.44
PM2.5 Concentration (ug/m3)	2.84	17.12	10.80	29.67
Proximity to Underground Storage Tanks	1.02	2.77	2.72	2.83

SuffolkEJ <- subset(EJScreen, CNTY_NAME == "Suffolk County")
df_Suffolk = as.data.frame(SuffolkEJ)

fit.adjusted.Suffolk <- lm(PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT +
                     LINGISOPCT + UNDER5PCT + OVER64PCT
                   + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + UST, data = df_Suffolk)


# Obtain model summary
summary(fit.adjusted.Suffolk)

## 
## Call:
## lm(formula = PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + 
##     LINGISOPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + 
##     PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + UST, data = df_Suffolk)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17511 -0.34711 -0.09261  0.23723  2.14710 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  0.96196    1.16077   0.829              0.40745    
## MINORPCT     0.13057    0.09857   1.325              0.18558    
## LOWINCPCT   -0.06567    0.14488  -0.453              0.65048    
## LESSHSPCT   -0.49640    0.24550  -2.022              0.04343 *  
## LINGISOPCT   0.21371    0.37562   0.569              0.56950    
## UNDER5PCT   -0.38233    0.42412  -0.901              0.36755    
## OVER64PCT   -0.33442    0.14357  -2.329              0.02004 *  
## PRE1960PCT  -0.53673    0.07934  -6.765      0.0000000000221 ***
## PTRAF_Q      0.02129    0.01430   1.489              0.13678    
## PWDIS_Q      0.02004    0.02058   0.974              0.33047    
## PRMP_Q      -0.26506    0.01629 -16.272 < 0.0000000000000002 ***
## PTDF_Q      -0.24489    0.01399 -17.499 < 0.0000000000000002 ***
## OZONE       -0.01676    0.02563  -0.654              0.51322    
## UST         -0.05473    0.01934  -2.829              0.00475 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.541 on 1042 degrees of freedom
## Multiple R-squared:  0.513,  Adjusted R-squared:  0.5069 
## F-statistic: 84.44 on 13 and 1042 DF,  p-value: < 0.00000000000000022

# Calculate VIF values
vif(fit.adjusted.Suffolk)

##   MINORPCT  LOWINCPCT  LESSHSPCT LINGISOPCT  UNDER5PCT  OVER64PCT PRE1960PCT 
##   2.288209   1.339459   2.048313   1.378826   1.139644   1.325095   1.116388 
##    PTRAF_Q    PWDIS_Q     PRMP_Q     PTDF_Q      OZONE        UST 
##   1.381084   1.118711   1.336111   1.421702   1.147994   1.164093

# PM2.5 was taken out of the minal due to having a VIF of 9.32


# Get the coefficient estimates and confidence intervals
tidy_fit_Suffolk <- tidy(fit.adjusted.Suffolk, exponentiate = TRUE, conf.int = TRUE)

tidy_fit_Suffolk <- tidy(fit.adjusted.Suffolk, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Facility Quartile",
      term == "PRMP_Q" ~ "Proximity to RMP Facility Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Proximity to Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

tidy_fit_Suffolk %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
NA	2.62	13.74	1.31	122331449863.11
Percent Minority	1.14	3.13	2.56	3.97
Percent Low Income	0.94	2.56	2.01	3.46
Percent Less than High School Education	0.61	1.84	1.46	2.69
Percent Limited English Proficiency	1.24	3.46	1.80	13.33
Percent Under 5 Years Old	0.68	1.97	1.35	4.81
Percent Over 64 Years Old	0.72	2.05	1.72	2.59
Percent Pre-1960 Housing	0.58	1.79	1.65	1.97
Traffic Density Quartile	1.02	2.77	2.69	2.86
Proximity to Wastewater Discharge Facility Quartile	1.02	2.77	2.66	2.89
Proximity to RMP Facility Quartile	0.77	2.16	2.10	2.20
Proximity to Hazardous Waste Facility Quartile	0.78	2.18	2.14	2.23
Ozone Concentration (ppb)	0.98	2.66	2.56	2.80
Proximity to Underground Storage Tanks	0.95	2.59	2.48	2.66

### Significant results

# Get the coefficient estimates and confidence intervals
tidy_fit_Suffolk <- tidy(fit.adjusted.Suffolk, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "MINORPCT" ~ "Percent Minority",
      term == "LOWINCPCT" ~ "Percent Low Income",
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "LINGISOPCT" ~ "Percent Limited English Proficiency",
      term == "UNDER5PCT" ~ "Percent Under 5 Years Old",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PWDIS_Q" ~ "Proximity to Wastewater Discharge Quartile",
      term == "PRMP_Q" ~ "Percent in Poverty Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

# Filter significant and positive effect variables
tidy_fit_significant_Suffolk <- tidy_fit_Suffolk %>%
  filter(p.value < 0.05, exp_estimate > 1) %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  arrange(desc(exp_estimate))

# Display the results
tidy_fit_significant_Suffolk %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
Underground Storage Tanks	0.95	2.59	2.48	2.66
Proximity to Hazardous Waste Facility Quartile	0.78	2.18	2.14	2.23
Percent in Poverty Quartile	0.77	2.16	2.10	2.20
Percent Over 64 Years Old	0.72	2.05	1.72	2.59
Percent Less than High School Education	0.61	1.84	1.46	2.69
Percent Pre-1960 Housing	0.58	1.79	1.65	1.97

## Reduced Model for Suffolk

reduced_model_Suffolk <- step(fit.adjusted.Suffolk, direction = "backward")

## Start:  AIC=-1283.64
## PNPL_LOGAR ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + UST
## 
##              Df Sum of Sq    RSS     AIC
## - LOWINCPCT   1     0.060 305.01 -1285.4
## - LINGISOPCT  1     0.095 305.05 -1285.3
## - OZONE       1     0.125 305.08 -1285.2
## - UNDER5PCT   1     0.238 305.19 -1284.8
## - PWDIS_Q     1     0.277 305.23 -1284.7
## - MINORPCT    1     0.514 305.47 -1283.9
## <none>                    304.95 -1283.6
## - PTRAF_Q     1     0.649 305.60 -1283.4
## - LESSHSPCT   1     1.197 306.15 -1281.5
## - OVER64PCT   1     1.588 306.54 -1280.2
## - UST         1     2.343 307.30 -1277.6
## - PRE1960PCT  1    13.395 318.35 -1240.2
## - PRMP_Q      1    77.490 382.44 -1046.5
## - PTDF_Q      1    89.617 394.57 -1013.6
## 
## Step:  AIC=-1285.43
## PNPL_LOGAR ~ MINORPCT + LESSHSPCT + LINGISOPCT + UNDER5PCT + 
##     OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + 
##     OZONE + UST
## 
##              Df Sum of Sq    RSS     AIC
## - LINGISOPCT  1     0.082 305.10 -1287.2
## - OZONE       1     0.108 305.12 -1287.1
## - UNDER5PCT   1     0.257 305.27 -1286.5
## - PWDIS_Q     1     0.289 305.30 -1286.4
## - MINORPCT    1     0.477 305.49 -1285.8
## <none>                    305.01 -1285.4
## - PTRAF_Q     1     0.660 305.67 -1285.2
## - LESSHSPCT   1     1.415 306.43 -1282.5
## - OVER64PCT   1     1.679 306.69 -1281.6
## - UST         1     2.381 307.39 -1279.2
## - PRE1960PCT  1    13.344 318.36 -1242.2
## - PRMP_Q      1    79.224 384.24 -1043.6
## - PTDF_Q      1    89.557 394.57 -1015.6
## 
## Step:  AIC=-1287.15
## PNPL_LOGAR ~ MINORPCT + LESSHSPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + 
##     PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + UST
## 
##              Df Sum of Sq    RSS     AIC
## - OZONE       1     0.109 305.20 -1288.8
## - UNDER5PCT   1     0.253 305.35 -1288.3
## - PWDIS_Q     1     0.273 305.37 -1288.2
## <none>                    305.10 -1287.2
## - MINORPCT    1     0.586 305.68 -1287.1
## - PTRAF_Q     1     0.636 305.73 -1287.0
## - LESSHSPCT   1     1.334 306.43 -1284.5
## - OVER64PCT   1     1.663 306.76 -1283.4
## - UST         1     2.381 307.48 -1280.9
## - PRE1960PCT  1    13.351 318.45 -1243.9
## - PRMP_Q      1    79.203 384.30 -1045.4
## - PTDF_Q      1    89.532 394.63 -1017.4
## 
## Step:  AIC=-1288.77
## PNPL_LOGAR ~ MINORPCT + LESSHSPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + 
##     PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + UST
## 
##              Df Sum of Sq    RSS     AIC
## - UNDER5PCT   1     0.240 305.44 -1289.9
## - PWDIS_Q     1     0.373 305.58 -1289.5
## <none>                    305.20 -1288.8
## - MINORPCT    1     0.601 305.81 -1288.7
## - PTRAF_Q     1     0.668 305.87 -1288.5
## - LESSHSPCT   1     1.290 306.49 -1286.3
## - OVER64PCT   1     1.631 306.84 -1285.1
## - UST         1     2.301 307.51 -1282.8
## - PRE1960PCT  1    14.009 319.21 -1243.4
## - PRMP_Q      1    79.451 384.66 -1046.5
## - PTDF_Q      1    89.652 394.86 -1018.8
## 
## Step:  AIC=-1289.94
## PNPL_LOGAR ~ MINORPCT + LESSHSPCT + OVER64PCT + PRE1960PCT + 
##     PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + UST
## 
##              Df Sum of Sq    RSS     AIC
## - PWDIS_Q     1     0.367 305.81 -1290.7
## - MINORPCT    1     0.516 305.96 -1290.2
## <none>                    305.44 -1289.9
## - PTRAF_Q     1     0.676 306.12 -1289.6
## - LESSHSPCT   1     1.247 306.69 -1287.6
## - OVER64PCT   1     1.431 306.88 -1287.0
## - UST         1     2.307 307.75 -1284.0
## - PRE1960PCT  1    14.140 319.58 -1244.2
## - PRMP_Q      1    79.682 385.13 -1047.2
## - PTDF_Q      1    89.585 395.03 -1020.4
## 
## Step:  AIC=-1290.68
## PNPL_LOGAR ~ MINORPCT + LESSHSPCT + OVER64PCT + PRE1960PCT + 
##     PTRAF_Q + PRMP_Q + PTDF_Q + UST
## 
##              Df Sum of Sq    RSS     AIC
## - PTRAF_Q     1     0.575 306.39 -1290.7
## <none>                    305.81 -1290.7
## - MINORPCT    1     0.601 306.41 -1290.6
## - LESSHSPCT   1     1.313 307.12 -1288.2
## - OVER64PCT   1     1.420 307.23 -1287.8
## - UST         1     2.290 308.10 -1284.8
## - PRE1960PCT  1    14.951 320.76 -1242.3
## - PRMP_Q      1    81.375 387.19 -1043.5
## - PTDF_Q      1    89.493 395.30 -1021.6
## 
## Step:  AIC=-1290.69
## PNPL_LOGAR ~ MINORPCT + LESSHSPCT + OVER64PCT + PRE1960PCT + 
##     PRMP_Q + PTDF_Q + UST
## 
##              Df Sum of Sq    RSS     AIC
## - MINORPCT    1     0.581 306.97 -1290.7
## <none>                    306.39 -1290.7
## - OVER64PCT   1     1.332 307.72 -1288.1
## - LESSHSPCT   1     1.364 307.75 -1288.0
## - UST         1     2.982 309.37 -1282.5
## - PRE1960PCT  1    15.453 321.84 -1240.7
## - PRMP_Q      1    81.385 387.77 -1043.9
## - PTDF_Q      1    92.687 399.07 -1013.6
## 
## Step:  AIC=-1290.69
## PNPL_LOGAR ~ LESSHSPCT + OVER64PCT + PRE1960PCT + PRMP_Q + PTDF_Q + 
##     UST
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    306.97 -1290.7
## - LESSHSPCT   1     0.789 307.75 -1290.0
## - OVER64PCT   1     2.125 309.09 -1285.4
## - UST         1     2.899 309.87 -1282.8
## - PRE1960PCT  1    16.497 323.46 -1237.4
## - PRMP_Q      1    83.900 390.87 -1037.5
## - PTDF_Q      1    97.029 403.99 -1002.6

summary(reduced_model_Suffolk)

## 
## Call:
## lm(formula = PNPL_LOGAR ~ LESSHSPCT + OVER64PCT + PRE1960PCT + 
##     PRMP_Q + PTDF_Q + UST, data = df_Suffolk)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.21203 -0.34717 -0.09136  0.24412  2.18213 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  0.39988    0.08024   4.983    0.000000730179649 ***
## LESSHSPCT   -0.29955    0.18247  -1.642              0.10095    
## OVER64PCT   -0.35501    0.13175  -2.695              0.00716 ** 
## PRE1960PCT  -0.57751    0.07692  -7.508    0.000000000000128 ***
## PRMP_Q      -0.26556    0.01568 -16.933 < 0.0000000000000002 ***
## PTDF_Q      -0.23903    0.01313 -18.209 < 0.0000000000000002 ***
## UST         -0.05893    0.01872  -3.148              0.00169 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.541 on 1049 degrees of freedom
## Multiple R-squared:  0.5098, Adjusted R-squared:  0.507 
## F-statistic: 181.8 on 6 and 1049 DF,  p-value: < 0.00000000000000022

# Get the coefficient estimates and confidence intervals
tidy_fit_Suffolk_reduced <- tidy(reduced_model_Suffolk, exponentiate = TRUE, conf.int = TRUE)

tidy_fit_Suffolk_reduced <- tidy(reduced_model_Suffolk, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(
    term = case_when(
      term == "LESSHSPCT" ~ "Percent Less than High School Education",
      term == "OVER64PCT" ~ "Percent Over 64 Years Old",
      term == "PRE1960PCT" ~ "Percent Pre-1960 Housing",
      term == "PTRAF_Q" ~ "Traffic Density Quartile",
      term == "PRMP_Q" ~ "Proximity to RMP Facility Quartile",
      term == "PTDF_Q" ~ "Proximity to Hazardous Waste Facility Quartile",
      term == "OZONE" ~ "Ozone Concentration (ppb)",
      term == "PM25" ~ "PM2.5 Concentration (ug/m3)",
      term == "UST" ~ "Proximity to Underground Storage Tanks"
    ),
    estimate = round(estimate, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp_estimate = round(exp(estimate), 2),
    exp_ci_low = round(exp(conf.low), 2),
    exp_ci_high = round(exp(conf.high), 2)
  )

tidy_fit_Suffolk_reduced %>%
  select(term, estimate, exp_estimate, exp_ci_low, exp_ci_high) %>%
  kable(format = "html", align = "c") %>%
  kable_styling(full_width = FALSE)

term	estimate	exp_estimate	exp_ci_low	exp_ci_high
NA	1.49	4.44	3.56	5.75
Percent Less than High School Education	0.74	2.10	1.68	2.89
Percent Over 64 Years Old	0.70	2.01	1.72	2.48
Percent Pre-1960 Housing	0.56	1.75	1.62	1.92
Proximity to RMP Facility Quartile	0.77	2.16	2.10	2.20
Proximity to Hazardous Waste Facility Quartile	0.79	2.20	2.16	2.25
Proximity to Underground Storage Tanks	0.94	2.56	2.48	2.66

Percent Cange, original linear model

# Calculate coefficients as percentage change for a 1% increase in independent variable
coef_pct_change <- 100*(exp(coef(fit.adjusted)) - 1)

# Calculate confidence intervals for percentage change coefficients
SE <- sqrt(diag(vcov(fit.adjusted)))
lower_ci <- 100*(exp(coef(fit.adjusted))^(1 - 1.96*SE) - 1)
upper_ci <- 100*(exp(coef(fit.adjusted))^(1 + 1.96*SE) - 1)

# Create a new data frame to display the coefficients and confidence intervals
coef_table <- data.frame(Coefficient = names(coef_pct_change),
                         Percentage_change = coef_pct_change,
                         Lower_CI = lower_ci,
                         Upper_CI = upper_ci)

# Round the values in the table to two decimal places
coef_table[,2:4] <- round(coef_table[,2:4], 2)

# Print the table
print(coef_table)

##             Coefficient Percentage_change Lower_CI Upper_CI
## (Intercept) (Intercept)            152.72   -66.24  1791.65
## MINORPCT       MINORPCT            -36.76   -32.61   -40.65
## LOWINCPCT     LOWINCPCT             20.66    15.22    26.35
## LESSHSPCT     LESSHSPCT             11.71     6.92    16.71
## LINGISOPCT   LINGISOPCT             83.92    37.06   146.79
## UNDER5PCT     UNDER5PCT             43.98    11.66    85.66
## OVER64PCT     OVER64PCT            -10.44    -7.77   -13.04
## PRE1960PCT   PRE1960PCT            -17.40   -15.45   -19.30
## PTRAF_Q         PTRAF_Q              3.21     3.14     3.28
## PWDIS_Q         PWDIS_Q             20.85    20.31    21.39
## PRMP_Q           PRMP_Q            -21.77   -21.24   -22.31
## PTDF_Q           PTDF_Q            -16.17   -15.84   -16.50
## OZONE             OZONE            -14.93   -14.13   -15.71
## PM25               PM25            103.68    89.20   119.28
## UST                 UST              0.14     0.14     0.15

Logistic Regression to compute odds ratios

# CREATE LOGISTIC VARIABLE
MEDIAN <- round(median(df$PNPL),4)

df <- df %>% 
  mutate(PNPL_SPLIT = as.numeric(PNPL >= MEDIAN),
        PNPL_SPLIT = factor(PNPL_SPLIT,
                             levels = 0:1,
                             labels = c(0, 1)))

tapply(df$PNPL, df$PNPL_SPLIT, range)

## $`0`
## [1] 0.03104137 0.21264430
## 
## $`1`
## [1] 0.2128068 5.1370115

fit.logistic <- glm(PNPL_SPLIT ~ MINORPCT + LOWINCPCT + LESSHSPCT +
                     LINGISOPCT + UNDER5PCT + OVER64PCT
                   + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, data = df, family = binomial)

summary(fit.logistic)

## 
## Call:
## glm(formula = PNPL_SPLIT ~ MINORPCT + LOWINCPCT + LESSHSPCT + 
##     LINGISOPCT + UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + 
##     PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.47516  -0.69978   0.06315   0.63253   2.86338  
## 
## Coefficients:
##              Estimate Std. Error z value             Pr(>|z|)    
## (Intercept) 17.671083   5.107511   3.460             0.000541 ***
## MINORPCT    -0.724756   0.288209  -2.515             0.011914 *  
## LOWINCPCT   -0.005107   0.506163  -0.010             0.991949    
## LESSHSPCT    0.084824   0.840405   0.101             0.919604    
## LINGISOPCT   2.812117   1.076620   2.612             0.009002 ** 
## UNDER5PCT    0.126327   1.470678   0.086             0.931548    
## OVER64PCT   -0.132112   0.607200  -0.218             0.827760    
## PRE1960PCT  -1.064281   0.254760  -4.178        0.00002946252 ***
## PTRAF_Q      0.153802   0.045931   3.349             0.000812 ***
## PWDIS_Q      0.478843   0.052927   9.047 < 0.0000000000000002 ***
## PRMP_Q      -0.619451   0.058306 -10.624 < 0.0000000000000002 ***
## PTDF_Q      -0.668534   0.047859 -13.969 < 0.0000000000000002 ***
## OZONE       -0.799839   0.134573  -5.944        0.00000000279 ***
## PM25         2.554376   0.248990  10.259 < 0.0000000000000002 ***
## UST         -0.032458   0.030543  -1.063             0.287923    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3036.0  on 2189  degrees of freedom
## Residual deviance: 1905.6  on 2175  degrees of freedom
## AIC: 1935.6
## 
## Number of Fisher Scoring iterations: 5

fit.logistic %>%
  tbl_regression(exponentiate = TRUE,
                  intercept = T,
                 estimate_fun = function(x) style_sigfig(x, digits = 3),
                 pvalue_fun   = function(x) style_pvalue(x, digits = 3),
                 label  = list(MINORPCT ~ "Racial Minority %",
                               LOWINCPCT   ~ "Low Income %",
                               LESSHSPCT ~ "Less than HS Ed. %",
                               LINGISOPCT ~ "Linguistic Isolation %",
                               UNDER5PCT ~ "Under 5%",
                               OVER64PCT   ~ "Over 64%",
                               PRE1960PCT ~ "Houses Built Pre-1960 %",
                               PTRAF_Q ~ "Proximity to Traffic",
                               PWDIS_Q ~ "Proximity to Wastewater Discharge Facility",
                               PRMP_Q ~ "Proximity to RMP Facility",
                               PTDF_Q ~ "Proximity to Hazardous Waste Facility",
                               OZONE ~ "Average O3 Ambient Air Concentration",
                               PM25 ~ "Average Annual PM 2.5 Ambient Air Concentration",
                               UST ~ "Proximity to Underground Storage Facility"
                              )) %>%
  add_global_p(keep = T) %>% 
  modify_caption(paste("Logistic regression results for proximity to Superfund sites (split by median) by census block group (N = ", NOBS, ")", sep=""))

Logistic regression results for proximity to Superfund sites (split by median) by census block group (N = 2190)
Characteristic	OR¹	95% CI¹	p-value
(Intercept)	47,255,668	2,305, 1,158,569,667,952	<0.001
Racial Minority %	0.484	0.275, 0.851	0.012
Low Income %	0.995	0.370, 2.70	0.992
Less than HS Ed. %	1.09	0.211, 5.70	0.920
Linguistic Isolation %	16.6	2.07, 141	0.008
Under 5%	1.13	0.063, 20.3	0.932
Over 64%	0.876	0.265, 2.87	0.828
Houses Built Pre-1960 %	0.345	0.209, 0.567	<0.001
Proximity to Traffic	1.17	1.07, 1.28	<0.001
Proximity to Wastewater Discharge Facility	1.61	1.46, 1.79	<0.001
Proximity to RMP Facility	0.538	0.480, 0.603	<0.001
Proximity to Hazardous Waste Facility	0.512	0.466, 0.562	<0.001
Average O3 Ambient Air Concentration	0.449	0.344, 0.583	<0.001
Average Annual PM 2.5 Ambient Air Concentration	12.9	7.97, 21.2	<0.001
Proximity to Underground Storage Facility	0.968	0.912, 1.03	0.290
¹ OR = Odds Ratio, CI = Confidence Interval

# plot histogram of deviance residuals
hist(resid(fit.logistic), breaks = 30, col = "grey", main = "Histogram of Deviance Residuals")

# create QQ-plot of deviance residuals
qqnorm(resid(fit.logistic))
qqline(resid(fit.logistic))

Fitting best logistic model

reduced.logistic.model <- step(fit.logistic, direction = "both")

## Start:  AIC=1935.63
## PNPL_SPLIT ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + PM25 + UST
## 
##              Df Deviance    AIC
## - LOWINCPCT   1   1905.6 1933.6
## - UNDER5PCT   1   1905.6 1933.6
## - LESSHSPCT   1   1905.6 1933.6
## - OVER64PCT   1   1905.7 1933.7
## - UST         1   1906.8 1934.8
## <none>            1905.6 1935.6
## - MINORPCT    1   1912.0 1940.0
## - LINGISOPCT  1   1912.7 1940.7
## - PTRAF_Q     1   1917.0 1945.0
## - PRE1960PCT  1   1923.4 1951.4
## - OZONE       1   1943.3 1971.3
## - PWDIS_Q     1   1996.5 2024.5
## - PRMP_Q      1   2026.4 2054.4
## - PM25        1   2030.1 2058.1
## - PTDF_Q      1   2125.7 2153.7
## 
## Step:  AIC=1933.63
## PNPL_SPLIT ~ MINORPCT + LESSHSPCT + LINGISOPCT + UNDER5PCT + 
##     OVER64PCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + 
##     OZONE + PM25 + UST
## 
##              Df Deviance    AIC
## - UNDER5PCT   1   1905.6 1931.6
## - LESSHSPCT   1   1905.7 1931.7
## - OVER64PCT   1   1905.7 1931.7
## - UST         1   1906.8 1932.8
## <none>            1905.6 1933.6
## + LOWINCPCT   1   1905.6 1935.6
## - MINORPCT    1   1912.1 1938.1
## - LINGISOPCT  1   1912.8 1938.8
## - PTRAF_Q     1   1917.0 1943.0
## - PRE1960PCT  1   1923.5 1949.5
## - OZONE       1   1943.3 1969.3
## - PWDIS_Q     1   1996.5 2022.5
## - PRMP_Q      1   2026.8 2052.8
## - PM25        1   2030.2 2056.2
## - PTDF_Q      1   2125.8 2151.8
## 
## Step:  AIC=1931.64
## PNPL_SPLIT ~ MINORPCT + LESSHSPCT + LINGISOPCT + OVER64PCT + 
##     PRE1960PCT + PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + 
##     PM25 + UST
## 
##              Df Deviance    AIC
## - LESSHSPCT   1   1905.7 1929.7
## - OVER64PCT   1   1905.7 1929.7
## - UST         1   1906.8 1930.8
## <none>            1905.6 1931.6
## + UNDER5PCT   1   1905.6 1933.6
## + LOWINCPCT   1   1905.6 1933.6
## - MINORPCT    1   1912.1 1936.1
## - LINGISOPCT  1   1912.8 1936.8
## - PTRAF_Q     1   1917.0 1941.0
## - PRE1960PCT  1   1923.5 1947.5
## - OZONE       1   1943.3 1967.3
## - PWDIS_Q     1   1996.6 2020.6
## - PRMP_Q      1   2026.9 2050.9
## - PM25        1   2030.4 2054.4
## - PTDF_Q      1   2125.9 2149.9
## 
## Step:  AIC=1929.65
## PNPL_SPLIT ~ MINORPCT + LINGISOPCT + OVER64PCT + PRE1960PCT + 
##     PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25 + UST
## 
##              Df Deviance    AIC
## - OVER64PCT   1   1905.7 1927.7
## - UST         1   1906.8 1928.8
## <none>            1905.7 1929.7
## + LESSHSPCT   1   1905.6 1931.6
## + UNDER5PCT   1   1905.7 1931.7
## + LOWINCPCT   1   1905.7 1931.7
## - MINORPCT    1   1913.8 1935.8
## - LINGISOPCT  1   1914.1 1936.1
## - PTRAF_Q     1   1917.1 1939.1
## - PRE1960PCT  1   1923.6 1945.6
## - OZONE       1   1943.3 1965.3
## - PWDIS_Q     1   1996.8 2018.8
## - PRMP_Q      1   2027.0 2049.0
## - PM25        1   2032.0 2054.0
## - PTDF_Q      1   2127.2 2149.2
## 
## Step:  AIC=1927.71
## PNPL_SPLIT ~ MINORPCT + LINGISOPCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + PM25 + UST
## 
##              Df Deviance    AIC
## - UST         1   1906.8 1926.8
## <none>            1905.7 1927.7
## + OVER64PCT   1   1905.7 1929.7
## + UNDER5PCT   1   1905.7 1929.7
## + LESSHSPCT   1   1905.7 1929.7
## + LOWINCPCT   1   1905.7 1929.7
## - MINORPCT    1   1914.0 1934.0
## - LINGISOPCT  1   1914.2 1934.2
## - PTRAF_Q     1   1917.1 1937.1
## - PRE1960PCT  1   1923.8 1943.8
## - OZONE       1   1943.3 1963.3
## - PWDIS_Q     1   1997.2 2017.2
## - PRMP_Q      1   2027.1 2047.1
## - PM25        1   2033.2 2053.2
## - PTDF_Q      1   2127.3 2147.3
## 
## Step:  AIC=1926.83
## PNPL_SPLIT ~ MINORPCT + LINGISOPCT + PRE1960PCT + PTRAF_Q + PWDIS_Q + 
##     PRMP_Q + PTDF_Q + OZONE + PM25
## 
##              Df Deviance    AIC
## <none>            1906.8 1926.8
## + UST         1   1905.7 1927.7
## + OVER64PCT   1   1906.8 1928.8
## + UNDER5PCT   1   1906.8 1928.8
## + LESSHSPCT   1   1906.8 1928.8
## + LOWINCPCT   1   1906.8 1928.8
## - LINGISOPCT  1   1914.9 1932.9
## - MINORPCT    1   1915.5 1933.5
## - PTRAF_Q     1   1919.4 1937.4
## - PRE1960PCT  1   1925.8 1943.8
## - OZONE       1   1943.3 1961.3
## - PWDIS_Q     1   2001.1 2019.1
## - PRMP_Q      1   2028.9 2046.9
## - PM25        1   2033.5 2051.5
## - PTDF_Q      1   2128.1 2146.1

summary(reduced.logistic.model)

## 
## Call:
## glm(formula = PNPL_SPLIT ~ MINORPCT + LINGISOPCT + PRE1960PCT + 
##     PTRAF_Q + PWDIS_Q + PRMP_Q + PTDF_Q + OZONE + PM25, family = binomial, 
##     data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.46498  -0.70946   0.06019   0.63019   2.86550  
## 
## Coefficients:
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept) 16.76018    5.02570   3.335             0.000853 ***
## MINORPCT    -0.70857    0.24173  -2.931             0.003376 ** 
## LINGISOPCT   2.77585    0.99877   2.779             0.005448 ** 
## PRE1960PCT  -1.07665    0.25002  -4.306        0.00001660717 ***
## PTRAF_Q      0.15988    0.04552   3.512             0.000444 ***
## PWDIS_Q      0.48547    0.05267   9.217 < 0.0000000000000002 ***
## PRMP_Q      -0.62141    0.05820 -10.677 < 0.0000000000000002 ***
## PTDF_Q      -0.66862    0.04777 -13.997 < 0.0000000000000002 ***
## OZONE       -0.77337    0.13223  -5.849        0.00000000496 ***
## PM25         2.51042    0.24270  10.344 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3036.0  on 2189  degrees of freedom
## Residual deviance: 1906.8  on 2180  degrees of freedom
## AIC: 1926.8
## 
## Number of Fisher Scoring iterations: 5

reduced.logistic.model %>%
  tbl_regression(exponentiate = TRUE,
                 estimate_fun = function(x) style_sigfig(x, digits = 3),
                 pvalue_fun   = function(x) style_pvalue(x, digits = 3),
                 label  = list(MINORPCT ~ "Racial Minority %",
                               LINGISOPCT ~ "Linguistic Isolation %",
                               PRE1960PCT ~ "Houses Built Pre-1960 %",
                               PTRAF_Q ~ "Proximity to Traffic",
                               PWDIS_Q ~ "Proximity to Wastewater Discharge Facility",
                               PRMP_Q ~ "Proximity to RMP Facility",
                               PTDF_Q ~ "Proximity to Hazardous Waste Facility",
                               OZONE ~ "Average O3 Ambient Air Concentration",
                               PM25 ~ "Average Annual PM 2.5 Ambient Air Concentration"                              )) %>%
  add_global_p(keep = T) %>% 
  modify_caption(paste("Reduced logistic regression results for proximity to Superfund sites (split by median) by census block group (N = ", NOBS, ")", sep=""))

Reduced logistic regression results for proximity to Superfund sites (split by median) by census block group (N = 2190)
Characteristic	OR¹	95% CI¹	p-value
Racial Minority %	0.492	0.306, 0.790	0.003
Linguistic Isolation %	16.1	2.34, 118	0.004
Houses Built Pre-1960 %	0.341	0.208, 0.555	<0.001
Proximity to Traffic	1.17	1.07, 1.28	<0.001
Proximity to Wastewater Discharge Facility	1.62	1.47, 1.80	<0.001
Proximity to RMP Facility	0.537	0.479, 0.601	<0.001
Proximity to Hazardous Waste Facility	0.512	0.466, 0.562	<0.001
Average O3 Ambient Air Concentration	0.461	0.355, 0.596	<0.001
Average Annual PM 2.5 Ambient Air Concentration	12.3	7.72, 20.0	<0.001
¹ OR = Odds Ratio, CI = Confidence Interval

Linear and Logistic Models for Superfund Sites, Long Island, NY

Fin Mooney

2023-03-13

Descriptive Statistics

Demographic Factors

Environmental Factors

Regressions

Linear Regression

Fitting best linear model

Exponentiate Model

Linear Regression - Nassau

Percent Cange, original linear model

Logistic Regression to compute odds ratios

Fitting best logistic model