# Changing NA values to zero
df["PWDIS"][is.na(df["PWDIS"])] <- 0 

df["UST"][is.na(df["UST"])] <- 0

df["PTRAF"][is.na(df["PTRAF"])] <- 0

#PNPL Standardized Split

MEDIAN <- round(median(df$PNPL),4)
LABEL0 <- paste("PNPL Score < Median:", round(MEDIAN,2))
LABEL1 <- paste("PNPL Score >= Median:", round(MEDIAN,2))

df <- df %>% 
  mutate(PNPL_SPLIT = as.numeric(PNPL >= MEDIAN),
        PNPL_SPLIT = factor(PNPL_SPLIT,
                             levels = 0:1,
                             labels = c(LABEL0, LABEL1)))

tapply(df$PNPL, df$PNPL_SPLIT, range)

## $`PNPL Score < Median: 0.21`
## [1] 0.03104137 0.21204710
## 
## $`PNPL Score >= Median: 0.21`
## [1] 0.212644 5.137011

Descrpitive Statistics

Demographic Factors

tbl_dem <- 
  df %>% 
  dplyr::select(VULEOPCT, MINORPCT, LOWINCPCT, LESSHSPCT, LINGISOPCT, UNEMPPCT, UNDER5PCT, OVER64PCT, PNPL_SPLIT) %>%
  tbl_summary(
    # The "by" variable
    by =PNPL_SPLIT,
    statistic = list(all_continuous()  ~ "{mean} ({sd})"),
    digits = list(all_continuous()  ~ c(2, 2)),
    type = list(VULEOPCT ~ "continuous",
                MINORPCT ~ "continuous",
                LOWINCPCT ~ "continuous",
                UNEMPPCT ~ "continuous",
                LESSHSPCT ~ "continuous",
                LINGISOPCT ~ "continuous",
                UNDER5PCT ~ "continuous",
                OVER64PCT ~ "continuous"),
    
    label  = list(VULEOPCT ~ "Vulerability Index %",
                  MINORPCT ~ "Racial Minority %",
                  LOWINCPCT   ~ "Low Income %",
                  UNEMPPCT ~ "Unemployed %",
                  LESSHSPCT ~ "Less than HS Ed. %",
                  LINGISOPCT ~ "Lingustic Isolation %",
                  UNDER5PCT ~ "Under Age 5 %",
                  OVER64PCT ~ "Over Age 64 %")
  ) %>%
  modify_header(
    label = "**Variable**",
    # The following adds the % to the column total label
    # <br> is the location of a line break
    all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
  ) %>%
  modify_caption("Demographic Characteristics") %>%
  bold_labels()  %>%
  # Include an "overall" column
  add_overall(
    last = FALSE,
    # The ** make it bold
    col_label = "**All census block groups**<br>N = {N}"
  ) %>% 
  add_p(
    test = list(all_continuous()  ~ "t.test",
                all_categorical() ~ "chisq.test"),
    pvalue_fun = function(x) style_pvalue(x, digits = 3)
  ) 

tbl_dem

Demographic Characteristics
Variable	All census block groups N = 2191¹	PNPL Score < Median: 0.21 N = 1094 (49.9%)¹	PNPL Score >= Median: 0.21 N = 1097 (50.1%)¹	p-value²
Vulerability Index %	0.25 (0.17)	0.22 (0.16)	0.27 (0.18)	<0.001
Racial Minority %	0.34 (0.28)	0.29 (0.25)	0.40 (0.29)	<0.001
Low Income %	0.15 (0.13)	0.15 (0.13)	0.15 (0.14)	0.818
Less than HS Ed. %	0.08 (0.09)	0.08 (0.08)	0.09 (0.10)	<0.001
Lingustic Isolation %	0.04 (0.07)	0.03 (0.06)	0.05 (0.07)	<0.001
Unemployed %	0.04 (0.05)	0.04 (0.05)	0.04 (0.05)	0.240
Under Age 5 %	0.05 (0.04)	0.05 (0.04)	0.05 (0.04)	0.122
Over Age 64 %	0.19 (0.11)	0.19 (0.13)	0.18 (0.09)	0.002
¹ Mean (SD)
² Welch Two Sample t-test

Demographic Factors - Stratified

tbl_dem_strat <- 
  df %>% 
  dplyr::select(VULEOPCT, MINORPCT, LOWINCPCT, LESSHSPCT, LINGISOPCT, UNEMPPCT, UNDER5PCT, OVER64PCT, PNPL_SPLIT, CNTY_NAME) %>%
  mutate(CNTY_NAME = paste(CNTY_NAME,"County")) %>%
  tbl_strata(
    strata = CNTY_NAME,
    .tbl_fun =
      ~ .x %>%
      tbl_summary(
        # The "by" variable
        by =PNPL_SPLIT,
        statistic = list(all_continuous()  ~ "{mean} ({sd})"),
        digits = list(all_continuous()  ~ c(2, 2)),
        type = list(VULEOPCT ~ "continuous",
                    MINORPCT ~ "continuous",
                    LOWINCPCT ~ "continuous",
                    UNEMPPCT ~ "continuous",
                    LESSHSPCT ~ "continuous",
                    LINGISOPCT ~ "continuous",
                    UNDER5PCT ~ "continuous",
                    OVER64PCT ~ "continuous"),
        
        label  = list(VULEOPCT ~ "Vulerability Index %",
                      MINORPCT ~ "Racial Minority %",
                      LOWINCPCT   ~ "Low Income %",
                      UNEMPPCT ~ "Unemployed %",
                      LESSHSPCT ~ "Less than HS Ed. %",
                      LINGISOPCT ~ "Lingustic Isolation %",
                      UNDER5PCT ~ "Under Age 5 %",
                      OVER64PCT ~ "Over Age 64 %")
      ) %>%
      modify_header(
        label = "**Variable**",
        # The following adds the % to the column total label
        # <br> is the location of a line break
        all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
      ) %>%
      modify_caption("Demographic Characteristics - Stratified by County") %>%
      bold_labels()  %>%
      # Include an "overall" column
      add_overall(
        last = FALSE,
        # The ** make it bold
        col_label = "**All census block groups**<br>N = {N}"
      ) %>% 
      add_p(
        test = list(all_continuous()  ~ "t.test",
                    all_categorical() ~ "chisq.test"),
        pvalue_fun = function(x) style_pvalue(x, digits = 3)
      ) )

tbl_dem_strat

Demographic Characteristics - Stratified by County
Variable	Nassau County County				Suffolk County County
Variable	All census block groups N = 1134¹	PNPL Score < Median: 0.21 N = 357 (31.5%)¹	PNPL Score >= Median: 0.21 N = 777 (68.5%)¹	p-value²	All census block groups N = 1057¹	PNPL Score < Median: 0.21 N = 737 (69.7%)¹	PNPL Score >= Median: 0.21 N = 320 (30.3%)¹	p-value²
Vulerability Index %	0.26 (0.18)	0.24 (0.19)	0.27 (0.18)	0.008	0.23 (0.16)	0.21 (0.15)	0.28 (0.19)	<0.001
Racial Minority %	0.39 (0.29)	0.35 (0.30)	0.40 (0.28)	0.002	0.30 (0.26)	0.26 (0.22)	0.39 (0.31)	<0.001
Low Income %	0.13 (0.13)	0.13 (0.13)	0.14 (0.13)	0.508	0.16 (0.13)	0.16 (0.13)	0.17 (0.14)	0.106
Less than HS Ed. %	0.08 (0.09)	0.07 (0.09)	0.09 (0.09)	0.007	0.09 (0.10)	0.08 (0.08)	0.11 (0.12)	<0.001
Lingustic Isolation %	0.05 (0.08)	0.04 (0.08)	0.06 (0.08)	<0.001	0.03 (0.05)	0.02 (0.05)	0.04 (0.06)	<0.001
Unemployed %	0.04 (0.05)	0.04 (0.06)	0.04 (0.05)	0.340	0.04 (0.04)	0.04 (0.04)	0.04 (0.05)	0.900
Under Age 5 %	0.05 (0.04)	0.05 (0.04)	0.05 (0.04)	0.589	0.05 (0.04)	0.05 (0.04)	0.05 (0.04)	0.255
Over Age 64 %	0.18 (0.09)	0.18 (0.09)	0.18 (0.09)	0.739	0.19 (0.13)	0.20 (0.14)	0.17 (0.11)	<0.001
¹ Mean (SD)
² Welch Two Sample t-test

Environmental Factors

tbl_env <- 
  df %>% 
  dplyr::select(PRE1960PCT, PTRAF, PWDIS, PRMP, PTSDF, OZONE, PM25, UST, PNPL_SPLIT) %>%
      tbl_summary(
        # The "by" variable
        by =PNPL_SPLIT,
        statistic = list(all_continuous()  ~ "{mean} ({sd})"),
        digits = list(all_continuous()  ~ c(2, 2)),
        type = list(PRE1960PCT ~ "continuous",
                    PTRAF ~ "continuous",
                    PWDIS ~ "continuous",
                    PRMP ~ "continuous",
                    PTSDF ~ "continuous",
                    OZONE ~ "continuous",
                    PM25 ~ "continuous",
                    UST ~ "continuous"),
        
        label  = list(PRE1960PCT ~ "% Houses Built Pre 1960",
                      PTRAF ~ "Proximity to Traffic",
                      PWDIS ~ "Proximity to Waterwater Discharge Facility",
                      PRMP   ~ "Proximity to Regulated Management Plan Facility",
                      PTSDF ~ "Proximity to Hazardous Waste Facility",
                      OZONE ~ "Average Seasonal Ozone Concentration",
                      PM25 ~ "Annual Average PM2.5 Concentration",
                      UST ~ "Proximity to Underground Storage Tank")
      ) %>%
      modify_header(
        label = "**Variable**",
        # The following adds the % to the column total label
        # <br> is the location of a line break
        all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
      ) %>%
      modify_caption("Environmental Characteristics - Stratified by County") %>%
      bold_labels()  %>%
      # Include an "overall" column
      add_overall(
        last = FALSE,
        # The ** make it bold
        col_label = "**All census block groups**<br>N = {N}"
      ) %>% 
      add_p(
        test = list(all_continuous()  ~ "t.test",
                    all_categorical() ~ "chisq.test"),
        pvalue_fun = function(x) style_pvalue(x, digits = 3)
      ) 

tbl_env

Environmental Characteristics - Stratified by County
Variable	All census block groups N = 2191¹	PNPL Score < Median: 0.21 N = 1094 (49.9%)¹	PNPL Score >= Median: 0.21 N = 1097 (50.1%)¹	p-value²
% Houses Built Pre 1960	0.53 (0.29)	0.46 (0.27)	0.61 (0.29)	<0.001
Proximity to Traffic	504.61 (685.44)	387.41 (541.02)	621.48 (787.09)	<0.001
Proximity to Waterwater Discharge Facility	0.01 (0.19)	0.02 (0.19)	0.01 (0.19)	0.223
Proximity to Regulated Management Plan Facility	0.16 (0.21)	0.09 (0.06)	0.23 (0.27)	<0.001
Proximity to Hazardous Waste Facility	2.02 (1.74)	1.20 (1.03)	2.84 (1.90)	<0.001
Average Seasonal Ozone Concentration	43.56 (0.61)	43.51 (0.70)	43.61 (0.49)	<0.001
Annual Average PM2.5 Concentration	7.48 (0.47)	7.26 (0.44)	7.70 (0.38)	<0.001
Proximity to Underground Storage Tank	1.16 (1.98)	0.91 (1.54)	1.41 (2.31)	<0.001
¹ Mean (SD)
² Welch Two Sample t-test

Environmental Factors-Stratified

tbl_env_strat <- 
  df %>% 
  dplyr::select(PRE1960PCT, PTRAF, PWDIS, PRMP, PTSDF, OZONE, PM25, UST, PNPL_SPLIT, CNTY_NAME) %>%
  mutate(CNTY_NAME = paste(CNTY_NAME,"County")) %>%
  tbl_strata(
    strata = CNTY_NAME,
    .tbl_fun =
      ~ .x %>%
      tbl_summary(
        # The "by" variable
        by =PNPL_SPLIT,
        statistic = list(all_continuous()  ~ "{mean} ({sd})"),
        digits = list(all_continuous()  ~ c(2, 2)),
        type = list(PRE1960PCT ~ "continuous",
                    PTRAF ~ "continuous",
                    PWDIS ~ "continuous",
                    PRMP ~ "continuous",
                    PTSDF ~ "continuous",
                    OZONE ~ "continuous",
                    PM25 ~ "continuous",
                    UST ~ "continuous"),
        
        label  = list(PRE1960PCT ~ "% Houses Built Pre 1960",
                      PTRAF ~ "Proximity to Traffic",
                      PWDIS ~ "Proximity to Waterwater Discharge Facility",
                      PRMP   ~ "Proximity to Regulated Management Plan Facility",
                      PTSDF ~ "Proximity to Hazardous Waste Facility",
                      OZONE ~ "Average Seasonal Ozone Concentration",
                      PM25 ~ "Annual Average PM2.5 Concentration",
                      UST ~ "Proximity to Underground Storage Tank")
      ) %>%
      modify_header(
        label = "**Variable**",
        # The following adds the % to the column total label
        # <br> is the location of a line break
        all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
      ) %>%
      modify_caption("Environmental Characteristics - Stratified by County") %>%
      bold_labels()  %>%
      # Include an "overall" column
      add_overall(
        last = FALSE,
        # The ** make it bold
        col_label = "**All census block groups**<br>N = {N}"
      ) %>% 
      add_p(
        test = list(all_continuous()  ~ "t.test",
                    all_categorical() ~ "chisq.test"),
        pvalue_fun = function(x) style_pvalue(x, digits = 3)
      ) )

tbl_env_strat

Environmental Characteristics - Stratified by County
Variable	Nassau County County				Suffolk County County
Variable	All census block groups N = 1134¹	PNPL Score < Median: 0.21 N = 357 (31.5%)¹	PNPL Score >= Median: 0.21 N = 777 (68.5%)¹	p-value²	All census block groups N = 1057¹	PNPL Score < Median: 0.21 N = 737 (69.7%)¹	PNPL Score >= Median: 0.21 N = 320 (30.3%)¹	p-value²
% Houses Built Pre 1960	0.72 (0.21)	0.68 (0.22)	0.74 (0.20)	<0.001	0.33 (0.22)	0.35 (0.23)	0.28 (0.20)	<0.001
Proximity to Traffic	664.91 (809.44)	588.88 (729.87)	699.84 (841.64)	0.024	332.63 (462.80)	289.83 (384.43)	431.21 (595.03)	<0.001
Proximity to Waterwater Discharge Facility	0.03 (0.27)	0.06 (0.33)	0.01 (0.23)	0.020	0.00 (0.00)	0.00 (0.00)	0.00 (0.00)	0.019
Proximity to Regulated Management Plan Facility	0.23 (0.24)	0.14 (0.07)	0.27 (0.28)	<0.001	0.09 (0.12)	0.07 (0.03)	0.16 (0.19)	<0.001
Proximity to Hazardous Waste Facility	2.38 (1.72)	1.44 (0.96)	2.81 (1.81)	<0.001	1.64 (1.67)	1.08 (1.03)	2.91 (2.11)	<0.001
Average Seasonal Ozone Concentration	43.60 (0.51)	43.50 (0.55)	43.64 (0.48)	<0.001	43.53 (0.70)	43.52 (0.77)	43.55 (0.50)	0.398
Annual Average PM2.5 Concentration	7.81 (0.25)	7.62 (0.17)	7.89 (0.23)	<0.001	7.13 (0.38)	7.09 (0.42)	7.23 (0.24)	<0.001
Proximity to Underground Storage Tank	1.71 (2.48)	1.56 (2.15)	1.78 (2.62)	0.131	0.57 (0.93)	0.60 (0.99)	0.51 (0.77)	0.116
¹ Mean (SD)
² Welch Two Sample t-test

Preparing for Regression Analysis

Correlation Matrix

matrix <- subset(df, select = c(VULEOPCT, MINORPCT,LOWINCPCT, LESSHSPCT, LINGISOPCT, UNEMPPCT, UNDER5PCT, OVER64PCT
                                ,PRE1960PCT, PTRAF,
                                 PWDIS, PRMP, PTSDF, OZONE, PM25, UST))

matrix.cor <- round(cor(matrix), 1)

All Variables - Heat Map

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(matrix.cor, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, #Text label color and rotation
         # Combine with significance
         p.mat = p.mat, sig.level = 0.05, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
)

Demographic Variables - Heat Map

#Demographic Correlation Matrix 

matrix_dem <- subset(df, select = c(VULEOPCT, MINORPCT,LOWINCPCT, LESSHSPCT, LINGISOPCT, UNEMPPCT, UNDER5PCT, OVER64PCT))

matrix_dem.cor <- round(cor(matrix_dem), 1) 

p.mat <- cor.mtest(matrix_dem)


col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(matrix_dem.cor, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, #Text label color and rotation
         # Combine with significance
         p.mat = p.mat, sig.level = 0.05, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
)

Environmental Variables - Heat Map

# Environmental Correlation Matrix
matrix_env <- subset(df, select = c(PRE1960PCT, PTRAF,
                                    PWDIS, PRMP, PTSDF, OZONE, PM25, UST))

matrix_env.cor <- round(cor(matrix_env), 1) 

p.mat <- cor.mtest(matrix_env)

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(matrix_env.cor, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=45, #Text label color and rotation
         # Combine with significance
         p.mat = p.mat, sig.level = 0.05, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
)

Vulnerabilty Index must be removed from the regrssion due to having a high correlation with the other variables.

Histograms

Creating PNPL Log Transformation

Showing negative values, is this okay?

#Vulnerability PCT
### Fairly normal
hist(df$VULEOPCT, main = "Distribution of VULEOPCT") #2

#MINOR PCT
### Non-normal

hist(df$MINORPCT, main = "Distribution of MINORPCT") #3

#LOW INCOME PCT
### Skew Right

hist(df$LOWINCPCT, main = "Distribution of LOWINCPCT") #4

#LESSHSPCT
### Skew Right

hist(df$LESSHSPCT, main = "Distribution of LESSHSPCT") #5

#LINGUISTIC ISOLATION
### Skew Right

hist(df$LINGISOPCT, main = "Distribution of LINGISOPCT") #6

#UNEMPPCT
### Skew left

hist(df$UNEMPPCT, main = "Distribution of UNEMPPCT") #7

#UNDER 5 PCT
### Looks normal

hist(df$UNDER5PCT, main = "Distribution of UNDER5PCT") #8

#OVER 64 PCT
### Looks normal

hist(df$OVER64PCT, main = "Distribution of OVER64PCT") #9

#PRE1960 PCT
### Skewed right

hist(df$PRE1960PCT, main = "Distribution of PRE1960PCT") #10

#PTRAF 
### Skewed left

hist(df$PTRAF, main = "Distribution of PTRAF") #11

#PWDIS
### Skewed left

hist(df$PWDIS, main = "Distribution of PWDIS") #12

#PRMP
### Skewed left

hist(df$PRMP, main = "Distribution of PRMP") #13

#PTSDF
### Skewed left

hist(df$PTSDF, main = "Distribution of PTSDF") #14

#Ozone
### Looks fairly normal

hist(df$OZONE, main = "Distribution of OZONE") #15

#PM2.5
### Slight skew right

hist(df$PM25, main = "Distribution of PM25") #16

#UST 
### Skewed left

hist(df$UST, main = "Distribution of UST") #17

Notes

Should I transform the independent variables that have an obvious skew before conducting regression?

First Try at Regression (all in, both counties)

Unadjusted Demographic Relationships

# MINORPCT

fit.MINORPCT  <- lm(PNPL_LOG ~ MINORPCT, data = df)

# LWINCPCT

fit.LWINCPCT  <- lm(PNPL_LOG ~ LOWINCPCT, data = df)

# LESHSPCT

fit.LESHSPCT  <- lm(PNPL_LOG ~ LESSHSPCT, data = df)

# LNGISPCT

fit.LNGISPCT  <- lm(PNPL_LOG ~ LINGISOPCT, data = df)

# UNDR5PCT

fit.UNDR5PCT  <- lm(PNPL_LOG ~ UNDER5PCT, data = df)

# OVR64PCT

fit.OVR64PCT  <- lm(PNPL_LOG ~ OVER64PCT, data = df)

tab_model(fit.MINORPCT, fit.LWINCPCT, fit.LESHSPCT, fit.LNGISPCT, fit.UNDR5PCT, fit.OVR64PCT)

	PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG
Predictors	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p
(Intercept)	-1.51	-1.58 – -1.45	<0.001	-1.30	-1.36 – -1.24	<0.001	-1.39	-1.44 – -1.33	<0.001	-1.40	-1.45 – -1.36	<0.001	-1.38	-1.44 – -1.32	<0.001	-1.18	-1.25 – -1.10	<0.001
MINORPCT	0.55	0.41 – 0.69	<0.001
LOWINCPCT				-0.18	-0.47 – 0.12	0.241
LESSHSPCT							0.72	0.30 – 1.13	0.001
LINGISOPCT										1.94	1.37 – 2.51	<0.001
UNDER5PCT													1.08	0.12 – 2.04	0.027
OVER64PCT																-0.78	-1.13 – -0.43	<0.001
Observations	2191			2191			2191			2191			2191			2191
R² / R² adjusted	0.026 / 0.025			0.001 / 0.000			0.005 / 0.005			0.020 / 0.019			0.002 / 0.002			0.009 / 0.008

Unadjusted Environmental Relationships

# PRE1960 PCT

fit.PRE1960  <- lm(PNPL_LOG ~ PRE1960PCT, data = df)

# PTRAF 

fit.PTRAF  <- lm(PNPL_LOG ~ PTRAF, data = df)

# PRMP

fit.PRMP  <- lm(PNPL_LOG ~ PRMP, data = df)

# PTSDF

fit.PTSDF  <- lm(PNPL_LOG ~ PTSDF, data = df)

# OZONE

fit.OZONE  <- lm(PNPL_LOG ~ OZONE, data = df)

# PM2.5

fit.PM25  <- lm(PNPL_LOG ~ PM25, data = df)

# UST

fit.UST  <- lm(PNPL_LOG ~ UST, data = df)

tab_model(fit.PRE1960, fit.PTRAF, fit.PRMP, fit.PTSDF, fit.OZONE, fit.PM25, fit.UST)

	PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG			PNPL_LOG
Predictors	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p	Estimates	CI	p
(Intercept)	-1.86	-1.94 – -1.79	<0.001	-1.44	-1.49 – -1.39	<0.001	-1.63	-1.68 – -1.59	<0.001	-1.83	-1.88 – -1.78	<0.001	-12.12	-14.91 – -9.33	<0.001	-9.70	-10.23 – -9.17	<0.001	-1.41	-1.45 – -1.36	<0.001
PRE1960PCT	1.01	0.88 – 1.14	<0.001
PTRAF				0.00	0.00 – 0.00	<0.001
PRMP							1.90	1.73 – 2.08	<0.001
PTSDF										0.25	0.23 – 0.27	<0.001
OZONE													0.25	0.18 – 0.31	<0.001
PM25																1.12	1.05 – 1.19	<0.001
UST																			0.07	0.05 – 0.09	<0.001
Observations	2191			2191			2191			2191			2191			2191			2191
R² / R² adjusted	0.099 / 0.098			0.027 / 0.027			0.172 / 0.172			0.213 / 0.212			0.026 / 0.025			0.307 / 0.306			0.021 / 0.021

Adjusted All

fit.adjusted <- lm(PNPL_LOG ~ MINORPCT + LOWINCPCT + LESSHSPCT +
                  LINGISOPCT + UNDER5PCT + OVER64PCT
                  + PRE1960PCT + PTRAF + PWDIS + PRMP + PTSDF + OZONE + PM25+ UST, data = df)

summary(fit.adjusted)

## 
## Call:
## lm(formula = PNPL_LOG ~ MINORPCT + LOWINCPCT + LESSHSPCT + LINGISOPCT + 
##     UNDER5PCT + OVER64PCT + PRE1960PCT + PTRAF + PWDIS + PRMP + 
##     PTSDF + OZONE + PM25 + UST, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0698 -0.4849 -0.1283  0.3930  2.7427 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.406e-01  1.171e+00  -0.547   0.5844    
## MINORPCT    -4.005e-01  7.480e-02  -5.354 9.51e-08 ***
## LOWINCPCT   -8.677e-02  1.319e-01  -0.658   0.5107    
## LESSHSPCT   -4.589e-02  2.135e-01  -0.215   0.8298    
## LINGISOPCT   2.696e-01  2.599e-01   1.037   0.2997    
## UNDER5PCT    2.710e-01  3.741e-01   0.724   0.4690    
## OVER64PCT   -1.329e-01  1.430e-01  -0.929   0.3528    
## PRE1960PCT  -1.394e-01  6.442e-02  -2.164   0.0305 *  
## PTRAF       -2.906e-05  2.271e-05  -1.280   0.2007    
## PWDIS       -1.843e-01  7.710e-02  -2.391   0.0169 *  
## PRMP         1.295e+00  8.096e-02  16.001  < 2e-16 ***
## PTSDF        1.730e-01  9.338e-03  18.528  < 2e-16 ***
## OZONE       -1.876e-01  3.064e-02  -6.122 1.09e-09 ***
## PM25         9.616e-01  5.085e-02  18.912  < 2e-16 ***
## UST         -1.828e-02  8.139e-03  -2.246   0.0248 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6855 on 2176 degrees of freedom
## Multiple R-squared:  0.4727, Adjusted R-squared:  0.4693 
## F-statistic: 139.3 on 14 and 2176 DF,  p-value: < 2.2e-16

fit.adjusted %>%
  tbl_regression(intercept = T,
                 estimate_fun = function(x) style_sigfig(x, digits = 3),
                 pvalue_fun   = function(x) style_pvalue(x, digits = 3),
                 label  = list(
                               MINORPCT ~ "Racial Minority %",
                               LOWINCPCT ~ "Low Income %",
                               LESSHSPCT ~ "Less than HS Ed. %",
                               LINGISOPCT ~ "Lingustic Isolation %",
                               UNDER5PCT ~ "Under Age 5 %",
                               OVER64PCT ~ "Over Age 64 %",
                               PRE1960PCT ~ "Housing Units Pre1960 %",
                               PTRAF ~ "Proximity to Traffic",
                               PWDIS ~ "Prxomity to Wastewater Discharge Facility",
                               PRMP ~ "Proximity to RMP Facility",
                               PTSDF ~ "Proximity to Hazardous Waste Facility",
                               OZONE ~ "Ozone Concentration",
                               PM25 ~ "PM 2.5 Concentration",
                               UST ~ "Proximity to Underground Storage Tank"
                               )) %>%
  add_global_p(keep = T) %>% 
  modify_caption("Adjusted Linear Regression Results for Block Group Proximity to NPl Site")

Adjusted Linear Regression Results for Block Group Proximity to NPl Site
Characteristic	Beta	95% CI¹	p-value
(Intercept)	-0.641	-2.94, 1.66	0.584
Racial Minority %	-0.401	-0.547, -0.254	<0.001
Low Income %	-0.087	-0.345, 0.172	0.511
Less than HS Ed. %	-0.046	-0.465, 0.373	0.830
Lingustic Isolation %	0.270	-0.240, 0.779	0.300
Under Age 5 %	0.271	-0.463, 1.00	0.469
Over Age 64 %	-0.133	-0.413, 0.148	0.353
Housing Units Pre1960 %	-0.139	-0.266, -0.013	0.031
Proximity to Traffic	0.000	0.000, 0.000	0.201
Prxomity to Wastewater Discharge Facility	-0.184	-0.336, -0.033	0.017
Proximity to RMP Facility	1.30	1.14, 1.45	<0.001
Proximity to Hazardous Waste Facility	0.173	0.155, 0.191	<0.001
Ozone Concentration	-0.188	-0.248, -0.127	<0.001
PM 2.5 Concentration	0.962	0.862, 1.06	<0.001
Proximity to Underground Storage Tank	-0.018	-0.034, -0.002	0.025
¹ CI = Confidence Interval

Visualize Adjusted Results

Check Normality

## [1] "The sample size is 2191 and there are 14 predictors."
## [1] "There are 156.5 observations per predictor."

Check Linearity

Not all variables present linearity. Should we change for regression?

Check Co-Linearity

All below 5!

##   MINORPCT  LOWINCPCT  LESSHSPCT LINGISOPCT  UNDER5PCT  OVER64PCT PRE1960PCT 
##   1.993136   1.433285   1.910932   1.458653   1.098889   1.195903   1.642841 
##      PTRAF      PWDIS       PRMP      PTSDF      OZONE       PM25        UST 
##   1.128826   1.014517   1.283995   1.224700   1.614204   2.606815   1.211848

DO NOT WANT TO MOVE ON WITH THE REGRESSION UNTIL I FIND OUT:

Do we need to transform the skewed independent variables?
Do we need to transform the non-linear independent variables?

Superfund Project - MPH THESIS

Fin Mooney

2023-02-19