Leukemia

Author

Agenda

  1. Leukemia and our Hypothesis

  2. Exploration

  3. Cleaning and Transform

  4. Feature Selected (ggpairs)


1. Leukemia and our Hypothesis

2. Exploration

2.1 Look into county_leukemia_128

#include

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggspatial)
library(lmtest)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(spatialreg)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: Matrix
## Loading required package: sf
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(spdep)
## 
## Attaching package: 'spdep'
## The following objects are masked from 'package:spatialreg':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
library(texreg)
## Version:  1.39.4
## Date:     2024-07-23
## Author:   Philip Leifeld (University of Manchester)
## 
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(sf)
library(ggplot2)
library(sp)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
## 
##     collapse
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load("data_n128.RData")
# Look into county_leukemia_128
# head(county_leukemia_128)
summary(county_leukemia_128)
##        id          leuk_cases             AverageAge  GreenNeighborhoodIndex
##  Min.   :  1.0   Min.   :23.00   15 and younger:186   Min.   :0.0002333     
##  1st Qu.:125.8   1st Qu.:32.00   15-50         :167   1st Qu.:0.2346412     
##  Median :250.5   Median :35.00   50 and older  :147   Median :0.4859838     
##  Mean   :250.5   Mean   :36.68                        Mean   :0.4862802     
##  3rd Qu.:375.2   3rd Qu.:41.00                        3rd Qu.:0.7251946     
##  Max.   :500.0   Max.   :52.00                        Max.   :0.9967076     
##    CarsPerHH           PopDens           PctPopGrowth     Dist2FracLoc_1  
##  Min.   :0.001174   Min.   :0.0000152   Min.   :-6.9749   Min.   : 0.000  
##  1st Qu.:0.256101   1st Qu.:0.0600216   1st Qu.:-1.4347   1st Qu.: 2.938  
##  Median :0.502197   Median :0.1753587   Median :-0.2529   Median : 4.128  
##  Mean   :0.497339   Mean   :0.2427995   Mean   :-0.4161   Mean   : 4.626  
##  3rd Qu.:0.749654   3rd Qu.:0.3783928   3rd Qu.: 0.6439   3rd Qu.: 6.410  
##  Max.   :0.999106   Max.   :0.9666323   Max.   : 4.9987   Max.   :10.000  
##  Dist2FracLoc_2   Dist2FracLoc_3   Dist2FracLoc_4   Dist2FracLoc_5  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 2.986   1st Qu.: 2.868   1st Qu.: 2.547   1st Qu.: 3.373  
##  Median : 4.690   Median : 4.476   Median : 3.938   Median : 5.260  
##  Mean   : 4.835   Mean   : 4.587   Mean   : 4.566   Mean   : 5.245  
##  3rd Qu.: 6.770   3rd Qu.: 6.374   3rd Qu.: 6.627   3rd Qu.: 7.139  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  Dist2FracLoc_6   Dist2FracLoc_7   Dist2FracLoc_8   Dist2FracLoc_9  
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.075   1st Qu.: 2.477   1st Qu.: 2.759   1st Qu.: 2.426  
##  Median : 4.846   Median : 4.010   Median : 4.278   Median : 4.217  
##  Mean   : 4.760   Mean   : 4.433   Mean   : 4.608   Mean   : 4.443  
##  3rd Qu.: 6.249   3rd Qu.: 6.161   3rd Qu.: 6.123   3rd Qu.: 6.129  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  Dist2FracLoc_10       lon               lat                    geometry  
##  Min.   : 0.000   Min.   :-99.633   Min.   :-49.7417   POLYGON      :500  
##  1st Qu.: 3.043   1st Qu.:-46.424   1st Qu.:-27.0143   epsg:4326    :  0  
##  Median : 4.639   Median :  6.654   Median : -1.4512   +proj=long...:  0  
##  Mean   : 4.771   Mean   :  2.938   Mean   : -0.7321                      
##  3rd Qu.: 6.124   3rd Qu.: 52.227   3rd Qu.: 25.4981                      
##  Max.   :10.000   Max.   : 99.923   Max.   : 49.2118
  • ID : “id”

  • Outcome (y) : *****

  • Risk Factors (x) : *****

  • Spatial Structure : *****

min_value <- min(county_leukemia_128$leuk_cases)
max_value <- max(county_leukemia_128$leuk_cases)
mean_value <- mean(county_leukemia_128$leuk_cases)

boxplot(county_leukemia_128$leuk_cases, 
        main = "Leukemia cases", 
        ylab = "Cases", 
        col = "lightblue",
        ylim = c(20,60))

text(x = 1.2, y = min_value - 5, labels = paste("Min:", round(min_value, 2)), pos = 4, col = "red")
text(x = 1.2, y = max_value + 5, labels = paste("Max:", round(max_value, 2)), pos = 4, col = "red")
text(x = 1.2, y = mean_value, labels = paste("Mean:", round(mean_value, 2)), pos = 4, col = "blue")

points(x = 1, y = mean_value, col = "blue", pch = 18, cex = 1.5)  # pch = 19 for a filled circle

2.2 Look into fracking_locations_128

head(fracking_locations_128)
## Simple feature collection with 6 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -51.85614 ymin: -29.43917 xmax: 99.02833 ymax: 38.71371
## Geodetic CRS:  WGS 84
##                      geometry FracLoc
## 1 POINT (-40.33708 -6.008472)       1
## 2   POINT (88.45395 25.42268)       2
## 3   POINT (78.33582 34.07758)       3
## 4  POINT (-51.85614 1.218802)       4
## 5  POINT (99.02833 -29.43917)       5
## 6   POINT (31.19807 38.71371)       6
summary(fracking_locations_128)
##           geometry     FracLoc     
##  POINT        :10   Min.   : 1.00  
##  epsg:4326    : 0   1st Qu.: 3.25  
##  +proj=long...: 0   Median : 5.50  
##                     Mean   : 5.50  
##                     3rd Qu.: 7.75  
##                     Max.   :10.00
  • ID : “FracLoc”

  • Spatial Structure : *****

County Maps

county_leukemia_sf <- st_as_sf(county_leukemia_128, coords = c("lon", "lat"), crs = 4326)

fracking_locations_sf <- st_as_sf(fracking_locations_128, coords = c("longitude", "latitude"), crs = 4326)
plot(st_geometry(county_leukemia_sf), main = "World Maps")

Countries Location

# Extract coordinates for fracking locations
fracking_coords <- st_coordinates(fracking_locations_sf)

# Add the coordinates as columns to the fracking_locations object
fracking_locations_sf$lon <- fracking_coords[, 1]
fracking_locations_sf$lat <- fracking_coords[, 2]

ggplot() +
  geom_sf(data = county_leukemia_sf, aes(fill = county_leukemia_sf$leuk_cases), color = "black") +
  # Plot the geometry of fracking locations using 'geom_sf'
  geom_sf(data = fracking_locations_sf, fill = "lightgreen", color = "red", alpha = 1, size = 2) + 
  # Plot the scatter points for the fracking sites +
  geom_text(data = fracking_locations_sf, aes(x = lon, y = lat, label = as.character(FracLoc)), 
            color = "Green", size = 5, vjust = 0, hjust = 0) +  # Adjust position for clarity
  ggtitle("                          Leukemia Density and Fracking Sites") +
  # Use a gradient scale to fill with darker colors for higher 'leuk_cases'
  scale_fill_gradient(low = "lightblue", high = "darkblue") +  # Light color for lower values, dark for higher
  theme_minimal() +
  theme(legend.position = "bottom",
        axis.text.x = element_blank(),
        axis.text.y = element_blank())

3. Cleaning and Transforming

4. Features Selection (ggpairs)

== Start with Linear Correlation by GG Pairs ==

Use GGpairs with Main risk factors to see linear correlation

sel_riskFactor_vars <- county_leukemia_sf[, c("leuk_cases", "GreenNeighborhoodIndex", "CarsPerHH", "PopDens", "PctPopGrowth") ]

sel_riskFactor_vars <- st_drop_geometry(sel_riskFactor_vars)

ggpairs(sel_riskFactor_vars, title = "Risk Factor") +
  theme()

  • High Linear Cor at leuk_cases–PctPopGrowth, CarsPerHH–PopDens

Try GGpair with distance to fractions.

sel_distFraction_vars <- county_leukemia_sf[, c("leuk_cases", "Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", "Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", "Dist2FracLoc_9", "Dist2FracLoc_10") ]

sel_distFraction_vars <- st_drop_geometry(sel_distFraction_vars)

ggpairs(sel_distFraction_vars, title = "Distance to Fractions") + theme()

# ggsave("dist2Frac.png", plot = pic_sel_distFraction_vars, width = 20, height = 20, units = "in", dpi = 300)
  • High Cor when fraction is close together.
sel_distFraction_1st_vars <- county_leukemia_sf[, c("leuk_cases", "Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", "Dist2FracLoc_5") ]

sel_distFraction_2nd_vars <- county_leukemia_sf[, c("leuk_cases", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", "Dist2FracLoc_9", "Dist2FracLoc_10") ]

sel_distFraction_1st_vars <- st_drop_geometry(sel_distFraction_1st_vars)

sel_distFraction_2nd_vars <- st_drop_geometry(sel_distFraction_2nd_vars)

ggpairs(sel_distFraction_1st_vars, title = "Distance to Fractions of 1st half Variables")

ggpairs(sel_distFraction_2nd_vars, title = "Distance to Fractions of 2nd half Variables")

  • all Dist2FracLoc statistically significant(p-value <= 0.001) to leuk_cases

== Start some Linear Modeling ==

Try linear modeling with Risks Factor

lm_risk_01 <- lm(leuk_cases ~ PctPopGrowth, county_leukemia_sf)  
lm_risk_02 <- lm(leuk_cases ~ AverageAge, county_leukemia_sf)

lm_risk_12 <- lm(leuk_cases ~ PctPopGrowth+ AverageAge+ GreenNeighborhoodIndex+ CarsPerHH+ PopDens, county_leukemia_sf)

lm_risk_13 <-lm(leuk_cases ~ PctPopGrowth+ GreenNeighborhoodIndex, county_leukemia_sf)

lm_risk_14 <-lm(leuk_cases ~ PctPopGrowth+ PopDens, county_leukemia_sf) 

screenreg(
  list( lm_risk_01, lm_risk_02, lm_risk_12, lm_risk_13, lm_risk_14),
  digits = 2
)
## 
## ==================================================================================
##                         Model 1     Model 2     Model 3     Model 4     Model 5   
## ----------------------------------------------------------------------------------
## (Intercept)              37.05 ***   36.47 ***   36.83 ***   36.88 ***   37.02 ***
##                          (0.27)      (0.44)      (0.76)      (0.52)      (0.40)   
## PctPopGrowth              0.89 ***                0.88 ***    0.89 ***    0.89 ***
##                          (0.15)                  (0.15)      (0.15)      (0.15)   
## AverageAge15-50                       0.60        0.56                            
##                                      (0.64)      (0.63)                           
## AverageAge50 and older                0.05       -0.07                            
##                                      (0.67)      (0.65)                           
## GreenNeighborhoodIndex                            0.39        0.37                
##                                                  (0.92)      (0.91)               
## CarsPerHH                                        -0.63                            
##                                                  (1.23)                           
## PopDens                                           0.75                    0.14    
##                                                  (1.62)                  (1.22)   
## ----------------------------------------------------------------------------------
## R^2                       0.07        0.00        0.07        0.07        0.07    
## Adj. R^2                  0.07       -0.00        0.06        0.07        0.07    
## Num. obs.               500         500         500         500         500       
## ==================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Let’s focus on the only variable that show high correlation in ggpairs() –> Model 1
  • Also curious about AverageAge –> Model 2
  • Try again with all variables –> Model 3
  • I’m extra suspicious with GreenNeighborhoodIndex –> Model 4
  • Let’s try one more to conclude all variables (assume CarsPerHH and PopDens have high pair-correlation) let’s just fo PopDens –> Model 5
  • From this I conclude on PctPopGrowth is coorelated linearly to leuk_cases (lm_risk_01), but I will keep in mind for other Risk Factor to be non-linear related.

Try linear modeling with Dist2Frac

lm_dist_a01 <- lm(leuk_cases ~ Dist2FracLoc_1  , county_leukemia_sf) 
lm_dist_a02 <- lm(leuk_cases ~ Dist2FracLoc_2  , county_leukemia_sf) 
lm_dist_a03 <- lm(leuk_cases ~ Dist2FracLoc_3  , county_leukemia_sf) 
lm_dist_a04 <- lm(leuk_cases ~ Dist2FracLoc_4  , county_leukemia_sf) 
lm_dist_a05 <- lm(leuk_cases ~ Dist2FracLoc_5  , county_leukemia_sf) 
lm_dist_a06 <- lm(leuk_cases ~ Dist2FracLoc_6  , county_leukemia_sf) 
lm_dist_a07 <- lm(leuk_cases ~ Dist2FracLoc_7  , county_leukemia_sf) 
lm_dist_a08 <- lm(leuk_cases ~ Dist2FracLoc_8  , county_leukemia_sf) 
lm_dist_a09 <- lm(leuk_cases ~ Dist2FracLoc_9  , county_leukemia_sf) 
lm_dist_a10 <- lm(leuk_cases ~ Dist2FracLoc_10 , county_leukemia_sf) 

screenreg(
  list(lm_dist_a01, lm_dist_a02, lm_dist_a03, lm_dist_a04, lm_dist_a05),
  digits = 2
)
## 
## ==========================================================================
##                 Model 1     Model 2     Model 3     Model 4     Model 5   
## --------------------------------------------------------------------------
## (Intercept)      40.13 ***   28.67 ***   28.89 ***   41.53 ***   29.03 ***
##                  (0.55)      (0.45)      (0.44)      (0.49)      (0.50)   
## Dist2FracLoc_1   -0.74 ***                                                
##                  (0.10)                                                   
## Dist2FracLoc_2                1.66 ***                                    
##                              (0.08)                                       
## Dist2FracLoc_3                            1.70 ***                        
##                                          (0.09)                           
## Dist2FracLoc_4                                       -1.06 ***            
##                                                      (0.09)               
## Dist2FracLoc_5                                                    1.46 ***
##                                                                  (0.09)   
## --------------------------------------------------------------------------
## R^2               0.09        0.45        0.44        0.20        0.37    
## Adj. R^2          0.09        0.44        0.44        0.20        0.37    
## Num. obs.       500         500         500         500         500       
## ==========================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg(
  list(lm_dist_a06, lm_dist_a07, lm_dist_a08, lm_dist_a09, lm_dist_a10),
  digits = 2
)
## 
## ===========================================================================
##                  Model 1     Model 2     Model 3     Model 4     Model 5   
## ---------------------------------------------------------------------------
## (Intercept)       28.19 ***   27.03 ***   25.76 ***   27.99 ***   25.27 ***
##                   (0.49)      (0.28)      (0.22)      (0.35)      (0.25)   
## Dist2FracLoc_6     1.78 ***                                                
##                   (0.09)                                                   
## Dist2FracLoc_7                 2.18 ***                                    
##                               (0.05)                                       
## Dist2FracLoc_8                             2.37 ***                        
##                                           (0.04)                           
## Dist2FracLoc_9                                         1.96 ***            
##                                                       (0.07)               
## Dist2FracLoc_10                                                    2.39 ***
##                                                                   (0.05)   
## ---------------------------------------------------------------------------
## R^2                0.42        0.76        0.86        0.62        0.83    
## Adj. R^2           0.42        0.76        0.86        0.62        0.83    
## Num. obs.        500         500         500         500         500       
## ===========================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • All dist2Frac indeed correlated, but some yield more R ^2 than others.

  • From location wise, let’s not use close by location to prevent Multicollinearity, and also keep model simple.

  • Pair 1,4 : lm_dist_a04 R^2 > lm_dist_a01 : Keep 4

  • Pair 2,3 : lm_dist_a02 R^2 > lm_dist_a03 : Keep 2

  • Pair 7,9 : lm_dist_a07 R^2 > lm_dist_a09 : Keep 7

  • Pair 8,10 : lm_dist_a08 R^2 > lm_dist_a10 : Keep 8 (R ^2 = 0.86)

  • Keep 5, and 6 normally.

    lm_dist_b01 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6 + Dist2FracLoc_7 + Dist2FracLoc_8   , county_leukemia_sf) 
    
    lm_dist_b02 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 + Dist2FracLoc_7 + Dist2FracLoc_8   , county_leukemia_sf) # not significant eliminate 5,6 
    
    lm_dist_b03 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 + Dist2FracLoc_7    , county_leukemia_sf) # 8 only, yield high data, try eliminate 8 to see the impact.
    
    lm_dist_b04 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4    , county_leukemia_sf) #close to 8 eliminate 7
    
    lm_dist_b05 <- lm(leuk_cases ~ Dist2FracLoc_7 + Dist2FracLoc_8    , county_leukemia_sf) #7 & 8 give high impact, let's only focus on that
    
    screenreg(
      list(lm_dist_b01, lm_dist_b02, lm_dist_a08, lm_dist_b03, lm_dist_b04, lm_dist_b05 ),
      digits = 2
    )
    ## 
    ## ======================================================================================
    ##                 Model 1     Model 2     Model 3     Model 4     Model 5     Model 6   
    ## --------------------------------------------------------------------------------------
    ## (Intercept)      19.30 ***   18.20 ***   25.76 ***   13.87 ***   20.33 ***   25.76 ***
    ##                  (1.88)      (1.27)      (0.22)      (0.69)      (1.33)      (0.22)   
    ## Dist2FracLoc_2    1.07 ***    1.01 ***                1.53 ***    2.50 ***            
    ##                  (0.21)      (0.15)                  (0.08)      (0.15)               
    ## Dist2FracLoc_4    0.77 ***    0.85 ***                1.37 ***    0.94 ***            
    ##                  (0.20)      (0.15)                  (0.07)      (0.14)               
    ## Dist2FracLoc_5   -0.19                                                                
    ##                  (0.22)                                                               
    ## Dist2FracLoc_6   -0.12                                                                
    ##                  (0.20)                                                               
    ## Dist2FracLoc_7    1.24 **     1.18 ***                2.07 ***                0.08    
    ##                  (0.40)      (0.23)                  (0.05)                  (0.12)   
    ## Dist2FracLoc_8    1.05 **     0.98 ***    2.37 ***                            2.29 ***
    ##                  (0.38)      (0.24)      (0.04)                              (0.12)   
    ## --------------------------------------------------------------------------------------
    ## R^2               0.87        0.87        0.86        0.87        0.49        0.86    
    ## Adj. R^2          0.87        0.87        0.86        0.87        0.49        0.86    
    ## Num. obs.       500         500         500         500         500         500       
    ## ======================================================================================
    ## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Let’s only use just 6 variables that we keep. –> Model 1

  • Model 1 –> not significant eliminate 5,6 –> Model 2

  • Model 2 –> compare with DistFracLoc_8 only (lm_dist_a08 == Model 3)

  • Model 2 + 3 –> Assume DistFracLoc_8 is most significant, what if I eliminate DistFracLoc_8 –> Model 4

  • Model 4 –> Get same result, but FracLoc_7 and FracLoc_8 are very close, how about we eliminate DistFracLoc_7 as well –> Model 5

  • Model 5 –> Yield bad result, can we try will only DistFracLoc_7 , and DistFracLoc_8 –> Model 6

  • Too conclude this, 8 is only parameter we need (Model3).

Try lon, and lat

lm_lon <- lm(leuk_cases ~ lon, county_leukemia_sf)  
lm_lat <- lm(leuk_cases ~ lat, county_leukemia_sf)
lm_lon_lat <- lm(leuk_cases ~ lon + lat, county_leukemia_sf)

screenreg(
  list( lm_lon, lm_lat, lm_lon_lat),
  digits = 2
)
## 
## ===============================================
##              Model 1     Model 2     Model 3   
## -----------------------------------------------
## (Intercept)   36.89 ***   36.69 ***   36.89 ***
##               (0.20)      (0.27)      (0.20)   
## lon           -0.07 ***               -0.07 ***
##               (0.00)                  (0.00)   
## lat                        0.01        0.00    
##                           (0.01)      (0.01)   
## -----------------------------------------------
## R^2            0.47        0.00        0.47    
## Adj. R^2       0.47        0.00        0.47    
## Num. obs.    500         500         500       
## ===============================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Longitude show statistic significant,

Try combine three summaries we have so far.

lm_sum_01 <- lm(leuk_cases ~PctPopGrowth + Dist2FracLoc_8 + lon, county_leukemia_sf)
lm_sum_02 <- lm(leuk_cases ~               Dist2FracLoc_8 + lon, county_leukemia_sf)
lm_sum_03 <- lm(leuk_cases ~ PctPopGrowth                 + lon, county_leukemia_sf)
lm_sum_04 <- lm(leuk_cases ~ PctPopGrowth+ Dist2FracLoc_8      , county_leukemia_sf)
## dist 8 is very important, even there is no lon it is ok.

screenreg(
  list(lm_sum_01, lm_sum_02, lm_sum_03, lm_sum_04 ),
  digits = 2
)
## 
## ==============================================================
##                 Model 1     Model 2     Model 3     Model 4   
## --------------------------------------------------------------
## (Intercept)      26.91 ***   26.38 ***   37.24 ***   26.05 ***
##                  (0.30)      (0.29)      (0.19)      (0.22)   
## PctPopGrowth      0.30 ***                0.85 ***    0.26 ***
##                  (0.06)                  (0.10)      (0.06)   
## Dist2FracLoc_8    2.15 ***    2.24 ***                2.33 ***
##                  (0.06)      (0.06)                  (0.04)   
## lon              -0.01 ***   -0.01 **    -0.07 ***            
##                  (0.00)      (0.00)      (0.00)               
## --------------------------------------------------------------
## R^2               0.87        0.87        0.54        0.87    
## Adj. R^2          0.87        0.86        0.54        0.87    
## Num. obs.       500         500         500         500       
## ==============================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Start with trying 3 Statistic Significant Variables. (Model 1)

  • Then let’s see what happened if each of these variable is missing (Model 2,3,4)

  • From Model 3, we can see that Dist2FracLoc_8 is dominating the R^2 result.

screenreg(
  list(lm_sum_01, lm_sum_02, lm_sum_04, lm_risk_01, lm_dist_a08 ,lm_lon),
  digits = 2
)
## 
## ======================================================================================
##                 Model 1     Model 2     Model 3     Model 4     Model 5     Model 6   
## --------------------------------------------------------------------------------------
## (Intercept)      26.91 ***   26.38 ***   26.05 ***   37.05 ***   25.76 ***   36.89 ***
##                  (0.30)      (0.29)      (0.22)      (0.27)      (0.22)      (0.20)   
## PctPopGrowth      0.30 ***                0.26 ***    0.89 ***                        
##                  (0.06)                  (0.06)      (0.15)                           
## Dist2FracLoc_8    2.15 ***    2.24 ***    2.33 ***                2.37 ***            
##                  (0.06)      (0.06)      (0.04)                  (0.04)               
## lon              -0.01 ***   -0.01 **                                        -0.07 ***
##                  (0.00)      (0.00)                                          (0.00)   
## --------------------------------------------------------------------------------------
## R^2               0.87        0.87        0.87        0.07        0.86        0.47    
## Adj. R^2          0.87        0.86        0.87        0.07        0.86        0.47    
## Num. obs.       500         500         500         500         500         500       
## ======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Eliminate lm_sum_03 (Previous Model 3), because we only Focus model with Dist2FracLoc_8.

  • Compare them with previous good model.

  • For balance between R^2 and variables, let keep thes two. lm_sum_04 (Model 3), lm_dist_a08 (Model5)

== Check Residual of our linear modeling ==

Try Residual of lm_sum_04 and bpTest

# Create a data frame with fitted values and residuals
plot_lm_sum_04_data <- data.frame(
  FittedValues = lm_sum_04$fitted.values,
  Residuals = resid(lm_sum_04)
)

# Create the ggplot
ggplot(plot_lm_sum_04_data, aes(x = FittedValues, y = Residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red") +
  labs(
    x = "Fitted Values",
    y = "Residuals",
    title = "Residuals vs Fitted Values of lm_sum_04"
  ) +
  theme_minimal()

bptest(lm_sum_04)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_sum_04
## BP = 4.3835, df = 2, p-value = 0.1117

Try Residual of lm_dist_a08 and bpTest

# Create a data frame with fitted values and residuals
plot_lm_dist_a08_data <- data.frame(
  FittedValues = lm_dist_a08$fitted.values,
  Residuals = resid(lm_dist_a08)
)

# Create the ggplot
ggplot(plot_lm_dist_a08_data, aes(x = FittedValues, y = Residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red") +
  labs(
    x = "Fitted Values",
    y = "Residuals",
    title = "Residuals vs Fitted Values of lm_dist_a08"
  ) +
  theme_minimal()

bptest(lm_dist_a08)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_dist_a08
## BP = 0.074982, df = 1, p-value = 0.7842
  • For both models have high p-value, so we couldn’t conclude the no heteroskedasticity, assume both models linear. (homoscedasticity)

  • Too conclude for simplicity use only lm_dist_a08.


==Using GAM and Smooth Curve assume there are some non-linear relationship==.

Try pair test of 3 main variables (PctPopGrowth, Dist2FracLoc_8, lon)

#Pair Test
gam_pair_pct_linear <- gam(leuk_cases ~   PctPopGrowth , data = county_leukemia_sf)
gam_pair_pct_smooth <- gam(leuk_cases ~ s(PctPopGrowth), data = county_leukemia_sf)

AIC(gam_pair_pct_linear)
## [1] 3185.279
AIC(gam_pair_pct_smooth)
## [1] 3184.964
summary(gam_pair_pct_linear)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ PctPopGrowth
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.0532     0.2675 138.516  < 2e-16 ***
## PctPopGrowth   0.8873     0.1456   6.094 2.21e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.0675   Deviance explained = 6.94%
## GCV =  34.08  Scale est. = 33.943    n = 500
summary(gam_pair_pct_smooth)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(PctPopGrowth)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.6840     0.2596   141.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df     F  p-value    
## s(PctPopGrowth) 4.415  5.526 7.673 8.35e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0744   Deviance explained = 8.26%
## GCV = 34.062  Scale est. = 33.693    n = 500
Topics P-Value AIC R-sq (adj.) GCV & Scale. Est
Linear ( Pop Growth ) ’ *** ’ 3185.3 0.068 34.1 , 34.0
Smooth (Pop Growth ) ’ *** ’ 3185.0 0.074 34.1 , 33.7
Difference | 0.3 | | 0.006 | | 0.0 | , | 0.3 |
  • For simplicity prefer linear. Use linear of PctPopGrowth (gam_pair_pct_linear)

  • Let’s remain with our two main king’s variables.

gam_pair_ds8_linear <- gam(leuk_cases ~   Dist2FracLoc_8 , data = county_leukemia_sf)
gam_pair_ds8_smooth <- gam(leuk_cases ~ s(Dist2FracLoc_8), data = county_leukemia_sf)

AIC(gam_pair_ds8_linear)
## [1] 2229.561
AIC(gam_pair_ds8_smooth)
## [1] 2187.061
summary(gam_pair_ds8_linear)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ Dist2FracLoc_8
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    25.75797    0.21975  117.22   <2e-16 ***
## Dist2FracLoc_8  2.37102    0.04244   55.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.862   Deviance explained = 86.2%
## GCV = 5.0393  Scale est. = 5.0191    n = 500
summary(gam_pair_ds8_smooth)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(Dist2FracLoc_8)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.68400    0.09531   384.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df     F p-value    
## s(Dist2FracLoc_8) 8.567  8.949 391.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.875   Deviance explained = 87.7%
## GCV = 4.6303  Scale est. = 4.5417    n = 500
Topics P-Value AIC R-sq (adj.) GCV & Scale. Est
Linear ( Dist 8 ) ’ *** ’ 2229 0.862 5.0 , 5.0
Smooth ( Dist 8 ) ’ *** ’ 2187 0.875 4.6 , 4.5
Difference | 42 | | 0.013 | | 0.4 | , | 0.5 |
  • Non much difference for simplicity use Linear of Dist2FracLoc_8 (gam_pair_ds8_linear.)

  • Last one with lon

gam_pair_lon_linear <- gam(leuk_cases ~   lon , data = county_leukemia_sf)
gam_pair_lon_smooth <- gam(leuk_cases ~ s(lon), data = county_leukemia_sf)

AIC(gam_pair_lon_linear)
## [1] 2900.649
AIC(gam_pair_lon_smooth)
## [1] 2312.733
summary(gam_pair_lon_linear)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ lon
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.892664   0.196258  187.98   <2e-16 ***
## lon         -0.071028   0.003357  -21.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =  0.472   Deviance explained = 47.3%
## GCV = 19.287  Scale est. = 19.21     n = 500
summary(gam_pair_lon_smooth)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(lon)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.6840     0.1083   338.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(lon) 6.397   7.54 344.5  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.839   Deviance explained = 84.1%
## GCV = 5.9525  Scale est. = 5.8644    n = 500
Topics P-Value AIC R-sq (adj.) GCV & Scale. Est
Linear ( lon ) ’ *** ’ 2900 0.472 19.3 , 19.2
Smooth ( lon ) ’ *** ’ 2312 0.839 6.0 , 5.9
Difference | 588 | | 0.367 | | 13.3 | , | 13.3 |
  • With drastic improvement using smooth curve of lon (gam_pair_lon_smooth)

  • Assume will using variable as PctPopGrowth + Dist2FracLoc_8 + s(lon)

Too be sure with other variables we left-off, it there are non-linear behaviors

gam_sing_01 <- gam(leuk_cases ~ s(GreenNeighborhoodIndex), data = county_leukemia_sf) #no
gam_sing_02 <- gam(leuk_cases ~ s(CarsPerHH), data = county_leukemia_sf) #no
gam_sing_03 <- gam(leuk_cases ~ s(PopDens), data = county_leukemia_sf) #no
gam_sing_04 <- gam(leuk_cases ~ s(lat), data = county_leukemia_sf) # -OK-

AIC(gam_sing_01)
## [1] 3221.156
AIC(gam_sing_02)
## [1] 3218.954
AIC(gam_sing_03)
## [1] 3221.231
AIC(gam_sing_04)
## [1] 3211.888
summary(gam_sing_01)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(GreenNeighborhoodIndex)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.6840     0.2701   135.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df     F p-value
## s(GreenNeighborhoodIndex)   1      1 0.078    0.78
## 
## R-sq.(adj) =  -0.00185   Deviance explained = 0.0157%
## GCV = 36.615  Scale est. = 36.468    n = 500
summary(gam_sing_02)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(CarsPerHH)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.6840     0.2681   136.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value
## s(CarsPerHH) 6.09  7.258 1.311   0.283
## 
## R-sq.(adj) =  0.0126   Deviance explained = 2.46%
## GCV = 36.461  Scale est. = 35.944    n = 500
summary(gam_sing_03)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(PopDens)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.6840     0.2701   135.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value
## s(PopDens)   1      1 0.004   0.948
## 
## R-sq.(adj) =  -0.002   Deviance explained = 0.000858%
## GCV =  36.62  Scale est. = 36.474    n = 500
summary(gam_sing_04)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.6840     0.2672   137.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value  
## s(lat) 2.283  2.838 3.217  0.0192 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0191   Deviance explained = 2.35%
## GCV = 35.943  Scale est. = 35.707    n = 500
Topics P-Value AIC R-sq (adj.) GCV & Scale. Est
(GreenIndex) ’ - ’ 3221 ’ - ’ ’ - ’
CarPerHH ’ - ’ 3219 ’ - ’ ’ - ’
PopDens ’ - ’ 3221 ’ - ’ ’ - ’
(Latitude) ’ * ’ 3212 0.019 35.9 , 35.7
  • latitude is only statistically significant here, let do some more experiment with it.

  • Result give similar result of focusing on PctPopGrowth, Dist2FracLoc8 , and lon.

GAM with focus variables

weights_normal = abs(county_leukemia_sf$PctPopGrowth)

gam_foc_01 <- gam(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + s(lon, lat),data= county_leukemia_sf)

gam_foc_02 <- gam(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + s(lon),
                        data= county_leukemia_sf)

gam_foc_03 <- gam(leuk_cases ~                Dist2FracLoc_8 + s(lon, lat),
                        data= county_leukemia_sf)

gam_foc_04 <- gam(leuk_cases ~ PctPopGrowth                  + s(lon, lat),
                        data= county_leukemia_sf)

gam_foc_05 <- gam(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8       ,
                        data= county_leukemia_sf)



AIC(gam_foc_01)
## [1] 2069.665
AIC(gam_foc_02)
## [1] 2185.837
AIC(gam_foc_03)
## [1] 2110.986
AIC(gam_foc_04)
## [1] 2080.057
AIC(gam_foc_05)
## [1] 2210.431
# summary(gam_foc_01)
# summary(gam_foc_02)
# summary(gam_foc_03)
summary(gam_foc_04)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## leuk_cases ~ PctPopGrowth + s(lon, lat)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.87095    0.08925 413.100  < 2e-16 ***
## PctPopGrowth  0.44931    0.07204   6.237 9.91e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf Ref.df     F p-value    
## s(lon,lat) 26.77  28.69 149.6  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.903   Deviance explained = 90.8%
## GCV = 3.7497  Scale est. = 3.534     n = 500
# summary(gam_foc_05)


# gam_foc_01 <- gam(leuk_cases ~ PctPopGrowth,
# county_leukemia_sf, weights = PctPopGrowth)
# 
# gam_foc_02 <- gam(leuk_cases ~ PctPopGrowth+ Dist2FracLoc_8 + s(lon),
# data = county_leukemia_sf, weights = PctPopGrowth)
Topics PctPop Dist2_8 s(lon) s(lat) AIC R-sq (adj.)
All variables Y Y Y Y 2070 0.905
Without s(lat) Y Y Y 2186 0.876
Without PctPopGrowth Y Y Y 2111 0.896
Without Dist2FracLoc_8 Y Y Y 2080 0.903
Without s(lon,lat) Y Y 2210 0.868
  • We are choosing Model Without Dist2FracLoc_8 or gam_foc_04 or Leuk_cases (PctPopGrowth + s(lon,lat).

Let’s do GAM Check for both

gam.check(gam_foc_01)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 8 iterations.
## The RMS GCV score gradient at convergence was 1.888517e-06 .
## The Hessian was positive definite.
## Model rank =  32 / 32 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##              k'  edf k-index p-value    
## s(lon,lat) 29.0 25.4    0.69  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam.check(gam_foc_04)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 7 iterations.
## The RMS GCV score gradient at convergence was 1.245411e-05 .
## The Hessian was positive definite.
## Model rank =  31 / 31 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##              k'  edf k-index p-value    
## s(lon,lat) 29.0 26.8    0.67  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(gam_foc_01)

plot(gam_foc_04, main = "GAM Models - Leukemia Prediction")

  • After checking we see good result, so to repeat We are choosing Model Without Dist2FracLoc_8 or gam_foc_04 or Leuk_cases (PctPopGrowth + s(lon,lat).

Transition summary :

screenreg( list(lm_dist_a08, gam_foc_04), digits = 2, 
           custom.model.names = c("OLS Model", "GAM Model") )
## 
## ============================================
##                     OLS Model   GAM Model   
## --------------------------------------------
## (Intercept)          25.76 ***     36.87 ***
##                      (0.22)        (0.09)   
## Dist2FracLoc_8        2.37 ***              
##                      (0.04)                 
## PctPopGrowth                        0.45 ***
##                                    (0.07)   
## EDF: s(lon,lat)                    26.77 ***
##                                   (28.69)   
## --------------------------------------------
## R^2                   0.86          0.90    
## Adj. R^2              0.86                  
## Num. obs.           500           500       
## AIC                              2080.06    
## BIC                              2205.51    
## Log Likelihood                  -1010.26    
## Deviance                         1665.32    
## Deviance explained                  0.91    
## Dispersion                          3.53    
## GCV score                           3.75    
## Num. smooth terms                   1       
## ============================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

== Using Spatial for next result ==

Set up Spatial Weight Matrix

coords <- cbind(county_leukemia_sf$lon, county_leukemia_sf$lat)

dists <- dist(coords)  # Compute pairwise distances
knn <- knn2nb(knearneigh(coords, k = 4))  # Create a neighbor list using k=4 nearest neighbors
sp_weight <- nb2listw(knn , style="W")  # Create the spatial weights list

moran_test <- moran.test(county_leukemia_sf$leuk_cases, listw = sp_weight)
print(moran_test)
## 
##  Moran I test under randomisation
## 
## data:  county_leukemia_sf$leuk_cases  
## weights: sp_weight    
## 
## Moran I statistic standard deviate = 31.243, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.9254981482     -0.0020040080      0.0008812889
  • Super high Moran I Statistic, mean strong positive spatial autocorrelation.

    Topics P-value Moran’s I Expectation Variance
    leuk_cases(lon,lat) ’ *** ’ 0.93 -0.002 0.0009

Do the lm.morantest to find spatial correlation in my residual of models

lm.morantest(lm_dist_a08, sp_weight)
## 
##  Global Moran I for regression residuals
## 
## data:  
## model: lm(formula = leuk_cases ~ Dist2FracLoc_8, data =
## county_leukemia_sf)
## weights: sp_weight
## 
## Moran I statistic standard deviate = 16.062, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I      Expectation         Variance 
##     0.4712159465    -0.0040010647     0.0008753169
Topics P-value Moran’s I Expectation Variance
lm_dist_a08 (Only Dist2FracLoc_8) ’ *** ’ 0.47 -0.004 0.0009
  • There still some spatial cluster in my residual of this model.
moran.test(gam_foc_04$residuals , sp_weight)
## 
##  Moran I test under randomisation
## 
## data:  gam_foc_04$residuals  
## weights: sp_weight    
## 
## Moran I statistic standard deviate = 10.259, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.3024806076     -0.0020040080      0.0008808084
Topics P-value Moran’s I Expectation Variance
gam_foc_04 (PctPopGrowth + s(lon,lat)) ’ *** ’ 0.30 -0.002 0.0009
Topics P-value Moran’s I Expectation Variance
lm_dist_a08 (Only Dist2FracLoc_8) ’ *** ’ 0.47 -0.004 0.0009
gam_foc_04 (PctPopGrowth + s(lon,lat)) ’ *** ’ 0.30 -0.002 0.0009
  • There still some spatial cluster in my residual of this model.

  • To address this we have to apply spatial error model to account for spatially correlated error terms

Spatial Error Model

sperr_lm_dist_a08 = errorsarlm(leuk_cases ~ Dist2FracLoc_8, 
                     data= county_leukemia_sf, listw = sp_weight)

summary(sperr_lm_dist_a08)
## 
## Call:
## errorsarlm(formula = leuk_cases ~ Dist2FracLoc_8, data = county_leukemia_sf, 
##     listw = sp_weight)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4.86902 -1.29451  0.10411  1.26150  5.13015 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                 Estimate Std. Error z value  Pr(>|z|)
## (Intercept)    25.918449   0.439734  58.941 < 2.2e-16
## Dist2FracLoc_8  2.346697   0.084292  27.840 < 2.2e-16
## 
## Lambda: 0.60654, LR test value: 177.9, p-value: < 2.22e-16
## Asymptotic standard error: 0.036973
##     z-value: 16.405, p-value: < 2.22e-16
## Wald statistic: 269.12, p-value: < 2.22e-16
## 
## Log likelihood: -1022.832 for error model
## ML residual variance (sigma squared): 3.1715, (sigma: 1.7809)
## Number of observations: 500 
## Number of parameters estimated: 4 
## AIC: 2053.7, (AIC for lm: 2229.6)
sperr_gam_foc_04 = errorsarlm(leuk_cases ~ PctPopGrowth, 
                     data= county_leukemia_sf, listw = sp_weight)

summary(sperr_gam_foc_04)
## 
## Call:
## errorsarlm(formula = leuk_cases ~ PctPopGrowth, data = county_leukemia_sf, 
##     listw = sp_weight)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4.91820 -1.20355  0.01942  1.16151  5.13078 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##               Estimate Std. Error z value  Pr(>|z|)
## (Intercept)  37.716005   0.856533 44.0334 < 2.2e-16
## PctPopGrowth  0.552459   0.062766  8.8019 < 2.2e-16
## 
## Lambda: 0.91069, LR test value: 1049.7, p-value: < 2.22e-16
## Asymptotic standard error: 0.010938
##     z-value: 83.258, p-value: < 2.22e-16
## Wald statistic: 6931.9, p-value: < 2.22e-16
## 
## Log likelihood: -1064.78 for error model
## ML residual variance (sigma squared): 2.9195, (sigma: 1.7087)
## Number of observations: 500 
## Number of parameters estimated: 4 
## AIC: 2137.6, (AIC for lm: 3185.3)
screenreg( list(sperr_lm_dist_a08, sperr_gam_foc_04), digits = 2 , custom.model.names = c("sperr_lm_dist_a08", "sperr_gam_foc_04"))
## 
## ========================================================
##                      sperr_lm_dist_a08  sperr_gam_foc_04
## --------------------------------------------------------
## (Intercept)             25.92 ***          37.72 ***    
##                         (0.44)             (0.86)       
## Dist2FracLoc_8           2.35 ***                       
##                         (0.08)                          
## lambda                   0.61 ***           0.91 ***    
##                         (0.04)             (0.01)       
## PctPopGrowth                                0.55 ***    
##                                            (0.06)       
## --------------------------------------------------------
## Num. obs.              500                500           
## Parameters               4                  4           
## Log Likelihood       -1022.83           -1064.78        
## AIC (Linear model)    2229.56            3185.28        
## AIC (Spatial model)   2053.66            2137.56        
## LR test: statistic     177.90            1049.72        
## LR test: p-value         0.00               0.00        
## ========================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Let’s use this as an benchmark

Introducing Risk Factors back

sperr_risk_1 <- errorsarlm(leuk_cases ~ PctPopGrowth, 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_risk_2 <- errorsarlm(leuk_cases ~ Dist2FracLoc_8, 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_risk_3 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8, 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_risk_4 <- errorsarlm(leuk_cases ~ PctPopGrowth + GreenNeighborhoodIndex,
                     data= county_leukemia_sf, listw = sp_weight)

sperr_risk_5 <- errorsarlm(leuk_cases ~ PctPopGrowth + CarsPerHH,
                     data= county_leukemia_sf, listw = sp_weight)

sperr_risk_6 <- errorsarlm(leuk_cases ~ PctPopGrowth + PopDens,
                     data= county_leukemia_sf, listw = sp_weight)

sperr_risk_7 <- errorsarlm(leuk_cases ~ PctPopGrowth + AverageAge,
                     data= county_leukemia_sf, listw = sp_weight) 

screenreg( list(sperr_risk_1, sperr_risk_2, sperr_risk_3, sperr_risk_4), digits = 3 , custom.model.names = c("sperr_risk_1", "sperr_risk_2", "sperr_risk_3", "sperr_risk_4"))
## 
## =================================================================================
##                         sperr_risk_1   sperr_risk_2   sperr_risk_3  sperr_risk_4 
## ---------------------------------------------------------------------------------
## (Intercept)                37.716 ***     25.918 ***    26.481 ***     38.359 ***
##                            (0.857)        (0.440)       (0.476)        (0.874)   
## PctPopGrowth                0.552 ***                    0.491 ***      0.541 ***
##                            (0.063)                      (0.059)        (0.061)   
## lambda                      0.911 ***      0.607 ***     0.661 ***      0.915 ***
##                            (0.011)        (0.037)       (0.033)        (0.010)   
## Dist2FracLoc_8                             2.347 ***     2.277 ***               
##                                           (0.084)       (0.091)                  
## GreenNeighborhoodIndex                                                 -1.343 ***
##                                                                        (0.232)   
## ---------------------------------------------------------------------------------
## Num. obs.                 500            500           500            500        
## Parameters                  4              4             5              5        
## Log Likelihood          -1064.780      -1022.832      -991.545      -1048.702    
## AIC (Linear model)       3185.279       2229.561      2210.431       3187.117    
## AIC (Spatial model)      2137.559       2053.664      1993.091       2107.405    
## LR test: statistic       1049.720        177.897       219.340       1081.712    
## LR test: p-value            0.000          0.000         0.000          0.000    
## =================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg( list(sperr_risk_5, sperr_risk_6, sperr_risk_7), digits = 3 , custom.model.names = c("sperr_risk_5", "sperr_risk_6", "sperr_risk_7"))
## 
## ===================================================================
##                         sperr_risk_5   sperr_risk_6   sperr_risk_7 
## -------------------------------------------------------------------
## (Intercept)                37.455 ***     37.645 ***     37.186 ***
##                            (0.867)        (0.861)        (0.873)   
## PctPopGrowth                0.559 ***      0.553 ***      0.534 ***
##                            (0.062)        (0.063)        (0.061)   
## CarsPerHH                   0.587 *                                
##                            (0.255)                                 
## lambda                      0.911 ***      0.911 ***      0.915 ***
##                            (0.011)        (0.011)        (0.011)   
## PopDens                                    0.378                   
##                                           (0.335)                  
## AverageAge15-50                                           0.693 ***
##                                                          (0.163)   
## AverageAge50 and older                                    0.887 ***
##                                                          (0.167)   
## -------------------------------------------------------------------
## Num. obs.                 500            500            500        
## Parameters                  5              5              6        
## Log Likelihood          -1062.140      -1064.144      -1049.260    
## AIC (Linear model)       3187.219       3187.267       3188.236    
## AIC (Spatial model)      2134.280       2138.289       2110.520    
## LR test: statistic       1054.939       1050.978       1079.716    
## LR test: p-value            0.000          0.000          0.000    
## ===================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Green index show negative outcome.

  • PctPopGrowth, Dist2FracLoc_8, AverageAge, CarPerHH is statistically significant

  • This also show how importance Dist2FracLoc_8 is

  • Keep this sperr_risk_3 in mind

Let’s get in focus mode variables

sperr_foc_1 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + AverageAge + PopDens + CarsPerHH, 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_foc_2 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + AverageAge , 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_foc_3 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + lon + lat , 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_foc_4 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + AverageAge + CarsPerHH, 
                     data= county_leukemia_sf, listw = sp_weight)

sperr_foc_5 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + CarsPerHH, 
                     data= county_leukemia_sf, listw = sp_weight)

screenreg( list(sperr_foc_1, sperr_foc_2, sperr_foc_3, sperr_foc_4, sperr_foc_5), digits = 2, custom.model.names = c("sperr_foc_1", "sperr_foc_2", "sperr_foc_3", "sperr_foc_4", "sperr_foc_5") )
## 
## =======================================================================================
##                         sperr_foc_1  sperr_foc_2  sperr_foc_3  sperr_foc_4  sperr_foc_5
## ---------------------------------------------------------------------------------------
## (Intercept)               26.13 ***    26.46 ***    27.87 ***    26.13 ***    26.78 ***
##                           (0.51)       (0.49)       (0.63)       (0.51)       (0.52)   
## PctPopGrowth               0.48 ***     0.47 ***     0.49 ***     0.48 ***     0.49 ***
##                           (0.05)       (0.06)       (0.06)       (0.05)       (0.06)   
## Dist2FracLoc_8             2.30 ***     2.29 ***     2.11 ***     2.30 ***     2.28 ***
##                           (0.09)       (0.09)       (0.12)       (0.09)       (0.09)   
## GreenNeighborhoodIndex    -1.30 ***    -1.31 ***    -1.21 ***    -1.28 ***    -1.17 ***
##                           (0.22)       (0.23)       (0.24)       (0.22)       (0.23)   
## AverageAge15-50            0.76 ***     0.77 ***                  0.76 ***             
##                           (0.15)       (0.16)                    (0.16)                
## AverageAge50 and older     1.07 ***     1.03 ***                  1.05 ***             
##                           (0.16)       (0.16)                    (0.16)                
## PopDens                   -0.61                                                        
##                           (0.40)                                                       
## CarsPerHH                  0.90 **                                0.61 *       0.56 *  
##                           (0.30)                                 (0.24)       (0.25)   
## lambda                     0.69 ***     0.68 ***     0.66 ***     0.69 ***     0.68 ***
##                           (0.03)       (0.03)       (0.03)       (0.03)       (0.03)   
## lon                                                 -0.01 *                            
##                                                     (0.00)                             
## lat                                                 -0.01                              
##                                                     (0.01)                             
## ---------------------------------------------------------------------------------------
## Num. obs.                500          500          500          500          500       
## Parameters                10            8            8            9            7       
## Log Likelihood          -953.01      -957.44      -976.64      -954.16      -976.65    
## AIC (Linear model)      2168.16      2167.30      2183.11      2168.49      2204.85    
## AIC (Spatial model)     1926.03      1930.88      1969.28      1926.33      1967.29    
## LR test: statistic       244.14       238.42       215.83       244.16       239.55    
## LR test: p-value           0.00         0.00         0.00         0.00         0.00    
## =======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Retry with all fraction site

sperr_dist_01 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_1  + Dist2FracLoc_2 + Dist2FracLoc_3 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6 + Dist2FracLoc_7 + Dist2FracLoc_8 + Dist2FracLoc_9 + Dist2FracLoc_10,  
                     data= county_leukemia_sf, listw = sp_weight)

sperr_dist_02 <- errorsarlm(leuk_cases ~                Dist2FracLoc_1  + Dist2FracLoc_2 + Dist2FracLoc_3 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6 + Dist2FracLoc_7 + Dist2FracLoc_8 + Dist2FracLoc_9 + Dist2FracLoc_10,  
                     data= county_leukemia_sf, listw = sp_weight)

sperr_dist_03 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_9 + Dist2FracLoc_10,  
                     data= county_leukemia_sf, listw = sp_weight)

sperr_dist_04 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_1 + Dist2FracLoc_2 + Dist2FracLoc_3,  
                     data= county_leukemia_sf, listw = sp_weight)

sperr_dist_05 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6,  
                     data= county_leukemia_sf, listw = sp_weight)

sperr_dist_06 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_7,  
                     data= county_leukemia_sf, listw = sp_weight)

screenreg( list(sperr_dist_01, sperr_dist_02, sperr_dist_03), digits = 3 , custom.model.names = c("sperr_dist_01", "sperr_dist_02", "sperr_dist_03"))
## 
## ================================================================
##                      sperr_dist_01  sperr_dist_02  sperr_dist_03
## ----------------------------------------------------------------
## (Intercept)            19.948 ***      21.683 ***    26.827 *** 
##                        (4.502)         (4.388)       (0.567)    
## PctPopGrowth            0.495 ***                     0.498 *** 
##                        (0.060)                       (0.059)    
## Dist2FracLoc_1         -0.158           0.459                   
##                        (0.942)         (0.920)                  
## Dist2FracLoc_2          0.172           0.767                   
##                        (1.271)         (1.255)                  
## Dist2FracLoc_3          1.315           0.315                   
##                        (1.340)         (1.320)                  
## Dist2FracLoc_4          0.970           0.010                   
##                        (1.175)         (1.149)                  
## Dist2FracLoc_5         -0.344          -0.838                   
##                        (0.583)         (0.565)                  
## Dist2FracLoc_6         -0.696           0.061                   
##                        (0.474)         (0.446)                  
## Dist2FracLoc_7         -1.858          -1.635                   
##                        (1.875)         (1.893)                  
## Dist2FracLoc_8          3.237           1.245         3.266 *** 
##                        (2.306)         (2.317)       (0.865)    
## Dist2FracLoc_9          1.983           2.685 *      -0.075     
##                        (1.359)         (1.354)       (0.215)    
## Dist2FracLoc_10        -0.823           0.295        -0.958     
##                        (1.682)         (1.684)       (0.752)    
## lambda                  0.617 ***       0.563 ***     0.652 *** 
##                        (0.036)         (0.040)       (0.034)    
## ----------------------------------------------------------------
## Num. obs.             500             500           500         
## Parameters             14              13             7         
## Log Likelihood       -983.554       -1014.207      -990.540     
## AIC (Linear model)   2161.776        2189.429      2202.323     
## AIC (Spatial model)  1995.108        2054.414      1995.080     
## LR test: statistic    168.668         137.015       209.242     
## LR test: p-value        0.000           0.000         0.000     
## ================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg( list(sperr_dist_04, sperr_dist_05, sperr_dist_06), digits = 3 , custom.model.names = c("sperr_dist_04", "sperr_dist_05", "sperr_dist_06"))
## 
## ================================================================
##                      sperr_dist_04  sperr_dist_05  sperr_dist_06
## ----------------------------------------------------------------
## (Intercept)            22.808 ***     24.970 ***     26.475 *** 
##                        (2.100)        (3.212)        (0.474)    
## PctPopGrowth            0.484 ***      0.489 ***      0.492 *** 
##                        (0.059)        (0.059)        (0.059)    
## Dist2FracLoc_8          1.958 ***      2.031 ***      2.115 *** 
##                        (0.156)        (0.285)        (0.252)    
## Dist2FracLoc_1          0.405                                   
##                        (0.254)                                  
## Dist2FracLoc_2          1.438                                   
##                        (0.944)                                  
## Dist2FracLoc_3         -0.805                                   
##                        (0.742)                                  
## lambda                  0.632 ***      0.656 ***      0.660 *** 
##                        (0.035)        (0.033)        (0.033)    
## Dist2FracLoc_4                         0.087                    
##                                       (0.291)                   
## Dist2FracLoc_5                         0.245                    
##                                       (0.390)                   
## Dist2FracLoc_6                         0.202                    
##                                       (0.241)                   
## Dist2FracLoc_7                                        0.169     
##                                                      (0.246)    
## ----------------------------------------------------------------
## Num. obs.             500            500            500         
## Parameters              8              8              6         
## Log Likelihood       -988.620       -990.750       -991.311     
## AIC (Linear model)   2172.959       2207.437       2210.784     
## AIC (Spatial model)  1993.239       1997.501       1994.622     
## LR test: statistic    181.720        211.936        218.162     
## LR test: p-value        0.000          0.000          0.000     
## ================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Only 8 are statistically significant..

Summarize

screenreg( list(sperr_lm_dist_a08, sperr_gam_foc_04, sperr_risk_3, sperr_foc_2), digits = 3 , custom.model.names = c("sperr_lm_dist_a08", "sperr_gam_foc_04", "sperr_risk_3", "sperr_foc_2"))
## 
## =======================================================================================
##                         sperr_lm_dist_a08  sperr_gam_foc_04  sperr_risk_3  sperr_foc_2 
## ---------------------------------------------------------------------------------------
## (Intercept)                25.918 ***         37.716 ***       26.481 ***    26.455 ***
##                            (0.440)            (0.857)          (0.476)       (0.494)   
## Dist2FracLoc_8              2.347 ***                           2.277 ***     2.293 ***
##                            (0.084)                             (0.091)       (0.090)   
## lambda                      0.607 ***          0.911 ***        0.661 ***     0.684 ***
##                            (0.037)            (0.011)          (0.033)       (0.031)   
## PctPopGrowth                                   0.552 ***        0.491 ***     0.468 ***
##                                               (0.063)          (0.059)       (0.055)   
## GreenNeighborhoodIndex                                                       -1.307 ***
##                                                                              (0.226)   
## AverageAge15-50                                                               0.766 ***
##                                                                              (0.156)   
## AverageAge50 and older                                                        1.034 ***
##                                                                              (0.162)   
## ---------------------------------------------------------------------------------------
## Num. obs.                 500                500              500           500        
## Parameters                  4                  4                5             8        
## Log Likelihood          -1022.832          -1064.780         -991.545      -957.440    
## AIC (Linear model)       2229.561           3185.279         2210.431      2167.301    
## AIC (Spatial model)      2053.664           2137.559         1993.091      1930.881    
## LR test: statistic        177.897           1049.720          219.340       238.421    
## LR test: p-value            0.000              0.000            0.000         0.000    
## =======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Model 3 have well balanced between AIC and number of Parameter. sperr_risk_3

== Model Compare between 3 methods ==

Compare model together

screenreg( list(lm_dist_a08, gam_foc_04, sperr_risk_3), digits = 3 , custom.model.names = c("lm_dist_a08", "gam_foc_04", "sperr_risk_3"))
## 
## =============================================================
##                      lm_dist_a08  gam_foc_04     sperr_risk_3
## -------------------------------------------------------------
## (Intercept)           25.758 ***     36.871 ***    26.481 ***
##                       (0.220)        (0.089)       (0.476)   
## Dist2FracLoc_8         2.371 ***                    2.277 ***
##                       (0.042)                      (0.091)   
## PctPopGrowth                          0.449 ***     0.491 ***
##                                      (0.072)       (0.059)   
## EDF: s(lon,lat)                      26.767 ***              
##                                     (28.690)                 
## lambda                                              0.661 ***
##                                                    (0.033)   
## -------------------------------------------------------------
## R^2                    0.862          0.903                  
## Adj. R^2               0.862                                 
## Num. obs.            500            500           500        
## AIC                                2080.057                  
## BIC                                2205.515                  
## Log Likelihood                    -1010.261      -991.545    
## Deviance                           1665.323                  
## Deviance explained                    0.908                  
## Dispersion                            3.534                  
## GCV score                             3.750                  
## Num. smooth terms                     1                      
## Parameters                                          5        
## AIC (Linear model)                               2210.431    
## AIC (Spatial model)                              1993.091    
## LR test: statistic                                219.340    
## LR test: p-value                                    0.000    
## =============================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
  • Comparing-wise there three models show similar fit (R^2 = 0.862 and 0.903) (AIC = 2080 and 1993).

  • Let’s select model base on logical and data experimented.

  • For my first disagreement to the models, Dist2FracLoc espescially for 8 show high correlation, but to conclude that farther away from Fraction Site, the more chance of getting leukemia is not logically explained. Reject lm_dist_a08 model and sperr_risk_3 model.

  • With GAM Model with show nicely explained plot that is totally agree with Leukemia Density.

  • From moran’s I test, this also show some geographically clustered. Moran I Statistic = 0.30. Indicated moderate positive spatial autocorrelation. Similar areas tend to have similar levels of Leukemia cases.

  • Considering environmental factor (such as GreenNeighborhoodIndex, CarsPerHH and PopDens) that might related to pollution also show insignificant to leukemia cases

  • Considering Genetic Factors, Even thorough Leukemia is not infected like flu, but by Genetic Predisposition could cause spatial gradient, this could be explaining the geographically clustered that happened in the area.

  • Additionally the leukemia cases also linearly correlated to the PctPopGrowth as well. Is back up the possibility of Genetic Predisposition theory.

  • At the center of the cluster show less number of Leukemia cases, if the Genetic Predisposition theory is correct. The separate gene should have immunity factor to leukemia.

  • To conclude, We strongly believe that the leukemia immunity’s gene was separated by Genetic Predisposition starting from the center of area near Fraction Site 8, result in less leukemia cases around the center of the cluster. And our model show the explanation according to this Generic Predisposition Theory.

Topics P-value Moran’s I Expectation Variance
gam_foc_04 (PctPopGrowth + s(lon,lat)) ’ *** ’ 0.30 -0.002 0.0009
Topics P-Value AIC R-sq (adj.) GCV & Scale. Est
(GreenIndex) ’ - ’ 3221 ’ - ’ ’ - ’
CarPerHH ’ - ’ 3219 ’ - ’ ’ - ’
PopDens ’ - ’ 3221 ’ - ’ ’ - ’
(Latitude) ’ * ’ 3212 0.019 35.9 , 35.7

5. Modeling & Experimenting

6. Evaluating Results

7. Conclusion and Insight

== Side Quest==

head(coords)
##          [,1]      [,2]
## [1,] 44.93403 -29.47574
## [2,] 89.50712 -18.58449
## [3,] 37.06287  32.51696
## [4,] 73.29633 -20.26620
## [5,] 90.39685 -12.98208
## [6,] 69.74439 -48.33941
head(fracking_coords)
##              X          Y
## [1,] -40.33708  -6.008472
## [2,]  88.45395  25.422681
## [3,]  78.33582  34.077583
## [4,] -51.85614   1.218802
## [5,]  99.02833 -29.439174
## [6,]  31.19807  38.713713
leukemia_coords <- coords

dists_btw_frac <-  spDists(leukemia_coords, fracking_coords, longlat = TRUE)

min_dists_btw_frac <- apply(dists_btw_frac, 1, min)

county_leukemia_sf$min_fracking_dist <- min_dists_btw_frac

cor(county_leukemia_sf$min_fracking_dist, county_leukemia_sf$leuk_cases)
## [1] 0.5892939

Null Hypothesis :

Null Hypothesis (H₀): There is no relationship between the proximity to fracking sites and the number of leukemia cases.

Alternative Hypothesis (H₁): There is a relationship between the proximity to fracking sites and the number of leukemia cases.

ggplot(county_leukemia_sf, aes(x = min_dists_btw_frac, y = leuk_cases)) +
  geom_point() +
  labs(title = "Leukemia Cases vs. Distance to Fracking Sites", x = "Distance to Nearest Fracking Site", y = "Leukemia Cases")

  • I expected a negative correlation but no where to be found. (Expected Closer = more cases).
ggplot() +
  geom_sf(data = county_leukemia_sf, aes(fill = leuk_cases)) +
  geom_sf(data = fracking_locations_sf, fill = "green", color = "red", alpha = 0.5) +
  ggtitle("Leukemia Cases and Fracking Sites Proximity") +
  scale_fill_gradient(low = "lightblue", high = "darkblue")

cor(county_leukemia_sf$min_fracking_dist, county_leukemia_sf$leuk_cases, method = "pearson")
## [1] 0.5892939
cor(county_leukemia_sf$min_fracking_dist, county_leukemia_sf$leuk_cases, method = "spearman")
## [1] 0.4866192
model <- lm(leuk_cases ~ min_fracking_dist + CarsPerHH + PopDens + GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf)
summary(model)
## 
## Call:
## lm(formula = leuk_cases ~ min_fracking_dist + CarsPerHH + PopDens + 
##     GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.9657 -3.3809  0.0416  3.2894 12.2270 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            30.397902   0.622375  48.842   <2e-16 ***
## min_fracking_dist       0.002649   0.000136  19.479   <2e-16 ***
## CarsPerHH              -0.103245   0.923447  -0.112    0.911    
## PopDens                 0.200990   1.217783   0.165    0.869    
## GreenNeighborhoodIndex -0.665608   0.688988  -0.966    0.334    
## PctPopGrowth            1.205996   0.111586  10.808   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.398 on 494 degrees of freedom
## Multiple R-squared:  0.474,  Adjusted R-squared:  0.4687 
## F-statistic: 89.05 on 5 and 494 DF,  p-value: < 2.2e-16
plot(model)

  • PctPopGrowth and min_fracking_dist is related as expected.
model <- errorsarlm(leuk_cases ~ min_fracking_dist + CarsPerHH + PopDens + GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf, listw = sp_weight)
summary(model)
## 
## Call:errorsarlm(formula = leuk_cases ~ min_fracking_dist + CarsPerHH + 
##     PopDens + GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf, 
##     listw = sp_weight)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.7666857 -1.1361530  0.0051382  1.1145987  4.3032821 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                           Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)            34.47004846  1.00708738 34.2275 < 2.2e-16
## min_fracking_dist       0.00123996  0.00024861  4.9877 6.111e-07
## CarsPerHH               0.57222610  0.31472164  1.8182   0.06903
## PopDens                -0.21721102  0.41260962 -0.5264   0.59859
## GreenNeighborhoodIndex -1.30244341  0.23169775 -5.6213 1.895e-08
## PctPopGrowth            0.56467110  0.06015880  9.3863 < 2.2e-16
## 
## Lambda: 0.89667, LR test value: 822.39, p-value: < 2.22e-16
## Asymptotic standard error: 0.012392
##     z-value: 72.361, p-value: < 2.22e-16
## Wald statistic: 5236.1, p-value: < 2.22e-16
## 
## Log likelihood: -1035.789 for error model
## ML residual variance (sigma squared): 2.6604, (sigma: 1.6311)
## Number of observations: 500 
## Number of parameters estimated: 8 
## AIC: 2087.6, (AIC for lm: 2908)
  • Nothing is significant

Test to see the trend of 10 fracking site

library(ggplot2)
library(reshape2)

# # Create a data frame with the distance variables and leukemia cases
# distance_columns <- c("Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", 
#                       "Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", 
#                       "Dist2FracLoc_9", "Dist2FracLoc_10")
# 
# # Reshape data for easier plotting
# leukemia_distances <- county_leukemia_sf[, c("leuk_cases", distance_columns)]
# 
# leukemia_distances <- data.frame(leukemia_distances)
# class(leukemia_distances)
# 
# leukemia_distances_melted <- melt(leukemia_distances, id.vars = "leuk_cases", variable.name = "Fracking_Site", value.name = "Distance")
# 
# # Plotting scatter plots for each fracking site
# ggplot(leukemia_distances_melted, aes(x = Distance, y = leuk_cases)) +
#   geom_point() +
#   facet_wrap(~Fracking_Site, scales = "free") +  # Create a separate plot for each fracking site
#   labs(title = "Scatter Plots of Leukemia Cases vs. Distance to Fracking Sites",
#        x = "Distance to Fracking Site (Euclidean)",
#        y = "Number of Leukemia Cases") +
#   theme_minimal()
library(dplyr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
## The following object is masked from 'package:texreg':
## 
##     extract
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
library(ggplot2)

# Create a data frame with the distance variables and leukemia cases
distance_columns <- c("Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", 
                      "Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", 
                      "Dist2FracLoc_9", "Dist2FracLoc_10")

# Select only the relevant columns
leukemia_distances <- county_leukemia_sf %>%
  select(leuk_cases, all_of(distance_columns))

# Reshape data using tidyr::pivot_longer (alternative to melt)
leukemia_distances_melted <- leukemia_distances %>%
  pivot_longer(cols = starts_with("Dist2FracLoc"), 
               names_to = "Fracking_Site", 
               values_to = "Distance")

# Plotting scatter plots for each fracking site
ggplot(leukemia_distances_melted, aes(x = Distance, y = leuk_cases)) +
  geom_point() +
  facet_wrap(~Fracking_Site, scales = "free") +  # Create a separate plot for each fracking site
  labs(title = "Scatter Plots of Leukemia Cases vs. Distance to Fracking Sites",
       x = "Distance to Fracking Site (Euclidean)",
       y = "Number of Leukemia Cases") +
  theme_minimal()

  • There is no negative correlation.
dists_displacement <- st_distance(county_leukemia_sf, fracking_locations_sf)

min_dists_displacement <- apply(dists_displacement, 1, min)

county_leukemia_sf$min_fracking_dist_displacement <- min_dists_displacement
  • Scale it to 1-10
# Identify the minimum and maximum values of the distance column
min_distance_sc <- min(county_leukemia_sf$min_fracking_dist_displacement, na.rm = TRUE)
max_distance_sc <- max(county_leukemia_sf$min_fracking_dist_displacement, na.rm = TRUE)

# Perform Min-Max Scaling to rescale distance to 0-10 range
county_leukemia_sf$scaled_min_fracking_dist <- 
  (county_leukemia_sf$min_fracking_dist_displacement - min_distance_sc) / 
  (max_distance_sc - min_distance_sc) * 10

# Check the result
head(county_leukemia_sf$scaled_min_fracking_dist)
## [1] 0.8270320 1.6581797 0.8910517 3.3780370 2.6941850 3.1186514
  • Scaled then ggplot()
library(ggplot2)

ggplot(county_leukemia_sf, aes(x = scaled_min_fracking_dist, y = leuk_cases)) +
  geom_point() +
  labs(title = "Leukemia Cases vs. Scaled Distance to Nearest Fracking Site",
       x = "Scaled Distance to Nearest Fracking Site (0-10)",
       y = "Number of Leukemia Cases") +
  theme_minimal()

  • Still a positive trend.

  • Do correlation checking

cor(county_leukemia_sf$scaled_min_fracking_dist, county_leukemia_sf$leuk_cases, method = "pearson")
## [1] 0.581328
cor(county_leukemia_sf$scaled_min_fracking_dist, county_leukemia_sf$leuk_cases, method = "spearman")
## [1] 0.4793031
model <- lm(leuk_cases ~ scaled_min_fracking_dist, data = county_leukemia_sf)
summary(model)
## 
## Call:
## lm(formula = leuk_cases ~ scaled_min_fracking_dist, data = county_leukemia_sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.749  -4.057  -0.331   3.647  15.102 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              31.21840    0.40720   76.67   <2e-16 ***
## scaled_min_fracking_dist  1.50760    0.09456   15.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.914 on 498 degrees of freedom
## Multiple R-squared:  0.3379, Adjusted R-squared:  0.3366 
## F-statistic: 254.2 on 1 and 498 DF,  p-value: < 2.2e-16
plot(model)