Leukemia and our Hypothesis
Exploration
Cleaning and Transform
Feature Selected (ggpairs)
#include
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggspatial)
library(lmtest)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(spatialreg)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: Matrix
## Loading required package: sf
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(spdep)
##
## Attaching package: 'spdep'
## The following objects are masked from 'package:spatialreg':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
library(texreg)
## Version: 1.39.4
## Date: 2024-07-23
## Author: Philip Leifeld (University of Manchester)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(sf)
library(ggplot2)
library(sp)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("data_n128.RData")
# Look into county_leukemia_128
# head(county_leukemia_128)
summary(county_leukemia_128)
## id leuk_cases AverageAge GreenNeighborhoodIndex
## Min. : 1.0 Min. :23.00 15 and younger:186 Min. :0.0002333
## 1st Qu.:125.8 1st Qu.:32.00 15-50 :167 1st Qu.:0.2346412
## Median :250.5 Median :35.00 50 and older :147 Median :0.4859838
## Mean :250.5 Mean :36.68 Mean :0.4862802
## 3rd Qu.:375.2 3rd Qu.:41.00 3rd Qu.:0.7251946
## Max. :500.0 Max. :52.00 Max. :0.9967076
## CarsPerHH PopDens PctPopGrowth Dist2FracLoc_1
## Min. :0.001174 Min. :0.0000152 Min. :-6.9749 Min. : 0.000
## 1st Qu.:0.256101 1st Qu.:0.0600216 1st Qu.:-1.4347 1st Qu.: 2.938
## Median :0.502197 Median :0.1753587 Median :-0.2529 Median : 4.128
## Mean :0.497339 Mean :0.2427995 Mean :-0.4161 Mean : 4.626
## 3rd Qu.:0.749654 3rd Qu.:0.3783928 3rd Qu.: 0.6439 3rd Qu.: 6.410
## Max. :0.999106 Max. :0.9666323 Max. : 4.9987 Max. :10.000
## Dist2FracLoc_2 Dist2FracLoc_3 Dist2FracLoc_4 Dist2FracLoc_5
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.986 1st Qu.: 2.868 1st Qu.: 2.547 1st Qu.: 3.373
## Median : 4.690 Median : 4.476 Median : 3.938 Median : 5.260
## Mean : 4.835 Mean : 4.587 Mean : 4.566 Mean : 5.245
## 3rd Qu.: 6.770 3rd Qu.: 6.374 3rd Qu.: 6.627 3rd Qu.: 7.139
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## Dist2FracLoc_6 Dist2FracLoc_7 Dist2FracLoc_8 Dist2FracLoc_9
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.075 1st Qu.: 2.477 1st Qu.: 2.759 1st Qu.: 2.426
## Median : 4.846 Median : 4.010 Median : 4.278 Median : 4.217
## Mean : 4.760 Mean : 4.433 Mean : 4.608 Mean : 4.443
## 3rd Qu.: 6.249 3rd Qu.: 6.161 3rd Qu.: 6.123 3rd Qu.: 6.129
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## Dist2FracLoc_10 lon lat geometry
## Min. : 0.000 Min. :-99.633 Min. :-49.7417 POLYGON :500
## 1st Qu.: 3.043 1st Qu.:-46.424 1st Qu.:-27.0143 epsg:4326 : 0
## Median : 4.639 Median : 6.654 Median : -1.4512 +proj=long...: 0
## Mean : 4.771 Mean : 2.938 Mean : -0.7321
## 3rd Qu.: 6.124 3rd Qu.: 52.227 3rd Qu.: 25.4981
## Max. :10.000 Max. : 99.923 Max. : 49.2118
ID : “id”
Outcome (y) : *****
Risk Factors (x) : *****
Spatial Structure : *****
min_value <- min(county_leukemia_128$leuk_cases)
max_value <- max(county_leukemia_128$leuk_cases)
mean_value <- mean(county_leukemia_128$leuk_cases)
boxplot(county_leukemia_128$leuk_cases,
main = "Leukemia cases",
ylab = "Cases",
col = "lightblue",
ylim = c(20,60))
text(x = 1.2, y = min_value - 5, labels = paste("Min:", round(min_value, 2)), pos = 4, col = "red")
text(x = 1.2, y = max_value + 5, labels = paste("Max:", round(max_value, 2)), pos = 4, col = "red")
text(x = 1.2, y = mean_value, labels = paste("Mean:", round(mean_value, 2)), pos = 4, col = "blue")
points(x = 1, y = mean_value, col = "blue", pch = 18, cex = 1.5) # pch = 19 for a filled circle
head(fracking_locations_128)
## Simple feature collection with 6 features and 1 field
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -51.85614 ymin: -29.43917 xmax: 99.02833 ymax: 38.71371
## Geodetic CRS: WGS 84
## geometry FracLoc
## 1 POINT (-40.33708 -6.008472) 1
## 2 POINT (88.45395 25.42268) 2
## 3 POINT (78.33582 34.07758) 3
## 4 POINT (-51.85614 1.218802) 4
## 5 POINT (99.02833 -29.43917) 5
## 6 POINT (31.19807 38.71371) 6
summary(fracking_locations_128)
## geometry FracLoc
## POINT :10 Min. : 1.00
## epsg:4326 : 0 1st Qu.: 3.25
## +proj=long...: 0 Median : 5.50
## Mean : 5.50
## 3rd Qu.: 7.75
## Max. :10.00
ID : “FracLoc”
Spatial Structure : *****
county_leukemia_sf <- st_as_sf(county_leukemia_128, coords = c("lon", "lat"), crs = 4326)
fracking_locations_sf <- st_as_sf(fracking_locations_128, coords = c("longitude", "latitude"), crs = 4326)
plot(st_geometry(county_leukemia_sf), main = "World Maps")
# Extract coordinates for fracking locations
fracking_coords <- st_coordinates(fracking_locations_sf)
# Add the coordinates as columns to the fracking_locations object
fracking_locations_sf$lon <- fracking_coords[, 1]
fracking_locations_sf$lat <- fracking_coords[, 2]
ggplot() +
geom_sf(data = county_leukemia_sf, aes(fill = county_leukemia_sf$leuk_cases), color = "black") +
# Plot the geometry of fracking locations using 'geom_sf'
geom_sf(data = fracking_locations_sf, fill = "lightgreen", color = "red", alpha = 1, size = 2) +
# Plot the scatter points for the fracking sites +
geom_text(data = fracking_locations_sf, aes(x = lon, y = lat, label = as.character(FracLoc)),
color = "Green", size = 5, vjust = 0, hjust = 0) + # Adjust position for clarity
ggtitle(" Leukemia Density and Fracking Sites") +
# Use a gradient scale to fill with darker colors for higher 'leuk_cases'
scale_fill_gradient(low = "lightblue", high = "darkblue") + # Light color for lower values, dark for higher
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_blank(),
axis.text.y = element_blank())
sel_riskFactor_vars <- county_leukemia_sf[, c("leuk_cases", "GreenNeighborhoodIndex", "CarsPerHH", "PopDens", "PctPopGrowth") ]
sel_riskFactor_vars <- st_drop_geometry(sel_riskFactor_vars)
ggpairs(sel_riskFactor_vars, title = "Risk Factor") +
theme()
sel_distFraction_vars <- county_leukemia_sf[, c("leuk_cases", "Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", "Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", "Dist2FracLoc_9", "Dist2FracLoc_10") ]
sel_distFraction_vars <- st_drop_geometry(sel_distFraction_vars)
ggpairs(sel_distFraction_vars, title = "Distance to Fractions") + theme()
# ggsave("dist2Frac.png", plot = pic_sel_distFraction_vars, width = 20, height = 20, units = "in", dpi = 300)
sel_distFraction_1st_vars <- county_leukemia_sf[, c("leuk_cases", "Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4", "Dist2FracLoc_5") ]
sel_distFraction_2nd_vars <- county_leukemia_sf[, c("leuk_cases", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8", "Dist2FracLoc_9", "Dist2FracLoc_10") ]
sel_distFraction_1st_vars <- st_drop_geometry(sel_distFraction_1st_vars)
sel_distFraction_2nd_vars <- st_drop_geometry(sel_distFraction_2nd_vars)
ggpairs(sel_distFraction_1st_vars, title = "Distance to Fractions of 1st half Variables")
ggpairs(sel_distFraction_2nd_vars, title = "Distance to Fractions of 2nd half Variables")
lm_risk_01 <- lm(leuk_cases ~ PctPopGrowth, county_leukemia_sf)
lm_risk_02 <- lm(leuk_cases ~ AverageAge, county_leukemia_sf)
lm_risk_12 <- lm(leuk_cases ~ PctPopGrowth+ AverageAge+ GreenNeighborhoodIndex+ CarsPerHH+ PopDens, county_leukemia_sf)
lm_risk_13 <-lm(leuk_cases ~ PctPopGrowth+ GreenNeighborhoodIndex, county_leukemia_sf)
lm_risk_14 <-lm(leuk_cases ~ PctPopGrowth+ PopDens, county_leukemia_sf)
screenreg(
list( lm_risk_01, lm_risk_02, lm_risk_12, lm_risk_13, lm_risk_14),
digits = 2
)
##
## ==================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5
## ----------------------------------------------------------------------------------
## (Intercept) 37.05 *** 36.47 *** 36.83 *** 36.88 *** 37.02 ***
## (0.27) (0.44) (0.76) (0.52) (0.40)
## PctPopGrowth 0.89 *** 0.88 *** 0.89 *** 0.89 ***
## (0.15) (0.15) (0.15) (0.15)
## AverageAge15-50 0.60 0.56
## (0.64) (0.63)
## AverageAge50 and older 0.05 -0.07
## (0.67) (0.65)
## GreenNeighborhoodIndex 0.39 0.37
## (0.92) (0.91)
## CarsPerHH -0.63
## (1.23)
## PopDens 0.75 0.14
## (1.62) (1.22)
## ----------------------------------------------------------------------------------
## R^2 0.07 0.00 0.07 0.07 0.07
## Adj. R^2 0.07 -0.00 0.06 0.07 0.07
## Num. obs. 500 500 500 500 500
## ==================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
lm_dist_a01 <- lm(leuk_cases ~ Dist2FracLoc_1 , county_leukemia_sf)
lm_dist_a02 <- lm(leuk_cases ~ Dist2FracLoc_2 , county_leukemia_sf)
lm_dist_a03 <- lm(leuk_cases ~ Dist2FracLoc_3 , county_leukemia_sf)
lm_dist_a04 <- lm(leuk_cases ~ Dist2FracLoc_4 , county_leukemia_sf)
lm_dist_a05 <- lm(leuk_cases ~ Dist2FracLoc_5 , county_leukemia_sf)
lm_dist_a06 <- lm(leuk_cases ~ Dist2FracLoc_6 , county_leukemia_sf)
lm_dist_a07 <- lm(leuk_cases ~ Dist2FracLoc_7 , county_leukemia_sf)
lm_dist_a08 <- lm(leuk_cases ~ Dist2FracLoc_8 , county_leukemia_sf)
lm_dist_a09 <- lm(leuk_cases ~ Dist2FracLoc_9 , county_leukemia_sf)
lm_dist_a10 <- lm(leuk_cases ~ Dist2FracLoc_10 , county_leukemia_sf)
screenreg(
list(lm_dist_a01, lm_dist_a02, lm_dist_a03, lm_dist_a04, lm_dist_a05),
digits = 2
)
##
## ==========================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5
## --------------------------------------------------------------------------
## (Intercept) 40.13 *** 28.67 *** 28.89 *** 41.53 *** 29.03 ***
## (0.55) (0.45) (0.44) (0.49) (0.50)
## Dist2FracLoc_1 -0.74 ***
## (0.10)
## Dist2FracLoc_2 1.66 ***
## (0.08)
## Dist2FracLoc_3 1.70 ***
## (0.09)
## Dist2FracLoc_4 -1.06 ***
## (0.09)
## Dist2FracLoc_5 1.46 ***
## (0.09)
## --------------------------------------------------------------------------
## R^2 0.09 0.45 0.44 0.20 0.37
## Adj. R^2 0.09 0.44 0.44 0.20 0.37
## Num. obs. 500 500 500 500 500
## ==========================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg(
list(lm_dist_a06, lm_dist_a07, lm_dist_a08, lm_dist_a09, lm_dist_a10),
digits = 2
)
##
## ===========================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5
## ---------------------------------------------------------------------------
## (Intercept) 28.19 *** 27.03 *** 25.76 *** 27.99 *** 25.27 ***
## (0.49) (0.28) (0.22) (0.35) (0.25)
## Dist2FracLoc_6 1.78 ***
## (0.09)
## Dist2FracLoc_7 2.18 ***
## (0.05)
## Dist2FracLoc_8 2.37 ***
## (0.04)
## Dist2FracLoc_9 1.96 ***
## (0.07)
## Dist2FracLoc_10 2.39 ***
## (0.05)
## ---------------------------------------------------------------------------
## R^2 0.42 0.76 0.86 0.62 0.83
## Adj. R^2 0.42 0.76 0.86 0.62 0.83
## Num. obs. 500 500 500 500 500
## ===========================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
From location wise, let’s not use close by location to prevent Multicollinearity, and also keep model simple.
Pair 1,4 : lm_dist_a04 R^2 > lm_dist_a01 : Keep 4
Pair 2,3 : lm_dist_a02 R^2 > lm_dist_a03 : Keep 2
Pair 7,9 : lm_dist_a07 R^2 > lm_dist_a09 : Keep 7
Pair 8,10 : lm_dist_a08 R^2 > lm_dist_a10 : Keep 8 (R ^2 = 0.86)
Keep 5, and 6 normally.
lm_dist_b01 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6 + Dist2FracLoc_7 + Dist2FracLoc_8 , county_leukemia_sf)
lm_dist_b02 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 + Dist2FracLoc_7 + Dist2FracLoc_8 , county_leukemia_sf) # not significant eliminate 5,6
lm_dist_b03 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 + Dist2FracLoc_7 , county_leukemia_sf) # 8 only, yield high data, try eliminate 8 to see the impact.
lm_dist_b04 <- lm(leuk_cases ~ Dist2FracLoc_2 + Dist2FracLoc_4 , county_leukemia_sf) #close to 8 eliminate 7
lm_dist_b05 <- lm(leuk_cases ~ Dist2FracLoc_7 + Dist2FracLoc_8 , county_leukemia_sf) #7 & 8 give high impact, let's only focus on that
screenreg(
list(lm_dist_b01, lm_dist_b02, lm_dist_a08, lm_dist_b03, lm_dist_b04, lm_dist_b05 ),
digits = 2
)
##
## ======================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## --------------------------------------------------------------------------------------
## (Intercept) 19.30 *** 18.20 *** 25.76 *** 13.87 *** 20.33 *** 25.76 ***
## (1.88) (1.27) (0.22) (0.69) (1.33) (0.22)
## Dist2FracLoc_2 1.07 *** 1.01 *** 1.53 *** 2.50 ***
## (0.21) (0.15) (0.08) (0.15)
## Dist2FracLoc_4 0.77 *** 0.85 *** 1.37 *** 0.94 ***
## (0.20) (0.15) (0.07) (0.14)
## Dist2FracLoc_5 -0.19
## (0.22)
## Dist2FracLoc_6 -0.12
## (0.20)
## Dist2FracLoc_7 1.24 ** 1.18 *** 2.07 *** 0.08
## (0.40) (0.23) (0.05) (0.12)
## Dist2FracLoc_8 1.05 ** 0.98 *** 2.37 *** 2.29 ***
## (0.38) (0.24) (0.04) (0.12)
## --------------------------------------------------------------------------------------
## R^2 0.87 0.87 0.86 0.87 0.49 0.86
## Adj. R^2 0.87 0.87 0.86 0.87 0.49 0.86
## Num. obs. 500 500 500 500 500 500
## ======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05Let’s only use just 6 variables that we keep. –> Model 1
Model 1 –> not significant eliminate 5,6 –> Model 2
Model 2 –> compare with DistFracLoc_8 only (lm_dist_a08 == Model 3)
Model 2 + 3 –> Assume DistFracLoc_8 is most significant, what if I eliminate DistFracLoc_8 –> Model 4
Model 4 –> Get same result, but FracLoc_7 and FracLoc_8 are very close, how about we eliminate DistFracLoc_7 as well –> Model 5
Model 5 –> Yield bad result, can we try will only DistFracLoc_7 , and DistFracLoc_8 –> Model 6
Too conclude this, 8 is only parameter we need (Model3).
lm_lon <- lm(leuk_cases ~ lon, county_leukemia_sf)
lm_lat <- lm(leuk_cases ~ lat, county_leukemia_sf)
lm_lon_lat <- lm(leuk_cases ~ lon + lat, county_leukemia_sf)
screenreg(
list( lm_lon, lm_lat, lm_lon_lat),
digits = 2
)
##
## ===============================================
## Model 1 Model 2 Model 3
## -----------------------------------------------
## (Intercept) 36.89 *** 36.69 *** 36.89 ***
## (0.20) (0.27) (0.20)
## lon -0.07 *** -0.07 ***
## (0.00) (0.00)
## lat 0.01 0.00
## (0.01) (0.01)
## -----------------------------------------------
## R^2 0.47 0.00 0.47
## Adj. R^2 0.47 0.00 0.47
## Num. obs. 500 500 500
## ===============================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
lm_sum_01 <- lm(leuk_cases ~PctPopGrowth + Dist2FracLoc_8 + lon, county_leukemia_sf)
lm_sum_02 <- lm(leuk_cases ~ Dist2FracLoc_8 + lon, county_leukemia_sf)
lm_sum_03 <- lm(leuk_cases ~ PctPopGrowth + lon, county_leukemia_sf)
lm_sum_04 <- lm(leuk_cases ~ PctPopGrowth+ Dist2FracLoc_8 , county_leukemia_sf)
## dist 8 is very important, even there is no lon it is ok.
screenreg(
list(lm_sum_01, lm_sum_02, lm_sum_03, lm_sum_04 ),
digits = 2
)
##
## ==============================================================
## Model 1 Model 2 Model 3 Model 4
## --------------------------------------------------------------
## (Intercept) 26.91 *** 26.38 *** 37.24 *** 26.05 ***
## (0.30) (0.29) (0.19) (0.22)
## PctPopGrowth 0.30 *** 0.85 *** 0.26 ***
## (0.06) (0.10) (0.06)
## Dist2FracLoc_8 2.15 *** 2.24 *** 2.33 ***
## (0.06) (0.06) (0.04)
## lon -0.01 *** -0.01 ** -0.07 ***
## (0.00) (0.00) (0.00)
## --------------------------------------------------------------
## R^2 0.87 0.87 0.54 0.87
## Adj. R^2 0.87 0.86 0.54 0.87
## Num. obs. 500 500 500 500
## ==============================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Start with trying 3 Statistic Significant Variables. (Model 1)
Then let’s see what happened if each of these variable is missing (Model 2,3,4)
From Model 3, we can see that Dist2FracLoc_8 is dominating the R^2 result.
screenreg(
list(lm_sum_01, lm_sum_02, lm_sum_04, lm_risk_01, lm_dist_a08 ,lm_lon),
digits = 2
)
##
## ======================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5 Model 6
## --------------------------------------------------------------------------------------
## (Intercept) 26.91 *** 26.38 *** 26.05 *** 37.05 *** 25.76 *** 36.89 ***
## (0.30) (0.29) (0.22) (0.27) (0.22) (0.20)
## PctPopGrowth 0.30 *** 0.26 *** 0.89 ***
## (0.06) (0.06) (0.15)
## Dist2FracLoc_8 2.15 *** 2.24 *** 2.33 *** 2.37 ***
## (0.06) (0.06) (0.04) (0.04)
## lon -0.01 *** -0.01 ** -0.07 ***
## (0.00) (0.00) (0.00)
## --------------------------------------------------------------------------------------
## R^2 0.87 0.87 0.87 0.07 0.86 0.47
## Adj. R^2 0.87 0.86 0.87 0.07 0.86 0.47
## Num. obs. 500 500 500 500 500 500
## ======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Eliminate lm_sum_03 (Previous Model 3), because we only Focus model with Dist2FracLoc_8.
Compare them with previous good model.
For balance between R^2 and variables, let keep thes two. lm_sum_04 (Model 3), lm_dist_a08 (Model5)
# Create a data frame with fitted values and residuals
plot_lm_sum_04_data <- data.frame(
FittedValues = lm_sum_04$fitted.values,
Residuals = resid(lm_sum_04)
)
# Create the ggplot
ggplot(plot_lm_sum_04_data, aes(x = FittedValues, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(
x = "Fitted Values",
y = "Residuals",
title = "Residuals vs Fitted Values of lm_sum_04"
) +
theme_minimal()
bptest(lm_sum_04)
##
## studentized Breusch-Pagan test
##
## data: lm_sum_04
## BP = 4.3835, df = 2, p-value = 0.1117
# Create a data frame with fitted values and residuals
plot_lm_dist_a08_data <- data.frame(
FittedValues = lm_dist_a08$fitted.values,
Residuals = resid(lm_dist_a08)
)
# Create the ggplot
ggplot(plot_lm_dist_a08_data, aes(x = FittedValues, y = Residuals)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") +
labs(
x = "Fitted Values",
y = "Residuals",
title = "Residuals vs Fitted Values of lm_dist_a08"
) +
theme_minimal()
bptest(lm_dist_a08)
##
## studentized Breusch-Pagan test
##
## data: lm_dist_a08
## BP = 0.074982, df = 1, p-value = 0.7842
For both models have high p-value, so we couldn’t conclude the no heteroskedasticity, assume both models linear. (homoscedasticity)
Too conclude for simplicity use only lm_dist_a08.
#Pair Test
gam_pair_pct_linear <- gam(leuk_cases ~ PctPopGrowth , data = county_leukemia_sf)
gam_pair_pct_smooth <- gam(leuk_cases ~ s(PctPopGrowth), data = county_leukemia_sf)
AIC(gam_pair_pct_linear)
## [1] 3185.279
AIC(gam_pair_pct_smooth)
## [1] 3184.964
summary(gam_pair_pct_linear)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ PctPopGrowth
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.0532 0.2675 138.516 < 2e-16 ***
## PctPopGrowth 0.8873 0.1456 6.094 2.21e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.0675 Deviance explained = 6.94%
## GCV = 34.08 Scale est. = 33.943 n = 500
summary(gam_pair_pct_smooth)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(PctPopGrowth)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.6840 0.2596 141.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(PctPopGrowth) 4.415 5.526 7.673 8.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0744 Deviance explained = 8.26%
## GCV = 34.062 Scale est. = 33.693 n = 500
| Topics | P-Value | AIC | R-sq (adj.) | GCV & Scale. Est |
|---|---|---|---|---|
| Linear ( Pop Growth ) | ’ *** ’ | 3185.3 | 0.068 | 34.1 , 34.0 |
| Smooth (Pop Growth ) | ’ *** ’ | 3185.0 | 0.074 | 34.1 , 33.7 |
| Difference | | 0.3 | | | 0.006 | | | 0.0 | , | 0.3 | |
For simplicity prefer linear. Use linear of PctPopGrowth (gam_pair_pct_linear)
Let’s remain with our two main king’s variables.
gam_pair_ds8_linear <- gam(leuk_cases ~ Dist2FracLoc_8 , data = county_leukemia_sf)
gam_pair_ds8_smooth <- gam(leuk_cases ~ s(Dist2FracLoc_8), data = county_leukemia_sf)
AIC(gam_pair_ds8_linear)
## [1] 2229.561
AIC(gam_pair_ds8_smooth)
## [1] 2187.061
summary(gam_pair_ds8_linear)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ Dist2FracLoc_8
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.75797 0.21975 117.22 <2e-16 ***
## Dist2FracLoc_8 2.37102 0.04244 55.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.862 Deviance explained = 86.2%
## GCV = 5.0393 Scale est. = 5.0191 n = 500
summary(gam_pair_ds8_smooth)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(Dist2FracLoc_8)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.68400 0.09531 384.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dist2FracLoc_8) 8.567 8.949 391.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.875 Deviance explained = 87.7%
## GCV = 4.6303 Scale est. = 4.5417 n = 500
| Topics | P-Value | AIC | R-sq (adj.) | GCV & Scale. Est |
|---|---|---|---|---|
| Linear ( Dist 8 ) | ’ *** ’ | 2229 | 0.862 | 5.0 , 5.0 |
| Smooth ( Dist 8 ) | ’ *** ’ | 2187 | 0.875 | 4.6 , 4.5 |
| Difference | | 42 | | | 0.013 | | | 0.4 | , | 0.5 | |
Non much difference for simplicity use Linear of Dist2FracLoc_8 (gam_pair_ds8_linear.)
Last one with lon
gam_pair_lon_linear <- gam(leuk_cases ~ lon , data = county_leukemia_sf)
gam_pair_lon_smooth <- gam(leuk_cases ~ s(lon), data = county_leukemia_sf)
AIC(gam_pair_lon_linear)
## [1] 2900.649
AIC(gam_pair_lon_smooth)
## [1] 2312.733
summary(gam_pair_lon_linear)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ lon
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.892664 0.196258 187.98 <2e-16 ***
## lon -0.071028 0.003357 -21.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## R-sq.(adj) = 0.472 Deviance explained = 47.3%
## GCV = 19.287 Scale est. = 19.21 n = 500
summary(gam_pair_lon_smooth)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(lon)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.6840 0.1083 338.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(lon) 6.397 7.54 344.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.839 Deviance explained = 84.1%
## GCV = 5.9525 Scale est. = 5.8644 n = 500
| Topics | P-Value | AIC | R-sq (adj.) | GCV & Scale. Est |
|---|---|---|---|---|
| Linear ( lon ) | ’ *** ’ | 2900 | 0.472 | 19.3 , 19.2 |
| Smooth ( lon ) | ’ *** ’ | 2312 | 0.839 | 6.0 , 5.9 |
| Difference | | 588 | | | 0.367 | | | 13.3 | , | 13.3 | |
With drastic improvement using smooth curve of lon (gam_pair_lon_smooth)
Assume will using variable as PctPopGrowth + Dist2FracLoc_8 + s(lon)
gam_sing_01 <- gam(leuk_cases ~ s(GreenNeighborhoodIndex), data = county_leukemia_sf) #no
gam_sing_02 <- gam(leuk_cases ~ s(CarsPerHH), data = county_leukemia_sf) #no
gam_sing_03 <- gam(leuk_cases ~ s(PopDens), data = county_leukemia_sf) #no
gam_sing_04 <- gam(leuk_cases ~ s(lat), data = county_leukemia_sf) # -OK-
AIC(gam_sing_01)
## [1] 3221.156
AIC(gam_sing_02)
## [1] 3218.954
AIC(gam_sing_03)
## [1] 3221.231
AIC(gam_sing_04)
## [1] 3211.888
summary(gam_sing_01)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(GreenNeighborhoodIndex)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.6840 0.2701 135.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(GreenNeighborhoodIndex) 1 1 0.078 0.78
##
## R-sq.(adj) = -0.00185 Deviance explained = 0.0157%
## GCV = 36.615 Scale est. = 36.468 n = 500
summary(gam_sing_02)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(CarsPerHH)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.6840 0.2681 136.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(CarsPerHH) 6.09 7.258 1.311 0.283
##
## R-sq.(adj) = 0.0126 Deviance explained = 2.46%
## GCV = 36.461 Scale est. = 35.944 n = 500
summary(gam_sing_03)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(PopDens)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.6840 0.2701 135.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(PopDens) 1 1 0.004 0.948
##
## R-sq.(adj) = -0.002 Deviance explained = 0.000858%
## GCV = 36.62 Scale est. = 36.474 n = 500
summary(gam_sing_04)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ s(lat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.6840 0.2672 137.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(lat) 2.283 2.838 3.217 0.0192 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0191 Deviance explained = 2.35%
## GCV = 35.943 Scale est. = 35.707 n = 500
| Topics | P-Value | AIC | R-sq (adj.) | GCV & Scale. Est |
|---|---|---|---|---|
| (GreenIndex) | ’ - ’ | 3221 | ’ - ’ | ’ - ’ |
| CarPerHH | ’ - ’ | 3219 | ’ - ’ | ’ - ’ |
| PopDens | ’ - ’ | 3221 | ’ - ’ | ’ - ’ |
| (Latitude) | ’ * ’ | 3212 | 0.019 | 35.9 , 35.7 |
latitude is only statistically significant here, let do some more experiment with it.
Result give similar result of focusing on PctPopGrowth, Dist2FracLoc8 , and lon.
weights_normal = abs(county_leukemia_sf$PctPopGrowth)
gam_foc_01 <- gam(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + s(lon, lat),data= county_leukemia_sf)
gam_foc_02 <- gam(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + s(lon),
data= county_leukemia_sf)
gam_foc_03 <- gam(leuk_cases ~ Dist2FracLoc_8 + s(lon, lat),
data= county_leukemia_sf)
gam_foc_04 <- gam(leuk_cases ~ PctPopGrowth + s(lon, lat),
data= county_leukemia_sf)
gam_foc_05 <- gam(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 ,
data= county_leukemia_sf)
AIC(gam_foc_01)
## [1] 2069.665
AIC(gam_foc_02)
## [1] 2185.837
AIC(gam_foc_03)
## [1] 2110.986
AIC(gam_foc_04)
## [1] 2080.057
AIC(gam_foc_05)
## [1] 2210.431
# summary(gam_foc_01)
# summary(gam_foc_02)
# summary(gam_foc_03)
summary(gam_foc_04)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## leuk_cases ~ PctPopGrowth + s(lon, lat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.87095 0.08925 413.100 < 2e-16 ***
## PctPopGrowth 0.44931 0.07204 6.237 9.91e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(lon,lat) 26.77 28.69 149.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.903 Deviance explained = 90.8%
## GCV = 3.7497 Scale est. = 3.534 n = 500
# summary(gam_foc_05)
# gam_foc_01 <- gam(leuk_cases ~ PctPopGrowth,
# county_leukemia_sf, weights = PctPopGrowth)
#
# gam_foc_02 <- gam(leuk_cases ~ PctPopGrowth+ Dist2FracLoc_8 + s(lon),
# data = county_leukemia_sf, weights = PctPopGrowth)
| Topics | PctPop | Dist2_8 | s(lon) | s(lat) | AIC | R-sq (adj.) |
|---|---|---|---|---|---|---|
| All variables | Y | Y | Y | Y | 2070 | 0.905 |
| Without s(lat) | Y | Y | Y | 2186 | 0.876 | |
| Without PctPopGrowth | Y | Y | Y | 2111 | 0.896 | |
| Without Dist2FracLoc_8 | Y | Y | Y | 2080 | 0.903 | |
| Without s(lon,lat) | Y | Y | 2210 | 0.868 |
gam.check(gam_foc_01)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 8 iterations.
## The RMS GCV score gradient at convergence was 1.888517e-06 .
## The Hessian was positive definite.
## Model rank = 32 / 32
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(lon,lat) 29.0 25.4 0.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam.check(gam_foc_04)
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 7 iterations.
## The RMS GCV score gradient at convergence was 1.245411e-05 .
## The Hessian was positive definite.
## Model rank = 31 / 31
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(lon,lat) 29.0 26.8 0.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(gam_foc_01)
plot(gam_foc_04, main = "GAM Models - Leukemia Prediction")
After checking we see good result, so to repeat We are choosing Model Without Dist2FracLoc_8 or gam_foc_04 or Leuk_cases (PctPopGrowth + s(lon,lat).
screenreg( list(lm_dist_a08, gam_foc_04), digits = 2,
custom.model.names = c("OLS Model", "GAM Model") )
##
## ============================================
## OLS Model GAM Model
## --------------------------------------------
## (Intercept) 25.76 *** 36.87 ***
## (0.22) (0.09)
## Dist2FracLoc_8 2.37 ***
## (0.04)
## PctPopGrowth 0.45 ***
## (0.07)
## EDF: s(lon,lat) 26.77 ***
## (28.69)
## --------------------------------------------
## R^2 0.86 0.90
## Adj. R^2 0.86
## Num. obs. 500 500
## AIC 2080.06
## BIC 2205.51
## Log Likelihood -1010.26
## Deviance 1665.32
## Deviance explained 0.91
## Dispersion 3.53
## GCV score 3.75
## Num. smooth terms 1
## ============================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
coords <- cbind(county_leukemia_sf$lon, county_leukemia_sf$lat)
dists <- dist(coords) # Compute pairwise distances
knn <- knn2nb(knearneigh(coords, k = 4)) # Create a neighbor list using k=4 nearest neighbors
sp_weight <- nb2listw(knn , style="W") # Create the spatial weights list
moran_test <- moran.test(county_leukemia_sf$leuk_cases, listw = sp_weight)
print(moran_test)
##
## Moran I test under randomisation
##
## data: county_leukemia_sf$leuk_cases
## weights: sp_weight
##
## Moran I statistic standard deviate = 31.243, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.9254981482 -0.0020040080 0.0008812889
Super high Moran I Statistic, mean strong positive spatial autocorrelation.
| Topics | P-value | Moran’s I | Expectation | Variance |
|---|---|---|---|---|
| leuk_cases(lon,lat) | ’ *** ’ | 0.93 | -0.002 | 0.0009 |
lm.morantest(lm_dist_a08, sp_weight)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = leuk_cases ~ Dist2FracLoc_8, data =
## county_leukemia_sf)
## weights: sp_weight
##
## Moran I statistic standard deviate = 16.062, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.4712159465 -0.0040010647 0.0008753169
| Topics | P-value | Moran’s I | Expectation | Variance |
|---|---|---|---|---|
| lm_dist_a08 (Only Dist2FracLoc_8) | ’ *** ’ | 0.47 | -0.004 | 0.0009 |
moran.test(gam_foc_04$residuals , sp_weight)
##
## Moran I test under randomisation
##
## data: gam_foc_04$residuals
## weights: sp_weight
##
## Moran I statistic standard deviate = 10.259, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.3024806076 -0.0020040080 0.0008808084
| Topics | P-value | Moran’s I | Expectation | Variance |
|---|---|---|---|---|
| gam_foc_04 (PctPopGrowth + s(lon,lat)) | ’ *** ’ | 0.30 | -0.002 | 0.0009 |
| Topics | P-value | Moran’s I | Expectation | Variance |
|---|---|---|---|---|
| lm_dist_a08 (Only Dist2FracLoc_8) | ’ *** ’ | 0.47 | -0.004 | 0.0009 |
| gam_foc_04 (PctPopGrowth + s(lon,lat)) | ’ *** ’ | 0.30 | -0.002 | 0.0009 |
There still some spatial cluster in my residual of this model.
To address this we have to apply spatial error model to account for spatially correlated error terms
sperr_lm_dist_a08 = errorsarlm(leuk_cases ~ Dist2FracLoc_8,
data= county_leukemia_sf, listw = sp_weight)
summary(sperr_lm_dist_a08)
##
## Call:
## errorsarlm(formula = leuk_cases ~ Dist2FracLoc_8, data = county_leukemia_sf,
## listw = sp_weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.86902 -1.29451 0.10411 1.26150 5.13015
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 25.918449 0.439734 58.941 < 2.2e-16
## Dist2FracLoc_8 2.346697 0.084292 27.840 < 2.2e-16
##
## Lambda: 0.60654, LR test value: 177.9, p-value: < 2.22e-16
## Asymptotic standard error: 0.036973
## z-value: 16.405, p-value: < 2.22e-16
## Wald statistic: 269.12, p-value: < 2.22e-16
##
## Log likelihood: -1022.832 for error model
## ML residual variance (sigma squared): 3.1715, (sigma: 1.7809)
## Number of observations: 500
## Number of parameters estimated: 4
## AIC: 2053.7, (AIC for lm: 2229.6)
sperr_gam_foc_04 = errorsarlm(leuk_cases ~ PctPopGrowth,
data= county_leukemia_sf, listw = sp_weight)
summary(sperr_gam_foc_04)
##
## Call:
## errorsarlm(formula = leuk_cases ~ PctPopGrowth, data = county_leukemia_sf,
## listw = sp_weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.91820 -1.20355 0.01942 1.16151 5.13078
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 37.716005 0.856533 44.0334 < 2.2e-16
## PctPopGrowth 0.552459 0.062766 8.8019 < 2.2e-16
##
## Lambda: 0.91069, LR test value: 1049.7, p-value: < 2.22e-16
## Asymptotic standard error: 0.010938
## z-value: 83.258, p-value: < 2.22e-16
## Wald statistic: 6931.9, p-value: < 2.22e-16
##
## Log likelihood: -1064.78 for error model
## ML residual variance (sigma squared): 2.9195, (sigma: 1.7087)
## Number of observations: 500
## Number of parameters estimated: 4
## AIC: 2137.6, (AIC for lm: 3185.3)
screenreg( list(sperr_lm_dist_a08, sperr_gam_foc_04), digits = 2 , custom.model.names = c("sperr_lm_dist_a08", "sperr_gam_foc_04"))
##
## ========================================================
## sperr_lm_dist_a08 sperr_gam_foc_04
## --------------------------------------------------------
## (Intercept) 25.92 *** 37.72 ***
## (0.44) (0.86)
## Dist2FracLoc_8 2.35 ***
## (0.08)
## lambda 0.61 *** 0.91 ***
## (0.04) (0.01)
## PctPopGrowth 0.55 ***
## (0.06)
## --------------------------------------------------------
## Num. obs. 500 500
## Parameters 4 4
## Log Likelihood -1022.83 -1064.78
## AIC (Linear model) 2229.56 3185.28
## AIC (Spatial model) 2053.66 2137.56
## LR test: statistic 177.90 1049.72
## LR test: p-value 0.00 0.00
## ========================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
sperr_risk_1 <- errorsarlm(leuk_cases ~ PctPopGrowth,
data= county_leukemia_sf, listw = sp_weight)
sperr_risk_2 <- errorsarlm(leuk_cases ~ Dist2FracLoc_8,
data= county_leukemia_sf, listw = sp_weight)
sperr_risk_3 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8,
data= county_leukemia_sf, listw = sp_weight)
sperr_risk_4 <- errorsarlm(leuk_cases ~ PctPopGrowth + GreenNeighborhoodIndex,
data= county_leukemia_sf, listw = sp_weight)
sperr_risk_5 <- errorsarlm(leuk_cases ~ PctPopGrowth + CarsPerHH,
data= county_leukemia_sf, listw = sp_weight)
sperr_risk_6 <- errorsarlm(leuk_cases ~ PctPopGrowth + PopDens,
data= county_leukemia_sf, listw = sp_weight)
sperr_risk_7 <- errorsarlm(leuk_cases ~ PctPopGrowth + AverageAge,
data= county_leukemia_sf, listw = sp_weight)
screenreg( list(sperr_risk_1, sperr_risk_2, sperr_risk_3, sperr_risk_4), digits = 3 , custom.model.names = c("sperr_risk_1", "sperr_risk_2", "sperr_risk_3", "sperr_risk_4"))
##
## =================================================================================
## sperr_risk_1 sperr_risk_2 sperr_risk_3 sperr_risk_4
## ---------------------------------------------------------------------------------
## (Intercept) 37.716 *** 25.918 *** 26.481 *** 38.359 ***
## (0.857) (0.440) (0.476) (0.874)
## PctPopGrowth 0.552 *** 0.491 *** 0.541 ***
## (0.063) (0.059) (0.061)
## lambda 0.911 *** 0.607 *** 0.661 *** 0.915 ***
## (0.011) (0.037) (0.033) (0.010)
## Dist2FracLoc_8 2.347 *** 2.277 ***
## (0.084) (0.091)
## GreenNeighborhoodIndex -1.343 ***
## (0.232)
## ---------------------------------------------------------------------------------
## Num. obs. 500 500 500 500
## Parameters 4 4 5 5
## Log Likelihood -1064.780 -1022.832 -991.545 -1048.702
## AIC (Linear model) 3185.279 2229.561 2210.431 3187.117
## AIC (Spatial model) 2137.559 2053.664 1993.091 2107.405
## LR test: statistic 1049.720 177.897 219.340 1081.712
## LR test: p-value 0.000 0.000 0.000 0.000
## =================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg( list(sperr_risk_5, sperr_risk_6, sperr_risk_7), digits = 3 , custom.model.names = c("sperr_risk_5", "sperr_risk_6", "sperr_risk_7"))
##
## ===================================================================
## sperr_risk_5 sperr_risk_6 sperr_risk_7
## -------------------------------------------------------------------
## (Intercept) 37.455 *** 37.645 *** 37.186 ***
## (0.867) (0.861) (0.873)
## PctPopGrowth 0.559 *** 0.553 *** 0.534 ***
## (0.062) (0.063) (0.061)
## CarsPerHH 0.587 *
## (0.255)
## lambda 0.911 *** 0.911 *** 0.915 ***
## (0.011) (0.011) (0.011)
## PopDens 0.378
## (0.335)
## AverageAge15-50 0.693 ***
## (0.163)
## AverageAge50 and older 0.887 ***
## (0.167)
## -------------------------------------------------------------------
## Num. obs. 500 500 500
## Parameters 5 5 6
## Log Likelihood -1062.140 -1064.144 -1049.260
## AIC (Linear model) 3187.219 3187.267 3188.236
## AIC (Spatial model) 2134.280 2138.289 2110.520
## LR test: statistic 1054.939 1050.978 1079.716
## LR test: p-value 0.000 0.000 0.000
## ===================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Green index show negative outcome.
PctPopGrowth, Dist2FracLoc_8, AverageAge, CarPerHH is statistically significant
This also show how importance Dist2FracLoc_8 is
Keep this sperr_risk_3 in mind
sperr_foc_1 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + AverageAge + PopDens + CarsPerHH,
data= county_leukemia_sf, listw = sp_weight)
sperr_foc_2 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + AverageAge ,
data= county_leukemia_sf, listw = sp_weight)
sperr_foc_3 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + lon + lat ,
data= county_leukemia_sf, listw = sp_weight)
sperr_foc_4 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + AverageAge + CarsPerHH,
data= county_leukemia_sf, listw = sp_weight)
sperr_foc_5 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + GreenNeighborhoodIndex + CarsPerHH,
data= county_leukemia_sf, listw = sp_weight)
screenreg( list(sperr_foc_1, sperr_foc_2, sperr_foc_3, sperr_foc_4, sperr_foc_5), digits = 2, custom.model.names = c("sperr_foc_1", "sperr_foc_2", "sperr_foc_3", "sperr_foc_4", "sperr_foc_5") )
##
## =======================================================================================
## sperr_foc_1 sperr_foc_2 sperr_foc_3 sperr_foc_4 sperr_foc_5
## ---------------------------------------------------------------------------------------
## (Intercept) 26.13 *** 26.46 *** 27.87 *** 26.13 *** 26.78 ***
## (0.51) (0.49) (0.63) (0.51) (0.52)
## PctPopGrowth 0.48 *** 0.47 *** 0.49 *** 0.48 *** 0.49 ***
## (0.05) (0.06) (0.06) (0.05) (0.06)
## Dist2FracLoc_8 2.30 *** 2.29 *** 2.11 *** 2.30 *** 2.28 ***
## (0.09) (0.09) (0.12) (0.09) (0.09)
## GreenNeighborhoodIndex -1.30 *** -1.31 *** -1.21 *** -1.28 *** -1.17 ***
## (0.22) (0.23) (0.24) (0.22) (0.23)
## AverageAge15-50 0.76 *** 0.77 *** 0.76 ***
## (0.15) (0.16) (0.16)
## AverageAge50 and older 1.07 *** 1.03 *** 1.05 ***
## (0.16) (0.16) (0.16)
## PopDens -0.61
## (0.40)
## CarsPerHH 0.90 ** 0.61 * 0.56 *
## (0.30) (0.24) (0.25)
## lambda 0.69 *** 0.68 *** 0.66 *** 0.69 *** 0.68 ***
## (0.03) (0.03) (0.03) (0.03) (0.03)
## lon -0.01 *
## (0.00)
## lat -0.01
## (0.01)
## ---------------------------------------------------------------------------------------
## Num. obs. 500 500 500 500 500
## Parameters 10 8 8 9 7
## Log Likelihood -953.01 -957.44 -976.64 -954.16 -976.65
## AIC (Linear model) 2168.16 2167.30 2183.11 2168.49 2204.85
## AIC (Spatial model) 1926.03 1930.88 1969.28 1926.33 1967.29
## LR test: statistic 244.14 238.42 215.83 244.16 239.55
## LR test: p-value 0.00 0.00 0.00 0.00 0.00
## =======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
sperr_dist_01 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_1 + Dist2FracLoc_2 + Dist2FracLoc_3 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6 + Dist2FracLoc_7 + Dist2FracLoc_8 + Dist2FracLoc_9 + Dist2FracLoc_10,
data= county_leukemia_sf, listw = sp_weight)
sperr_dist_02 <- errorsarlm(leuk_cases ~ Dist2FracLoc_1 + Dist2FracLoc_2 + Dist2FracLoc_3 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6 + Dist2FracLoc_7 + Dist2FracLoc_8 + Dist2FracLoc_9 + Dist2FracLoc_10,
data= county_leukemia_sf, listw = sp_weight)
sperr_dist_03 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_9 + Dist2FracLoc_10,
data= county_leukemia_sf, listw = sp_weight)
sperr_dist_04 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_1 + Dist2FracLoc_2 + Dist2FracLoc_3,
data= county_leukemia_sf, listw = sp_weight)
sperr_dist_05 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_4 + Dist2FracLoc_5 + Dist2FracLoc_6,
data= county_leukemia_sf, listw = sp_weight)
sperr_dist_06 <- errorsarlm(leuk_cases ~ PctPopGrowth + Dist2FracLoc_8 + Dist2FracLoc_7,
data= county_leukemia_sf, listw = sp_weight)
screenreg( list(sperr_dist_01, sperr_dist_02, sperr_dist_03), digits = 3 , custom.model.names = c("sperr_dist_01", "sperr_dist_02", "sperr_dist_03"))
##
## ================================================================
## sperr_dist_01 sperr_dist_02 sperr_dist_03
## ----------------------------------------------------------------
## (Intercept) 19.948 *** 21.683 *** 26.827 ***
## (4.502) (4.388) (0.567)
## PctPopGrowth 0.495 *** 0.498 ***
## (0.060) (0.059)
## Dist2FracLoc_1 -0.158 0.459
## (0.942) (0.920)
## Dist2FracLoc_2 0.172 0.767
## (1.271) (1.255)
## Dist2FracLoc_3 1.315 0.315
## (1.340) (1.320)
## Dist2FracLoc_4 0.970 0.010
## (1.175) (1.149)
## Dist2FracLoc_5 -0.344 -0.838
## (0.583) (0.565)
## Dist2FracLoc_6 -0.696 0.061
## (0.474) (0.446)
## Dist2FracLoc_7 -1.858 -1.635
## (1.875) (1.893)
## Dist2FracLoc_8 3.237 1.245 3.266 ***
## (2.306) (2.317) (0.865)
## Dist2FracLoc_9 1.983 2.685 * -0.075
## (1.359) (1.354) (0.215)
## Dist2FracLoc_10 -0.823 0.295 -0.958
## (1.682) (1.684) (0.752)
## lambda 0.617 *** 0.563 *** 0.652 ***
## (0.036) (0.040) (0.034)
## ----------------------------------------------------------------
## Num. obs. 500 500 500
## Parameters 14 13 7
## Log Likelihood -983.554 -1014.207 -990.540
## AIC (Linear model) 2161.776 2189.429 2202.323
## AIC (Spatial model) 1995.108 2054.414 1995.080
## LR test: statistic 168.668 137.015 209.242
## LR test: p-value 0.000 0.000 0.000
## ================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg( list(sperr_dist_04, sperr_dist_05, sperr_dist_06), digits = 3 , custom.model.names = c("sperr_dist_04", "sperr_dist_05", "sperr_dist_06"))
##
## ================================================================
## sperr_dist_04 sperr_dist_05 sperr_dist_06
## ----------------------------------------------------------------
## (Intercept) 22.808 *** 24.970 *** 26.475 ***
## (2.100) (3.212) (0.474)
## PctPopGrowth 0.484 *** 0.489 *** 0.492 ***
## (0.059) (0.059) (0.059)
## Dist2FracLoc_8 1.958 *** 2.031 *** 2.115 ***
## (0.156) (0.285) (0.252)
## Dist2FracLoc_1 0.405
## (0.254)
## Dist2FracLoc_2 1.438
## (0.944)
## Dist2FracLoc_3 -0.805
## (0.742)
## lambda 0.632 *** 0.656 *** 0.660 ***
## (0.035) (0.033) (0.033)
## Dist2FracLoc_4 0.087
## (0.291)
## Dist2FracLoc_5 0.245
## (0.390)
## Dist2FracLoc_6 0.202
## (0.241)
## Dist2FracLoc_7 0.169
## (0.246)
## ----------------------------------------------------------------
## Num. obs. 500 500 500
## Parameters 8 8 6
## Log Likelihood -988.620 -990.750 -991.311
## AIC (Linear model) 2172.959 2207.437 2210.784
## AIC (Spatial model) 1993.239 1997.501 1994.622
## LR test: statistic 181.720 211.936 218.162
## LR test: p-value 0.000 0.000 0.000
## ================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg( list(sperr_lm_dist_a08, sperr_gam_foc_04, sperr_risk_3, sperr_foc_2), digits = 3 , custom.model.names = c("sperr_lm_dist_a08", "sperr_gam_foc_04", "sperr_risk_3", "sperr_foc_2"))
##
## =======================================================================================
## sperr_lm_dist_a08 sperr_gam_foc_04 sperr_risk_3 sperr_foc_2
## ---------------------------------------------------------------------------------------
## (Intercept) 25.918 *** 37.716 *** 26.481 *** 26.455 ***
## (0.440) (0.857) (0.476) (0.494)
## Dist2FracLoc_8 2.347 *** 2.277 *** 2.293 ***
## (0.084) (0.091) (0.090)
## lambda 0.607 *** 0.911 *** 0.661 *** 0.684 ***
## (0.037) (0.011) (0.033) (0.031)
## PctPopGrowth 0.552 *** 0.491 *** 0.468 ***
## (0.063) (0.059) (0.055)
## GreenNeighborhoodIndex -1.307 ***
## (0.226)
## AverageAge15-50 0.766 ***
## (0.156)
## AverageAge50 and older 1.034 ***
## (0.162)
## ---------------------------------------------------------------------------------------
## Num. obs. 500 500 500 500
## Parameters 4 4 5 8
## Log Likelihood -1022.832 -1064.780 -991.545 -957.440
## AIC (Linear model) 2229.561 3185.279 2210.431 2167.301
## AIC (Spatial model) 2053.664 2137.559 1993.091 1930.881
## LR test: statistic 177.897 1049.720 219.340 238.421
## LR test: p-value 0.000 0.000 0.000 0.000
## =======================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
screenreg( list(lm_dist_a08, gam_foc_04, sperr_risk_3), digits = 3 , custom.model.names = c("lm_dist_a08", "gam_foc_04", "sperr_risk_3"))
##
## =============================================================
## lm_dist_a08 gam_foc_04 sperr_risk_3
## -------------------------------------------------------------
## (Intercept) 25.758 *** 36.871 *** 26.481 ***
## (0.220) (0.089) (0.476)
## Dist2FracLoc_8 2.371 *** 2.277 ***
## (0.042) (0.091)
## PctPopGrowth 0.449 *** 0.491 ***
## (0.072) (0.059)
## EDF: s(lon,lat) 26.767 ***
## (28.690)
## lambda 0.661 ***
## (0.033)
## -------------------------------------------------------------
## R^2 0.862 0.903
## Adj. R^2 0.862
## Num. obs. 500 500 500
## AIC 2080.057
## BIC 2205.515
## Log Likelihood -1010.261 -991.545
## Deviance 1665.323
## Deviance explained 0.908
## Dispersion 3.534
## GCV score 3.750
## Num. smooth terms 1
## Parameters 5
## AIC (Linear model) 2210.431
## AIC (Spatial model) 1993.091
## LR test: statistic 219.340
## LR test: p-value 0.000
## =============================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
Comparing-wise there three models show similar fit (R^2 = 0.862 and 0.903) (AIC = 2080 and 1993).
Let’s select model base on logical and data experimented.
For my first disagreement to the models, Dist2FracLoc espescially for 8 show high correlation, but to conclude that farther away from Fraction Site, the more chance of getting leukemia is not logically explained. Reject lm_dist_a08 model and sperr_risk_3 model.
With GAM Model with show nicely explained plot that is totally agree with Leukemia Density.
From moran’s I test, this also show some geographically clustered. Moran I Statistic = 0.30. Indicated moderate positive spatial autocorrelation. Similar areas tend to have similar levels of Leukemia cases.
Considering environmental factor (such as GreenNeighborhoodIndex, CarsPerHH and PopDens) that might related to pollution also show insignificant to leukemia cases
Considering Genetic Factors, Even thorough Leukemia is not infected like flu, but by Genetic Predisposition could cause spatial gradient, this could be explaining the geographically clustered that happened in the area.
Additionally the leukemia cases also linearly correlated to the PctPopGrowth as well. Is back up the possibility of Genetic Predisposition theory.
At the center of the cluster show less number of Leukemia cases, if the Genetic Predisposition theory is correct. The separate gene should have immunity factor to leukemia.
To conclude, We strongly believe that the leukemia immunity’s gene was separated by Genetic Predisposition starting from the center of area near Fraction Site 8, result in less leukemia cases around the center of the cluster. And our model show the explanation according to this Generic Predisposition Theory.
| Topics | P-value | Moran’s I | Expectation | Variance |
|---|---|---|---|---|
| gam_foc_04 (PctPopGrowth + s(lon,lat)) | ’ *** ’ | 0.30 | -0.002 | 0.0009 |
| Topics | P-Value | AIC | R-sq (adj.) | GCV & Scale. Est |
|---|---|---|---|---|
| (GreenIndex) | ’ - ’ | 3221 | ’ - ’ | ’ - ’ |
| CarPerHH | ’ - ’ | 3219 | ’ - ’ | ’ - ’ |
| PopDens | ’ - ’ | 3221 | ’ - ’ | ’ - ’ |
| (Latitude) | ’ * ’ | 3212 | 0.019 | 35.9 , 35.7 |
head(coords)
## [,1] [,2]
## [1,] 44.93403 -29.47574
## [2,] 89.50712 -18.58449
## [3,] 37.06287 32.51696
## [4,] 73.29633 -20.26620
## [5,] 90.39685 -12.98208
## [6,] 69.74439 -48.33941
head(fracking_coords)
## X Y
## [1,] -40.33708 -6.008472
## [2,] 88.45395 25.422681
## [3,] 78.33582 34.077583
## [4,] -51.85614 1.218802
## [5,] 99.02833 -29.439174
## [6,] 31.19807 38.713713
leukemia_coords <- coords
dists_btw_frac <- spDists(leukemia_coords, fracking_coords, longlat = TRUE)
min_dists_btw_frac <- apply(dists_btw_frac, 1, min)
county_leukemia_sf$min_fracking_dist <- min_dists_btw_frac
cor(county_leukemia_sf$min_fracking_dist, county_leukemia_sf$leuk_cases)
## [1] 0.5892939
Null Hypothesis (H₀): There is no relationship between the proximity to fracking sites and the number of leukemia cases.
Alternative Hypothesis (H₁): There is a relationship between the proximity to fracking sites and the number of leukemia cases.
ggplot(county_leukemia_sf, aes(x = min_dists_btw_frac, y = leuk_cases)) +
geom_point() +
labs(title = "Leukemia Cases vs. Distance to Fracking Sites", x = "Distance to Nearest Fracking Site", y = "Leukemia Cases")
ggplot() +
geom_sf(data = county_leukemia_sf, aes(fill = leuk_cases)) +
geom_sf(data = fracking_locations_sf, fill = "green", color = "red", alpha = 0.5) +
ggtitle("Leukemia Cases and Fracking Sites Proximity") +
scale_fill_gradient(low = "lightblue", high = "darkblue")
cor(county_leukemia_sf$min_fracking_dist, county_leukemia_sf$leuk_cases, method = "pearson")
## [1] 0.5892939
cor(county_leukemia_sf$min_fracking_dist, county_leukemia_sf$leuk_cases, method = "spearman")
## [1] 0.4866192
model <- lm(leuk_cases ~ min_fracking_dist + CarsPerHH + PopDens + GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf)
summary(model)
##
## Call:
## lm(formula = leuk_cases ~ min_fracking_dist + CarsPerHH + PopDens +
## GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9657 -3.3809 0.0416 3.2894 12.2270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.397902 0.622375 48.842 <2e-16 ***
## min_fracking_dist 0.002649 0.000136 19.479 <2e-16 ***
## CarsPerHH -0.103245 0.923447 -0.112 0.911
## PopDens 0.200990 1.217783 0.165 0.869
## GreenNeighborhoodIndex -0.665608 0.688988 -0.966 0.334
## PctPopGrowth 1.205996 0.111586 10.808 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.398 on 494 degrees of freedom
## Multiple R-squared: 0.474, Adjusted R-squared: 0.4687
## F-statistic: 89.05 on 5 and 494 DF, p-value: < 2.2e-16
plot(model)
model <- errorsarlm(leuk_cases ~ min_fracking_dist + CarsPerHH + PopDens + GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf, listw = sp_weight)
summary(model)
##
## Call:errorsarlm(formula = leuk_cases ~ min_fracking_dist + CarsPerHH +
## PopDens + GreenNeighborhoodIndex + PctPopGrowth, data = county_leukemia_sf,
## listw = sp_weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7666857 -1.1361530 0.0051382 1.1145987 4.3032821
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 34.47004846 1.00708738 34.2275 < 2.2e-16
## min_fracking_dist 0.00123996 0.00024861 4.9877 6.111e-07
## CarsPerHH 0.57222610 0.31472164 1.8182 0.06903
## PopDens -0.21721102 0.41260962 -0.5264 0.59859
## GreenNeighborhoodIndex -1.30244341 0.23169775 -5.6213 1.895e-08
## PctPopGrowth 0.56467110 0.06015880 9.3863 < 2.2e-16
##
## Lambda: 0.89667, LR test value: 822.39, p-value: < 2.22e-16
## Asymptotic standard error: 0.012392
## z-value: 72.361, p-value: < 2.22e-16
## Wald statistic: 5236.1, p-value: < 2.22e-16
##
## Log likelihood: -1035.789 for error model
## ML residual variance (sigma squared): 2.6604, (sigma: 1.6311)
## Number of observations: 500
## Number of parameters estimated: 8
## AIC: 2087.6, (AIC for lm: 2908)
library(ggplot2)
library(reshape2)
# # Create a data frame with the distance variables and leukemia cases
# distance_columns <- c("Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4",
# "Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8",
# "Dist2FracLoc_9", "Dist2FracLoc_10")
#
# # Reshape data for easier plotting
# leukemia_distances <- county_leukemia_sf[, c("leuk_cases", distance_columns)]
#
# leukemia_distances <- data.frame(leukemia_distances)
# class(leukemia_distances)
#
# leukemia_distances_melted <- melt(leukemia_distances, id.vars = "leuk_cases", variable.name = "Fracking_Site", value.name = "Distance")
#
# # Plotting scatter plots for each fracking site
# ggplot(leukemia_distances_melted, aes(x = Distance, y = leuk_cases)) +
# geom_point() +
# facet_wrap(~Fracking_Site, scales = "free") + # Create a separate plot for each fracking site
# labs(title = "Scatter Plots of Leukemia Cases vs. Distance to Fracking Sites",
# x = "Distance to Fracking Site (Euclidean)",
# y = "Number of Leukemia Cases") +
# theme_minimal()
library(dplyr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following object is masked from 'package:texreg':
##
## extract
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(ggplot2)
# Create a data frame with the distance variables and leukemia cases
distance_columns <- c("Dist2FracLoc_1", "Dist2FracLoc_2", "Dist2FracLoc_3", "Dist2FracLoc_4",
"Dist2FracLoc_5", "Dist2FracLoc_6", "Dist2FracLoc_7", "Dist2FracLoc_8",
"Dist2FracLoc_9", "Dist2FracLoc_10")
# Select only the relevant columns
leukemia_distances <- county_leukemia_sf %>%
select(leuk_cases, all_of(distance_columns))
# Reshape data using tidyr::pivot_longer (alternative to melt)
leukemia_distances_melted <- leukemia_distances %>%
pivot_longer(cols = starts_with("Dist2FracLoc"),
names_to = "Fracking_Site",
values_to = "Distance")
# Plotting scatter plots for each fracking site
ggplot(leukemia_distances_melted, aes(x = Distance, y = leuk_cases)) +
geom_point() +
facet_wrap(~Fracking_Site, scales = "free") + # Create a separate plot for each fracking site
labs(title = "Scatter Plots of Leukemia Cases vs. Distance to Fracking Sites",
x = "Distance to Fracking Site (Euclidean)",
y = "Number of Leukemia Cases") +
theme_minimal()
dists_displacement <- st_distance(county_leukemia_sf, fracking_locations_sf)
min_dists_displacement <- apply(dists_displacement, 1, min)
county_leukemia_sf$min_fracking_dist_displacement <- min_dists_displacement
# Identify the minimum and maximum values of the distance column
min_distance_sc <- min(county_leukemia_sf$min_fracking_dist_displacement, na.rm = TRUE)
max_distance_sc <- max(county_leukemia_sf$min_fracking_dist_displacement, na.rm = TRUE)
# Perform Min-Max Scaling to rescale distance to 0-10 range
county_leukemia_sf$scaled_min_fracking_dist <-
(county_leukemia_sf$min_fracking_dist_displacement - min_distance_sc) /
(max_distance_sc - min_distance_sc) * 10
# Check the result
head(county_leukemia_sf$scaled_min_fracking_dist)
## [1] 0.8270320 1.6581797 0.8910517 3.3780370 2.6941850 3.1186514
library(ggplot2)
ggplot(county_leukemia_sf, aes(x = scaled_min_fracking_dist, y = leuk_cases)) +
geom_point() +
labs(title = "Leukemia Cases vs. Scaled Distance to Nearest Fracking Site",
x = "Scaled Distance to Nearest Fracking Site (0-10)",
y = "Number of Leukemia Cases") +
theme_minimal()
Still a positive trend.
Do correlation checking
cor(county_leukemia_sf$scaled_min_fracking_dist, county_leukemia_sf$leuk_cases, method = "pearson")
## [1] 0.581328
cor(county_leukemia_sf$scaled_min_fracking_dist, county_leukemia_sf$leuk_cases, method = "spearman")
## [1] 0.4793031
model <- lm(leuk_cases ~ scaled_min_fracking_dist, data = county_leukemia_sf)
summary(model)
##
## Call:
## lm(formula = leuk_cases ~ scaled_min_fracking_dist, data = county_leukemia_sf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.749 -4.057 -0.331 3.647 15.102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.21840 0.40720 76.67 <2e-16 ***
## scaled_min_fracking_dist 1.50760 0.09456 15.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.914 on 498 degrees of freedom
## Multiple R-squared: 0.3379, Adjusted R-squared: 0.3366
## F-statistic: 254.2 on 1 and 498 DF, p-value: < 2.2e-16
plot(model)