# Load the data
air_quality <- read.csv("C:/Users/Shamp/Downloads/Air_Quality 2024NYC.csv")
# Remove message and Binary_Var columns
cols_to_remove <- c("Message", "Binary_Var")
existing_cols <- cols_to_remove[cols_to_remove %in% names(air_quality)]
if (length(existing_cols) > 0) {
air_quality <- air_quality[, !names(air_quality) %in% existing_cols]
}
# Create a binary variable (e.g., Data.Value > 25)
air_quality <- air_quality %>%
mutate(Binary_Var = ifelse(Data.Value > 25, 1, 0))
# Ensure categorical variables are factors
air_quality <- air_quality %>%
mutate(Indicator.ID = as.factor(Indicator.ID),
`Geo.Type.Name` = as.factor(`Geo.Type.Name`),
`Geo.Join.ID` = as.factor(`Geo.Join.ID`))
# Create a subset and remove missing values
subset_data <- air_quality %>%
dplyr::select(Binary_Var, Data.Value, Indicator.ID, `Geo.Type.Name`, `Geo.Join.ID`) %>%
na.omit()
# Print summary to verify dataset
summary(subset_data)
## Binary_Var Data.Value Indicator.ID Geo.Type.Name
## Min. :0.0000 Min. : 0.00 365 :5922 Borough : 850
## 1st Qu.:0.0000 1st Qu.: 8.90 375 :5922 CD :6490
## Median :0.0000 Median : 15.20 386 :2115 Citywide: 170
## Mean :0.2879 Mean : 21.41 643 : 321 UHF34 :3366
## 3rd Qu.:1.0000 3rd Qu.: 26.70 644 : 321 UHF42 :7140
## Max. :1.0000 Max. :424.70 645 : 321
## (Other):3094
## Geo.Join.ID
## 101 : 379
## 102 : 379
## 103 : 379
## 104 : 379
## 201 : 379
## 202 : 379
## (Other):15742
# Fit logistic regression models
model1 <- glm(Binary_Var ~ Data.Value, data = subset_data, family = binomial)
model2 <- glm(Binary_Var ~ Data.Value + Indicator.ID + Geo.Type.Name, data = subset_data, family = binomial)
model3 <- glm(Binary_Var ~ Data.Value + Indicator.ID + Geo.Type.Name + Geo.Join.ID, data = subset_data, family = binomial)
# Compare models
AIC(model1, model2, model3)
## df AIC
## model1 2 4.001003
## model2 26 52.000721
## model3 97 194.000732
## df BIC
## model1 2 19.59903
## model2 26 254.77512
## model3 97 950.50524
## Analysis of Deviance Table
##
## Model 1: Binary_Var ~ Data.Value
## Model 2: Binary_Var ~ Data.Value + Indicator.ID + Geo.Type.Name
## Model 3: Binary_Var ~ Data.Value + Indicator.ID + Geo.Type.Name + Geo.Join.ID
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 18014 0.00100308
## 2 17990 0.00072086 24 2.8222e-04 1
## 3 17919 0.00073180 71 -1.0939e-05
##
## Call:
## glm(formula = Binary_Var ~ Data.Value + Indicator.ID + Geo.Type.Name +
## Geo.Join.ID, family = binomial, data = subset_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.990e+03 3.563e+04 -0.112 0.911
## Data.Value 2.175e+02 6.369e+02 0.342 0.733
## Indicator.ID375 -1.360e+03 4.749e+03 -0.286 0.775
## Indicator.ID386 -1.361e+03 4.756e+03 -0.286 0.775
## Indicator.ID639 -1.427e+03 7.573e+03 -0.188 0.851
## Indicator.ID640 -1.376e+03 7.170e+04 -0.019 0.985
## Indicator.ID641 1.531e+03 2.697e+04 0.057 0.955
## Indicator.ID642 -1.358e+03 2.500e+04 -0.054 0.957
## Indicator.ID643 -1.455e+03 1.087e+04 -0.134 0.894
## Indicator.ID644 -1.429e+03 1.159e+05 -0.012 0.990
## Indicator.ID645 1.367e+03 1.036e+04 0.132 0.895
## Indicator.ID646 2.697e+03 1.535e+04 0.176 0.861
## Indicator.ID647 2.879e+03 1.500e+04 0.192 0.848
## Indicator.ID648 -1.400e+03 2.619e+04 -0.053 0.957
## Indicator.ID650 -1.249e+03 2.549e+04 -0.049 0.961
## Indicator.ID651 -1.380e+03 4.918e+03 -0.281 0.779
## Indicator.ID652 1.425e+03 3.704e+04 0.038 0.969
## Indicator.ID653 -1.360e+03 2.302e+04 -0.059 0.953
## Indicator.ID655 -1.323e+03 3.391e+04 -0.039 0.969
## Indicator.ID657 -1.358e+03 3.494e+04 -0.039 0.969
## Indicator.ID659 -1.409e+03 5.004e+04 -0.028 0.978
## Indicator.ID661 -2.058e+02 1.818e+04 -0.011 0.991
## Geo.Type.NameCD -5.576e+09 6.297e+14 0.000 1.000
## Geo.Type.NameCitywide -6.870e+01 1.880e+05 0.000 1.000
## Geo.Type.NameUHF34 -5.576e+09 6.297e+14 0.000 1.000
## Geo.Type.NameUHF42 -5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID2 -4.208e+01 3.751e+04 -0.001 0.999
## Geo.Join.ID3 -1.091e+02 4.213e+04 -0.003 0.998
## Geo.Join.ID4 -1.080e+02 4.109e+04 -0.003 0.998
## Geo.Join.ID5 3.051e+02 4.205e+04 0.007 0.994
## Geo.Join.ID101 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID102 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID103 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID104 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID105 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID106 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID107 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID108 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID109 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID110 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID111 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID112 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID201 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID202 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID203 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID204 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID205 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID206 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID207 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID208 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID209 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID210 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID211 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID212 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID301 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID302 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID303 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID304 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID305 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID306 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID307 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID308 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID309 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID310 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID311 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID312 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID313 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID314 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID315 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID316 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID317 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID318 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID401 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID402 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID403 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID404 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID405 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID406 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID407 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID408 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID409 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID410 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID411 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID412 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID413 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID414 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID501 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID502 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID503 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID504 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID305307 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID306308 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID309310 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID404406 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID501502 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID503504 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID105106107 5.576e+09 6.297e+14 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.1627e+04 on 18015 degrees of freedom
## Residual deviance: 7.3180e-04 on 17919 degrees of freedom
## AIC: 194
##
## Number of Fisher Scoring iterations: 25
## [1] TRUE
##
## Call:
## glm(formula = Binary_Var ~ Data.Value + Indicator.ID + Geo.Type.Name +
## Geo.Join.ID, family = binomial, data = subset_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.990e+03 3.563e+04 -0.112 0.911
## Data.Value 2.175e+02 6.369e+02 0.342 0.733
## Indicator.ID375 -1.360e+03 4.749e+03 -0.286 0.775
## Indicator.ID386 -1.361e+03 4.756e+03 -0.286 0.775
## Indicator.ID639 -1.427e+03 7.573e+03 -0.188 0.851
## Indicator.ID640 -1.376e+03 7.170e+04 -0.019 0.985
## Indicator.ID641 1.531e+03 2.697e+04 0.057 0.955
## Indicator.ID642 -1.358e+03 2.500e+04 -0.054 0.957
## Indicator.ID643 -1.455e+03 1.087e+04 -0.134 0.894
## Indicator.ID644 -1.429e+03 1.159e+05 -0.012 0.990
## Indicator.ID645 1.367e+03 1.036e+04 0.132 0.895
## Indicator.ID646 2.697e+03 1.535e+04 0.176 0.861
## Indicator.ID647 2.879e+03 1.500e+04 0.192 0.848
## Indicator.ID648 -1.400e+03 2.619e+04 -0.053 0.957
## Indicator.ID650 -1.249e+03 2.549e+04 -0.049 0.961
## Indicator.ID651 -1.380e+03 4.918e+03 -0.281 0.779
## Indicator.ID652 1.425e+03 3.704e+04 0.038 0.969
## Indicator.ID653 -1.360e+03 2.302e+04 -0.059 0.953
## Indicator.ID655 -1.323e+03 3.391e+04 -0.039 0.969
## Indicator.ID657 -1.358e+03 3.494e+04 -0.039 0.969
## Indicator.ID659 -1.409e+03 5.004e+04 -0.028 0.978
## Indicator.ID661 -2.058e+02 1.818e+04 -0.011 0.991
## Geo.Type.NameCD -5.576e+09 6.297e+14 0.000 1.000
## Geo.Type.NameCitywide -6.870e+01 1.880e+05 0.000 1.000
## Geo.Type.NameUHF34 -5.576e+09 6.297e+14 0.000 1.000
## Geo.Type.NameUHF42 -5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID2 -4.208e+01 3.751e+04 -0.001 0.999
## Geo.Join.ID3 -1.091e+02 4.213e+04 -0.003 0.998
## Geo.Join.ID4 -1.080e+02 4.109e+04 -0.003 0.998
## Geo.Join.ID5 3.051e+02 4.205e+04 0.007 0.994
## Geo.Join.ID101 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID102 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID103 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID104 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID105 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID106 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID107 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID108 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID109 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID110 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID111 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID112 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID201 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID202 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID203 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID204 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID205 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID206 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID207 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID208 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID209 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID210 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID211 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID212 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID301 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID302 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID303 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID304 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID305 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID306 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID307 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID308 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID309 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID310 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID311 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID312 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID313 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID314 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID315 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID316 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID317 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID318 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID401 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID402 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID403 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID404 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID405 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID406 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID407 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID408 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID409 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID410 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID411 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID412 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID413 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID414 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID501 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID502 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID503 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID504 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID305307 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID306308 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID309310 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID404406 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID501502 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID503504 5.576e+09 6.297e+14 0.000 1.000
## Geo.Join.ID105106107 5.576e+09 6.297e+14 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.1627e+04 on 18015 degrees of freedom
## Residual deviance: 7.3180e-04 on 17919 degrees of freedom
## AIC: 194
##
## Number of Fisher Scoring iterations: 25
# Simulate the model using clarify
model3_sim <- clarify::sim(model3, n = 1000)
# Average Marginal Effects (AMEs)
ame_data_value <- sim_ame(model3_sim, var = "Data.Value")
summary(ame_data_value)
## Estimate 2.5 % 97.5 %
## E[dY/d(Data.Value)] 4.42e-06 0.00e+00 0.00e+00
## Estimate 2.5 % 97.5 %
## E[Y(365)] 4.24e-01 1.67e-04 1.00e+00
## E[Y(375)] 2.88e-01 2.22e-16 1.00e+00
## E[Y(386)] 2.88e-01 2.22e-16 1.00e+00
## E[Y(639)] 2.81e-01 2.22e-16 1.00e+00
## E[Y(640)] 2.87e-01 2.22e-16 1.00e+00
## E[Y(641)] 6.00e-01 2.22e-16 1.00e+00
## E[Y(642)] 2.88e-01 2.22e-16 1.00e+00
## E[Y(643)] 2.78e-01 2.22e-16 1.00e+00
## E[Y(644)] 2.80e-01 2.22e-16 1.00e+00
## E[Y(645)] 5.75e-01 2.22e-16 1.00e+00
## E[Y(646)] 9.13e-01 2.22e-16 1.00e+00
## E[Y(647)] 9.33e-01 2.22e-16 1.00e+00
## E[Y(648)] 2.83e-01 2.22e-16 1.00e+00
## E[Y(650)] 2.98e-01 2.22e-16 1.00e+00
## E[Y(651)] 2.87e-01 2.22e-16 1.00e+00
## E[Y(652)] 5.83e-01 2.22e-16 1.00e+00
## E[Y(653)] 2.88e-01 2.22e-16 1.00e+00
## E[Y(655)] 2.92e-01 2.22e-16 1.00e+00
## E[Y(657)] 2.88e-01 2.22e-16 1.00e+00
## E[Y(659)] 2.82e-01 2.22e-16 1.00e+00
## E[Y(661)] 4.03e-01 2.22e-16 1.00e+00
# Generate predicted probabilities at set values using sim_setx()
predictions <- sim_setx(model3_sim, x = list(Data.Value = c(10, 25, 50)))
# Summary of predicted probabilities
summary(predictions)
## Estimate 2.5 % 97.5 %
## Data.Value = 10 2.22e-16 2.22e-16 1.00e+00
## Data.Value = 25 1.00e+00 2.22e-16 1.00e+00
## Data.Value = 50 1.00e+00 2.22e-16 1.00e+00
Interpretation of the Results
The analysis begins with the calculation of Average Marginal Effects (AMEs), which measures the average change in the probability of Binary_Var = 1, signifying a high level of air pollutant, for a one-unit increase in Data.Value, representing the pollutant concentration. The AME estimate is extremely small, at 4.42e-06, suggesting that a one-unit increase in Data.Value leads to only a marginal increase of 0.00000442 in the probability of the pollutant level exceeding the high threshold. This effect is very close to zero that it indicates little to no effect of Data.Value on the outcome across the entire range of data. Additionally, the 95% confidence interval for this effect is [0, 0], which further confirms the lack of statistical significance. This means that the relationship between small changes in Data.Value and the likelihood of exceeding the danger threshold might be non-existent or too weak to detect reliably. Other potential reasons for this outcome could be co-linearity with other predictors in the model or the possibility that the binary outcome does not respond to minor fluctuations in Data.Value.
In the second part of the analysis, we examined the predicted probabilities of Binary_Var = 1 (pollutant level exceeding the threshold) at three specific values of Data.Value (10, 25, and 50). At Data.Value = 10, the predicted probability is essentially 0 (2.22e-16), indicating that at this low concentration of the pollutant, the occurrence of exceeding the dangerous threshold is highly unlikely. However, as Data.Value increases to 25 and 50, the predicted probabilities jump to 1.00, suggesting that at these higher concentrations, the outcome of the pollutant level being above the threshold is almost certain. The confidence intervals for the probabilities show a wide range at Data.Value = 10, indicating high uncertainty at low values, but a narrow range at Data.Value = 25 and 50, indicating high confidence in the predicted probabilities at these levels.
Overall, the results show that while minor increases in Data.Value appear to have little effect on the probability of Binary_Var = 1 (exceeding the dangerous threshold) when viewed across the entire data set, the predicted probabilities suggest that once Data.Value reaches certain critical thresholds (25 and 50), the outcome of exceeding the threshold becomes almost inevitable. This suggests a potential threshold effect where small increases in pollutant concentration lead to significant changes in the likelihood of dangerous levels at higher values, even though the marginal effects across the entire range are minimal.