DATA 698 FINAL PROJECT

# Setup
knitr::opts_chunk$set(cache   = TRUE, autodep = TRUE, warning = FALSE, message = FALSE)

library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.5.2
library(tigris)
## Warning: package 'tigris' was built under R version 4.5.2
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(sf)
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(purrr)
library(readr)
library(ggplot2)
library(classInt)
## Warning: package 'classInt' was built under R version 4.5.2
library(viridis)
## Warning: package 'viridis' was built under R version 4.5.2
## Loading required package: viridisLite
library(osmdata)
## Warning: package 'osmdata' was built under R version 4.5.2
## Data (c) OpenStreetMap contributors, ODbL 1.0. https://www.openstreetmap.org/copyright
library(spdep)
## Warning: package 'spdep' was built under R version 4.5.2
## Loading required package: spData
## Warning: package 'spData' was built under R version 4.5.2
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(spatialreg)
## Warning: package 'spatialreg' was built under R version 4.5.2
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following object is masked from 'package:purrr':
## 
##     discard
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(osmextract)
## Warning: package 'osmextract' was built under R version 4.5.2
## Data (c) OpenStreetMap contributors, ODbL 1.0. https://www.openstreetmap.org/copyright.
## Check the package website, https://docs.ropensci.org/osmextract/, for more details.
options(tigris_use_cache = TRUE)
options(scipen = 999)

Connect United States Census API

# Census API Key
tidycensus::census_api_key(Sys.getenv("CENSUS_API_KEY"), install = FALSE)

Load Data

We use NYC census tracts as the neighborhood unit for a clean, reproducible run (no manual files). Proxies come from ACS 5-year estimates. Mosques are pulled from OpenStreetMap.

# Base settings
year <- 2022
state <- "NY"
nyc_counties <- c("New York","Kings","Queens","Bronx","Richmond")

# Census tract population (and geometry)
tract_pop <- get_acs(
  geography = "tract",
  variables = c(total_pop = "B01003_001"),
  state     = state,
  county    = nyc_counties,
  year      = year,
  survey    = "acs5",
  geometry  = TRUE,
  output    = "wide"
)

tract_pop <- tract_pop |>
  rename(total_pop = total_popE) |>
  st_transform(2263) |>
  st_make_valid()

# ACS metadata
vars_b05006 <- load_variables(year, "acs5", cache = TRUE) |>
  filter(str_detect(name, "^B05006"))

vars_b16001 <- load_variables(year, "acs5", cache = TRUE) |>
  filter(str_detect(name, "^B16001"))

vars_b04006 <- load_variables(year, "acs5", cache = TRUE) |>
  filter(str_detect(name, "^B04006"))

# Muslim-majority country list
mm_countries <- c(
  "Afghanistan","Algeria","Azerbaijan","Bahrain","Bangladesh","Brunei","Burkina Faso",
  "Chad","Comoros","Djibouti","Egypt","Gambia","Guinea","Guinea-Bissau","Indonesia",
  "Iran","Iraq","Jordan","Kazakhstan","Kuwait","Kyrgyzstan","Lebanon","Libya","Malaysia",
  "Maldives","Mali","Mauritania","Morocco","Niger","Nigeria","Oman","Pakistan","Palestine",
  "Qatar","Saudi Arabia","Senegal","Sierra Leone","Somalia","Sudan","Syria","Tajikistan",
  "Tunisia","Turkey","Turkmenistan","United Arab Emirates","Uzbekistan","Western Sahara","Yemen"
)

mm_regex <- regex(
  paste0("\\b(", paste(mm_countries, collapse = "|"), ")\\b"),
  ignore_case = TRUE
)

# ACS variable selections
pick_b05006 <- vars_b05006 |>
  filter(str_detect(label, mm_regex))

pick_b16001 <- vars_b16001 |>
  filter(
    str_detect(
      label,
      regex("Urdu:|Bengali:|Arabic:|African languages:", ignore_case = TRUE)
    )
  )

pick_b04006 <- vars_b04006 |>
  filter(str_detect(label, regex("Arab", ignore_case = TRUE)))

# Helper to pull ACS variables and compute shares
pull_table <- function(var_rows, label, batch_size = 40) {
  v <- unique(var_rows$name)
  if (length(v) == 0) {
    message("No variables found for label = ", label)
    return(tibble())
  }
  chunks <- split(v, ceiling(seq_along(v) / batch_size))
  get_one <- function(vars_vec) {
    tryCatch(
      get_acs(
        geography   = "tract",
        state       = state,
        county      = nyc_counties,
        year        = year,
        survey      = "acs5",
        variables   = vars_vec,
        geometry    = FALSE,
        cache_table = TRUE
      ),
      error = function(e) {
        message("Skipping a chunk that failed (", length(vars_vec),
                " vars). Error: ", conditionMessage(e))
        return(tibble())
      }
    )
  }
  raw <- map_dfr(chunks, get_one)
  if (nrow(raw) == 0) return(tibble())
  raw |>
    left_join(
      tract_pop |>
        st_drop_geometry() |>
        select(GEOID, total_pop),
      by = "GEOID"
    ) |>
    mutate(
      share = estimate / pmax(total_pop, 1),
      group = label
    ) |>
    select(GEOID, variable, estimate, moe, total_pop, share, group)
}

# Pull proxies
acs_b05006 <- pull_table(pick_b05006, "birthplace")
acs_b04006 <- pull_table(pick_b04006, "ancestry")
acs_b16001 <- NULL  # language excluded due to suppressed NA estimates

# Validate distributions
summary(acs_b05006$share)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.001365 0.000000 0.390987
summary(acs_b04006$share)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.000000 0.002724 0.000000 0.565217
# Bind birthplace + ancestry proxies 
acs_proxy <- dplyr::bind_rows(
  purrr::keep(
    list(acs_b05006, acs_b04006),
    ~ !is.null(.x) && nrow(.x) > 0
  )
)

if (is.null(acs_proxy) || nrow(acs_proxy) == 0) {
  stop("No ACS proxy rows available. Check earlier selection logic.")
}

# Pivot wide: one row per tract, one column per proxy variable
proxy_wide <- acs_proxy |>
  tidyr::unite(var_id, group, variable, remove = FALSE) |>
  dplyr::select(GEOID, var_id, share) |>
  tidyr::pivot_wider(
    names_from  = var_id,
    values_from = share,
    values_fill = 0
  )

# Z-score each proxy column
zcols <- setdiff(names(proxy_wide), "GEOID")
proxy_z <- proxy_wide
proxy_z[zcols] <- lapply(proxy_wide[zcols], function(x) as.numeric(scale(x)))

# Compute PCI as average of standardized proxies
proxy_z$PCI <- rowMeans(proxy_z[zcols], na.rm = TRUE)

# Join PCI back to tract geometry
pci <- tract_pop |>
  dplyr::select(GEOID, geometry) |>
  dplyr::left_join(
    proxy_z |> dplyr::select(GEOID, PCI),
    by = "GEOID"
  )

summary(proxy_z$PCI)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.18582 -0.16087 -0.09050  0.00000  0.05829  3.71303
head(pci)
## Simple feature collection with 6 features and 2 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 1007383 ymin: 239639.8 xmax: 1026519 ymax: 268162.4
## Projected CRS: NAD83 / New York Long Island (ftUS)
##         GEOID         PCI                       geometry
## 1 36005013500 -0.18581591 POLYGON ((1010519 240631.4,...
## 2 36005009200  0.03438263 POLYGON ((1023619 242996.9,...
## 3 36005005400 -0.07722129 POLYGON ((1016383 241620.3,...
## 4 36005036501 -0.03178067 POLYGON ((1015150 247218.6,...
## 5 36005044902 -0.16844511 POLYGON ((1019811 267212.7,...
## 6 36005017500  0.04407802 POLYGON ((1007579 241875.2,...

Mosques (OpenStreetMap)

nyc_bbox <- tract_pop |>
  st_transform(4326) |>
  st_bbox()

# Query 1: all places of worship
q_pow <- opq(bbox = nyc_bbox) |>
  add_osm_feature(key = "amenity", value = "place_of_worship")

osm_pow_raw <- osmdata_sf(q_pow)

# Query 2: all buildings explicitly tagged as mosques
q_bldg_mosque <- opq(bbox = nyc_bbox) |>
  add_osm_feature(key = "building", value = "mosque")

osm_bldg_raw <- osmdata_sf(q_bldg_mosque)

filter_mosque_sf <- function(x) {
  if (is.null(x) || nrow(x) == 0) return(NULL)
  
  x |>
    mutate(
      religion = tolower(as.character(religion)),
      name     = tolower(as.character(name)),
      denom    = tolower(as.character(denomination))
    ) |>
    filter(
      religion == "muslim" |
        str_detect(name, "mosque|masjid|islamic") |
        str_detect(denom, "muslim|sunni|shia")
    )
}

pow_pts    <- filter_mosque_sf(osm_pow_raw$osm_points)
pow_polys  <- filter_mosque_sf(osm_pow_raw$osm_polygons)
pow_mpolys <- filter_mosque_sf(osm_pow_raw$osm_multipolygons)

# For building=mosque, accept ALL features without extra filtering
bldg_pts    <- osm_bldg_raw$osm_points
bldg_polys  <- osm_bldg_raw$osm_polygons
bldg_mpolys <- osm_bldg_raw$osm_multipolygons

poly_to_centroids <- function(x) {
  if (is.null(x) || nrow(x) == 0) return(NULL)
  st_centroid(x)
}

pow_poly_centroids   <- poly_to_centroids(pow_polys)
pow_mpoly_centroids  <- poly_to_centroids(pow_mpolys)
bldg_poly_centroids  <- poly_to_centroids(bldg_polys)
bldg_mpoly_centroids <- poly_to_centroids(bldg_mpolys)

mosque_all <- list(
  pow_pts,
  pow_poly_centroids,
  pow_mpoly_centroids,
  bldg_pts,
  bldg_poly_centroids,
  bldg_mpoly_centroids
) |>
  purrr::keep(~ !is.null(.x) && nrow(.x) > 0) |>
  dplyr::bind_rows()

if (is.null(mosque_all) || nrow(mosque_all) == 0) {
  stop("No mosque features returned from OSM. Check queries or bbox.")
}

mosque_all <- mosque_all |>
  st_transform(2263) |>
  distinct(geometry, .keep_all = TRUE)

mosque_tract <- st_join(
  mosque_all,
  tract_pop |> select(GEOID, geometry),
  join = st_within,
  left = FALSE
)

mosque_counts <- mosque_tract |>
  st_drop_geometry() |>
  count(GEOID, name = "mosque_count")

mosque_density <- tract_pop |>
  st_drop_geometry() |>
  select(GEOID, total_pop) |>
  left_join(mosque_counts, by = "GEOID") |>
  mutate(
    mosque_count = ifelse(is.na(mosque_count), 0L, mosque_count),
    mosque_density_per_10k = ifelse(
      total_pop > 0,
      mosque_count / (total_pop / 10000),
      NA_real_
    )
  )

pci_mosques <- pci |>
  left_join(
    mosque_density |>
      select(GEOID, mosque_count, mosque_density_per_10k),
    by = "GEOID"
  )

nrow(mosque_all)
## [1] 324
summary(pci_mosques$mosque_density_per_10k)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   0.0000   0.0000   0.0000   0.4313   0.0000 156.2500       85
head(unique(tolower(mosque_all$name)), 30)
##  [1] "islamic society of bayridge"                             
##  [2] "mosque of american mohammedan society"                   
##  [3] "al-mahdi foundation"                                     
##  [4] "muslim community center of union county (mccuc)"         
##  [5] "islamic center of passaic county"                        
##  [6] "staten island islamic masjid"                            
##  [7] "muslim majlis of staten island"                          
##  [8] "noor al-islam society"                                   
##  [9] "masjid al-ihsan"                                         
## [10] "dar-ul-islah"                                            
## [11] "masjid at-taqea"                                         
## [12] "masjid mission center"                                   
## [13] "united american muslim association"                      
## [14] "ibaadu'rrahman central mosque"                           
## [15] "masjid al taufiq"                                        
## [16] "north bronx islamic center"                              
## [17] "masjid nur al islam"                                     
## [18] "greenpoint islamic center"                               
## [19] "masjid al-salam islamic education and information center"
## [20] "masjid manhattan"                                        
## [21] NA                                                        
## [22] "masjid ar-rahman"                                        
## [23] "masjid dar-ul furqan"                                    
## [24] "masjidul allah"                                          
## [25] "masjid salam"                                            
## [26] "islamic center at nyu"                                   
## [27] "unionport jame masjid"                                   
## [28] "masjid al imam al albani"                                
## [29] "sheepshead bay muslim center"                            
## [30] "baitul jannah"
pci_mosques <- pci_mosques |>
  mutate(
    mosque_density_per_10k = ifelse(
      is.na(mosque_density_per_10k), 
      0, 
      mosque_density_per_10k
    )
  )
cor.test(
  pci_mosques$PCI,
  pci_mosques$mosque_density_per_10k,
  method = "spearman"
)
## 
##  Spearman's rank correlation rho
## 
## data:  pci_mosques$PCI and pci_mosques$mosque_density_per_10k
## S = 1914541729, p-value = 0.00001972
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.08835235
ggplot(pci_mosques,
       aes(x = PCI, y = mosque_density_per_10k)) +
  geom_point(alpha = 0.4, size = 1) +
  geom_smooth(method = "loess", se = FALSE, color = "red", linewidth = 1) +
  scale_y_continuous("Mosque density per 10,000 residents") +
  scale_x_continuous("Proxy Concentration Index (PCI)") +
  theme_minimal()

Query OSM and extract halal businesses

# 1) Query amenities: restaurants and fast food
q_halal_amenity <- opq(bbox = nyc_bbox) |>
  add_osm_feature(key = "amenity", value = c("restaurant", "fast_food"))

osm_halal_amenity <- osmdata_sf(q_halal_amenity)

amen_pts   <- osm_halal_amenity$osm_points
amen_polys <- osm_halal_amenity$osm_polygons

# 2) Query shops: supermarkets, convenience, butchers
q_halal_shop <- opq(bbox = nyc_bbox) |>
  add_osm_feature(key = "shop", value = c("supermarket", "convenience", "butcher"))

osm_halal_shop <- osmdata_sf(q_halal_shop)

shop_pts   <- osm_halal_shop$osm_points
shop_polys <- osm_halal_shop$osm_polygons

# 3) Filtering function: look for halal cues in name or cuisine
filter_halal_sf <- function(x) {
  if (is.null(x) || nrow(x) == 0) return(NULL)
  
  x |>
    dplyr::mutate(
      name_lc    = tolower(as.character(name)),
      cuisine_lc = tolower(as.character(cuisine))
    ) |>
    dplyr::filter(
      stringr::str_detect(name_lc, "halal|islamic|muslim|zabiha|zabihah") |
        stringr::str_detect(cuisine_lc, "halal")
    )
}

amen_pts_f   <- filter_halal_sf(amen_pts)
amen_polys_f <- filter_halal_sf(amen_polys)
shop_pts_f   <- filter_halal_sf(shop_pts)
shop_polys_f <- filter_halal_sf(shop_polys)

# 4) Convert polygons to centroids
poly_to_centroids <- function(x) {
  if (is.null(x) || nrow(x) == 0) return(NULL)
  sf::st_centroid(x)
}

amen_poly_centroids  <- poly_to_centroids(amen_polys_f)
shop_poly_centroids  <- poly_to_centroids(shop_polys_f)

# 5) Combine all halal candidate locations as sf
halal_list <- list(
  amen_pts_f,
  amen_poly_centroids,
  shop_pts_f,
  shop_poly_centroids
) |>
  purrr::keep(~ !is.null(.x) && nrow(.x) > 0)

if (length(halal_list) > 0) {
  # Find common columns across all sf objects
  common_cols <- Reduce(intersect, lapply(halal_list, names))
  
  # Subset each to the common columns, then row-bind
  halal_all <- do.call(
    rbind,
    lapply(halal_list, function(x) x[, common_cols, drop = FALSE])
  )
  
  # Transform to tract CRS
  halal_all <- halal_all |>
    sf::st_transform(2263)
  
  # Drop duplicate geometries
  geom_txt <- sf::st_as_text(sf::st_geometry(halal_all))
  halal_all <- halal_all[!duplicated(geom_txt), ]
  
  # 6) Assign each halal business to a tract
  halal_tract <- sf::st_join(
    halal_all,
    tract_pop |> dplyr::select(GEOID, geometry),
    join = sf::st_within,
    left = FALSE
  )
  
  # 7) Count per tract
  halal_counts <- halal_tract |>
    sf::st_drop_geometry() |>
    dplyr::count(GEOID, name = "halal_count")
  
} else {
  message("No halal-related features returned from OSM using current filters.")
  halal_counts <- dplyr::tibble(GEOID = character(), halal_count = integer())
}

# 8) Construct halal density per 10K residents for all tracts
halal_density <- tract_pop |>
  sf::st_drop_geometry() |>
  dplyr::select(GEOID, total_pop) |>
  dplyr::left_join(halal_counts, by = "GEOID") |>
  dplyr::mutate(
    halal_count = dplyr::coalesce(halal_count, 0L),
    halal_density_per_10k = dplyr::if_else(
      total_pop > 0,
      halal_count / (total_pop / 10000),
      0
    )
  )

# 9) Join halal density onto PCI + mosque object
pci_halal <- pci_mosques |>
  dplyr::left_join(
    halal_density |>
      dplyr::select(GEOID, halal_count, halal_density_per_10k),
    by = "GEOID"
  )
nrow(halal_all)
## [1] 145
summary(pci_halal$halal_density_per_10k)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0000   0.2728   0.0000 263.1579
head(unique(tolower(halal_all$name)), 40)
##  [1] "no pork halal kitchen"               "halalbee's"                         
##  [3] "halal dynasty"                       "alnoor halal deli"                  
##  [5] "just halal"                          "shah's halal food"                  
##  [7] "cajun halal platters"                "halal chicks"                       
##  [9] "halal bitez"                         "crafted halal"                      
## [11] "khalil halal chinese"                "urban halal grill"                  
## [13] "halal munchies"                      "barakah's halal"                    
## [15] "medo halal food"                     "the halal guys"                     
## [17] "halal"                               "un halal"                           
## [19] "halal mix"                           "numidia halal food"                 
## [21] "hillside express halal food"         "tanjawi halal sandwich shop"        
## [23] "halal boyz"                          "mr. chang halal chinese"            
## [25] "bakhter halal food cart"             "halal city"                         
## [27] "halal bros"                          "express halal food"                 
## [29] "bellrose halal meat"                 "no 1 halal kitchen"                 
## [31] "halal xpress asian fusion"           "mia halal food"                     
## [33] "halal grill eats"                    "hooda halal"                        
## [35] "halal cart"                          "sammy's halal food"                 
## [37] "halal food"                          "halal guys"                         
## [39] "sammy’s halal"                       "zinger halal express chicken burger"
cor.test(
  pci_halal$PCI,
  pci_halal$halal_density_per_10k,
  method = "spearman"
)
## 
##  Spearman's rank correlation rho
## 
## data:  pci_halal$PCI and pci_halal$halal_density_per_10k
## S = 1956829080, p-value = 0.0009921
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.06821637
ggplot(pci) +
  geom_sf(aes(fill = PCI), color = "grey85", size = 0.05) +
  scale_fill_viridis_c(
    option    = "plasma",
    direction = 1,
    name      = "PCI (relative)\nHigher = stronger proxy"
  ) +
  labs(
    title    = "Proxy Concentration Index (PCI) Across NYC Census Tracts",
    subtitle = "Higher PCI values indicate neighborhoods with stronger Muslim settlement proxies",
    caption  = "Source: 2018–2022 ACS (birthplace + ancestry proxies)"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text        = element_blank(),
    axis.title       = element_blank(),
    plot.title       = element_text(face = "bold", size = 13),
    legend.position  = "right",
    legend.title     = element_text(size = 10),
    legend.text      = element_text(size = 9)
  )

mosque_xy <- mosque_all |>
  st_transform(2263)

ggplot() +
  geom_sf(data = pci, aes(fill = PCI), color = "grey85", size = 0.05) +
  geom_sf(
    data  = mosque_xy,
    color = "deepskyblue3",
    size  = 1.2,
    alpha = 0.8
  ) +
  scale_fill_viridis_c(
    option    = "plasma",
    direction = 1,
    name      = "PCI"
  ) +
  labs(
    title    = "Mosque Locations Overlaid on the Proxy Concentration Index (PCI)",
    subtitle = "Mosques tend to cluster in neighborhoods where PCI is higher",
    caption  = "Mosques from OpenStreetMap; PCI from ACS proxies"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text        = element_blank(),
    axis.title       = element_blank(),
    plot.title       = element_text(face = "bold", size = 13),
    legend.position  = "right",
    legend.title     = element_text(size = 10),
    legend.text      = element_text(size = 9)
  )

halal_xy <- halal_all |>
  st_transform(2263)

ggplot() +
  geom_sf(data = pci, aes(fill = PCI), color = "grey85", size = 0.05) +
  geom_sf(
    data  = halal_xy,
    color = "goldenrod2",
    size  = 1.1,
    alpha = 0.7
  ) +
  scale_fill_viridis_c(
    option    = "plasma",
    direction = 1,
    name      = "PCI"
  ) +
  labs(
    title    = "Halal Businesses Overlaid on the Proxy Concentration Index (PCI)",
    subtitle = "Halal food and retail clusters appear in some—but not all—high-PCI neighborhoods",
    caption  = "Halal-related establishments from OpenStreetMap; PCI from ACS proxies"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text        = element_blank(),
    axis.title       = element_blank(),
    plot.title       = element_text(face = "bold", size = 13),
    legend.position  = "right",
    legend.title     = element_text(size = 10),
    legend.text      = element_text(size = 9)
  )