Data Importing and Loading Packages:

require(vegan)
## Loading required package: vegan
## Warning: package 'vegan' was built under R version 4.4.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.4.3
## Loading required package: lattice
require(tidyverse)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
require(sf)
## Loading required package: sf
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
require(mapview)
## Loading required package: mapview
## Warning: package 'mapview' was built under R version 4.4.3
require(magrittr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(dplyr)
library(ggplot2)

#Reading soil data for each borough, ensuring a uniform corrdinate reference system
bronxsoil<-st_read("NY005bronx/spatial/soilmu_a_ny005.shp")
## Reading layer `soilmu_a_ny005' from data source 
##   `C:\Users\Kami\Downloads\soil R project\NY005bronx\spatial\soilmu_a_ny005.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1407 features and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -73.9344 ymin: 40.7854 xmax: -73.74844 ymax: 40.91754
## Geodetic CRS:  WGS 84
bronxsoil_crs<-st_transform(bronxsoil,4326)
view(bronxsoil_crs)


queenssoil<-st_read("NY081queens/spatial/soilmu_a_ny081.shp")
## Reading layer `soilmu_a_ny081' from data source 
##   `C:\Users\Kami\Downloads\soil R project\NY081queens\spatial\soilmu_a_ny081.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2540 features and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.04372 ymin: 40.50893 xmax: -73.70021 ymax: 40.81217
## Geodetic CRS:  WGS 84
queenssoil_crs<-st_transform(queenssoil,4326)

manhattansoil<-st_read("NY061newyork/spatial/soilmu_a_ny061.shp")
## Reading layer `soilmu_a_ny061' from data source 
##   `C:\Users\Kami\Downloads\soil R project\NY061newyork\spatial\soilmu_a_ny061.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 636 features and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.03539 ymin: 40.68086 xmax: -73.90672 ymax: 40.88193
## Geodetic CRS:  WGS 84
manhattansoil_crs<-st_transform(manhattansoil,4326)

statensoil<-st_read("NY085richmond/spatial/soilmu_a_ny085.shp")
## Reading layer `soilmu_a_ny085' from data source 
##   `C:\Users\Kami\Downloads\soil R project\NY085richmond\spatial\soilmu_a_ny085.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2364 features and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25897 ymin: 40.47664 xmax: -74.03463 ymax: 40.6515
## Geodetic CRS:  WGS 84
statensoil_crs<-st_transform(statensoil,4326)

brooklynsoil<-st_read("NY047kings/spatial/soilmu_a_ny047.shp")
## Reading layer `soilmu_a_ny047' from data source 
##   `C:\Users\Kami\Downloads\soil R project\NY047kings\spatial\soilmu_a_ny047.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1106 features and 4 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.05669 ymin: 40.55034 xmax: -73.83292 ymax: 40.73936
## Geodetic CRS:  WGS 84
brooklynsoil_crs<-st_transform(brooklynsoil,4326)

# vector of fips code and borough names
borough_names <- c(
  "NY005" = "Bronx",
  "NY047" = "Brooklyn",
  "NY061" = "Manhattan",
  "NY081" = "Queens",
  "NY085" = "Staten Island"
)

#renaming the fips code to common borough names
bronxsoil_crs <- bronxsoil_crs %>%
  mutate(AREASYMBOL = recode(AREASYMBOL, !!!borough_names))
queenssoil_crs <- queenssoil_crs %>%
  mutate(AREASYMBOL = recode(AREASYMBOL, !!!borough_names))
manhattansoil_crs <- manhattansoil_crs %>%
  mutate(AREASYMBOL = recode(AREASYMBOL, !!!borough_names))
statensoil_crs <- statensoil_crs %>%
  mutate(AREASYMBOL = recode(AREASYMBOL, !!!borough_names))
brooklynsoil_crs <- brooklynsoil_crs %>%
  mutate(AREASYMBOL = recode(AREASYMBOL, !!!borough_names))

# making them validd
bronx_valid <- st_make_valid(bronxsoil_crs)
queens_valid <- st_make_valid(queenssoil_crs)
manhattan_valid <- st_make_valid(manhattansoil_crs)
staten_valid <- st_make_valid(statensoil_crs)
brooklyn_valid <- st_make_valid(brooklynsoil_crs)

# combining data using rbind function
bronx_queens <- rbind(bronx_valid, queens_valid, manhattan_valid, staten_valid, brooklyn_valid)


#Examples of how the soil classes look in Queens and Bronx
plot(bronxsoil_crs["MUSYM"], main= "Soil Classes in the Bronx")

plot(queenssoil_crs["MUSYM"], main= "Soil Classes in Queens")

#visualizing combined data
plot(bronx_queens["MUSYM"])

Calculating Diversity Indices by Borough

# calculating area of each soil class and total soil area by borough 
soilarea_boro <- bronx_queens %>%
  mutate(soil_area = st_area(.)) %>% # core area calculation
  st_drop_geometry() %>%
  group_by(AREASYMBOL, MUSYM) %>%
  summarize(total_soil_area = sum(soil_area, na.rm = TRUE)) %>% # this too
  ungroup()
## `summarise()` has grouped output by 'AREASYMBOL'. You can override using the
## `.groups` argument.
# calculating Shannon-wiener index for each borough
shannon_borough<- soilarea_boro %>%
  group_by(AREASYMBOL) %>%
  summarize(shannon_diversity = diversity(total_soil_area, index = "shannon"))

# showing table/tibble
print("Shannon-Wiener Diversity Index per Borough:")
## [1] "Shannon-Wiener Diversity Index per Borough:"
print(shannon_borough)
## # A tibble: 5 × 2
##   AREASYMBOL    shannon_diversity
##   <chr>                     <dbl>
## 1 Bronx                      2.99
## 2 Brooklyn                   2.94
## 3 Manhattan                  2.74
## 4 Queens                     3.20
## 5 Staten Island              2.79
# making Choropleth map
borough_divmap <- bronx_queens %>%
  left_join(shannon_borough, by = "AREASYMBOL")


ggplot(borough_divmap) +
  geom_sf(aes(fill = shannon_diversity)) +
  scale_fill_viridis_c(option = "viridis", name = "Shannon Diversity") +
  labs(title = "Shannon-Wiener Soil Diversity Index in NYC Boroughs") +
  theme_minimal()

# simpsons index calculation
simpson_borough <- soilarea_boro %>%
  group_by(AREASYMBOL) %>%
  summarize(simpson_diversity = diversity(total_soil_area, index = "simpson"))

# showing table/tibble for simpson
print("Simpson's Diversity Index per Borough:")
## [1] "Simpson's Diversity Index per Borough:"
print(simpson_borough)
## # A tibble: 5 × 2
##   AREASYMBOL    simpson_diversity
##   <chr>                     <dbl>
## 1 Bronx                     0.883
## 2 Brooklyn                  0.899
## 3 Manhattan                 0.874
## 4 Queens                    0.903
## 5 Staten Island             0.789
# 
borough_divmap_simpson <- borough_divmap %>%
  left_join(simpson_borough, by = "AREASYMBOL")

ggplot(borough_divmap_simpson) +
  geom_sf(aes(fill = simpson_diversity)) +
  scale_fill_viridis_c(option = "cividis", name = "Simpson Diversity") +
  labs(title = "Simpson Soil Diversity Index in NYC Boroughs") +
  theme_minimal()

## Visualizing Simpson’s and Shannon-Wiener Soil Diversity Index per Census Block

# uploading census blocks
census_blks <- "C:/Users/Kami/Downloads/soil R project/nycb2020wi_25a/nycb2020wi_25a/nycb2020wi.shp" 
census_blocks <- st_read(census_blks)
## Reading layer `nycb2020wi' from data source 
##   `C:\Users\Kami\Downloads\soil R project\nycb2020wi_25a\nycb2020wi_25a\nycb2020wi.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 37984 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 912287.1 ymin: 113279.3 xmax: 1067383 ymax: 273617.8
## Projected CRS: NAD83 / New York Long Island (ftUS)
blocks_transform <- st_transform(census_blocks, crs = st_crs(bronx_queens)) # uniform crs
blocksrvalid<-st_make_valid(blocks_transform)

# spatial join
soil_block <- st_join(blocksrvalid, bronx_queens, join = st_intersects)

soil_areablock <- soil_block %>%
  mutate(soil_area = st_area(.)) %>%
  st_drop_geometry() %>%
  group_by(GEOID, AREASYMBOL, MUSYM) %>% # Keep AREASYMBOL here
  summarize(tot_soilblock = sum(soil_area, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'GEOID', 'AREASYMBOL'. You can override
## using the `.groups` argument.
# shannons diversity per census block
shannon_block <- soil_areablock %>%
  group_by(GEOID, AREASYMBOL) %>% # Group by borough as well
  summarize(shannon_diversity = diversity(tot_soilblock, index = "shannon")) %>%
  rename(Borough = AREASYMBOL) # Rename for clarity
## `summarise()` has grouped output by 'GEOID'. You can override using the
## `.groups` argument.
# simpsons diversity per census block
simpson_block <- soil_areablock %>%
  group_by(GEOID, AREASYMBOL) %>% # Group by borough as well
  summarize(simpson_diversity = diversity(tot_soilblock, index = "simpson")) %>%
  rename(Borough = AREASYMBOL) # Rename for clarity
## `summarise()` has grouped output by 'GEOID'. You can override using the
## `.groups` argument.
# joining indices to census blocks
census_diversity_block <- blocks_transform %>%
  left_join(shannon_block, by = "GEOID") %>%
  left_join(simpson_block, by = "GEOID") %>%
  replace_na(list(shannon_diversity = 0, simpson_diversity = 0)) # Handle blocks with no soil data
## Warning in sf_column %in% names(g): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 21 of `x` matches multiple rows in `y`.
## ℹ Row 15324 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Making choropleth map of Shannon per cb
ggplot(census_diversity_block) +
  geom_sf(aes(fill = shannon_diversity), color = NA) +
  scale_fill_viridis_c(option = "viridis", name = "Shannon Diversity") +
  labs(title = "Shannon-Wiener Soil Diversity Index per Census Block") +
  theme_minimal()

# Making choropleth map of Simpson per cb
ggplot(census_diversity_block) +
  geom_sf(aes(fill = simpson_diversity), color = NA) + 
  scale_fill_viridis_c(option = "viridis", name = "Simpson Diversity") +
  labs(title = "Simpson Soil Diversity Index per Census Block") +
  theme_minimal()

#just to see the max and min

print("Shannon's Diversity Index")
## [1] "Shannon's Diversity Index"
summary(census_diversity_block$shannon_diversity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.6931  0.5326  0.6931  3.2916
print("Simpson's Diversity Index")
## [1] "Simpson's Diversity Index"
summary(census_diversity_block$simpson_diversity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.5000  0.3416  0.5000  0.9568

Comparing Indices and Finding the Number of Soil Types in each borough

diversity_longg <- shannon_borough %>%
  rename(Shannon = shannon_diversity) %>%
  left_join(simpson_borough %>% rename(Simpson = simpson_diversity), by = "AREASYMBOL") %>%
  pivot_longer(cols = c(Shannon, Simpson), names_to = "Index", values_to = "Value")
# barplot
ggplot(diversity_longg, aes(x = AREASYMBOL, y = Value, fill = Index)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Comparison of Soil Diversity Indices of NYC Boroughs",
       x = "AREASYMBOL", y = "Index Value") +
  theme_minimal() +
  scale_fill_viridis_d()

#finding num of unique soils per boro
unique_soils <- bronx_queens %>%
  group_by(AREASYMBOL) %>%
  summarize(unique_soil_count = n_distinct(MUSYM))

# showing results
print("Number of Unique Soil Classes per Borough:")
## [1] "Number of Unique Soil Classes per Borough:"
print(unique_soils)
## Simple feature collection with 5 features and 2 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25897 ymin: 40.47664 xmax: -73.70021 ymax: 40.91754
## Geodetic CRS:  WGS 84
## # A tibble: 5 × 3
##   AREASYMBOL    unique_soil_count                                       geometry
##   <chr>                     <int>                                  <POLYGON [°]>
## 1 Bronx                       126 ((-73.86299 40.90147, -73.86632 40.90242, -73…
## 2 Brooklyn                    117 ((-73.83532 40.60548, -73.83423 40.60677, -73…
## 3 Manhattan                    69 ((-73.9071 40.87624, -73.90738 40.87631, -73.…
## 4 Queens                      145 ((-73.72447 40.72449, -73.72346 40.72481, -73…
## 5 Staten Island               138 ((-74.15793 40.64365, -74.16142 40.64403, -74…
# calculating area of each soil class overall
soilarea_overall <- bronx_queens %>%
  mutate(soil_area = st_area(.)) %>%
  st_drop_geometry() %>%
  group_by(MUSYM) %>%
  summarize(total_area = sum(soil_area, na.rm = TRUE)) %>%
  ungroup()

# calculating the total soil area in all nyc
total_area_summary <- soilarea_overall %>%
  summarize(grnd_total_area = sum(total_area, na.rm = TRUE))

# getting the grand total area value
grnd_total_area <- total_area_summary$grnd_total_area

# Calculate percent coverage and keeping top 10
percent_coverall <- soilarea_overall %>%
  mutate(percent_cover = (total_area / grnd_total_area) * 100) %>%
  arrange(desc(percent_cover)) %>% # Arrange by percent coverage in descending order
  top_n(10, percent_cover) # Select the top 10

# resulting table
print("Percent Coverage (in meters squared) of Top 10 Soil Classes in NYC:")
## [1] "Percent Coverage (in meters squared) of Top 10 Soil Classes in NYC:"
print(percent_coverall)
## # A tibble: 10 × 3
##    MUSYM total_area percent_cover
##    <chr>      [m^2]           [1]
##  1 W     336509647.         29.4 
##  2 UFA   109008249.          9.52
##  3 UGB    72478070.          6.33
##  4 UGA    69461946.          6.07
##  5 UtA    60007200.          5.24
##  6 UoA    43492637.          3.80
##  7 UtB    36476694.          3.19
##  8 UmA    34470520.          3.01
##  9 UGBl   26854370.          2.35
## 10 Nag2   23149300.          2.02

Preparing Parks Data

# soil data into census blocks
soil_block <- st_join(blocksrvalid, bronx_queens, join = st_intersects)

# upload park data
parks_upload <- "C:/Users/Kami/Downloads/soil R project/Parks Properties_20250514/geo_export_1e52c339-338f-41fa-bbc2-2acaf4b9e25f.shp"
parks <- st_read(parks_upload)
## Reading layer `geo_export_1e52c339-338f-41fa-bbc2-2acaf4b9e25f' from data source `C:\Users\Kami\Downloads\soil R project\Parks Properties_20250514\geo_export_1e52c339-338f-41fa-bbc2-2acaf4b9e25f.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2054 features and 34 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25609 ymin: 40.49449 xmax: -73.70905 ymax: 40.91133
## Geodetic CRS:  WGS 84
parks_transform <- st_transform(parks, crs = st_crs(blocks_transform)) # uniform crs

# park geometries are valid
parks_valid <- st_make_valid(parks_transform)
parks_centroids <- st_centroid(parks_valid)
## Warning: st_centroid assumes attributes are constant over geometries

Bivariate map of Shannon’s Diversity and Parks Density

#spatial join wasnt working so i resorted to this
intersection <- st_intersects(blocksrvalid, parks_centroids)

parkslist <- lengths(intersection)

parkblocks <- tibble(
    GEOID = blocksrvalid$GEOID,
    n_parks = parkslist
)
   

# joining park count to the census div data

census_divparks <- census_diversity_block %>%
  left_join(parkblocks, by = "GEOID") %>%
  replace_na(list(n_parks = 0)) 


# creating categories for shannon diversity using quantiles
census_divparks <- census_divparks %>%
  mutate(shannon_cat = case_when(
    shannon_diversity <= quantile(shannon_diversity, 1/3, na.rm = TRUE) ~ "Low",
    shannon_diversity <= quantile(shannon_diversity, 2/3, na.rm = TRUE) ~ "Medium",
    TRUE ~ "High"
  ))

# creating categories of park density using quantiles again
census_divparks <- census_divparks %>%
  mutate(parks_cat = case_when(
    n_parks <= quantile(n_parks, 1/3, na.rm = TRUE) ~ "Low",
    n_parks <= quantile(n_parks, 2/3, na.rm = TRUE) ~ "Medium",
    TRUE ~ "High"
  ))

# combining/creating a bivariate categories
census_divparks <- census_divparks %>%
  mutate(bivariate_cat = interaction(shannon_cat, parks_cat, sep = "-"))

bivariate_colors_yg <- c(
  "Low-Low" = "#ffffcc",
  "Medium-Low" = "#d9f0a3", 
  "High-Low" = "#addd8e",  
  "Low-Medium" = "#f7fcb9",
  "Medium-Medium" = "#d9f0a3", 
  "High-Medium" = "#74c476", 
  "Low-High" = "#addd8e",  
  "Medium-High" = "#41ab5d", 
  "High-High" = "#238443"  
)

# final bivariate map
print(ggplot() +
  geom_sf(data = census_divparks, aes(fill = bivariate_cat), color = NA) +
  scale_fill_manual(values = bivariate_colors_yg, name = "Shannon to Parks") +
  labs(
    title = "Bivariate Map of Shannon Diversity and Park Density in NYC per Census Block",
    fill = "Shannon - Parks (Low-Med-High)"
  ) +
  theme_minimal())