Looking at census data from 2022.
Picked 2022 because:
Why is this data important?
REVISIT ME:
#CENSUS_API_KEY is in environment
nyc_counties <- c("New York", "Kings", "Queens", "Bronx", "Richmond")
# Entire 2022 5-Year ACS variable dictionary
acs_vars_2022 <- load_variables(2022, "acs5", cache = TRUE)
# MISC.
# Exploring different potential features
feature_exploration <- acs_vars_2022 %>%
filter(
str_detect(str_to_lower(label), "public transportation|bicycle|walked|worked from home") |
str_detect(str_to_lower(concept), "means of transportation to work|travel time")
)
# Comment Me out before final knit
#View(feature_exploration)
# Define all API variables explicitly
vars <- c(
# Base Demographics
no_car = "B08201_002",
med_income = "B19013_001",
med_age = "B01002_001",
total_pop = "B01003_001",
# Transportation Base & Features
tot_workers = "B08301_001",
transit = "B08301_010",
walk = "B08301_019",
wfh = "B08301_021",
# Travel Time Features
commute_60 = "B08303_012",
commute_90 = "B08303_013",
# Education Base & Features
tot_pop_25 = "B15003_001",
bachelors = "B15003_022",
# Occupation Base & The 5 Major Buckets
tot_employ = "C24010_001",
mngmt_job = "C24010_002",
service_job = "C24010_014",
sales_job = "C24010_021",
const_job = "C24010_030",
trans_job = "C24010_034",
# Continuous Age Spectrum (Table B01001 - Detailed Age)
# 18 to 24
m_18_19="B01001_007", m_20="B01001_008", m_21="B01001_009", m_22_24="B01001_010",
f_18_19="B01001_031", f_20="B01001_032", f_21="B01001_033", f_22_24="B01001_034",
# 25 to 34
m_25_29="B01001_011", m_30_34="B01001_012", f_25_29="B01001_035", f_30_34="B01001_036",
# 35 to 44
m_35_39="B01001_013", m_40_44="B01001_014", f_35_39="B01001_037", f_40_44="B01001_038",
# 45 to 64
m_45_49="B01001_015", m_50_54="B01001_016", m_55_59="B01001_017", m_60_61="B01001_018", m_62_64="B01001_019",
f_45_49="B01001_039", f_50_54="B01001_040", f_55_59="B01001_041", f_60_61="B01001_042", f_62_64="B01001_043",
# 65 Plus
m_65_66="B01001_020", m_67_69="B01001_021", m_70_74="B01001_022", m_75_79="B01001_023", m_80_84="B01001_024", m_85_up="B01001_025",
f_65_66="B01001_044", f_67_69="B01001_045", f_70_74="B01001_046", f_75_79="B01001_047", f_80_84="B01001_048", f_85_up="B01001_049",
# Commercial vs. Residential Proxy (Housing)
tot_housing = "B25002_001",
vacant_unit = "B25002_003",
tot_occup = "B25003_001",
renter_unit = "B25003_003",
# Economic Barrier Spectrum (Poverty)
tot_pov_base = "B17001_001",
pov_below = "B17001_002"
)
# Getting from census data
nyc_tracts <- get_acs(
geography = "tract",
variables = vars,
state = "NY",
county = nyc_counties,
geometry = TRUE,
year = 2022,
output = "wide"
)
## Getting data from the 2018-2022 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## | | | 0% | |= | 1% | |= | 2% | |== | 3% | |=== | 4% | |==== | 5% | |==== | 6% | |===== | 7% | |===== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======== | 11% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 16% | |============ | 17% | |============= | 18% | |============= | 19% | |============== | 20% | |=============== | 21% | |=============== | 22% | |================ | 23% | |================= | 24% | |================== | 25% | |================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |===================== | 29% | |===================== | 30% | |====================== | 31% | |====================== | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |========================= | 35% | |========================= | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 39% | |============================ | 40% | |============================= | 41% | |============================= | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 47% | |================================== | 48% | |================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |===================================== | 52% | |===================================== | 53% | |====================================== | 54% | |====================================== | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |========================================= | 58% | |========================================= | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================= | 64% | |============================================= | 65% | |============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 70% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 83% | |=========================================================== | 84% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 90% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 93% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 97% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 100%
nyc_tracts <- nyc_tracts %>%
# Explicitly rename every variable
select(
GEOID, NAME, geometry,
# Base Demographics
no_car = no_carE,
med_income = med_incomeE,
med_age = med_ageE,
total_pop = total_popE,
tot_workers = tot_workersE,
tot_pop_25 = tot_pop_25E,
tot_employ = tot_employE,
# Transportation & Travel Time
transit = transitE,
walk = walkE,
wfh = wfhE,
commute_60 = commute_60E,
commute_90 = commute_90E,
# Education & Occupation
bachelors = bachelorsE,
mngmt_job = mngmt_jobE,
service_job = service_jobE,
sales_job = sales_jobE,
const_job = const_jobE,
trans_job = trans_jobE,
# Age - Males
m_18_19 = m_18_19E, m_20 = m_20E, m_21 = m_21E, m_22_24 = m_22_24E,
m_25_29 = m_25_29E, m_30_34 = m_30_34E,
m_35_39 = m_35_39E, m_40_44 = m_40_44E,
m_45_49 = m_45_49E, m_50_54 = m_50_54E, m_55_59 = m_55_59E, m_60_61 = m_60_61E, m_62_64 = m_62_64E,
m_65_66 = m_65_66E, m_67_69 = m_67_69E, m_70_74 = m_70_74E, m_75_79 = m_75_79E, m_80_84 = m_80_84E, m_85_up = m_85_upE,
# Age - Females
f_18_19 = f_18_19E, f_20 = f_20E, f_21 = f_21E, f_22_24 = f_22_24E,
f_25_29 = f_25_29E, f_30_34 = f_30_34E,
f_35_39 = f_35_39E, f_40_44 = f_40_44E,
f_45_49 = f_45_49E, f_50_54 = f_50_54E, f_55_59 = f_55_59E, f_60_61 = f_60_61E, f_62_64 = f_62_64E,
f_65_66 = f_65_66E, f_67_69 = f_67_69E, f_70_74 = f_70_74E, f_75_79 = f_75_79E, f_80_84 = f_80_84E, f_85_up = f_85_upE,
# Housing & Poverty
tot_housing = tot_housingE,
vacant_unit = vacant_unitE,
tot_occup = tot_occupE,
renter_unit = renter_unitE,
tot_pov_base = tot_pov_baseE,
pov_below = pov_belowE
) %>%
# Feature Engineering: Convert raw counts to percentages
mutate(
pct_no_car = (no_car / total_pop) * 100,
pct_transit = (transit / tot_workers) * 100,
pct_walk = (walk / tot_workers) * 100,
pct_wfh = (wfh / tot_workers) * 100,
pct_commute_60p = ((commute_60 + commute_90) / tot_workers) * 100,
pct_bachelors = (bachelors / tot_pop_25) * 100,
pct_mngmt = (mngmt_job / tot_employ) * 100,
pct_service = (service_job / tot_employ) * 100,
pct_sales = (sales_job / tot_employ) * 100,
pct_const = (const_job / tot_employ) * 100,
pct_trans = (trans_job / tot_employ) * 100,
area_sq_miles = as.numeric(st_area(geometry)) / 2589988,
pop_density = total_pop / area_sq_miles,
# Continuous Age Spectrum
pct_age_18_24 = ((m_18_19 + m_20 + m_21 + m_22_24 + f_18_19 + f_20 + f_21 + f_22_24) / total_pop) * 100,
pct_age_25_34 = ((m_25_29 + m_30_34 + f_25_29 + f_30_34) / total_pop) * 100,
pct_age_35_44 = ((m_35_39 + m_40_44 + f_35_39 + f_40_44) / total_pop) * 100,
pct_age_45_64 = ((m_45_49 + m_50_54 + m_55_59 + m_60_61 + m_62_64 + f_45_49 + f_50_54 + f_55_59 + f_60_61 + f_62_64) / total_pop) * 100,
pct_age_65_plus = ((m_65_66 + m_67_69 + m_70_74 + m_75_79 + m_80_84 + m_85_up + f_65_66 + f_67_69 + f_70_74 + f_75_79 + f_80_84 + f_85_up) / total_pop) * 100,
# Commercial/Residential Proxy
pct_vacant = (vacant_unit / tot_housing) * 100,
pct_renter = (renter_unit / tot_occup) * 100,
# Economic Barrier
pct_poverty = (pov_below / tot_pov_base) * 100
) %>%
# Explicitly drop the raw counts so they don't leak into the model
select(
-no_car, -transit, -walk, -wfh, -commute_60, -commute_90, -bachelors,
-mngmt_job, -service_job, -sales_job, -const_job, -trans_job,
-tot_workers, -tot_pop_25, -tot_employ,
-m_18_19, -m_20, -m_21, -m_22_24, -m_25_29, -m_30_34, -m_35_39, -m_40_44,
-m_45_49, -m_50_54, -m_55_59, -m_60_61, -m_62_64, -m_65_66, -m_67_69, -m_70_74, -m_75_79, -m_80_84, -m_85_up,
-f_18_19, -f_20, -f_21, -f_22_24, -f_25_29, -f_30_34, -f_35_39, -f_40_44,
-f_45_49, -f_50_54, -f_55_59, -f_60_61, -f_62_64, -f_65_66, -f_67_69, -f_70_74, -f_75_79, -f_80_84, -f_85_up,
-tot_housing, -vacant_unit, -tot_occup, -renter_unit, -tot_pov_base, -pov_below
)
## Checking for total NAs and NaNs in each column
colSums(is.na(nyc_tracts %>% st_drop_geometry())) %>%
kable(caption = "# of NAs and NaNs in each column")
| x | |
|---|---|
| GEOID | 0 |
| NAME | 0 |
| med_income | 126 |
| med_age | 88 |
| total_pop | 0 |
| pct_no_car | 85 |
| pct_transit | 90 |
| pct_walk | 90 |
| pct_wfh | 90 |
| pct_commute_60p | 90 |
| pct_bachelors | 85 |
| pct_mngmt | 90 |
| pct_service | 90 |
| pct_sales | 90 |
| pct_const | 90 |
| pct_trans | 90 |
| area_sq_miles | 0 |
| pop_density | 3 |
| pct_age_18_24 | 85 |
| pct_age_25_34 | 85 |
| pct_age_35_44 | 85 |
| pct_age_45_64 | 85 |
| pct_age_65_plus | 85 |
| pct_vacant | 93 |
| pct_renter | 96 |
| pct_poverty | 90 |
## Percentage of missing data
# (is it a low enough percent to be negligible)
missing_summary <- nyc_tracts %>%
st_drop_geometry() %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "Feature", values_to = "Missing_Count") %>%
mutate(
Total_Rows = nrow(nyc_tracts),
Pct_Missing = (Missing_Count / Total_Rows) * 100,
# Flag if the missing data is below 5%
Is_Negligible_Under_5Pct = Pct_Missing < 5
) %>%
# Filter to only show features that actually have missing data
filter(Missing_Count > 0) %>%
arrange(desc(Pct_Missing))
# Output Percentage of Missing Data
kable(missing_summary, digits = 2, caption = "Missing Data Percentage by Feature")
| Feature | Missing_Count | Total_Rows | Pct_Missing | Is_Negligible_Under_5Pct |
|---|---|---|---|---|
| med_income | 126 | 2327 | 5.41 | FALSE |
| pct_renter | 96 | 2327 | 4.13 | TRUE |
| pct_vacant | 93 | 2327 | 4.00 | TRUE |
| pct_transit | 90 | 2327 | 3.87 | TRUE |
| pct_walk | 90 | 2327 | 3.87 | TRUE |
| pct_wfh | 90 | 2327 | 3.87 | TRUE |
| pct_commute_60p | 90 | 2327 | 3.87 | TRUE |
| pct_mngmt | 90 | 2327 | 3.87 | TRUE |
| pct_service | 90 | 2327 | 3.87 | TRUE |
| pct_sales | 90 | 2327 | 3.87 | TRUE |
| pct_const | 90 | 2327 | 3.87 | TRUE |
| pct_trans | 90 | 2327 | 3.87 | TRUE |
| pct_poverty | 90 | 2327 | 3.87 | TRUE |
| med_age | 88 | 2327 | 3.78 | TRUE |
| pct_no_car | 85 | 2327 | 3.65 | TRUE |
| pct_bachelors | 85 | 2327 | 3.65 | TRUE |
| pct_age_18_24 | 85 | 2327 | 3.65 | TRUE |
| pct_age_25_34 | 85 | 2327 | 3.65 | TRUE |
| pct_age_35_44 | 85 | 2327 | 3.65 | TRUE |
| pct_age_45_64 | 85 | 2327 | 3.65 | TRUE |
| pct_age_65_plus | 85 | 2327 | 3.65 | TRUE |
| pop_density | 3 | 2327 | 0.13 | TRUE |
## Map for missing income data
missing_map_data <- nyc_tracts %>%
mutate(missing_income_flag = ifelse(is.na(med_income), "Missing Data", "Valid Data"))
ggplot(missing_map_data) +
geom_sf(aes(fill = missing_income_flag), color = "white", size = 0.1) +
scale_fill_manual(values = c("Valid Data" = "#e0e0e0", "Missing Data" = "#d73027")) +
theme_void() +
labs(title = "Map of Missing Census Income Data",
fill = "Status")
## Drop tracts with less than 50 people or 0 workers
# There were a ton of tracts with a small number of or 0 people, screwing with denominators and also making for super volatile percentages
nyc_tracts_clean <- nyc_tracts %>%
filter(total_pop >= 50, !is.na(pct_transit))
## Imputing
# There were also many tracts where income or age was missing, but it wasn't due to a denominator issue
## RETURN HERE
## Just doing simple median imputation
## More robust method would be better
## Checking for MNAR and stuff would be better
# Right now
nyc_tracts_clean <- nyc_tracts_clean %>%
# Extract the borough from the NAME string (e.g., "Queens County")
mutate(borough = str_extract(NAME, "(?<=; ).*(?= County)")) %>%
# Group by Borough so the imputation is localized
group_by(borough) %>%
mutate(
med_income = ifelse(is.na(med_income),
median(med_income, na.rm = TRUE),
med_income),
med_age = ifelse(is.na(med_age),
median(med_age, na.rm = TRUE),
med_age)
) %>%
ungroup() %>%
# Drop the temporary borough column
select(-borough)
print(paste("Original tracts:", nrow(nyc_tracts)))
## [1] "Original tracts: 2327"
print(paste("Cleaned tracts of NAs:", nrow(nyc_tracts_clean)))
## [1] "Cleaned tracts of NAs: 2226"
# Save
census_save_path <- file.path(projPath, "data", "nyc_census_tracts.rds")
saveRDS(nyc_tracts_clean, census_save_path)
print(paste("Census features and geometry saved to:", census_save_path))
## [1] "Census features and geometry saved to: C:/Users/cdube/src/DATA_622_Project/data/nyc_census_tracts.rds"
# Peak
head(nyc_tracts_clean)
## Simple feature collection with 6 features and 26 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.91641 ymin: 40.82439 xmax: -73.84726 ymax: 40.90264
## Geodetic CRS: NAD83
## # A tibble: 6 × 27
## GEOID NAME med_income med_age total_pop pct_no_car pct_transit pct_walk
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 36005013500 Cens… 27602 36.5 3125 31.8 48.6 14.5
## 2 36005009200 Cens… 56208 33.9 5959 14.1 38.6 8.07
## 3 36005005400 Cens… 48750 30.5 5715 20.6 64.7 6.20
## 4 36005036501 Cens… 41287 31.8 4196 23.1 60.3 6.61
## 5 36005044902 Cens… 88264 32.4 2146 12.1 37.8 2.43
## 6 36005017500 Cens… 26463 33.9 6116 39.5 69.1 8.69
## # ℹ 19 more variables: pct_wfh <dbl>, pct_commute_60p <dbl>,
## # pct_bachelors <dbl>, pct_mngmt <dbl>, pct_service <dbl>, pct_sales <dbl>,
## # pct_const <dbl>, pct_trans <dbl>, area_sq_miles <dbl>, pop_density <dbl>,
## # pct_age_18_24 <dbl>, pct_age_25_34 <dbl>, pct_age_35_44 <dbl>,
## # pct_age_45_64 <dbl>, pct_age_65_plus <dbl>, pct_vacant <dbl>,
## # pct_renter <dbl>, pct_poverty <dbl>, geometry <MULTIPOLYGON [°]>
#Zip file from below url is downloaded and extracted into repo
#taxi_zones_url <- "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
# Read the shapefile
taxi_zones_sf <- st_read(file.path(projPath, 'taxi_zones', 'taxi_zones', "taxi_zones.shp")) %>%
st_transform(4326) # Transform to standard Lat/Long (WGS84)
## Reading layer `taxi_zones' from data source
## `C:\Users\cdube\src\DATA_622_Project\taxi_zones\taxi_zones\taxi_zones.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913175.1 ymin: 120121.9 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
# Lookup table with names and boroughs is downloaded into repo
#zone_lookup_url <- "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
zone_lookup <- read_csv(file.path(projPath, 'taxi_zone_lookup.csv'))
## Rows: 265 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Borough, Zone, service_zone
## dbl (1): LocationID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Join
taxi_zones <- taxi_zones_sf %>%
left_join(zone_lookup, by = c("LocationID" = "LocationID"))
# Peak: Does this map look right
plot(st_geometry(taxi_zones))
TLC Data is from the NYC Taxi and Limousine Commission.
High volume for hire vehicles = Uber/Lyft.
REVISIT ME:
## Aggregate all vehicular trips for October 2023
# Combine Yellow, Green, FHV, and High-Volume FHV (Uber/Lyft)
# See "all" the car-based 'Last-Mile' choices
# URL definitions for source data
tlc_urls <- c(
yellow = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet",
green = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-10.parquet",
#fhv = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2023-10.parquet",
fhvhv = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-10.parquet"
)
# Function to read and count trips by Taxi Zone
# PULocationID can be used to get the # of trips that started from (were picked up in) a zone
count_zone_trips <- function(url) {
read_parquet(url, col_select = any_of(c("PULocationID"))) %>%
count(PULocationID, name = "trips") %>%
filter(!is.na(PULocationID))
}
# Process the sources and sum them up
# Getting number of trips that started per pick up location
# This 'map_dfr' loop downloads each, counts by zone, and stacks them.
total_zone_counts <- tlc_urls %>%
map_dfr(count_zone_trips) %>%
group_by(PULocationID) %>%
summarize(total_vehicular_trips = sum(trips))
The question here is how much of each Census Tract sits inside which Taxi Zone?
This uses EPSG:2263 which uses the units of “feet”, and can be used to calculate the square footage of an area in NY more precisely.
# Use Coordinate Reference System (CRS)
# We'll use EPSG:2263 (NAD83 / New York Long Island) because it uses feet,
# which is much better for calculating area in NYC than Lat/Long.
nyc_tracts_proj <- st_transform(nyc_tracts, 2263)
taxi_zones_proj <- st_transform(taxi_zones, 2263)
# Intersect regions
# This creates a new set of shapes where tracts and zones overlap
intersection <- st_intersection(nyc_tracts_proj, taxi_zones_proj) %>%
mutate(intersect_area = st_area(.))
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
# Calculate the weights
# For every unique Census Tract,
# What % of Tract X's area is inside Taxi Zone Y?
tract_weights <- intersection %>%
as_tibble() %>%
group_by(GEOID) %>%
mutate(total_tract_area = sum(intersect_area)) %>%
ungroup() %>%
mutate(weight = as.numeric(intersect_area / total_tract_area)) %>%
select(GEOID, LocationID, zone, weight)
# Peak Crosswalk
head(tract_weights)
## # A tibble: 6 × 4
## GEOID LocationID zone weight
## <chr> <dbl> <chr> <dbl>
## 1 36081071600 2 Jamaica Bay 0.000628
## 2 36047070203 2 Jamaica Bay 1
## 3 36081107201 2 Jamaica Bay 0.0124
## 4 36081100801 2 Jamaica Bay 0.00106
## 5 36081107202 2 Jamaica Bay 0.989
## 6 36005037000 3 Allerton/Pelham Gardens 0.0000606
# 'Spatial Weights' / Crosswalk
# issue is that taxi zones are larger than census tracts, and we obvs want to use the data together
#
# WHY: Taxi Zones are larger than Census Tracts. We use the weights from Phase 1
# to distribute these trips into our Census Tracts (GEOIDs).
# Assumes 'tract_weights' object exists from your earlier step.
tract_vehicular_estimates <- tract_weights %>%
left_join(total_zone_counts, by = c("LocationID" = "PULocationID")) %>%
mutate(estimated_trips = total_vehicular_trips * weight) %>%
group_by(GEOID) %>%
summarize(total_uber_taxi_trips = sum(estimated_trips, na.rm = TRUE))
# Save the 'Vehicular' side of the Master Table
save_path_tlc <- file.path(projPath, "data", "processed_tlc_counts.rds")
saveRDS(tract_vehicular_estimates, save_path_tlc)
print(paste("Vehicular trip estimates for October 2023 saved to:", save_path_tlc))
## [1] "Vehicular trip estimates for October 2023 saved to: C:/Users/cdube/src/DATA_622_Project/data/processed_tlc_counts.rds"
# Map Station IDs to Census Tracts
# file locations
cb_file1 <- file.path(projPath, "citibike", "202310-citibike-tripdata_1.csv")
cb_file2 <- file.path(projPath, "citibike", "202310-citibike-tripdata_2.csv")
cb_file3 <- file.path(projPath, "citibike", "202310-citibike-tripdata_3.csv")
cb_file4 <- file.path(projPath, "citibike", "202310-citibike-tripdata_4.csv")
cb_files <- c(cb_file1, cb_file2, cb_file3, cb_file4)
# Station-to-Tract Lookup Table
# unique coordinates for every station found in files
all_stations <- cb_files %>%
map_dfr(~read_csv(.x,
col_select = c(start_station_id, start_lat, start_lng),
# Force the station ID to be a character, and coords to be numeric
col_types = cols(
start_station_id = col_character(),
start_lat = col_double(),
start_lng = col_double()
))) %>%
filter(!is.na(start_lat)) %>%
distinct(start_station_id, .keep_all = TRUE)
# Spatial Work: Convert Stations to Points
stations_sf <- st_as_sf(all_stations, coords = c("start_lng", "start_lat"), crs = 4326) %>%
st_transform(2263) # Project to NYC Feet (EPSG:2263)
# Ensure nyc_tracts (from Step 1) matches the projection
nyc_tracts_proj <- st_transform(nyc_tracts, 2263)
# Find which Tract Polygon physically contains each Station Point
station_tract_lookup <- st_join(stations_sf, nyc_tracts_proj, join = st_intersects) %>%
as_tibble() %>%
select(start_station_id, GEOID)
# Count the number of unique stations in each Census Tract
tract_station_counts <- station_tract_lookup %>%
group_by(GEOID) %>%
summarize(total_stations = n()) %>%
filter(!is.na(GEOID))
# Count Trips per Tract
tract_bike_counts <- cb_files %>%
map_dfr(~read_csv(.x,
col_select = c(start_station_id),
col_types = cols(start_station_id = col_character())
)) %>%
left_join(station_tract_lookup, by = "start_station_id") %>%
group_by(GEOID) %>%
summarize(total_bike_trips = n()) %>%
filter(!is.na(GEOID)) %>%
# NEW: Attach the station capacity feature to the trip counts
left_join(tract_station_counts, by = "GEOID")
# Save the final "Label" feature for the Machine Learning model
save_path <- file.path(projPath, "data", "processed_bike_counts.rds")
saveRDS(tract_bike_counts, save_path)
print(paste("Success! Bike counts saved to:", save_path))
## [1] "Success! Bike counts saved to: C:/Users/cdube/src/DATA_622_Project/data/processed_bike_counts.rds"
Source is https://data.ny.gov/Transportation/MTA-Subway-Entrances-and-Exits-2024/i9wp-a4ja/about_data.
File name is “MTA_Subway_Entrances_and_Exits__2024.csv”
## Distance to Subway
# Load csv
subway_csv_path <- file.path(projPath, "MTA_Subway_Entrances_and_Exits__2024.csv")
subway_raw <- read_csv(subway_csv_path) %>%
clean_names() # otherwise this had spaces in the column names
## Rows: 2120 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Division, Line, Borough, Stop Name, Constituent Station Name, GTFS...
## dbl (4): Complex ID, Station ID, Entrance Latitude, Entrance Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Transform into a Spatial Object (sf)
# 'entrance_longitude' is X and 'entrance_latitude' is Y
# 4326 is the standard GPS system (WGS84)
subway_entrances_sf <- subway_raw %>%
filter(!is.na(entrance_latitude) & !is.na(entrance_longitude)) %>%
st_as_sf(coords = c("entrance_longitude", "entrance_latitude"), crs = 4326) %>%
st_transform(2263) # Convert to NYC Feet to match our Census Tracts
# Centroids
# Find the center point of every Census Tract
tract_centers <- st_centroid(nyc_tracts_proj)
## Warning: st_centroid assumes attributes are constant over geometries
# Nearest Entrance for every Tract
# Find the closest subway for each tract
nearest_indices <- st_nearest_feature(tract_centers, subway_entrances_sf)
# Distance in Miles
# (Distance in feet / 5280)
dist_to_subway_feet <- st_distance(tract_centers, subway_entrances_sf[nearest_indices, ], by_element = TRUE)
subway_dist_lookup <- nyc_tracts_proj %>%
as_tibble() %>%
mutate(dist_subway_miles = as.numeric(dist_to_subway_feet) / 5280) %>%
select(GEOID, dist_subway_miles)
# Save the Subway Distance
saveRDS(subway_dist_lookup, file.path(projPath, "data", "subway_dist_lookup.rds"))
print("Subway distance feature successfully calculated from MTA data!")
## [1] "Subway distance feature successfully calculated from MTA data!"
## Big Join
# Load components
census_features <- readRDS(file.path(projPath, "data", "nyc_census_tracts.rds"))
bike_counts <- readRDS(file.path(projPath, "data", "processed_bike_counts.rds"))
tlc_counts <- readRDS(file.path(projPath, "data", "processed_tlc_counts.rds"))
subway_dist <- readRDS(file.path(projPath, "data", "subway_dist_lookup.rds"))
unique_weights <- tract_weights %>%
group_by(GEOID) %>%
slice_max(weight, n = 1, with_ties = FALSE) %>%
ungroup() %>%
select(GEOID, LocationID, zone)
master_table <- census_features %>%
st_drop_geometry() %>%
left_join(unique_weights, by = "GEOID") %>%
left_join(bike_counts, by = "GEOID") %>%
left_join(tlc_counts, by = "GEOID") %>%
left_join(subway_dist, by = "GEOID")
# Clean up NAs; create the NEW Predictors
master_table <- master_table %>%
mutate(
total_bike_trips = replace_na(total_bike_trips, 0),
total_uber_taxi_trips = replace_na(total_uber_taxi_trips, 0),
total_stations = replace_na(total_stations, 0),
stations_per_sq_mile = total_stations / area_sq_miles,
stations_per_1000_pop = (total_stations / total_pop) * 1000,
# NEW PREDICTORS: Normalize raw trips by population
uber_trips_per_capita = total_uber_taxi_trips / total_pop,
bike_trips_per_capita = total_bike_trips / total_pop
) %>%
# Filter out tracts with 0 population to avoid "Divided by Zero" errors
filter(total_pop > 0)
# Save joined data set, overwriting the original file
saveRDS(master_table, file.path(projPath, "data", "model_input.rds"))
print("Master Join Complete. The model_input.rds file is ready for modeling.")
## [1] "Master Join Complete. The model_input.rds file is ready for modeling."
### The Data Audit (Sanity Check)
# Load the Master Table
master_table <- readRDS(file.path(projPath, "data", "model_input.rds"))
# Join with Taxi Zone names for readability
audit_table <- master_table %>%
select(
zone,
GEOID,
pct_commute_60p, # The new target!
uber_trips_per_capita, # The new predictors
bike_trips_per_capita,
dist_subway_miles
)
# Top 10 "Transit Deserts" (Highest % of 60+ min commutes)
top_worst_commutes <- audit_table %>%
arrange(desc(pct_commute_60p)) %>%
head(10)
# Top 10 "Well-Connected" Neighborhoods (Lowest % of 60+ min commutes)
top_best_commutes <- audit_table %>%
arrange(pct_commute_60p) %>%
head(10)
# quick summary
print("--- TOP 10 WORST COMMUTE TRACTS (60m+) ---")
## [1] "--- TOP 10 WORST COMMUTE TRACTS (60m+) ---"
print(top_worst_commutes)
## # A tibble: 10 × 6
## zone GEOID pct_commute_60p uber_trips_per_capita bike_trips_per_capita
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Coney Isla… 3604… 83.3 33.0 0
## 2 Pelham Bay… 3600… 66.7 104. 0
## 3 Jamaica Es… 3608… 66.0 22.4 0
## 4 Brownsville 3604… 60.7 50.3 0
## 5 Canarsie 3604… 59.8 46.9 0
## 6 Canarsie 3604… 57.4 41.0 0
## 7 Glendale 3608… 57.4 26.8 0
## 8 Hollis 3608… 56.2 11.5 0
## 9 Hollis 3608… 55.8 15.3 0
## 10 Bronxdale 3600… 55.6 20.3 0
## # ℹ 1 more variable: dist_subway_miles <dbl>
print("--- TOP 10 BEST COMMUTE TRACTS ---")
## [1] "--- TOP 10 BEST COMMUTE TRACTS ---"
print(top_best_commutes)
## # A tibble: 10 × 6
## zone GEOID pct_commute_60p uber_trips_per_capita bike_trips_per_capita
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Garment Di… 3606… 0 1304. 223.
## 2 Lenox Hill… 3606… 0 29.5 0
## 3 Red Hook 3604… 0 424. 20.7
## 4 Douglaston 3608… 0 118. 0
## 5 Midtown Ce… 3606… 0 1742. 31.0
## 6 Van Nest/M… 3600… 0 144. 0
## 7 Sunnyside 3608… 0 2127. 26.5
## 8 Lincoln Sq… 3606… 0 58.5 0
## 9 Chinatown 3606… 0 52.8 3.65
## 10 Garment Di… 3606… 0 852. 72.4
## # ℹ 1 more variable: dist_subway_miles <dbl>
It’s late in the evening, so I just did a quick AI pass at this with a linear model for proof of concept.
# 1. Prepare and Standardize the Data
ml_data <- readRDS(file.path(projPath, "data", "model_input.rds")) %>%
select(
GEOID,
# NEW TARGET: % of workers with 60+ minute commutes
pct_commute_60p,
# NEW PREDICTORS: Transit Dependency
uber_trips_per_capita, bike_trips_per_capita,
# Scaled Infrastructure Controls
stations_per_sq_mile, stations_per_1000_pop, dist_subway_miles,
# Economic & Housing Controls
med_income, pct_poverty, pct_vacant, pct_renter,
# Demographic Behaviors
pop_density, pct_no_car, pct_transit, pct_bachelors,
# Continuous Age Spectrum
pct_age_18_24, pct_age_25_34, pct_age_35_44, pct_age_45_64, pct_age_65_plus
) %>%
drop_na() %>%
# Standardize all predictors
mutate(across(
.cols = -c(pct_commute_60p, GEOID),
.fns = ~ as.numeric(scale(.))
))
# 2. Fit the "Kitchen Sink" Model
lm_fit_standardized <- lm(pct_commute_60p ~ . - GEOID, data = ml_data)
# 3. Fit the Simple Model
lm_fit_simple <- lm(pct_commute_60p ~ dist_subway_miles, data = ml_data)
print("--- SIMPLE MVP MODEL ---")
## [1] "--- SIMPLE MVP MODEL ---"
summary(lm_fit_simple)
##
## Call:
## lm(formula = pct_commute_60p ~ dist_subway_miles, data = ml_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.006 -9.161 0.552 8.958 59.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.2319 0.2578 97.86 <2e-16 ***
## dist_subway_miles 2.8980 0.2579 11.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.15 on 2218 degrees of freedom
## Multiple R-squared: 0.05387, Adjusted R-squared: 0.05344
## F-statistic: 126.3 on 1 and 2218 DF, p-value: < 2.2e-16
print("--- FULL STANDARDIZED MODEL ---")
## [1] "--- FULL STANDARDIZED MODEL ---"
summary(lm_fit_standardized)
##
## Call:
## lm(formula = pct_commute_60p ~ . - GEOID, data = ml_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.024 -5.630 -0.238 5.591 50.943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.23193 0.18165 138.908 < 2e-16 ***
## uber_trips_per_capita 0.86127 0.29306 2.939 0.003328 **
## bike_trips_per_capita -0.04663 0.26667 -0.175 0.861209
## stations_per_sq_mile -2.04659 0.23431 -8.735 < 2e-16 ***
## stations_per_1000_pop -1.18672 0.25530 -4.648 3.55e-06 ***
## dist_subway_miles 1.09628 0.23266 4.712 2.61e-06 ***
## med_income -2.73341 0.31573 -8.657 < 2e-16 ***
## pct_poverty -1.50410 0.29983 -5.017 5.68e-07 ***
## pct_vacant -0.78284 0.21229 -3.688 0.000232 ***
## pct_renter -0.80008 0.38145 -2.097 0.036064 *
## pop_density -1.27917 0.23955 -5.340 1.03e-07 ***
## pct_no_car -2.95583 0.35875 -8.239 2.94e-16 ***
## pct_transit 4.51840 0.26316 17.170 < 2e-16 ***
## pct_bachelors -2.11300 0.29983 -7.047 2.43e-12 ***
## pct_age_18_24 0.06349 0.22445 0.283 0.777299
## pct_age_25_34 -0.95281 0.30154 -3.160 0.001600 **
## pct_age_35_44 -0.91914 0.26678 -3.445 0.000581 ***
## pct_age_45_64 0.22478 0.24646 0.912 0.361840
## pct_age_65_plus 0.24323 0.28761 0.846 0.397827
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.559 on 2201 degrees of freedom
## Multiple R-squared: 0.534, Adjusted R-squared: 0.5302
## F-statistic: 140.1 on 18 and 2201 DF, p-value: < 2.2e-16
# 1. Load the geocoding package
pacman::p_load(tidygeocoder, sf, dplyr)
# 2. Build the list of stations based on the MTA's 2025 CE Document
ibx_stations <- tibble::tribble(
~station_name, ~address,
"Brooklyn Army Terminal", "2nd Ave & 61st St, Brooklyn, NY",
"4th Ave", "5th Ave & 61st St, Brooklyn, NY",
"8th Ave", "8th Ave & 61st St, Brooklyn, NY",
"New Utrecht Ave", "14th Ave & 61st St, Brooklyn, NY",
"McDonald Ave", "McDonald Ave & Avenue I, Brooklyn, NY",
"East 16th St", "East 16th St & Avenue H, Brooklyn, NY",
"Flatbush-Nostrand Ave", "Nostrand Ave & Avenue H, Brooklyn, NY",
"Utica Ave", "Utica Ave & Farragut Rd, Brooklyn, NY",
"Remsen Ave", "Remsen Ave & Farragut Rd, Brooklyn, NY",
"Linden Blvd", "Linden Blvd & Junius St, Brooklyn, NY",
"Livonia Ave", "Livonia Ave & Junius St, Brooklyn, NY",
"Sutter Ave", "Sutter Ave & Junius St, Brooklyn, NY",
"Atlantic Ave", "Atlantic Ave & East New York Ave, Brooklyn, NY",
"Wilson Ave", "Wilson Ave & Moffat St, Brooklyn, NY",
"Myrtle Ave", "Cypress Hills St & 78th Ave, Queens, NY",
"Metropolitan Ave", "Metropolitan Ave & 69th St, Queens, NY",
"Eliot Ave", "Eliot Ave & 69th St, Queens, NY",
"Grand Ave", "Grand Ave & 69th St, Queens, NY",
"Roosevelt Ave", "Roosevelt Ave & 74th St, Queens, NY"
)
# 3. Geocode the addresses into Lat/Long coordinates
ibx_geocoded <- ibx_stations %>%
geocode(address, method = 'osm', lat = latitude, long = longitude) %>%
drop_na() %>% # Drop any that the open-source map couldn't perfectly match
# Convert the GPS coordinates into an active spatial (sf) object
st_as_sf(coords = c("longitude", "latitude"), crs = 4326) %>%
# Convert to NYC's specific projection (feet) to draw a perfect 0.5 mile circle
st_transform(2263) %>%
st_buffer(dist = 2640) %>% # 2640 feet = 0.5 miles (The 10-minute walkshed)
st_transform(4326) # Convert back to standard GPS
## Passing 19 addresses to the Nominatim single address geocoder
## Query completed in: 19.1 seconds
# 4. Find all Census Tracts that fall inside these new station walksheds
# We use st_transform to mathematically force the stations to match the census map's CRS
ibx_tracts_sf <- st_filter(
nyc_tracts,
st_transform(ibx_geocoded, st_crs(nyc_tracts)),
.predicate = st_intersects
)
# Extract just the 11-digit IDs as a clean list
ibx_geoid_list <- ibx_tracts_sf$GEOID
# 5. Create the "Alternative Universe" Dataset
# Find the Z-score that mathematically represents "right next to a subway entrance"
best_transit_z_score <- min(ml_data$dist_subway_miles)
ibx_scenario_data <- ml_data %>%
mutate(
# Inject that "perfect" transit score into the IBX tracts
dist_subway_miles = ifelse(GEOID %in% ibx_geoid_list, best_transit_z_score, dist_subway_miles)
)
# 6. Run the Predictions, Calculate Relief, AND Flag Station Locations
final_map_data <- ml_data %>%
mutate(
predicted_commute_baseline = predict(lm_fit_standardized, newdata = ml_data),
predicted_commute_ibx = predict(lm_fit_standardized, newdata = ibx_scenario_data),
# Calculate the Drop in commute times (Baseline - IBX)
ibx_commute_relief = predicted_commute_baseline - predicted_commute_ibx,
is_ibx_location = ifelse(GEOID %in% ibx_geoid_list, 1, 0)
)
# Save the final data so your Shiny App can map it instantly!
saveRDS(final_map_data, file.path(projPath, "data", "model_input.rds"))
pacman::p_load(broom)
lm_results <- tidy(lm_fit_standardized) %>%
filter(term != "(Intercept)")
ggplot(lm_results, aes(x = estimate, y = reorder(term, estimate))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
geom_point(size = 3, color = "darkblue") +
geom_errorbarh(aes(xmin = estimate - std.error, xmax = estimate + std.error), height = 0.2) +
theme_minimal() +
labs(title = "Feature Importance: What drives Extreme Commutes (60m+)?",
subtitle = "Variables pushed to the RIGHT increase commute times; to the LEFT decrease them.",
x = "Effect on 60m+ Commutes (per 1 Std. Dev. increase)",
y = "Feature")
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.
The “Hybrid” Workflow (The Pro Move) Since you are using GitHub, here is how you should structured your work:
data_prep.Rmd: Use this to join the Census data, TLC data, and Citi Bike data. At the end of this file, use write_rds(final_data, “data/model_input.rds”).
app.R: This file should be very “slim.” It just reads model_input.rds, runs the ML prediction, and shows the map.
Analogy: data_prep.Rmd is the kitchen where you chop the vegetables; app.R is the dining table where the customer sees the finished meal. You don’t want the customer sitting in the middle of the kitchen!