Looking at census data from 2022.
Picked 2022 because:
Why is this data important?
REVISIT ME:
#CENSUS_API_KEY is in environment
nyc_counties <- c("New York", "Kings", "Queens", "Bronx", "Richmond")
# Entire 2022 5-Year ACS variable dictionary
acs_vars_2022 <- load_variables(2022, "acs5", cache = TRUE)
# MISC.
# Exploring different potential features
feature_exploration <- acs_vars_2022 %>%
filter(
str_detect(str_to_lower(label), "public transportation|bicycle|walked|worked from home") |
str_detect(str_to_lower(concept), "means of transportation to work|travel time")
)
# Comment Me out before final knit
#View(feature_exploration)
# Define all API variables explicitly
vars <- c(
# Base Demographics
no_car = "B08201_002",
med_income = "B19013_001",
med_age = "B01002_001",
total_pop = "B01003_001",
# Transportation Base & Features
tot_workers = "B08301_001",
transit = "B08301_010",
walk = "B08301_019",
wfh = "B08301_021",
# Travel Time Features
commute_60 = "B08303_012",
commute_90 = "B08303_013",
# Education Base & Features
tot_pop_25 = "B15003_001",
bachelors = "B15003_022",
# Occupation Base & The 5 Major Buckets
tot_employ = "C24010_001",
mngmt_job = "C24010_002",
service_job = "C24010_014",
sales_job = "C24010_021",
const_job = "C24010_030",
trans_job = "C24010_034"
)
# Getting from census data
nyc_tracts <- get_acs(
geography = "tract",
variables = vars,
state = "NY",
county = nyc_counties,
geometry = TRUE,
year = 2022,
output = "wide"
)
## Getting data from the 2018-2022 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## | | | 0% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 7% | |====== | 8% | |====== | 9% | |======= | 10% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 16% | |============ | 17% | |============= | 18% | |============= | 19% | |============== | 20% | |=============== | 21% | |=============== | 22% | |================ | 23% | |================= | 24% | |================== | 25% | |================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |===================== | 29% | |===================== | 30% | |====================== | 31% | |====================== | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |========================= | 35% | |========================= | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 39% | |============================ | 40% | |============================= | 41% | |============================= | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 47% | |================================== | 48% | |================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |===================================== | 52% | |===================================== | 53% | |====================================== | 54% | |====================================== | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |========================================= | 58% | |========================================= | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================= | 64% | |============================================= | 65% | |============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 70% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 83% | |=========================================================== | 84% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 90% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 93% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 97% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 100%
nyc_tracts <- nyc_tracts %>%
select(
GEOID,
NAME,
geometry,
no_car = no_carE,
med_income = med_incomeE,
med_age = med_ageE,
total_pop = total_popE,
tot_workers = tot_workersE,
tot_pop_25 = tot_pop_25E,
tot_employ = tot_employE,
transit = transitE,
walk = walkE,
wfh = wfhE,
commute_60 = commute_60E,
commute_90 = commute_90E,
bachelors = bachelorsE,
mngmt_job = mngmt_jobE,
service_job = service_jobE,
sales_job = sales_jobE,
const_job = const_jobE,
trans_job = trans_jobE
) %>%
# Feature Engineering: Convert raw counts to percentages
mutate(
pct_no_car = (no_car / total_pop) * 100,
pct_transit = (transit / tot_workers) * 100,
pct_walk = (walk / tot_workers) * 100,
pct_wfh = (wfh / tot_workers) * 100,
# Percentage of Commuters Traveling 60+ Minutes
pct_commute_60p = ((commute_60 + commute_90) / tot_workers) * 100,
pct_bachelors = (bachelors / tot_pop_25) * 100,
# Occupation Percentages
pct_mngmt = (mngmt_job / tot_employ) * 100,
pct_service = (service_job / tot_employ) * 100,
pct_sales = (sales_job / tot_employ) * 100,
pct_const = (const_job / tot_employ) * 100,
pct_trans = (trans_job / tot_employ) * 100,
# Population Density (people per square mile)
area_sq_miles = as.numeric(st_area(geometry)) / 2589988, # st_area() returns square meters, converted
pop_density = total_pop / area_sq_miles
) %>%
# Drop the raw counts because I don't want to accidentally put them in the ML model
select(-no_car, -transit, -walk, -wfh, -commute_60, -commute_90, -bachelors,
-mngmt_job, -service_job, -sales_job, -const_job, -trans_job,
-tot_workers, -tot_pop_25, -tot_employ)
## Checking for total NAs and NaNs in each column
colSums(is.na(nyc_tracts %>% st_drop_geometry())) %>%
kable(caption = "# of NAs and NaNs in each column")
| x | |
|---|---|
| GEOID | 0 |
| NAME | 0 |
| med_income | 126 |
| med_age | 88 |
| total_pop | 0 |
| pct_no_car | 85 |
| pct_transit | 90 |
| pct_walk | 90 |
| pct_wfh | 90 |
| pct_commute_60p | 90 |
| pct_bachelors | 85 |
| pct_mngmt | 90 |
| pct_service | 90 |
| pct_sales | 90 |
| pct_const | 90 |
| pct_trans | 90 |
| area_sq_miles | 0 |
| pop_density | 3 |
## Percentage of missing data
# (is it a low enough percent to be negligible)
missing_summary <- nyc_tracts %>%
st_drop_geometry() %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "Feature", values_to = "Missing_Count") %>%
mutate(
Total_Rows = nrow(nyc_tracts),
Pct_Missing = (Missing_Count / Total_Rows) * 100,
# Flag if the missing data is below 5%
Is_Negligible_Under_5Pct = Pct_Missing < 5
) %>%
# Filter to only show features that actually have missing data
filter(Missing_Count > 0) %>%
arrange(desc(Pct_Missing))
# Output Percentage of Missing Data
kable(missing_summary, digits = 2, caption = "Missing Data Percentage by Feature")
| Feature | Missing_Count | Total_Rows | Pct_Missing | Is_Negligible_Under_5Pct |
|---|---|---|---|---|
| med_income | 126 | 2327 | 5.41 | FALSE |
| pct_transit | 90 | 2327 | 3.87 | TRUE |
| pct_walk | 90 | 2327 | 3.87 | TRUE |
| pct_wfh | 90 | 2327 | 3.87 | TRUE |
| pct_commute_60p | 90 | 2327 | 3.87 | TRUE |
| pct_mngmt | 90 | 2327 | 3.87 | TRUE |
| pct_service | 90 | 2327 | 3.87 | TRUE |
| pct_sales | 90 | 2327 | 3.87 | TRUE |
| pct_const | 90 | 2327 | 3.87 | TRUE |
| pct_trans | 90 | 2327 | 3.87 | TRUE |
| med_age | 88 | 2327 | 3.78 | TRUE |
| pct_no_car | 85 | 2327 | 3.65 | TRUE |
| pct_bachelors | 85 | 2327 | 3.65 | TRUE |
| pop_density | 3 | 2327 | 0.13 | TRUE |
## Map for missing income data
missing_map_data <- nyc_tracts %>%
mutate(missing_income_flag = ifelse(is.na(med_income), "Missing Data", "Valid Data"))
ggplot(missing_map_data) +
geom_sf(aes(fill = missing_income_flag), color = "white", size = 0.1) +
scale_fill_manual(values = c("Valid Data" = "#e0e0e0", "Missing Data" = "#d73027")) +
theme_void() +
labs(title = "Map of Missing Census Income Data",
fill = "Status")
## Drop tracts with less than 50 people or 0 workers
# There were a ton of tracts with a small number of or 0 people, screwing with denominators and also making for super volatile percentages
nyc_tracts_clean <- nyc_tracts %>%
filter(total_pop >= 50, !is.na(pct_transit))
## Imputing
# There were also many tracts where income or age was missing, but it wasn't due to a denominator issue
## RETURN HERE
## Just doing simple median imputation
## More robust method would be better
## Checking for MNAR and stuff would be better
# Right now
nyc_tracts_clean <- nyc_tracts_clean %>%
# Extract the borough from the NAME string (e.g., "Queens County")
mutate(borough = str_extract(NAME, "(?<=; ).*(?= County)")) %>%
# Group by Borough so the imputation is localized
group_by(borough) %>%
mutate(
med_income = ifelse(is.na(med_income),
median(med_income, na.rm = TRUE),
med_income),
med_age = ifelse(is.na(med_age),
median(med_age, na.rm = TRUE),
med_age)
) %>%
ungroup() %>%
# Drop the temporary borough column
select(-borough)
print(paste("Original tracts:", nrow(nyc_tracts)))
## [1] "Original tracts: 2327"
print(paste("Cleaned tracts of NAs:", nrow(nyc_tracts_clean)))
## [1] "Cleaned tracts of NAs: 2226"
# Save
census_save_path <- file.path(projPath, "data", "nyc_census_tracts.rds")
saveRDS(nyc_tracts_clean, census_save_path)
print(paste("Census features and geometry saved to:", census_save_path))
## [1] "Census features and geometry saved to: C:/Users/cdube/src/DATA_622_Project/data/nyc_census_tracts.rds"
# Peak
head(nyc_tracts_clean)
## Simple feature collection with 6 features and 18 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.91641 ymin: 40.82439 xmax: -73.84726 ymax: 40.90264
## Geodetic CRS: NAD83
## # A tibble: 6 × 19
## GEOID NAME med_income med_age total_pop pct_no_car pct_transit pct_walk
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 36005013500 Cens… 27602 36.5 3125 31.8 48.6 14.5
## 2 36005009200 Cens… 56208 33.9 5959 14.1 38.6 8.07
## 3 36005005400 Cens… 48750 30.5 5715 20.6 64.7 6.20
## 4 36005036501 Cens… 41287 31.8 4196 23.1 60.3 6.61
## 5 36005044902 Cens… 88264 32.4 2146 12.1 37.8 2.43
## 6 36005017500 Cens… 26463 33.9 6116 39.5 69.1 8.69
## # ℹ 11 more variables: pct_wfh <dbl>, pct_commute_60p <dbl>,
## # pct_bachelors <dbl>, pct_mngmt <dbl>, pct_service <dbl>, pct_sales <dbl>,
## # pct_const <dbl>, pct_trans <dbl>, area_sq_miles <dbl>, pop_density <dbl>,
## # geometry <MULTIPOLYGON [°]>
#Zip file from below url is downloaded and extracted into repo
#taxi_zones_url <- "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
# Read the shapefile
taxi_zones_sf <- st_read(file.path(projPath, 'taxi_zones', 'taxi_zones', "taxi_zones.shp")) %>%
st_transform(4326) # Transform to standard Lat/Long (WGS84)
## Reading layer `taxi_zones' from data source
## `C:\Users\cdube\src\DATA_622_Project\taxi_zones\taxi_zones\taxi_zones.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 263 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913175.1 ymin: 120121.9 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
# Lookup table with names and boroughs is downloaded into repo
#zone_lookup_url <- "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
zone_lookup <- read_csv(file.path(projPath, 'taxi_zone_lookup.csv'))
## Rows: 265 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Borough, Zone, service_zone
## dbl (1): LocationID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Join
taxi_zones <- taxi_zones_sf %>%
left_join(zone_lookup, by = c("LocationID" = "LocationID"))
# Peak: Does this map look right
plot(st_geometry(taxi_zones))
TLC Data is from the NYC Taxi and Limousine Commission.
High volume for hire vehicles = Uber/Lyft.
REVISIT ME:
## Aggregate all vehicular trips for October 2023
# Combine Yellow, Green, FHV, and High-Volume FHV (Uber/Lyft)
# See "all" the car-based 'Last-Mile' choices
# URL definitions for source data
tlc_urls <- c(
yellow = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-10.parquet",
green = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-10.parquet",
#fhv = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2023-10.parquet",
fhvhv = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2023-10.parquet"
)
# Function to read and count trips by Taxi Zone
# PULocationID can be used to get the # of trips that started from (were picked up in) a zone
count_zone_trips <- function(url) {
read_parquet(url, col_select = any_of(c("PULocationID"))) %>%
count(PULocationID, name = "trips") %>%
filter(!is.na(PULocationID))
}
# Process the sources and sum them up
# Getting number of trips that started per pick up location
# This 'map_dfr' loop downloads each, counts by zone, and stacks them.
total_zone_counts <- tlc_urls %>%
map_dfr(count_zone_trips) %>%
group_by(PULocationID) %>%
summarize(total_vehicular_trips = sum(trips))
The question here is how much of each Census Tract sits inside which Taxi Zone?
This uses EPSG:2263 which uses the units of “feet”, and can be used to calculate the square footage of an area in NY more precisely.
# Use Coordinate Reference System (CRS)
# We'll use EPSG:2263 (NAD83 / New York Long Island) because it uses feet,
# which is much better for calculating area in NYC than Lat/Long.
nyc_tracts_proj <- st_transform(nyc_tracts, 2263)
taxi_zones_proj <- st_transform(taxi_zones, 2263)
# Intersect regions
# This creates a new set of shapes where tracts and zones overlap
intersection <- st_intersection(nyc_tracts_proj, taxi_zones_proj) %>%
mutate(intersect_area = st_area(.))
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
# Calculate the weights
# For every unique Census Tract,
# What % of Tract X's area is inside Taxi Zone Y?
tract_weights <- intersection %>%
as_tibble() %>%
group_by(GEOID) %>%
mutate(total_tract_area = sum(intersect_area)) %>%
ungroup() %>%
mutate(weight = as.numeric(intersect_area / total_tract_area)) %>%
select(GEOID, LocationID, zone, weight)
# Peak Crosswalk
head(tract_weights)
## # A tibble: 6 × 4
## GEOID LocationID zone weight
## <chr> <dbl> <chr> <dbl>
## 1 36081071600 2 Jamaica Bay 0.000628
## 2 36047070203 2 Jamaica Bay 1
## 3 36081107201 2 Jamaica Bay 0.0124
## 4 36081100801 2 Jamaica Bay 0.00106
## 5 36081107202 2 Jamaica Bay 0.989
## 6 36005037000 3 Allerton/Pelham Gardens 0.0000606
# 'Spatial Weights' / Crosswalk
# issue is that taxi zones are larger than census tracts, and we obvs want to use the data together
#
# WHY: Taxi Zones are larger than Census Tracts. We use the weights from Phase 1
# to distribute these trips into our Census Tracts (GEOIDs).
# Assumes 'tract_weights' object exists from your earlier step.
tract_vehicular_estimates <- tract_weights %>%
left_join(total_zone_counts, by = c("LocationID" = "PULocationID")) %>%
mutate(estimated_trips = total_vehicular_trips * weight) %>%
group_by(GEOID) %>%
summarize(total_uber_taxi_trips = sum(estimated_trips, na.rm = TRUE))
# Save the 'Vehicular' side of the Master Table
save_path_tlc <- file.path(projPath, "data", "processed_tlc_counts.rds")
saveRDS(tract_vehicular_estimates, save_path_tlc)
print(paste("Vehicular trip estimates for October 2023 saved to:", save_path_tlc))
## [1] "Vehicular trip estimates for October 2023 saved to: C:/Users/cdube/src/DATA_622_Project/data/processed_tlc_counts.rds"
# Map Station IDs to Census Tracts
# file locations
cb_file1 <- file.path(projPath, "citibike", "202310-citibike-tripdata_1.csv")
cb_file2 <- file.path(projPath, "citibike", "202310-citibike-tripdata_2.csv")
cb_file3 <- file.path(projPath, "citibike", "202310-citibike-tripdata_3.csv")
cb_file4 <- file.path(projPath, "citibike", "202310-citibike-tripdata_4.csv")
cb_files <- c(cb_file1, cb_file2, cb_file3, cb_file4)
# Station-to-Tract Lookup Table
# unique coordinates for every station found in files
all_stations <- cb_files %>%
map_dfr(~read_csv(.x,
col_select = c(start_station_id, start_lat, start_lng),
# Force the station ID to be a character, and coords to be numeric
col_types = cols(
start_station_id = col_character(),
start_lat = col_double(),
start_lng = col_double()
))) %>%
filter(!is.na(start_lat)) %>%
distinct(start_station_id, .keep_all = TRUE)
# Spatial Work: Convert Stations to Points
stations_sf <- st_as_sf(all_stations, coords = c("start_lng", "start_lat"), crs = 4326) %>%
st_transform(2263) # Project to NYC Feet (EPSG:2263)
# Ensure nyc_tracts (from Step 1) matches the projection
nyc_tracts_proj <- st_transform(nyc_tracts, 2263)
# Find which Tract Polygon physically contains each Station Point
station_tract_lookup <- st_join(stations_sf, nyc_tracts_proj, join = st_intersects) %>%
as_tibble() %>%
select(start_station_id, GEOID)
# 4. Final Aggregation: Count Trips per Tract
# WHY: This uses a fast table join on IDs to count millions of rows in seconds.
tract_bike_counts <- cb_files %>%
map_dfr(~read_csv(.x,
col_select = c(start_station_id),
col_types = cols(start_station_id = col_character()) # Added here too!
)) %>%
left_join(station_tract_lookup, by = "start_station_id") %>%
group_by(GEOID) %>%
summarize(total_bike_trips = n()) %>%
filter(!is.na(GEOID))
# 5. Save the final "Label" feature for the Machine Learning model
save_path <- file.path(projPath, "data", "processed_bike_counts.rds")
saveRDS(tract_bike_counts, save_path)
print(paste("Success! Bike counts saved to:", save_path))
## [1] "Success! Bike counts saved to: C:/Users/cdube/src/DATA_622_Project/data/processed_bike_counts.rds"
Source is https://data.ny.gov/Transportation/MTA-Subway-Entrances-and-Exits-2024/i9wp-a4ja/about_data.
File name is “MTA_Subway_Entrances_and_Exits__2024.csv”
## Distance to Subway
# Load csv
subway_csv_path <- file.path(projPath, "MTA_Subway_Entrances_and_Exits__2024.csv")
subway_raw <- read_csv(subway_csv_path) %>%
clean_names() # otherwise this had spaces in the column names
## Rows: 2120 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Division, Line, Borough, Stop Name, Constituent Station Name, GTFS...
## dbl (4): Complex ID, Station ID, Entrance Latitude, Entrance Longitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Transform into a Spatial Object (sf)
# 'entrance_longitude' is X and 'entrance_latitude' is Y
# 4326 is the standard GPS system (WGS84)
subway_entrances_sf <- subway_raw %>%
filter(!is.na(entrance_latitude) & !is.na(entrance_longitude)) %>%
st_as_sf(coords = c("entrance_longitude", "entrance_latitude"), crs = 4326) %>%
st_transform(2263) # Convert to NYC Feet to match our Census Tracts
# Centroids
# Find the center point of every Census Tract
tract_centers <- st_centroid(nyc_tracts_proj)
## Warning: st_centroid assumes attributes are constant over geometries
# Nearest Entrance for every Tract
# Find the closest subway for each tract
nearest_indices <- st_nearest_feature(tract_centers, subway_entrances_sf)
# Distance in Miles
# (Distance in feet / 5280)
dist_to_subway_feet <- st_distance(tract_centers, subway_entrances_sf[nearest_indices, ], by_element = TRUE)
subway_dist_lookup <- nyc_tracts_proj %>%
as_tibble() %>%
mutate(dist_subway_miles = as.numeric(dist_to_subway_feet) / 5280) %>%
select(GEOID, dist_subway_miles)
# Save the Subway Distance
saveRDS(subway_dist_lookup, file.path(projPath, "data", "subway_dist_lookup.rds"))
print("Subway distance feature successfully calculated from MTA data!")
## [1] "Subway distance feature successfully calculated from MTA data!"
## Big Join
# Load components
census_features <- readRDS(file.path(projPath, "data", "nyc_census_tracts.rds"))
bike_counts <- readRDS(file.path(projPath, "data", "processed_bike_counts.rds"))
tlc_counts <- readRDS(file.path(projPath, "data", "processed_tlc_counts.rds"))
subway_dist <- readRDS(file.path(projPath, "data", "subway_dist_lookup.rds"))
# RECENTLY ADDED
# Before joining, force each Tract to belong to only one Taxi Zone, the one it overlaps most.
# I was seeing repeating values of income and distance for different zones in the final data set back when I was not doing this.
unique_weights <- tract_weights %>%
group_by(GEOID) %>%
slice_max(weight, n = 1, with_ties = FALSE) %>%
ungroup() %>%
select(GEOID, LocationID, zone) # Keep the Zone name and ID for joining
# Join everything to the Census Tracts
# left_join to not lose any tracts that might have had 0 trips
master_table <- census_features %>%
st_drop_geometry() %>% # drops the maps, keeps ALL features
# unique Neighborhood/LocationID mapping
left_join(unique_weights, by = "GEOID") %>%
# Bike counts (already at Tract level)
left_join(bike_counts, by = "GEOID") %>%
# Uber/Taxi counts
left_join(tlc_counts, by = "GEOID") %>%
# Subway Distance (already at Tract level)
left_join(subway_dist, by = "GEOID")
# Clean up NAs; create the Target Variable
# If a tract has no recorded trips, replace NA with 0
master_table <- master_table %>%
mutate(
total_bike_trips = replace_na(total_bike_trips, 0),
total_uber_taxi_trips = replace_na(total_uber_taxi_trips, 0),
# CALCULATE THE TARGET: % of Last-Mile trips taken by Bike
# Formula: Bike / (Bike + Uber)
bike_index = total_bike_trips / (total_bike_trips + total_uber_taxi_trips)
) %>%
# Filter out tracts with 0 total trips to avoid "Divided by Zero" errors
filter(!is.na(bike_index))
# Save joined data set
saveRDS(master_table, file.path(projPath, "data", "model_input.rds"))
print("Master Join Complete! The model_input.rds file is ready for Model Phase.")
## [1] "Master Join Complete! The model_input.rds file is ready for Model Phase."
### Phase 2.5: The Data Audit (Sanity Check)
# 1. Load the Master Table we just created
master_table <- readRDS(file.path(projPath, "data", "model_input.rds"))
# 2. Join with Taxi Zone names for readability
# We'll use the tract_weights crosswalk to get neighborhood names back
audit_table <- master_table %>%
select(zone, GEOID, bike_index, total_bike_trips, total_uber_taxi_trips, med_income, dist_subway_miles)
# 3. Top 10 "Bike-Heavy" Neighborhoods (Active)
top_bike <- audit_table %>%
arrange(desc(bike_index)) %>%
head(50)
# 4. Top 10 "Uber-Heavy" Neighborhoods (Vehicular)
top_uber <- audit_table %>%
arrange(bike_index) %>%
head(50)
# 5. Print a quick summary
print("--- TOP 10 BIKE-HEAVY TRACTS ---")
## [1] "--- TOP 10 BIKE-HEAVY TRACTS ---"
print(top_bike)
## # A tibble: 50 × 7
## zone GEOID bike_index total_bike_trips total_uber_taxi_trips med_income
## <chr> <chr> <dbl> <int> <dbl> <dbl>
## 1 Columbia … 3604… 0.390 7840 12250. 145417
## 2 Randalls … 3606… 0.374 3447 5781. 112969
## 3 Stuy Town… 3606… 0.277 21350 55802. 142975
## 4 Hudson Sq 3606… 0.276 36404 95636. 246250
## 5 Battery P… 3606… 0.219 34144 121618. 250001
## 6 Bloomingd… 3606… 0.219 10638 37894. 113379
## 7 DUMBO/Vin… 3604… 0.208 20761 79157. 250001
## 8 World Tra… 3606… 0.189 23224 99747. 199576
## 9 Stuy Town… 3606… 0.187 12168 52846. 202632
## 10 Gowanus 3604… 0.153 7486 41594. 134167
## # ℹ 40 more rows
## # ℹ 1 more variable: dist_subway_miles <dbl>
print("--- TOP 10 UBER-HEAVY TRACTS ---")
## [1] "--- TOP 10 UBER-HEAVY TRACTS ---"
print(top_uber)
## # A tibble: 50 × 7
## zone GEOID bike_index total_bike_trips total_uber_taxi_trips med_income
## <chr> <chr> <dbl> <int> <dbl> <dbl>
## 1 Westchest… 3600… 0 0 44590. 56208
## 2 West Farm… 3600… 0 0 44826. 48750
## 3 East Trem… 3600… 0 0 80595. 41287
## 4 Woodlawn/… 3600… 0 0 60533. 88264
## 5 Soundview… 3600… 0 0 87212 53166
## 6 East Trem… 3600… 0 0 75865. 32174
## 7 Williamsb… 3600… 0 0 87768. 82039
## 8 Eastchest… 3600… 0 0 51946. 27753
## 9 Williamsb… 3600… 0 0 87771 59758
## 10 Soundview… 3600… 0 0 47388. 29657
## # ℹ 40 more rows
## # ℹ 1 more variable: dist_subway_miles <dbl>
It’s late in the evening, so I just did a quick AI pass at this with a linear model for proof of concept.
# 1. Prepare the Data
# We filter for only the columns the model should "learn" from
ml_data <- readRDS(file.path(projPath, "data", "model_input.rds")) %>%
# Select the target AND features
select(
bike_index, med_income, med_age, pct_no_car,
dist_subway_miles, pop_density, pct_transit,
pct_walk, pct_wfh, pct_commute_60p, pct_bachelors,
pct_mngmt, pct_service, pct_sales, pct_const, pct_trans
) %>%
drop_na()
# 2. Fit the Linear Model
# Formula: bike_index is explained by (~) all other variables (.)
lm_fit <- lm(bike_index ~ ., data = ml_data)
# 3. View the "Coefficients"
# This is the heart of the model
summary(lm_fit)
##
## Call:
## lm(formula = bike_index ~ ., data = ml_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06324 -0.00745 -0.00144 0.00353 0.36007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.595e-02 5.555e-03 -4.672 3.16e-06 ***
## med_income 1.863e-07 1.693e-08 11.002 < 2e-16 ***
## med_age -1.559e-04 7.095e-05 -2.197 0.0281 *
## pct_no_car 6.043e-04 6.740e-05 8.966 < 2e-16 ***
## dist_subway_miles 1.654e-03 7.645e-04 2.163 0.0306 *
## pop_density -1.099e-07 1.584e-08 -6.936 5.27e-12 ***
## pct_transit 3.477e-04 4.848e-05 7.172 1.00e-12 ***
## pct_walk 3.933e-04 7.308e-05 5.382 8.16e-08 ***
## pct_wfh 8.840e-05 8.277e-05 1.068 0.2856
## pct_commute_60p -3.420e-04 5.393e-05 -6.342 2.74e-10 ***
## pct_bachelors 1.632e-04 6.693e-05 2.439 0.0148 *
## pct_mngmt 8.086e-05 6.689e-05 1.209 0.2269
## pct_service -7.666e-05 2.149e-04 -0.357 0.7214
## pct_sales 1.718e-04 2.081e-04 0.825 0.4092
## pct_const -1.344e-04 1.101e-04 -1.220 0.2225
## pct_trans 1.254e-04 1.041e-04 1.205 0.2284
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02169 on 2210 degrees of freedom
## Multiple R-squared: 0.3538, Adjusted R-squared: 0.3494
## F-statistic: 80.67 on 15 and 2210 DF, p-value: < 2.2e-16
###
# Load broom for tidy model outputs
pacman::p_load(broom, dotwhisker)
# Turn the model results into a clean table
lm_results <- tidy(lm_fit) %>%
filter(term != "(Intercept)") # Remove the intercept for better scaling
# Plot the "Pull" of each feature
ggplot(lm_results, aes(x = estimate, y = reorder(term, estimate))) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
geom_point(size = 3, color = "darkblue") +
geom_errorbarh(aes(xmin = estimate - std.error, xmax = estimate + std.error), height = 0.2) +
theme_minimal() +
labs(title = "Linear Model: What drives the 'Bike Index'?",
subtitle = "Points to the right increase biking; points to the left decrease it.",
x = "Estimate (Coefficient)",
y = "Feature")
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.
The “Hybrid” Workflow (The Pro Move) Since you are using GitHub, here is how you should structured your work:
data_prep.Rmd: Use this to join the Census data, TLC data, and Citi Bike data. At the end of this file, use write_rds(final_data, “data/model_input.rds”).
app.R: This file should be very “slim.” It just reads model_input.rds, runs the ML prediction, and shows the map.
Analogy: data_prep.Rmd is the kitchen where you chop the vegetables; app.R is the dining table where the customer sees the finished meal. You don’t want the customer sitting in the middle of the kitchen!