This analysis presents a data-driven framework for expanding Brookline’s BlueBikes network from 14 to 28 stations. Using established transportation planning methodologies from NACTO and ITDP, combined with community input and statistical analysis, we identify optimal locations across three strategic objectives:
Our recommendations integrate:
Our approach draws on established bike share planning frameworks:
# Load required packages
library(data.table)
library(geosphere) # For distance calculations
library(ggplot2) # For visualization
library(leaflet) # For interactive maps
library(DT) # For interactive tables
library(plotly) # For interactive plots
library(dplyr) # For data manipulation
library(lubridate) # For date manipulation
library(scales) # For nice formatting
library(tidycensus) # For census data
library(dbscan) # For clustering
library(sf) # For spatial data handling
library(jsonlite) # For reading JSON files
library(leaflet.extras) # For heatmaps# Utility function: Check if a point is inside a polygon (ray casting algorithm)
point_in_polygon <- function(point_lng, point_lat, polygon) {
polygon_lngs <- polygon$lng
polygon_lats <- polygon$lat
n <- nrow(polygon)
inside <- FALSE
j <- n
for (i in 1:n) {
if (((polygon_lats[i] > point_lat) != (polygon_lats[j] > point_lat)) &&
(point_lng < polygon_lngs[i] + (polygon_lngs[j] - polygon_lngs[i]) *
(point_lat - polygon_lats[i]) / (polygon_lats[j] - polygon_lats[i]))) {
inside <- !inside
}
j <- i
}
return(inside)
}
# Utility function: Normalize values to 0-1 scale
normalize_column <- function(x) {
if(max(x, na.rm = TRUE) == min(x, na.rm = TRUE)) return(rep(1, length(x)))
return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}
# Utility function: Identify statistical outliers using IQR method
identify_outliers <- function(x, multiplier = 1.5) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - multiplier * iqr
upper <- q3 + multiplier * iqr
return(x < lower | x > upper)
}# Load Brookline boundary from GeoJSON
create_brookline_boundary <- function() {
blinejson <- readLines("~/data_science/data/brookline_geo.json")
boundary <- as.numeric(unlist(jsonlite::fromJSON(paste(blinejson))))
.lat <- boundary[346:length(boundary)]
.lon <- boundary[2:345]
boundary_points <- data.table(
point_id = 1:345,
lat = .lat,
lng = .lon
)
return(boundary_points)
}
brookline_boundary <- create_brookline_boundary()
brookline_boundary <- rbind(
data.table(unique(brookline_boundary[,list(lat,lng)])),
data.table(unique(brookline_boundary[,list(lat,lng)]))[1]
)
brookline_boundary <- brookline_boundary[,list(point_id = 1:.N, lat, lng)]
# Create sf polygon for spatial operations
brookline_poly <- st_polygon(list(cbind(brookline_boundary$lng, brookline_boundary$lat)))
brookline_poly_sf <- st_sfc(brookline_poly, crs = 4326)# Get census block group data with geometry
brookline_bg <- get_acs(
geography = "block group",
variables = c(
"total_housing" = "B25001_001",
"renter_occupied" = "B25003_003",
"population" = "B01003_001"
),
state = "MA",
county = "Norfolk",
year = 2022,
geometry = TRUE
)
# Process census data
brookline_bg <- as.data.table(brookline_bg)
brookline_bg_geom <- brookline_bg$geometry
# Convert to sf and transform CRS
brookline_bg_sf_all <- st_as_sf(brookline_bg, geometry = brookline_bg_geom)
brookline_bg_sf_all <- st_transform(brookline_bg_sf_all, 4326)
# Filter to Brookline boundary
brookline_poly_sf_transformed <- st_transform(brookline_poly_sf, st_crs(brookline_bg_sf_all))
intersects <- st_intersects(brookline_bg_sf_all, brookline_poly_sf_transformed, sparse = FALSE)
brookline_bg_sf <- brookline_bg_sf_all[intersects[,1],]
# Convert to wide format
brookline_bg <- as.data.table(brookline_bg_sf)
brookline_bg_wide <- dcast(brookline_bg, GEOID + NAME ~ variable, value.var = "estimate")
brookline_bg_wide$geometry <- brookline_bg_sf$geometry[match(brookline_bg_wide$GEOID, brookline_bg$GEOID)]
# Calculate derived metrics
brookline_bg_wide[, `:=`(
housing_density = as.numeric(total_housing / (st_area(geometry) / 1e6)),
renter_ratio = renter_occupied / total_housing,
population_density = as.numeric(population / (st_area(geometry) / 1e6))
)]
brookline_bg_sf <- st_as_sf(brookline_bg_wide, geometry = brookline_bg_wide$geometry)## Census block groups in Brookline: 44
## Total population: 62698
## Total housing units: 28535
DT::datatable(
brookline_stations[order(-total_activity), .(
Station = station_name,
Starts = format(starts_count, big.mark = ","),
Stops = format(stops_count, big.mark = ","),
`Total Activity` = format(total_activity, big.mark = ","),
`Net Flow` = net_flow,
`Imbalance Ratio` = round(imbalance_ratio, 2)
)],
caption = "Brookline Station Metrics (2024)",
options = list(
pageLength = 14,
dom = 'Bfrtip',
buttons = c('csv', 'excel')
),
rownames = FALSE
)Understanding the distribution of flow imbalance helps us identify which stations experience capacity stress. We use statistical methods rather than arbitrary thresholds.
# Box plot of imbalance ratios
ggplot(brookline_stations[is.finite(imbalance_ratio)],
aes(x = "", y = imbalance_ratio)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
geom_jitter(width = 0.1, alpha = 0.5, size = 3) +
geom_hline(yintercept = upper_threshold, linetype = "dashed", color = "red") +
annotate("text", x = 1.3, y = upper_threshold,
label = paste("Outlier threshold:", round(upper_threshold, 2)),
color = "red", hjust = 0) +
labs(
title = "Distribution of Station Flow Imbalance Ratios",
subtitle = "Stations above the red line may experience capacity stress",
y = "Imbalance Ratio (max/min of starts vs stops)",
x = ""
) +
theme_minimal() +
theme(axis.text.x = element_blank())DT::datatable(
brookline_routes[1:20, .(
`Start Station` = start_station,
`End Station` = end_station,
`Trip Count` = format(trip_count, big.mark = ","),
`Distance (m)` = round(route_distance),
`Intensity` = round(intensity, 4)
)],
caption = "Top 20 Routes To/From Brookline Stations",
options = list(pageLength = 10),
rownames = FALSE
)Following the NACTO Bike Share Planning Guide approach to rebalancing demand analysis, we identify stations experiencing capacity stress through:
Thresholds are determined by the data distribution, not arbitrary cutoffs.
DT::datatable(
brookline_stations[order(-capacity_stress_score), .(
Station = station_name,
`Total Activity` = format(total_activity, big.mark = ","),
`Imbalance Ratio` = round(imbalance_ratio, 2),
`Peak Concentration` = paste0(round(peak_concentration * 100, 1), "%"),
`Stress Score` = round(capacity_stress_score, 3),
`Stress Level` = stress_level
)],
caption = "Station Capacity Stress Analysis",
options = list(pageLength = 14),
rownames = FALSE
) %>%
formatStyle(
'Stress Level',
backgroundColor = styleEqual(
c('High', 'Moderate', 'Low'),
c('#ffcccc', '#fff3cd', '#ccffcc')
),
fontWeight = styleEqual(c('High', 'Moderate', 'Low'), c('bold', 'normal', 'normal'))
)# Separate stations by stress level
high_stress_stations <- brookline_stations[stress_level == "High"]
moderate_stress_stations <- brookline_stations[stress_level == "Moderate"]
low_stress_stations <- brookline_stations[stress_level == "Low"]
# Map of stressed stations and relief recommendations
obj1_map <- leaflet() %>%
addTiles() %>%
addPolylines(
data = brookline_boundary,
lng = ~lng, lat = ~lat,
color = "darkgreen", weight = 3, opacity = 1
) %>%
# High stress stations (red)
addCircleMarkers(
data = high_stress_stations,
lng = ~lng, lat = ~lat,
radius = ~sqrt(total_activity) / 8,
color = "red",
fillOpacity = 0.7,
popup = ~paste0(
"<b>", station_name, "</b><br>",
"<span style='color:red; font-weight:bold'>High Stress</span><br>",
"Stress Score: ", round(capacity_stress_score, 3), "<br>",
"Total Activity: ", format(total_activity, big.mark = ",")
)
) %>%
# Moderate stress stations (yellow/orange)
addCircleMarkers(
data = moderate_stress_stations,
lng = ~lng, lat = ~lat,
radius = ~sqrt(total_activity) / 8,
color = "#FFA500",
fillOpacity = 0.6,
popup = ~paste0(
"<b>", station_name, "</b><br>",
"<span style='color:#FFA500; font-weight:bold'>Moderate Stress</span><br>",
"Stress Score: ", round(capacity_stress_score, 3), "<br>",
"Total Activity: ", format(total_activity, big.mark = ",")
)
) %>%
# Low stress stations (blue)
addCircleMarkers(
data = low_stress_stations,
lng = ~lng, lat = ~lat,
radius = 6,
color = "blue",
fillOpacity = 0.5,
popup = ~paste0(
"<b>", station_name, "</b><br>",
"Low Stress<br>",
"Stress Score: ", round(capacity_stress_score, 3)
)
)
# Add relief recommendations if any
if (nrow(obj1_recommendations) > 0) {
obj1_map <- obj1_map %>%
addCircleMarkers(
data = obj1_recommendations,
lng = ~lng, lat = ~lat,
radius = 10,
color = "yellow",
stroke = TRUE,
weight = 2,
fillOpacity = 0.8,
popup = ~paste0(
"<b>Relief Station #", rank, "</b><br>",
"Near: ", source_station, "<br>",
"Score: ", round(relief_score, 3)
)
)
}
obj1_map <- obj1_map %>%
addLegend(
position = "bottomright",
colors = c("red", "#FFA500", "blue", "yellow"),
labels = c("High Stress", "Moderate Stress", "Low Stress", "Relief Recommendation"),
title = "Objective 1: Capacity Relief"
) %>%
setView(lng = mean(brookline_stations$lng), lat = mean(brookline_stations$lat), zoom = 14)
obj1_mapDemand intensification focuses on meeting unmet demand within and around the existing network. Unlike Objective 3 (expansion), this objective prioritizes areas where people are already riding OR explicitly asking for stations—even if those areas are near existing stations.
Key insight: High-demand areas may need MULTIPLE stations within close proximity. Times Square in NYC has 4+ stations within 300m because demand justifies density. Similarly, if Coolidge Corner is overwhelmed, adding a station 150m away is valid intensification.
Criteria:
Critical distinction from Objective 3: Intensification stays WITHIN/NEAR the existing network to strengthen it. Expansion goes BEYOND the network to extend it.
# Map showing demand density and intensification recommendations
obj2_map <- leaflet() %>%
addTiles() %>%
addPolylines(
data = brookline_boundary,
lng = ~lng, lat = ~lat,
color = "darkgreen", weight = 3, opacity = 1
)
# Add community voting areas if available
if (exists("community_votes_sf") && !is.null(community_votes_sf)) {
vote_pal <- colorNumeric("YlOrRd", community_votes_sf$VoteCount)
obj2_map <- obj2_map %>%
addPolygons(
data = community_votes_sf,
fillColor = ~vote_pal(VoteCount),
weight = 1,
opacity = 0.5,
color = "grey",
fillOpacity = 0.4,
popup = ~paste0("Votes: ", VoteCount)
)
}
obj2_map <- obj2_map %>%
# Existing stations
addCircleMarkers(
data = brookline_stations,
lng = ~lng, lat = ~lat,
radius = 6,
color = "blue",
fillOpacity = 0.7,
popup = ~station_name
) %>%
# Intensification recommendations
addCircleMarkers(
data = obj2_recommendations,
lng = ~lng, lat = ~lat,
radius = 10,
color = "purple",
fillOpacity = 0.8,
popup = ~paste0(
"<b>Intensification #", rank, "</b><br>",
"Demand Score: ", round(demand_score, 3), "<br>",
"Trip Density: ", trip_density, "<br>",
"Community Votes: ", community_score, "<br>",
"Distance to Station: ", round(dist_to_station), "m"
)
) %>%
addLegend(
position = "bottomright",
colors = c("blue", "purple"),
labels = c("Existing Station", "Intensification Recommendation"),
title = "Objective 2: Demand Intensification"
) %>%
setView(lng = mean(brookline_stations$lng), lat = mean(brookline_stations$lat), zoom = 14)
obj2_mapFollowing NACTO Equity and Access Principles and last-mile connectivity best practices, we identify locations to extend the network footprint, particularly:
Key focus areas based on community feedback: - South Brookline (Putterham, Larz Anderson) - Chestnut Hill / Reservoir area - Western Brookline near Route 9
# Map showing service coverage and expansion recommendations
obj3_map <- leaflet() %>%
addTiles() %>%
addPolylines(
data = brookline_boundary,
lng = ~lng, lat = ~lat,
color = "darkgreen", weight = 3, opacity = 1
) %>%
# Existing stations
addCircleMarkers(
data = brookline_stations,
lng = ~lng, lat = ~lat,
radius = 6,
color = "blue",
fillOpacity = 0.7,
popup = ~station_name
) %>%
# T stops
addCircleMarkers(
data = mbta_stops,
lng = ~stop_lon, lat = ~stop_lat,
radius = 8,
color = "green",
fillOpacity = 0.7,
popup = ~paste0("<b>", stop_name, "</b><br>", line)
)
# Add expansion recommendations if any
if (nrow(obj3_recommendations) > 0) {
obj3_map <- obj3_map %>%
addCircleMarkers(
data = obj3_recommendations,
lng = ~lng, lat = ~lat,
radius = 10,
color = "red",
fillOpacity = 0.8,
popup = ~paste0(
"<b>Expansion #", rank, "</b><br>",
"Near T Stop: ", nearest_t_stop, "<br>",
"T Distance: ", round(t_proximity), "m<br>",
"Expansion Score: ", round(expansion_score, 3)
)
)
}
obj3_map <- obj3_map %>%
addLegend(
position = "bottomright",
colors = c("blue", "green", "red"),
labels = c("Existing Station", "T Stop", "Expansion Recommendation"),
title = "Objective 3: Network Expansion"
) %>%
setView(lng = -71.13, lat = 42.32, zoom = 13)
obj3_mapFollowing best practices from NACTO, ITDP, and successful systems like Citi Bike (NYC), Bay Wheels (SF), and Bluebikes (Boston), our unified recommendation process applies a convergence analysis that identifies locations where multiple planning objectives align.
We identify “golden locations” where recommendations from different objectives cluster together, indicating multi-dimensional value:
We look for locations where recommendations from different objectives are within 400m of each other—close enough to potentially serve the same area with a single, optimally-placed station.
Our algorithm guarantees balanced representation from each objective while identifying convergence opportunities:
This prevents the scenario where one objective dominates simply because its candidates are geographically dispersed.
Our final ranking prioritizes:
Important: We do NOT apply a blanket “distance bonus” because that would unfairly favor expansion over intensification. Each objective already handles distance appropriately within its own methodology.
The map below shows recommended siting zones (not exact points) where new stations would be effective. Each circle represents an area where Engineering can identify the optimal specific location based on site conditions, utilities, ADA compliance, and sidewalk space.
| Color | Type | Description |
|---|---|---|
| Gold | Convergence Zone | Serves multiple planning objectives - highest value |
| Orange | Capacity Relief | Relieves stressed existing stations |
| Purple | Demand Intensification | Fills gaps in high-demand areas |
| Red | Network Expansion | Extends service to underserved areas |
Click any zone for details. Rank shown in table below.
Use the table below to filter and sort recommendations. Click on a row to see details.
# Create enhanced table
DT::datatable(
final_recommendations[, .(
`#` = Rank,
Type = `Location Type`,
Objectives = `Objectives`,
`Obj #` = `# Objectives`,
Score = `Final Score`,
`Dist to Existing` = `Dist to Existing`,
`Nearest T` = `Nearest T Stop`,
Lat = Latitude,
Lng = Longitude
)],
caption = "Top 20 Recommended Station Zones (ranked by score)",
options = list(
pageLength = 20,
dom = 'Bfrtip',
buttons = c('csv', 'excel'),
scrollX = TRUE,
columnDefs = list(
list(className = 'dt-center', targets = c(0, 3, 4))
)
),
rownames = FALSE,
filter = 'top',
selection = 'single'
) %>%
formatStyle(
'Obj #',
backgroundColor = styleInterval(
c(1, 2),
c('#e8f5e9', '#c8e6c9', '#a5d6a7')
),
fontWeight = styleInterval(c(1), c('normal', 'bold'))
) %>%
formatStyle(
'Type',
backgroundColor = styleEqual(
c('Convergence Zone'),
c('#fff9c4')
),
fontWeight = styleEqual(c('Convergence Zone'), c('bold'))
) %>%
formatStyle(
'#',
fontWeight = 'bold'
)How stable are our Objective 2 (demand intensification) rankings? We test different weighting schemes to ensure recommendations are robust.
Interpretation: Locations that rank consistently high across all scenarios are robust recommendations. Large rank swings indicate sensitivity to weighting choices.
This analysis provides a rigorous, multi-criteria framework for expanding Brookline’s BlueBikes network. Our approach follows established bikeshare planning best practices from NACTO, ITDP, and successful systems like Citi Bike (NYC) and Bay Wheels (SF).
Key Innovations:
Three-Objective Framework: Rather than a single optimization, we separately analyzed capacity relief, demand intensification, and network expansion—then identified where these objectives converge.
Convergence Analysis: Locations where multiple objectives align represent the highest-value investments. These “golden locations” satisfy multiple planning criteria simultaneously.
Data-Driven Thresholds: All thresholds (stress levels, gap distances, outlier identification) are derived from the data distribution itself, not arbitrary cutoffs.
Community Integration: 889 community votes directly influence our demand scoring, ensuring recommendations reflect resident priorities.
Summary:
The convergence zones represent our highest-confidence recommendations—locations where operational needs (capacity relief), demonstrated demand, and network expansion goals all align.
Analysis generated: 2025-12-12