0) Packages & Data


18.6 — How Long? (Durations by client)

Trips <- Trips |>
  mutate(duration_sec = as.numeric(edate - sdate),
         duration_min = duration_sec / 60)

# Trim to central 99% to avoid extreme outliers
q_hi <- quantile(Trips$duration_min, 0.99, na.rm=TRUE)
Trips_f <- Trips |> filter(duration_min > 0, duration_min <= q_hi)

ggplot(Trips_f, aes(x = client, y = duration_min, fill = client)) +
  geom_boxplot(outlier.alpha = 0.15) +
  labs(title = "Bike Rental Duration by Client Type (trimmed at 99th percentile)",
       x = "Client Type", y = "Duration (minutes)") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

Notes: Casual users typically ride longer than regulars; trimming keeps the box focused on typical rides.


18.7 — When Are Bikes Used?

Trips <- Trips |>
  mutate(
    yday   = lubridate::yday(sdate),
    wday   = lubridate::wday(sdate, label = TRUE, abbr = TRUE, week_start = 7),
    hour   = lubridate::hour(sdate),
    minute = lubridate::minute(sdate)
  )
# Day of year (Q4 2014)
ggplot(Trips, aes(x = yday)) +
  geom_density(fill = "gray85", color = NA) +
  labs(title = "Checkouts by Day of Year (Q4 2014)",
       x = "Day of Year (1–365)", y = "Density") +
  theme_minimal(base_size = 12)

# Day of week
ggplot(Trips, aes(x = wday)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Checkouts by Day of Week",
       x = "Weekday", y = "Trips") +
  theme_minimal(base_size = 12)

# Hour of day (density)
ggplot(Trips, aes(x = hour)) +
  geom_density(fill = "gray85", color = NA) +
  scale_x_continuous(breaks = seq(0, 24, 2)) +
  labs(title = "Checkouts by Hour of Day (Density)",
       x = "Hour (0–24)", y = "Density") +
  theme_minimal(base_size = 12)

# Minute within hour (density)
ggplot(Trips, aes(x = minute)) +
  geom_density(fill = "gray85", color = NA) +
  labs(title = "Checkouts by Minute within Hour",
       x = "Minute (0–59)", y = "Density") +
  theme_minimal(base_size = 12)

# Counts by hour × weekday × client
Trips_counts <- Trips |> count(client, wday, hour, name = "n")

ggplot(Trips_counts, aes(x = hour, y = n, color = client)) +
  geom_line(linewidth = 0.9) +
  facet_wrap(~ wday, ncol = 4, scales = "free_y") +
  scale_x_continuous(breaks = seq(0, 24, 3)) +
  labs(title = "Trips by Hour, Weekday, and Client Type",
       x = "Hour of Day", y = "Trips", color = "Client") +
  theme_minimal(base_size = 12)

Notes: Regulars peak at commute hours (~8a, ~5p). Casuals show broader daytime/weekend use.


18.8 — How Far? (Distances)

# Haversine (meters)
haversine <- function(lat1, lon1, lat2, lon2){
  R <- 6371000
  to_rad <- function(deg) deg * pi / 180
  φ1 <- to_rad(lat1); λ1 <- to_rad(lon1)
  φ2 <- to_rad(lat2); λ2 <- to_rad(lon2)
  dφ <- φ2 - φ1; dλ <- λ2 - λ1
  a <- sin(dφ/2)^2 + cos(φ1)*cos(φ2)*sin(dλ/2)^2
  2 * R * asin(pmin(1, sqrt(a)))
}

# Station pairs (cartesian product) and straight-line distances
Simple  <- Stations |> dplyr::select(name, lat, long) |> dplyr::rename(sstation = name)
Simple2 <- Simple   |> dplyr::rename(estation = sstation, lat2 = lat, long2 = long)

StationPairs <- merge(Simple, Simple2, by = NULL)

PairDistances <- StationPairs |>
  dplyr::mutate(distance_km = haversine(lat, long, lat2, long2) / 1000) |>
  dplyr::select(sstation, estation, distance_km)

ggplot(PairDistances, aes(x = distance_km)) +
  geom_histogram(binwidth = 0.25, fill = "gray80", color = "white") +
  labs(title = "Station-to-Station Straight-Line Distances",
       x = "Distance (km)", y = "Count") +
  theme_minimal(base_size = 12)

# Join rides to pairwise distances to estimate straight-line ride length
Trips_w_dist <- Trips |> dplyr::inner_join(PairDistances, by = c("sstation","estation"))

ggplot() +
  geom_histogram(data = PairDistances, aes(x = distance_km, y = after_stat(density)),
                 binwidth = 0.25, fill = "gray85", color = NA) +
  geom_density(data = Trips_w_dist, aes(x = distance_km), color = "blue", linewidth = 1) +
  labs(title = "Ride Distances vs Station Distances",
       subtitle = "Gray: all station pairs | Blue: rides (straight-line start→end)",
       x = "Distance (km)", y = "Density") +
  theme_minimal(base_size = 12)

Notes: Most rides cluster within ~2–3 km; they’re generally shorter than many possible station-pair distances.


18.9 — Static Station Map (PDF-safe)

ggplot(Stations, aes(x = long, y = lat)) +
  geom_point(color = "red", size = 1.5) +
  labs(title = "Capital Bikeshare Station Locations",
       x = "Longitude", y = "Latitude") +
  theme_minimal(base_size = 12)

Notes: Dense downtown cluster with several outlying stations.


Summary