Project background:


Setting up my environment & data:

library(tidyverse)
library(RColorBrewer)
library(scales)
library(here)
tripdata_summary_weather <- read_csv(here("processed_data/r", "tripdata_summary_weather.csv"))
tripdata_summary_daily <- read_csv(here("processed_data/r", "tripdata_summary_daily.csv"))
tripdata_time_of_day <- read_csv(here("processed_data/r", "tripdata_time_of_day.csv"))
tripdata_stations <- read_csv(here("processed_data/r", "tripdata_stations.csv"))
tripdata_summary <- read_csv(here("processed_data/r", "tripdata_summary.csv"))
station_names <- read_csv(here("processed_data/r", "station_names.csv"))


Business tasks:


Presentation objectives:


The data story:


      Bike trips trend
tripdata_summary %>%
  ggplot(aes(x = s_year_month, y = ride_count, color = member_casual)) +
  theme_grey(base_size = 12) +
  geom_smooth(se = FALSE, size = 1.1) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %y") +
  scale_color_manual(values=c("purple","gold")) +
  scale_y_continuous(labels = scales::comma) +
  labs(x = "Month-Year", y = "Ride Count", color  = "Member Type") +
  theme(legend.position = "bottom")

head(tripdata_summary)
## # A tibble: 6 x 4
##   member_casual s_year_month mean_ride_length ride_count
##   <chr>         <date>                  <dbl>      <dbl>
## 1 casual        2020-04-01               72.5      23553
## 2 casual        2020-05-01               50.6      86635
## 3 casual        2020-06-01               51.3     154186
## 4 casual        2020-07-01               59.3     267977
## 5 casual        2020-08-01               44.5     280498
## 6 casual        2020-09-01               38.5     214102



      Total trip count of riders
tripdata_summary %>% 
  group_by(member_casual) %>% 
  summarise(total_ride = sum(ride_count)) %>% 
  ggplot(aes(x = member_casual, y = total_ride, fill = member_casual)) +
  theme_grey(base_size = 12) +
  theme(axis.title.x=element_blank()) +
  geom_col(width = 0.50, show.legend = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values=c("purple", "gold")) +
  stat_summary(fun = "sum", aes(label = format(..y.., big.mark = ",")),
               geom = "text", vjust = 2, size = 3.5) +
  ylab("Ride Count") +
  scale_x_discrete(labels = c('Casual', 'Member'))



      Seasonal trips trend
tripdata_summary_weather %>% 
  ggplot(aes(x = s_year_month, y = ride_count)) +
  theme_dark(base_size = 12) +
  geom_smooth(color = "gray", se = FALSE, size = 1) +
  geom_point(aes(color = avg_temp), size = 3) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %y") +
  scale_y_continuous(labels = scales::comma) +
  scale_color_distiller(name = "Temp (C)", palette = "RdBu") +
  labs(x = "Month-Year", y = "Ride Count")

head(tripdata_summary_weather)
## # A tibble: 6 x 4
##   s_year_month avg_temp mean_rl ride_count
##   <date>          <dbl>   <dbl>      <dbl>
## 1 2020-04-01         10      36      84607
## 2 2020-05-01         18      33     199674
## 3 2020-06-01         23      33     341897
## 4 2020-07-01         26      38     548485
## 5 2020-08-01         25      30     604203
## 6 2020-09-01         19      25     497656



      Average ride length
tripdata_summary %>%
  ggplot(aes(x = s_year_month, y = mean_ride_length, color = member_casual)) +
  theme_grey(base_size = 12) +
  geom_smooth(se = FALSE, size = 1.1, show.legend = FALSE) +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %y") +
  scale_color_manual(values=c("purple","gold")) +
  ylim(0, 80) +
  labs(x = "Month-Year", y = "Average Ride Length (Min)")



      Rides per day of week
tripdata_summary_daily%>%
  ggplot(aes(x = factor(day_of_week,level = c('Monday', 'Tuesday', 'Wednesday',
                                              'Thursday', 'Friday', 'Saturday', 'Sunday')),
             y = ride_count, fill = member_casual)) +
  theme_grey(base_size = 12) +
  geom_col(width = 0.50, show.legend = FALSE) +
  facet_wrap(~member_casual) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values=c("purple", "gold")) +
  labs(x = "Day of Week", y = "Ride Count") +
  scale_x_discrete(labels = c('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'))

head(tripdata_summary_daily)
## # A tibble: 6 x 3
##   member_casual day_of_week ride_count
##   <chr>         <chr>            <dbl>
## 1 casual        Sunday          518410
## 2 casual        Monday          292885
## 3 casual        Tuesday         281572
## 4 casual        Wednesday       286462
## 5 casual        Thursday        302551
## 6 casual        Friday          391052



      Rides per time of day
tripdata_time_of_day%>%
  ggplot(aes(x = started_at_time, y = ride_count, color = member_casual)) +
  theme_grey(base_size = 12) +
  geom_smooth(se = FALSE, size = 1.1, show.legend = FALSE) +
  scale_y_continuous(labels = scales::comma, breaks=seq(0,140000,20000)) +
  scale_color_manual(values=c("purple", "gold")) +
  labs(x = "Time of Day", y = "Ride Count") +
  scale_x_time(breaks = scales::breaks_width("2 hours"),
               labels = label_time(format = "%H:%M", tz = "UTC"))

head(tripdata_time_of_day)
## # A tibble: 6 x 3
##   member_casual started_at_time ride_count
##   <chr>         <time>               <dbl>
## 1 casual        00:00                14507
## 2 casual        00:30                25044
## 3 casual        01:00                20887
## 4 casual        01:30                17087
## 5 casual        02:00                14243
## 6 casual        02:30                10399



      Stations traveled
station_names %>% 
  summarise(total_stations = n_distinct(start_station_name))
## # A tibble: 1 x 1
##   total_stations
##            <int>
## 1            763
tripdata_stations %>%   
ggplot(aes(x = factor(stations, level = c('start_station', 'end_station')), 
             y = stations_covered, fill = member_casual)) +
  theme_grey(base_size = 12) +
  theme(axis.title.x=element_blank()) +
  geom_col(width = 0.50, position = "dodge", show.legend = FALSE) +
  ylim(0, 800) +
  scale_fill_manual(values=c("purple", "gold")) +
  ylab("Stations Covered") +
  scale_x_discrete(labels = c('Start Station', 'End Station')) +
  geom_text(aes(label = stations_covered),
            position = position_dodge(width = 0.5), vjust = 2, size = 3.5)

head(station_names)
## # A tibble: 6 x 1
##   start_station_name                 
##   <chr>                              
## 1 Eckhart Park                       
## 2 Drake Ave & Fullerton Ave          
## 3 McClurg Ct & Erie St               
## 4 California Ave & Division St       
## 5 Rush St & Hubbard St               
## 6 Mies van der Rohe Way & Chicago Ave



Key takeaways:

  1. Bike usage has a seasonal trend.
    • We can prepare our marketing strategies during the ‘Fall’ and ‘Winter’ season.
    • In the colder months, we can gather unused bikes for maintenance in preparation for peak season.
    • We can also pilot test and implement software upgrades in downtrend season.
    • At the start of peak season, we can deploy marketing agents and reach-out to Casual riders for potential Member conversion sale.


  1. Casual riders are more active as the day approaches weekend.
    • In a timing perspective, we can increase the pace of marketing agents reach-out during Friday, Saturday & Sunday.


  1. Optimize the bike allocation and review the station usage.
    • Transfer bikes from low usage stations to in-demand stations to maximize revenue.
    • Review the schedule of bike transfers to stations.
    • We can propose to drop low usage stations to reduce cost.


Appendix:

  1. Bike trips data provided by Motivate International Inc.
  2. Chicago, Illinois weather data provided by Visual Crossing.


Data workflow summary:

      Data cleaning


      Data manipulation


      Data archival for project closure


Thank you. You have reached the end of content.