Cyclistic Bike-Share Case Study

Setting up my environment

Setting up my R environment by loading the “tidyverse” package.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Importing the Datasets

Importing the datasets from 2019 and 2020. These datasets will be used for data cleaning and analysis.

dv2019 <- read_csv("C:/Users/asus/OneDrive/Documents/GOOGLE CERTIFICATE DATA ANALYST/Divvy_Trips_2019_Q1/Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): start_time, end_time, from_station_name, to_station_name, usertype,...
## dbl (6): trip_id, bikeid, from_station_id, to_station_id, birthyear, day_of_...
## num (1): tripduration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dv2020 <- read_csv("C:/Users/asus/OneDrive/Documents/GOOGLE CERTIFICATE DATA ANALYST/Divvy_Trips_2020_Q1/Divvy_Trips_2020_Q1.csv") 
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 426887 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, started_at, ended_at, start_station_name, ...
## dbl  (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Column Verification

Checking the column names of the datasets to identify differences in variable names between the datasets before combining them.

colnames(dv2019)
##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"        
## [13] "ride_length"       "day_of_week"
colnames(dv2020)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"      "ride_length"        "day_of_week"

Aligning Data Structures

Renaming column names in 2019 dataset to match 2020 dataset structure to ensures consistency between the datasets and allows them to be combined later.

dv2019 <- dv2019 %>% 
  rename(ride_id = trip_id, 
         rideable_type = bikeid, 
         started_at = start_time, 
         ended_at = end_time, 
         start_station_name = from_station_name,
         start_station_id = from_station_id, 
         end_station_name = to_station_name, 
         end_station_id = to_station_id, 
         member_casual = usertype)

Checking Data Types

Checking the data type of each column in the data frame to check for inconsistencies.

str(dv2019)
## spc_tbl_ [365,069 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
##  $ started_at        : chr [1:365069] "1/1/2019 0:04" "1/1/2019 0:08" "1/1/2019 0:13" "1/1/2019 0:13" ...
##  $ ended_at          : chr [1:365069] "1/1/2019 0:11" "1/1/2019 0:15" "1/1/2019 0:27" "1/1/2019 0:43" ...
##  $ rideable_type     : num [1:365069] 2167 4386 1524 252 1170 ...
##  $ tripduration      : num [1:365069] 390 441 829 1783 364 ...
##  $ start_station_id  : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ member_casual     : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender            : chr [1:365069] "Male" "Female" "Female" "Male" ...
##  $ birthyear         : num [1:365069] 1989 1990 1994 1993 1994 ...
##  $ ride_length       : chr [1:365069] "0:06:30" "0:07:21" "0:13:49" "0:29:43" ...
##  $ day_of_week       : num [1:365069] 3 3 3 3 3 3 3 3 3 3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_character(),
##   ..   end_time = col_character(),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double(),
##   ..   ride_length = col_character(),
##   ..   day_of_week = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(dv2020)
## spc_tbl_ [426,887 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:426887] "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
##  $ rideable_type     : chr [1:426887] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : chr [1:426887] "1/21/2020 20:06" "1/30/2020 14:22" "1/9/2020 19:29" "1/6/2020 16:17" ...
##  $ ended_at          : chr [1:426887] "1/21/2020 20:14" "1/30/2020 14:26" "1/9/2020 19:32" "1/6/2020 16:25" ...
##  $ start_station_name: chr [1:426887] "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
##  $ start_station_id  : num [1:426887] 239 234 296 51 66 212 96 96 212 38 ...
##  $ end_station_name  : chr [1:426887] "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
##  $ end_station_id    : num [1:426887] 326 318 117 24 212 96 212 212 96 100 ...
##  $ start_lat         : num [1:426887] 42 42 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:426887] -87.7 -87.7 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:426887] 42 42 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:426887] -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:426887] "member" "member" "member" "member" ...
##  $ ride_length       : 'hms' num [1:426887] 00:07:31 00:03:43 00:02:51 00:08:49 ...
##   ..- attr(*, "units")= chr "secs"
##  $ day_of_week       : num [1:426887] 3 5 5 2 5 6 6 6 6 6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_character(),
##   ..   ended_at = col_character(),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character(),
##   ..   ride_length = col_time(format = ""),
##   ..   day_of_week = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Changing Data Types

The data types of selected variables for both datasets are adjusted to ensure consistency.

dv2020 <- dv2020 %>% 
  mutate(ride_length = as.difftime(ride_length))

dv2019 <- dv2019 %>% 
  mutate(ride_id = as.character(ride_id),
         rideable_type = as.character(rideable_type),
         ride_length = as.difftime(ride_length))

Merging datasets

Combining both datasets into one data frame.

all_trips <- bind_rows(dv2019,dv2020)

Removing Unnecessary Columns

Removing colummns that are not requird for the analysis.

all_trips <- all_trips %>% 
  select(-c(start_lat,start_lng,
            end_lat,end_lng,
            birthyear,gender,
            tripduration))

Inspecting the Dataset

Examining the structure and basic characteristics of the dataset-checking column names, number of rows, dimensions, previewing the first and last few rows, and reviewing the data structure and summary statistics.

colnames(all_trips)
##  [1] "ride_id"            "started_at"         "ended_at"          
##  [4] "rideable_type"      "start_station_id"   "start_station_name"
##  [7] "end_station_id"     "end_station_name"   "member_casual"     
## [10] "ride_length"        "day_of_week"
nrow(all_trips)
## [1] 791956
dim(all_trips)
## [1] 791956     11
head(all_trips)
## # A tibble: 6 × 11
##   ride_id  started_at ended_at rideable_type start_station_id start_station_name
##   <chr>    <chr>      <chr>    <chr>                    <dbl> <chr>             
## 1 21742443 1/1/2019 … 1/1/201… 2167                       199 Wabash Ave & Gran…
## 2 21742444 1/1/2019 … 1/1/201… 4386                        44 State St & Randol…
## 3 21742445 1/1/2019 … 1/1/201… 1524                        15 Racine Ave & 18th…
## 4 21742446 1/1/2019 … 1/1/201… 252                        123 California Ave & …
## 5 21742447 1/1/2019 … 1/1/201… 1170                       173 Mies van der Rohe…
## 6 21742448 1/1/2019 … 1/1/201… 2437                        98 LaSalle St & Wash…
## # ℹ 5 more variables: end_station_id <dbl>, end_station_name <chr>,
## #   member_casual <chr>, ride_length <time>, day_of_week <dbl>
tail(all_trips)
## # A tibble: 6 × 11
##   ride_id  started_at ended_at rideable_type start_station_id start_station_name
##   <chr>    <chr>      <chr>    <chr>                    <dbl> <chr>             
## 1 6F4D221… 3/10/2020… 3/10/20… docked_bike                675 HQ QR             
## 2 ADDAA33… 3/10/2020… 3/10/20… docked_bike                675 HQ QR             
## 3 82B10FA… 3/7/2020 … 3/7/202… docked_bike                161 Rush St & Superio…
## 4 AA0D5AA… 3/1/2020 … 3/1/202… docked_bike                141 Clark St & Lincol…
## 5 3296360… 3/7/2020 … 3/7/202… docked_bike                672 Franklin St & Ill…
## 6 064EC76… 3/8/2020 … 3/8/202… docked_bike                110 Dearborn St & Eri…
## # ℹ 5 more variables: end_station_id <dbl>, end_station_name <chr>,
## #   member_casual <chr>, ride_length <time>, day_of_week <dbl>
str(all_trips)
## tibble [791,956 × 11] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:791956] "21742443" "21742444" "21742445" "21742446" ...
##  $ started_at        : chr [1:791956] "1/1/2019 0:04" "1/1/2019 0:08" "1/1/2019 0:13" "1/1/2019 0:13" ...
##  $ ended_at          : chr [1:791956] "1/1/2019 0:11" "1/1/2019 0:15" "1/1/2019 0:27" "1/1/2019 0:43" ...
##  $ rideable_type     : chr [1:791956] "2167" "4386" "1524" "252" ...
##  $ start_station_id  : num [1:791956] 199 44 15 123 173 98 98 211 150 268 ...
##  $ start_station_name: chr [1:791956] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ end_station_id    : num [1:791956] 84 624 644 176 35 49 49 142 148 141 ...
##  $ end_station_name  : chr [1:791956] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ member_casual     : chr [1:791956] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ ride_length       : 'hms' num [1:791956] 00:06:30 00:07:21 00:13:49 00:29:43 ...
##   ..- attr(*, "units")= chr "secs"
##  $ day_of_week       : num [1:791956] 3 3 3 3 3 3 3 3 3 3 ...
summary(all_trips)
##    ride_id           started_at          ended_at         rideable_type     
##  Length:791956      Length:791956      Length:791956      Length:791956     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  start_station_id start_station_name end_station_id  end_station_name  
##  Min.   :  2.0    Length:791956      Min.   :  2.0   Length:791956     
##  1st Qu.: 77.0    Class :character   1st Qu.: 77.0   Class :character  
##  Median :174.0    Mode  :character   Median :174.0   Mode  :character  
##  Mean   :204.4                       Mean   :204.4                     
##  3rd Qu.:291.0                       3rd Qu.:291.0                     
##  Max.   :675.0                       Max.   :675.0                     
##                                      NA's   :1                         
##  member_casual       ride_length               day_of_week  
##  Length:791956      Min.   :00:00:00.000000   Min.   :1.00  
##  Class :character   1st Qu.:00:05:28.000000   1st Qu.:3.00  
##  Mode  :character   Median :00:08:57.000000   Median :4.00  
##                     Mean   :00:14:22.601495   Mean   :3.99  
##                     3rd Qu.:00:15:09.000000   3rd Qu.:5.00  
##                     Max.   :99:16:27.000000   Max.   :7.00  
##                     NA's   :405

Checking User Type Distribution

Counting the number of observations of each user type which should only have two categories: member and casual.

table(all_trips$member_casual)
## 
##     casual   Customer     member Subscriber 
##      48480      23163     378407     341906

Standardizing User Type Labels

Changing the user type labels to make the categories consistent.

all_trips<-all_trips %>% 
  mutate(member_casual = recode(member_casual,
         "Subscriber" = "member",
         "Customer" = "casual"))

Removing Bad Data

Removing data consisting test rides and negative ride length (incorrect data).

all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]

Ride Length Statistics by User Type

Calculating mean, median, maximum and minimum value of ride length according to user types.

rl_mean_member_casual <- all_trips_v2 %>% 
  aggregate(ride_length ~ member_casual,FUN = mean)
rl_mean_member_casual
##   member_casual    ride_length
## 1        casual 2679.6586 secs
## 2        member  696.0927 secs
rl_median_member_casual <- all_trips_v2 %>% 
  aggregate(ride_length ~ member_casual,FUN = median)
rl_median_member_casual
##   member_casual ride_length
## 1        casual        1390
## 2        member         508
rl_max_member_casual <- all_trips_v2 %>% 
  aggregate(ride_length ~ member_casual,FUN = max)
rl_max_member_casual
##   member_casual ride_length
## 1        casual 357387 secs
## 2        member 347120 secs
rl_min_member_casual <- all_trips_v2 %>% 
  aggregate(ride_length ~ member_casual,FUN = min)
rl_min_member_casual
##   member_casual ride_length
## 1        casual      2 secs
## 2        member      1 secs

Converting Day Numbers to Names

Changing numeric codes of day of the week (1-7) to actual day names.

all_trips_v2 <- all_trips_v2 %>% 
  mutate(started_at = mdy_hm(started_at)) %>% 
  mutate(day_of_week = recode(day_of_week,
        "1" = "Sunday",
        "2" = "Monday",
        "3" = "Tuesday",
        "4" = "Wednesday",
        "5" = "Thursday",
        "6" = "Friday",
        "7" = "Saturday"))
head(all_trips_v2$day_of_week)
## [1] "Tuesday" "Tuesday" "Tuesday" "Tuesday" "Tuesday" "Tuesday"

Average Ride Length by User Type and Day of the Week

Calculating mean ride length for each combination of user type and day of the week.

aggregate(all_trips_v2$ride_length ~
          all_trips_v2$member_casual + all_trips_v2$day_of_week,
          FUN = mean)
##    all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1                      casual                   Friday           2679.4308 secs
## 2                      member                   Friday            688.3539 secs
## 3                      casual                   Monday           2227.5281 secs
## 4                      member                   Monday            671.8463 secs
## 5                      casual                 Saturday           2578.1526 secs
## 6                      member                 Saturday            761.6883 secs
## 7                      casual                   Sunday           2898.9683 secs
## 8                      member                   Sunday            780.7767 secs
## 9                      casual                 Thursday           2658.6866 secs
## 10                     member                 Thursday            675.2459 secs
## 11                     casual                  Tuesday           2384.9871 secs
## 12                     member                  Tuesday            685.3928 secs
## 13                     casual                Wednesday           2953.6225 secs
## 14                     member                Wednesday            684.2431 secs

Converting Ride Length to Numeric

Converting to numeric allows calculations like mean, median and create charts for analysis.

all_trips_v2$ride_length <- as.numeric(all_trips_v2$ride_length, 
                                       units = "secs")

Comparative Analysis: Ride Volume vs. Trip Duration

These visualizations contrast the total number of rides with the average ride duration for each user segment. The data reveals a significant inverse relationship:

  • Annual Members demonstrate high-frequency usage, outdoing casual riders in total number of trips by 10x. This indicates a high level of loyalty and daily reliance on the service.
  • While Casual Riders has lower total number of rides, these users exhibit a 300% higher average trip duration compared to members. This suggests that casual usage is primarily focused on leisure or recreational activities rather than quick commutes.
all_trips_v2 %>% 
  drop_na(member_casual) %>% 
  group_by(member_casual) %>% 
  summarise(number_of_rides = n(),
            average_duration = round(mean(ride_length,na.rm=TRUE))) %>% 
  arrange(member_casual) %>% 
  ggplot(aes(x = member_casual,y = average_duration,fill = member_casual))+
         geom_col()+
         geom_text(aes(label = average_duration),vjust = 1.5,color = "black")+
         labs(title = "Average Duration Vs. Usertype", 
              subtitle = "Casual Riders have higher average tripduration than Annual Members by 3.8x",
              x = "User Type",
              y = "Average Duration (in seconds)")+
         scale_fill_manual(name = "User Type",
                           values = c("casual"="pink","member"="yellow"),
                           labels = c("Casual Riders","Annual Members"))

all_trips_v2 %>% 
  drop_na(member_casual) %>% 
  group_by(member_casual) %>% 
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length,na.rm = TRUE)) %>% 
  arrange(member_casual) %>% 
  ggplot(aes(x = member_casual,y = number_of_rides,fill = member_casual))+
    geom_col()+
    geom_text(aes(label = scales::comma(number_of_rides)),vjust = 1.5,color = "black")+
    labs(title = "Total Rides by User Type",
         subtitle = "Annual Members have significantly higher number of rides than Casual Riders by 10.6x.",
         x = "User Type",
         y = "Number of Rides")+
    scale_fill_manual(name="User Type",
                    values=c("casual"="lightblue","member"="lightgreen"),
                    labels=c("Casual Riders","Annual Members"))

Analysis of Total Number of Trips by User Type and Weekday

The data reveals a distinct difference in usage patterns between Annual Members and Casual Riders.

  • Annual Members exhibit a consistent, high number of usage during the workweek (Monday–Friday). This strongly suggests that members utilize Cyclistic for daily function purposes, such as daily commuting to and from work.
  • Casual Riders demonstrate a significant rise in total number of trips during the weekend. This indicates that casual usage is mainly leisure-driven, likely for tourism, or entertainment.
all_trips_v2 %>% 
  mutate(weekday = wday(started_at,label = TRUE)) %>%
  drop_na(weekday,member_casual) %>% 
  group_by(member_casual,weekday) %>% 
  summarise(number_of_rides = n(),
            average_duration = mean(ride_length,na.rm = TRUE),.groups = "drop") %>%
  arrange(member_casual,weekday) %>% 
  ggplot(aes(x = weekday,y = number_of_rides,fill = member_casual))+
    geom_col(position = "dodge")+
    labs(title = "Weekly Usage Pattern: Member vs. Casual",
         subtitle = "Members peak during the work week; Casual riders surge on weekends",
         x = "Day of the Week",
         y = "Number of Rides")+
    scale_fill_manual(name = "User Type",
                      values = c("casual"="blue","member"="green"),
                      labels = c("Casual Riders","Annual Members") )

Strategic Recommendations

Based on the analysis of Cyclistic’s 2019 and 2020 trip data, I recommend the following strategies to drive Casual-to-Member conversions:

  • Start a “Weekend Member” pass. Since the data shows Casual Riders thrive on weekends and has 3x longer average usage duration, offering a membership that focuses on weekend leisure benefits could connect with users who are not ready for a full annual commitment.

  • Gamification️. Collaborate with the marketing team to build “Member-Only Scenic Routes” within the app. Offer digital rewards or points for completing specific scenic paths. These points could eventually be redeemed for a discount on a full Annual Membership.

Exporting Files for Further Use (on Tableau)

Exporting the average duration and total number of rides grouping it by week day, user types and time of the day.

counts <- all_trips_v2 %>% 
  drop_na() %>% 
  mutate(hour_of_day = hour(started_at)) %>% 
  group_by(day_of_week,member_casual,hour_of_day) %>% 
  summarise(mean_ride_length = mean(ride_length,na.rm=TRUE),
            number_of_rides = n(),.groups = "drop")

write.csv(counts,"C:/Users/asus/OneDrive/Documents/GOOGLE CERTIFICATE DATA ANALYST/avg_ride_length.csv")