As a Data Analyst working on the marketing analyst team at Cyclistic, a bike-share company in Chicago, The director of marketing believes the company’s future success depends on maximizing the number of annual memberships.
A clear goal has been set: Design marketing strategies aimed at converting casual riders into annual members. In order to do that, however, the team needs to better understand how annual members and casual riders use Cyclistic bikes differently.
The major interest is analyzing the Cyclistic historical bike trip data to identify trends. With these insights and recommendations approved by the Cyclistic Executives, a new marketing strategy to convert casual riders into annual members will be designed.
Question to answer that will guide the future marketing program: How do annual members and casual riders use Cyclistic Bikes differently?
I used Cyclistic’s historical 2019_Q1 and 2020_Q1 trip data sets to analyse and identify trends. For the purposes of this case study, the data sets are appropriate and will enable me to answer the business questions. The data has been made available by Motivate International Inc. under this license.
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
colnames(trip_q1_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(trip_q1_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
str(trip_q1_2019)
## spc_tbl_ [365,069 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
## $ started_at : POSIXct[1:365069], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ ended_at : POSIXct[1:365069], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ rideable_type : num [1:365069] 2167 4386 1524 252 1170 ...
## $ tripduration : num [1:365069] 390 441 829 1783 364 ...
## $ start_station_id : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ member_casual : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:365069] "Male" "Female" "Female" "Male" ...
## $ birthyear : num [1:365069] 1989 1990 1994 1993 1994 ...
## - attr(*, "spec")=
## .. cols(
## .. trip_id = col_double(),
## .. start_time = col_datetime(format = ""),
## .. end_time = col_datetime(format = ""),
## .. bikeid = col_double(),
## .. tripduration = col_number(),
## .. from_station_id = col_double(),
## .. from_station_name = col_character(),
## .. to_station_id = col_double(),
## .. to_station_name = col_character(),
## .. usertype = col_character(),
## .. gender = col_character(),
## .. birthyear = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(trip_q1_2020)
## spc_tbl_ [426,887 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:426887] "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
## $ rideable_type : chr [1:426887] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : POSIXct[1:426887], format: "2020-01-21 20:06:59" "2020-01-30 14:22:39" ...
## $ ended_at : POSIXct[1:426887], format: "2020-01-21 20:14:30" "2020-01-30 14:26:22" ...
## $ start_station_name: chr [1:426887] "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
## $ start_station_id : num [1:426887] 239 234 296 51 66 212 96 96 212 38 ...
## $ end_station_name : chr [1:426887] "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
## $ end_station_id : num [1:426887] 326 318 117 24 212 96 212 212 96 100 ...
## $ start_lat : num [1:426887] 42 42 41.9 41.9 41.9 ...
## $ start_lng : num [1:426887] -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:426887] 42 42 41.9 41.9 41.9 ...
## $ end_lng : num [1:426887] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr [1:426887] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
ride_id and rideable_type to
character so that they can stack correctlytrip_q1_2019 <- mutate(trip_q1_2019, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
all_trips <- bind_rows(trip_q1_2019,trip_q1_2020)
all_trips <- all_trips %>%
select(-c(start_lat,start_lng,end_lat,end_lng,birthyear,gender, "tripduration"))
colnames(all_trips)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
nrow(all_trips)
## [1] 791956
dim(all_trips)
## [1] 791956 9
head(all_trips)
## # A tibble: 6 × 9
## ride_id started_at ended_at rideable_type start_station_id
## <chr> <dttm> <dttm> <chr> <dbl>
## 1 217424… 2019-01-01 00:04:37 2019-01-01 00:11:07 2167 199
## 2 217424… 2019-01-01 00:08:13 2019-01-01 00:15:34 4386 44
## 3 217424… 2019-01-01 00:13:23 2019-01-01 00:27:12 1524 15
## 4 217424… 2019-01-01 00:13:45 2019-01-01 00:43:28 252 123
## 5 217424… 2019-01-01 00:14:52 2019-01-01 00:20:56 1170 173
## 6 217424… 2019-01-01 00:15:33 2019-01-01 00:19:09 2437 98
## # ℹ 4 more variables: start_station_name <chr>, end_station_id <dbl>,
## # end_station_name <chr>, member_casual <chr>
tail(all_trips)
## # A tibble: 6 × 9
## ride_id started_at ended_at rideable_type start_station_id
## <chr> <dttm> <dttm> <chr> <dbl>
## 1 6F4D22… 2020-03-10 10:40:27 2020-03-10 10:40:29 docked_bike 675
## 2 ADDAA3… 2020-03-10 10:40:06 2020-03-10 10:40:07 docked_bike 675
## 3 82B10F… 2020-03-07 15:25:55 2020-03-07 16:14:03 docked_bike 161
## 4 AA0D5A… 2020-03-01 13:12:38 2020-03-01 13:38:29 docked_bike 141
## 5 329636… 2020-03-07 18:02:45 2020-03-07 18:13:18 docked_bike 672
## 6 064EC7… 2020-03-08 13:03:57 2020-03-08 13:32:27 docked_bike 110
## # ℹ 4 more variables: start_station_name <chr>, end_station_id <dbl>,
## # end_station_name <chr>, member_casual <chr>
str(all_trips)
## tibble [791,956 × 9] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:791956] "21742443" "21742444" "21742445" "21742446" ...
## $ started_at : POSIXct[1:791956], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ ended_at : POSIXct[1:791956], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ rideable_type : chr [1:791956] "2167" "4386" "1524" "252" ...
## $ start_station_id : num [1:791956] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:791956] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:791956] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:791956] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ member_casual : chr [1:791956] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
summary(all_trips)
## ride_id started_at
## Length:791956 Min. :2019-01-01 00:04:37.00
## Class :character 1st Qu.:2019-02-28 17:04:04.75
## Mode :character Median :2020-01-07 12:48:50.50
## Mean :2019-09-01 11:58:08.35
## 3rd Qu.:2020-02-19 19:31:54.75
## Max. :2020-03-31 23:51:34.00
##
## ended_at rideable_type start_station_id
## Min. :2019-01-01 00:11:07.00 Length:791956 Min. : 2.0
## 1st Qu.:2019-02-28 17:15:58.75 Class :character 1st Qu.: 77.0
## Median :2020-01-07 13:02:50.00 Mode :character Median :174.0
## Mean :2019-09-01 12:17:52.17 Mean :204.4
## 3rd Qu.:2020-02-19 19:51:54.50 3rd Qu.:291.0
## Max. :2020-05-19 20:10:34.00 Max. :675.0
##
## start_station_name end_station_id end_station_name member_casual
## Length:791956 Min. : 2.0 Length:791956 Length:791956
## Class :character 1st Qu.: 77.0 Class :character Class :character
## Mode :character Median :174.0 Mode :character Mode :character
## Mean :204.4
## 3rd Qu.:291.0
## Max. :675.0
## NA's :1
In the member_casual column, there are two names for
members(‘member’ and ‘Subscribers’) and two names for casual
riders(‘casual’ and ‘Customer’). We need to consolidate that from four
to two labels.
The data can only be aggregated by rider level, which is too granular. We will want to add some additional columns of data–such as Day, Month, Year–that provide additional opportunities to aggregate the data.
We will want to add the length of ride since the 2020_Q1 data did
not have the tripduration column. We will add “ride_length”
to the entire data frame for consistency.
There are some rides where trip duration shows up as negative, including several hundred rides where Cyclistic took bikes out of circulation for Quality Control reasons. We will want to delete these rides.
member_casual column, replace “Subscriber” with
“member” and “Customer” with “casual”.Before 2020, Cyclistic used different labels for these two types of riders…we will want to make our data frame consistent with their current nomenclature.
N.B.: “Level” is a special property of a column that is retained even if a subset does not contain any values from a specific level
Begin by seeing how many observations fall under each user type
table(all_trips$member_casual)
##
## casual Customer member Subscriber
## 48480 23163 378407 341906
Reassign to the desired values (we will go with the current 2020 labels)
all_trips <- all_trips %>%
mutate(member_casual = recode(member_casual
,"Subscriber"="member"
,"Customer"="casual"))
Check to see if the proper number of observations were reassigned
table(all_trips$member_casual)
##
## casual member
## 71643 720313
This will allow us to aggregate ride data for each day, month or year…Before completing these operations we could only aggregate at the ride level
all_trips$date <- as.Date(all_trips$started_at)# the default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date),"%m")
all_trips$day <- format(as.Date(all_trips$date),"%d")
all_trips$year <- format(as.Date(all_trips$date),"%Y")
all_trips$day_of_the_week <- format(as.Date(all_trips$date),"%A")
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)
str(all_trips)
## tibble [791,956 × 15] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:791956] "21742443" "21742444" "21742445" "21742446" ...
## $ started_at : POSIXct[1:791956], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ ended_at : POSIXct[1:791956], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ rideable_type : chr [1:791956] "2167" "4386" "1524" "252" ...
## $ start_station_id : num [1:791956] 199 44 15 123 173 98 98 211 150 268 ...
## $ start_station_name: chr [1:791956] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ end_station_id : num [1:791956] 84 624 644 176 35 49 49 142 148 141 ...
## $ end_station_name : chr [1:791956] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ member_casual : chr [1:791956] "member" "member" "member" "member" ...
## $ date : Date[1:791956], format: "2019-01-01" "2019-01-01" ...
## $ month : chr [1:791956] "01" "01" "01" "01" ...
## $ day : chr [1:791956] "01" "01" "01" "01" ...
## $ year : chr [1:791956] "2019" "2019" "2019" "2019" ...
## $ day_of_the_week : chr [1:791956] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
## $ ride_length : 'difftime' num [1:791956] 390 441 829 1783 ...
## ..- attr(*, "units")= chr "secs"
ride_length from factor to numeric so we
can run calculations on the datais.factor(all_trips$ride_length)
## [1] FALSE
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE
The data frame includes a few hundred entries where bikes were taken out of dock by Cyclistic or ride_length was negative. We will create a new version of the data frame (v2) since data is being removed
all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]
ride_length (measured in
seconds)summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 331 539 1189 912 10632022
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 5372.7839
## 2 member 795.2523
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 1393
## 2 member 508
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 10632022
## 2 member 6096428
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 2
## 2 member 1
aggregate(all_trips_v2,ride_length ~ member_casual + day_of_the_week, FUN = mean)
## member_casual day_of_the_week ride_length
## 1 casual Friday 6090.7373
## 2 member Friday 796.7338
## 3 casual Monday 4752.0504
## 4 member Monday 822.3112
## 5 casual Saturday 4950.7708
## 6 member Saturday 974.0730
## 7 casual Sunday 5061.3044
## 8 member Sunday 972.9383
## 9 casual Thursday 8451.6669
## 10 member Thursday 707.2093
## 11 casual Tuesday 4561.8039
## 12 member Tuesday 769.4416
## 13 casual Wednesday 4480.3724
## 14 member Wednesday 711.9838
all_trips_v2$day_of_the_week <- ordered(all_trips_v2$day_of_the_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
aggregate(all_trips_v2,ride_length ~ member_casual + day_of_the_week, FUN = mean)
## member_casual day_of_the_week ride_length
## 1 casual Sunday 5061.3044
## 2 member Sunday 972.9383
## 3 casual Monday 4752.0504
## 4 member Monday 822.3112
## 5 casual Tuesday 4561.8039
## 6 member Tuesday 769.4416
## 7 casual Wednesday 4480.3724
## 8 member Wednesday 711.9838
## 9 casual Thursday 8451.6669
## 10 member Thursday 707.2093
## 11 casual Friday 6090.7373
## 12 member Friday 796.7338
## 13 casual Saturday 4950.7708
## 14 member Saturday 974.0730
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual,weekday) %>% #groups by user type and weekday
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>% #calculates the number of rides and average duration
arrange(member_casual,weekday)#sorts
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = member_casual, y = number_of_rides, fill = weekday )) +
geom_col(position = "fill") +
scale_y_continuous(labels = scales::percent) + # Format y-axis as percentages
labs(
title = "Usage Pattern by Day of the Week",
x = "Day of the Week",
y = "Number of Rides",
fill = "Rider Type"
) +
theme_minimal()
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
This shows the usage patterns of casual and member riders by day of the week. Member riders tend to gravitate towards weekdays while casual riders gravitate towards weekends.
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x= weekday, y= average_duration, fill = member_casual))+
geom_col(position = "dodge") +
labs(
title = "Ride Length Habits",
x = "Day of the Week",
y = "Average Ride Length",
fill = "Rider Type"
) +
theme_minimal()
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
This shows the average ride length of each rider type. Casual riders tend to have longer rides than Member riders on average.
all_trips_v2 that we will
visualize in Microsoft Power BI to create a dashboard.Cyclistic Bike-Share Analysis Dashboard
Key Insights & Data Driven Recommendations