1. Ask
Business Task:
How do annual members and casual riders use Cyclistic bikes
differently?
Goal:
Discover usage patterns and provide insights that can help convert
casual riders into annual members.
2. Prepare Load libraries
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # For working with dates
library(janitor) # For cleaning column names
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Read both CSV files Data sources:
Divvy Q1 2019 (Divvy_Trips_2019_Q1.csv)
Divvy Q1 2020 (Divvy_Trips_2020_Q1.csv)
q1_2019 <- read_csv("Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): from_station_name, to_station_name, usertype, gender
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## dttm (2): start_time, end_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q1_2020 <- read_csv("Divvy_Trips_2020_Q1.csv")
## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Verified column names across both datasets
colnames(q1_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q1_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
glimpse(q1_2019)
## Rows: 365,069
## Columns: 12
## $ trip_id <dbl> 21742443, 21742444, 21742445, 21742446, 21742447, 21…
## $ start_time <dttm> 2019-01-01 00:04:37, 2019-01-01 00:08:13, 2019-01-0…
## $ end_time <dttm> 2019-01-01 00:11:07, 2019-01-01 00:15:34, 2019-01-0…
## $ bikeid <dbl> 2167, 4386, 1524, 252, 1170, 2437, 2708, 2796, 6205,…
## $ tripduration <dbl> 390, 441, 829, 1783, 364, 216, 177, 100, 1727, 336, …
## $ from_station_id <dbl> 199, 44, 15, 123, 173, 98, 98, 211, 150, 268, 299, 2…
## $ from_station_name <chr> "Wabash Ave & Grand Ave", "State St & Randolph St", …
## $ to_station_id <dbl> 84, 624, 644, 176, 35, 49, 49, 142, 148, 141, 295, 4…
## $ to_station_name <chr> "Milwaukee Ave & Grand Ave", "Dearborn St & Van Bure…
## $ usertype <chr> "Subscriber", "Subscriber", "Subscriber", "Subscribe…
## $ gender <chr> "Male", "Female", "Female", "Male", "Male", "Female"…
## $ birthyear <dbl> 1989, 1990, 1994, 1993, 1994, 1983, 1984, 1990, 1995…
glimpse(q1_2020)
## Rows: 426,887
## Columns: 13
## $ ride_id <chr> "EACB19130B0CDA4A", "8FED874C809DC021", "789F3C21E4…
## $ rideable_type <chr> "docked_bike", "docked_bike", "docked_bike", "docke…
## $ started_at <dttm> 2020-01-21 20:06:59, 2020-01-30 14:22:39, 2020-01-…
## $ ended_at <dttm> 2020-01-21 20:14:30, 2020-01-30 14:26:22, 2020-01-…
## $ start_station_name <chr> "Western Ave & Leland Ave", "Clark St & Montrose Av…
## $ start_station_id <dbl> 239, 234, 296, 51, 66, 212, 96, 96, 212, 38, 117, 1…
## $ end_station_name <chr> "Clark St & Leland Ave", "Southport Ave & Irving Pa…
## $ end_station_id <dbl> 326, 318, 117, 24, 212, 96, 212, 212, 96, 100, 632,…
## $ start_lat <dbl> 41.9665, 41.9616, 41.9401, 41.8846, 41.8856, 41.889…
## $ start_lng <dbl> -87.6884, -87.6660, -87.6455, -87.6319, -87.6418, -…
## $ end_lat <dbl> 41.9671, 41.9542, 41.9402, 41.8918, 41.8899, 41.884…
## $ end_lng <dbl> -87.6674, -87.6644, -87.6530, -87.6206, -87.6343, -…
## $ member_casual <chr> "member", "member", "member", "member", "member", "…
str(q1_2019)
## spc_tbl_ [365,069 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
## $ start_time : POSIXct[1:365069], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ end_time : POSIXct[1:365069], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ bikeid : num [1:365069] 2167 4386 1524 252 1170 ...
## $ tripduration : num [1:365069] 390 441 829 1783 364 ...
## $ from_station_id : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
## $ from_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ to_station_id : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
## $ to_station_name : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ usertype : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:365069] "Male" "Female" "Female" "Male" ...
## $ birthyear : num [1:365069] 1989 1990 1994 1993 1994 ...
## - attr(*, "spec")=
## .. cols(
## .. trip_id = col_double(),
## .. start_time = col_datetime(format = ""),
## .. end_time = col_datetime(format = ""),
## .. bikeid = col_double(),
## .. tripduration = col_number(),
## .. from_station_id = col_double(),
## .. from_station_name = col_character(),
## .. to_station_id = col_double(),
## .. to_station_name = col_character(),
## .. usertype = col_character(),
## .. gender = col_character(),
## .. birthyear = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(q1_2020)
## spc_tbl_ [426,887 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:426887] "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
## $ rideable_type : chr [1:426887] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : POSIXct[1:426887], format: "2020-01-21 20:06:59" "2020-01-30 14:22:39" ...
## $ ended_at : POSIXct[1:426887], format: "2020-01-21 20:14:30" "2020-01-30 14:26:22" ...
## $ start_station_name: chr [1:426887] "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
## $ start_station_id : num [1:426887] 239 234 296 51 66 212 96 96 212 38 ...
## $ end_station_name : chr [1:426887] "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
## $ end_station_id : num [1:426887] 326 318 117 24 212 96 212 212 96 100 ...
## $ start_lat : num [1:426887] 42 42 41.9 41.9 41.9 ...
## $ start_lng : num [1:426887] -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:426887] 42 42 41.9 41.9 41.9 ...
## $ end_lng : num [1:426887] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr [1:426887] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
3. Process The 2019 dataset uses different column names than 2020. We rename 2019 columns so both datasets match.
# Rename columns in q1_2019 to match q1_2020
q1_2019_clean <- q1_2019 %>%
rename(
ride_id = trip_id,
started_at = start_time,
ended_at = end_time,
rideable_type = bikeid,
start_station_id = from_station_id,
start_station_name = from_station_name,
end_station_id = to_station_id,
end_station_name = to_station_name,
member_casual = usertype
)
# Drop extra cols not in 2020 (gender, birth_year, trip_duration)
q1_2019_clean <- q1_2019_clean %>%
select(ride_id, rideable_type, started_at, ended_at,
start_station_name, start_station_id, end_station_name, end_station_id, member_casual)
# Drop extra columns from 2020 dataset that are not in 2019
q1_2020_clean <- q1_2020 %>%
select(-start_lat, -start_lng, -end_lat, -end_lng)
Created ride_length and day_of_week We calculate ride duration in minutes using difftime() We use lubridate::wday() to extract the day name (Sunday, Monday, etc.)
# To add ride_length and day of the week for 2019
# For 2019
q1_2019_clean <- q1_2019_clean %>%
mutate(
started_at = as.POSIXct(started_at, format="%Y-%m-%d %H:%M:%S"),
ended_at = as.POSIXct(ended_at, format="%Y-%m-%d %H:%M:%S"),
ride_length = as.numeric(difftime(ended_at, started_at, units="mins")),
day_of_week = as.numeric(format(started_at, "%u")) %% 7 + 1
)
# For 2020
q1_2020 <- q1_2020 %>%
mutate(
started_at = as.POSIXct(started_at, format="%Y-%m-%d %H:%M:%S"),
ended_at = as.POSIXct(ended_at, format="%Y-%m-%d %H:%M:%S"),
ride_length = as.numeric(difftime(ended_at, started_at, units="mins")),
day_of_week = as.numeric(format(started_at, "%u")) %% 7 + 1
)
Remove invalid rides
# Remove invalid rides
q1_2019_clean <- q1_2019_clean %>%
filter(ride_length > 0,
!is.na(start_station_name),
!is.na(end_station_name))
# Check for invalid date formats
summary(q1_2019_clean$started_at)
## Min. 1st Qu. Median
## "2019-01-01 00:04:37" "2019-01-23 05:26:54" "2019-02-25 07:52:56"
## Mean 3rd Qu. Max.
## "2019-02-19 21:43:15" "2019-03-17 16:52:47" "2019-03-31 23:53:48"
summary(q1_2020_clean$started_at)
## Min. 1st Qu. Median
## "2020-01-01 00:04:44" "2020-01-24 14:03:26" "2020-02-17 05:01:27"
## Mean 3rd Qu. Max.
## "2020-02-14 01:23:18" "2020-03-05 15:08:13" "2020-03-31 23:51:34"
# Confirm they are datetime objects
class(q1_2019_clean$started_at)
## [1] "POSIXct" "POSIXt"
class(q1_2020_clean$started_at)
## [1] "POSIXct" "POSIXt"
# Make ride_id a character in both datasets
q1_2019_clean <- q1_2019_clean %>%
mutate(ride_id = as.character(ride_id))
q1_2020_clean <- q1_2020_clean %>%
mutate(ride_id = as.character(ride_id))
# Ensure both are character
q1_2019_clean$rideable_type <- as.character(q1_2019_clean$rideable_type)
q1_2020_clean$rideable_type <- as.character(q1_2020_clean$rideable_type)
4. Analyze #### Merge Datasets
# Combine datasets
all_trips <- bind_rows(q1_2019_clean, q1_2020_clean)
#Standardize user types: change "Subscribers" to "member", "Customer" to "casual"
all_trips <- all_trips %>%
mutate(member_casual = recode(member_casual,
"Subscriber" = "member",
"Customer" = "casual"))
summary statistics
# Summary by rider type
all_trips %>%
group_by(member_casual) %>%
summarise(
count_rides = n(),
avg_ride = mean(ride_length, na.rm=TRUE),
median_ride = median(ride_length, na.rm=TRUE),
max_ride = max(ride_length, na.rm=TRUE),
min_ride = min(ride_length, na.rm=TRUE)
)
## # A tibble: 2 × 6
## member_casual count_rides avg_ride median_ride max_ride min_ride
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 casual 71643 61.9 23.4 177200. 1.02
## 2 member 720313 13.9 8.35 101607. 1.02
Distribution across days of the week
all_trips %>%
group_by(member_casual, day_of_week) %>%
summarise(num_rides = n(),
avg_duration = mean(ride_length, na.rm=TRUE))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 4
## # Groups: member_casual [2]
## member_casual day_of_week num_rides avg_duration
## <chr> <dbl> <int> <dbl>
## 1 casual 1 3766 41.6
## 2 casual 2 1892 44.5
## 3 casual 3 2728 40.5
## 4 casual 4 2489 52.0
## 5 casual 5 2920 134.
## 6 casual 6 3375 59.9
## 7 casual 7 5993 60.3
## 8 casual NA 48480 NaN
## 9 member 1 24233 16.8
## 10 member 2 48507 14.6
## 11 member 3 58277 14.4
## 12 member 4 57925 12.1
## 13 member 5 63983 12.0
## 14 member 6 59672 13.9
## 15 member 7 29309 17.0
## 16 member NA 378407 NaN
# Ride counts by month, keeps Jan-Dec order
all_trips %>%
mutate(month = format(started_at, "%B")) %>%
group_by(member_casual, month) %>%
summarise(
number_of_rides = n(),
avg_duration = mean(ride_length, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(match(month, month.name))
## # A tibble: 6 × 4
## member_casual month number_of_rides avg_duration
## <chr> <chr> <int> <dbl>
## 1 casual January 12387 47.3
## 2 member January 234769 15.7
## 3 casual February 15508 146.
## 4 member February 220263 13.4
## 5 casual March 43748 52.3
## 6 member March 265281 13.0
n_distinct(all_trips$ride_id)
## [1] 791956
Unique station usage
# Top 10 most popular start stations
all_trips %>%
group_by(start_station_name) %>%
summarise(rides = n()) %>%
arrange(desc(rides)) %>%
head(10)
## # A tibble: 10 × 2
## start_station_name rides
## <chr> <int>
## 1 Canal St & Adams St 14155
## 2 Clinton St & Washington Blvd 13640
## 3 Clinton St & Madison St 13362
## 4 Columbus Dr & Randolph St 9080
## 5 Kingsbury St & Kinzie St 9021
## 6 Canal St & Madison St 8208
## 7 Michigan Ave & Washington St 7525
## 8 Franklin St & Monroe St 7227
## 9 Larrabee St & Kingsbury St 6708
## 10 Clinton St & Lake St 6674
# Check for unusually long rides (> 1 day = 1440 minutes)
all_trips %>%
filter(ride_length > 1440) %>%
summarise(outliers = n())
## # A tibble: 1 × 1
## outliers
## <int>
## 1 192
Insights:
Casual riders take longer trips (higher average ride length) Members ride more often and mostly on weekdays. Casual riders ride more on weekends and in warmer months.
5. Share ##Rides by Day of Week
#Rides by Day of Week (Bar Chart)
all_trips %>%
group_by(member_casual, day_of_week) %>%
summarise(rides = n(), .groups = "drop") %>%
ggplot(aes(x = day_of_week, y = rides, fill = member_casual)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Casual Riders Use Bikes More on Weekends",
subtitle = "Members ride consistently on weekdays",
x = "Day of Week",
y = "Number of Rides",
fill = "Rider Type"
) +
theme_minimal() +
theme(plot.title = element_text(face="bold", size=14)) +
theme(
panel.grid = element_blank(), # removes all grid lines
axis.line = element_line(color = "black") # keeps axis lines
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Ride Length Distribution (Boxplot)
#Ride Length Distribution (Boxplot)
all_trips %>%
ggplot(aes(x = member_casual, y = ride_length, fill = member_casual)) +
geom_boxplot(outlier.colour = "red", outlier.alpha = 0.3) +
coord_cartesian(ylim = c(0, 60)) +
labs(
title = "Casual Riders Take Longer Trips",
subtitle = "Members have shorter, more consistent ride times",
x = "Rider Type",
y = "Ride Length (minutes)"
) +
theme_minimal() +
theme(plot.title = element_text(face="bold", size=14)) + theme_minimal() +
theme(
panel.grid = element_blank(), # removes all grid lines
axis.line = element_line(color = "black") # keeps axis lines
)
## Warning: Removed 426887 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Monthly Usage Trends
#Monthly Usage Trends (Line Chart)
all_trips %>%
mutate(month = format(started_at, "%b")) %>%
group_by(member_casual, month) %>%
summarise(rides = n(), .groups = "drop") %>%
ggplot(aes(x = month, y = rides, color = member_casual, group = member_casual)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
labs(
title = "Casual Riders Peak in Summer, Members Stay Steady Year-round",
subtitle = "Seasonal differences show opportunity for targeted campaigns",
x = "Month",
y = "Number of Rides",
color = "Rider Type"
) +
theme_minimal() +
theme(plot.title = element_text(face="bold", size=14)) +
theme(
panel.grid = element_blank(), # removes all grid lines
axis.line = element_line(color = "black") # keeps axis lines
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
6. Act
##Conclusion The analysis highlights clear behavioral differences between Cyclistic’s annual members and casual riders. Members ride more frequently, on shorter weekday trips, consistent with commuting or daily travel. Casual riders take longer rides, mainly on weekends, suggesting recreational use. These insights reveal opportunities to tailor marketing strategies that encourage casual riders to convert to members.
Recommendations: 1. Target casual riders with membership offers * Promote discounted annual plans during peak recreational seasons (spring/summer). * Emphasize the cost savings of membership for those who already ride frequently.