The purpose of this report is to analyze how annual members and casual riders use Cyclistic bikes differently and identify strategies to encourage casual riders to become members. Since this report is intended for the Director of Marketing, the focus is on actionable recommendations the marketing team can implement.
Data used in this analysis was provided under license by Motivate International Inc. (https://divvybikes.com/data-license-agreement)
I downloaded all the data and consolidated it into a single location for processing.
Because the data collection methods have changed over time, I standardized and cleaned the datasets to ensure consistency, including:
# Data input and manipulation
library(readr) # Reading CSV and other delimited files
library(dplyr) # Data wrangling: filtering, summarizing, joining, etc.
library(purrr) # Functional programming: working with lists and iteration
library(tidyr) # Data tidying: reshaping, pivoting, etc.
library(lubridate) # Date-time parsing and manipulation
# Visualization
library(ggplot2) # Data visualization (core graphics library)
library(patchwork) # Combine multiple ggplot2 plots into one layout
library(scales) # Enhancing scales in plots (e.g., formatting axes)
# Reporting
library(knitr) # Knit dynamic reports with R Markdown
library(rmarkdown) # Render R Markdown documents to HTML, PDF, Word, etc.
stations_2015 <- read.csv("Divvy_Stations_2015.csv")
stations_2016q3 <- read.csv("Divvy_Stations_2016_Q3.csv")
stations_2016q4 <- read.csv("Divvy_Stations_2016_Q4.csv")
# Start of function for divvy riders
coalesce_column <- function(df, possible_names) {
existing <- possible_names[possible_names %in% names(df)]
if (length(existing) == 0) return(NULL)
return(existing[1])
}
# Map of standard column names to possible variations
col_map <- list(
trip_id = c("trip_id", "ride_id", "01 - rental details rental id"),
start_time = c("start_time", "starttime", "started_at", "01 - rental details local start time"),
end_time = c("end_time", "stop_time", "stoptime", "ended_at", "01 - rental details local end time"),
bike_id = c("bike_id", "bikeid", "01 rental details bike id", "01 - rental details bike id"),
trip_duration = c("trip_duration", "tripduration", "01 rental details duration in seconds uncapped", "01 - rental details duration in seconds uncapped"),
from_station_id = c("from_station_id", "03 rental start station id", "start_station_id", "03 - rental start station id"),
from_station_name = c("from_station_name", "03 rental start station name", "start_station_name", "03 - rental start station name"),
to_station_id = c("to_station_id", "02 rental end station id", "end_station_id", "02 - rental end station id"),
to_station_name = c("to_station_name", "02 rental end station name", "end_station_name", "02 - rental end station name"),
user_type = c("user_type", "usertype", "user type", "member_casual"),
gender = c("gender", "member gender", "sex"),
birth_year = c("05 - member details member birthday year", "birth_year", "birthyear", "05 member details member birthday year"),
start_lng = c("start_lng", "start_lngitude", "start_lngt", "start long", "start lon"),
start_lat = c("start_lat", "start_latitude", "start_latd", "start lat"),
end_lng = c("end_lng", "end_lngitude", "end_lngt", "end long", "end lon"),
end_lat = c("end_lat", "end_latitude", "end_latd", "end lat"),
rideable_type = c("rideable_type")
)
# Main cleaning function
read_and_clean <- function(file) {
df <- read_csv(file, col_types = cols(.default = col_guess()))
df <- df %>% rename_with(tolower)
rename_list <- list()
for (new_name in names(col_map)) {
old_name <- coalesce_column(df, tolower(col_map[[new_name]]))
if (!is.null(old_name)) {
rename_list[[new_name]] <- old_name
}
}
df <- df %>% rename(!!!rename_list)
# Ensure consistent column types only if they exist
df <- df %>% mutate(
from_station_id = if ("from_station_id" %in% names(df)) as.character(from_station_id) else NULL,
to_station_id = if ("to_station_id" %in% names(df)) as.character(to_station_id) else NULL,
trip_id = if ("trip_id" %in% names(df)) as.character(trip_id) else NULL,
bike_id = if ("bike_id" %in% names(df)) as.character(bike_id) else NULL,
start_time = parse_date_time(start_time, orders = c("ymd HMS", "mdy HMS", "dmy HMS", "ymd HM", "mdy HM", "dmy HM")),
end_time = parse_date_time(end_time, orders = c("ymd HMS", "mdy HMS", "dmy HMS", "ymd HM", "mdy HM", "dmy HM")),
birth_year = if ("birth_year" %in% names(df)) as.integer(birth_year) else NULL,
start_lng = if ("start_lng" %in% names(df)) as.numeric(start_lng) else NULL,
start_lat = if ("start_lat" %in% names(df)) as.numeric(start_lat) else NULL,
end_lng = if ("end_lng" %in% names(df)) as.numeric(end_lng) else NULL,
end_lat = if ("end_lat" %in% names(df)) as.numeric(end_lat) else NULL
)
return(df)
}
# Run the full load across all CSVs
divvy_rides <- list.files("all_rides/", pattern = "*.csv", full.names = TRUE) %>%
map_df(read_and_clean)
# End of function for divvy riders
divvy_rides <- divvy_rides %>%
mutate(
trip_duration = as.numeric(difftime(end_time, start_time, units = "secs"))
)
# Find duplicated trip IDs
duplicate_ride_ids <- divvy_rides %>%
group_by(trip_id) %>%
filter(n() > 1) %>%
arrange(trip_id)
# After making sure that all the duplicates are actually the same rides, it's time to delete them
divvy_rides <- divvy_rides %>%
group_by(trip_id) %>%
filter(n() == 1) %>%
ungroup()
# Select the 1000 shortest valid trips (excluding 0-second trips)
shortest_1000 <- divvy_rides %>%
filter(trip_duration > 0) %>%
arrange(trip_duration) %>%
slice(1:1000)
# Select the 1000 longest trips
longest_1000 <- divvy_rides %>%
arrange(desc(trip_duration)) %>%
slice(1:1000)
I had to unfortunately delete one type since I had no tools to check either it should land in the subscribers or casual users.
# Check what unique values exist in the user_type column
unique(divvy_rides$user_type)
# Count the number of occurrences of each user type
user_type_counts <- divvy_rides %>%
group_by(user_type) %>%
summarise(count = n()) %>%
arrange(desc(count))
print(user_type_counts)
# Standardize user_type values:
# - Convert "Subscriber" to "member"
# - Convert "Customer" to "casual"
# - Remove entries labeled as "Dependent" (unclear categorization)
divvy_rides <- divvy_rides %>%
mutate(user_type = case_when(
user_type == "Subscriber" ~ "member",
user_type == "Customer" ~ "casual",
TRUE ~ user_type
)) %>%
filter(user_type != "Dependent")
# Remove rows where from_station_name is empty or NA
filtered_divvy <- divvy_rides %>%
filter(from_station_name != "", !is.na(from_station_name))
This report provides clear evidence of the differences in how members and casual riders use Cyclistic bikes, with actionable insights to guide marketing strategies aimed at converting casual riders to members. By targeting the most active age groups, leveraging popular stations, and aligning campaigns with seasonal peaks, Cyclistic can increase membership rates and strengthen long-term rider engagement.
Based on the analysis, I recommend the following strategies for the marketing team:
Prioritize marketing campaigns for individuals aged 18–35, with a particular emphasis on the 20–30 age bracket, as they represent the largest user segment.
Given the current gender distribution, initial marketing efforts may focus on men to increase membership conversions efficiently while simultaneously exploring strategies to engage women riders in the future.
Concentrate marketing resources and promotional activities at the most popular starting and ending stations used by casual riders, including: Streeter Dr & Grand Ave Michigan Ave & Oak St Lake Shore Dr & North Blvd Lake Shore Dr & Monroe St Streeter Dr & Illinois St Wells St & Concord Ln
Highlight the advantages of membership for shorter, routine commutes to attract casual riders who currently take longer, occasional trips.
Launch membership promotions in late spring and early summer to align with peak casual usage, capturing interest when ridership is highest.