Ask Phase

The purpose of this report is to analyze how annual members and casual riders use Cyclistic bikes differently and identify strategies to encourage casual riders to become members. Since this report is intended for the Director of Marketing, the focus is on actionable recommendations the marketing team can implement.

Data used in this analysis was provided under license by Motivate International Inc. (https://divvybikes.com/data-license-agreement)

Prepare and Process Phase

I downloaded all the data and consolidated it into a single location for processing.

Because the data collection methods have changed over time, I standardized and cleaned the datasets to ensure consistency, including:

Here is the list of libraries I’ve used:

Click to view: libraries used
# Data input and manipulation
library(readr)       # Reading CSV and other delimited files
library(dplyr)       # Data wrangling: filtering, summarizing, joining, etc.
library(purrr)       # Functional programming: working with lists and iteration
library(tidyr)       # Data tidying: reshaping, pivoting, etc.
library(lubridate)   # Date-time parsing and manipulation

# Visualization
library(ggplot2)     # Data visualization (core graphics library)
library(patchwork)   # Combine multiple ggplot2 plots into one layout
library(scales)      # Enhancing scales in plots (e.g., formatting axes)

# Reporting
library(knitr)       # Knit dynamic reports with R Markdown
library(rmarkdown)   # Render R Markdown documents to HTML, PDF, Word, etc.

First I’ve put the stations data in different data frames:

Click to view: station data loading code
stations_2015 <- read.csv("Divvy_Stations_2015.csv") 
stations_2016q3 <- read.csv("Divvy_Stations_2016_Q3.csv") 
stations_2016q4 <- read.csv("Divvy_Stations_2016_Q4.csv") 

Second I’ve put all the data with bikes rides in one place, making sure all collumns are the same type of values and names:

Click to view: Function to load and clean Divvy data
# Start of function for divvy riders 
coalesce_column <- function(df, possible_names) {
  existing <- possible_names[possible_names %in% names(df)]
  if (length(existing) == 0) return(NULL)
  return(existing[1])
}

# Map of standard column names to possible variations
col_map <- list(
  trip_id = c("trip_id", "ride_id", "01 - rental details rental id"),
  start_time = c("start_time", "starttime", "started_at", "01 - rental details local start time"),
  end_time = c("end_time", "stop_time", "stoptime", "ended_at", "01 - rental details local end time"),
  bike_id = c("bike_id", "bikeid", "01 rental details bike id", "01 - rental details bike id"),
  trip_duration = c("trip_duration", "tripduration", "01 rental details duration in seconds uncapped", "01 - rental details duration in seconds uncapped"),
  from_station_id = c("from_station_id", "03 rental start station id", "start_station_id", "03 - rental start station id"),
  from_station_name = c("from_station_name", "03 rental start station name", "start_station_name", "03 - rental start station name"),
  to_station_id = c("to_station_id", "02 rental end station id", "end_station_id", "02 - rental end station id"),
  to_station_name = c("to_station_name", "02 rental end station name", "end_station_name", "02 - rental end station name"),
  user_type = c("user_type", "usertype", "user type", "member_casual"),
  gender = c("gender", "member gender", "sex"),
  birth_year = c("05 - member details member birthday year", "birth_year", "birthyear", "05 member details member birthday year"),
  start_lng = c("start_lng", "start_lngitude", "start_lngt", "start long", "start lon"),
  start_lat = c("start_lat", "start_latitude", "start_latd", "start lat"),
  end_lng = c("end_lng", "end_lngitude", "end_lngt", "end long", "end lon"),
  end_lat = c("end_lat", "end_latitude", "end_latd", "end lat"),
  rideable_type = c("rideable_type")
)

# Main cleaning function
read_and_clean <- function(file) {
  df <- read_csv(file, col_types = cols(.default = col_guess()))
  df <- df %>% rename_with(tolower)
  
  rename_list <- list()
  for (new_name in names(col_map)) {
    old_name <- coalesce_column(df, tolower(col_map[[new_name]]))
    if (!is.null(old_name)) {
      rename_list[[new_name]] <- old_name
    }
  }
  
  df <- df %>% rename(!!!rename_list)
  
# Ensure consistent column types only if they exist
  df <- df %>% mutate(
    from_station_id = if ("from_station_id" %in% names(df)) as.character(from_station_id) else NULL,
    to_station_id = if ("to_station_id" %in% names(df)) as.character(to_station_id) else NULL,
    trip_id = if ("trip_id" %in% names(df)) as.character(trip_id) else NULL,
    bike_id = if ("bike_id" %in% names(df)) as.character(bike_id) else NULL,
    start_time = parse_date_time(start_time, orders = c("ymd HMS", "mdy HMS", "dmy HMS", "ymd HM", "mdy HM", "dmy HM")),
    end_time   = parse_date_time(end_time,   orders = c("ymd HMS", "mdy HMS", "dmy HMS", "ymd HM", "mdy HM", "dmy HM")),
    birth_year = if ("birth_year" %in% names(df)) as.integer(birth_year) else NULL,
    start_lng = if ("start_lng" %in% names(df)) as.numeric(start_lng) else NULL,
    start_lat = if ("start_lat" %in% names(df)) as.numeric(start_lat) else NULL,
    end_lng = if ("end_lng" %in% names(df)) as.numeric(end_lng) else NULL,
    end_lat = if ("end_lat" %in% names(df)) as.numeric(end_lat) else NULL
  )
  
  return(df)
}

# Run the full load across all CSVs
divvy_rides <- list.files("all_rides/", pattern = "*.csv", full.names = TRUE) %>%
  map_df(read_and_clean)

# End of function for divvy riders

Than I’ve calculated the missing values in trip duration in missing cells:

Click to view: code for calculating trip duration
divvy_rides <- divvy_rides %>%
  mutate(
    trip_duration = as.numeric(difftime(end_time, start_time, units = "secs"))
  )

Next I’ve checked for duplicates and later deleted all of them:

Click to view: code for handling duplicate ride IDs
# Find duplicated trip IDs
duplicate_ride_ids <- divvy_rides %>%
  group_by(trip_id) %>%
  filter(n() > 1) %>%
  arrange(trip_id)

# After making sure that all the duplicates are actually the same rides, it's time to delete them
divvy_rides <- divvy_rides %>%
  group_by(trip_id) %>%
  filter(n() == 1) %>%
  ungroup()

Next I’ve checked for outliars in trip duration.

Click to view code for trip duration extremes
# Select the 1000 shortest valid trips (excluding 0-second trips)
shortest_1000 <- divvy_rides %>%
  filter(trip_duration > 0) %>%      
  arrange(trip_duration) %>%
  slice(1:1000)

# Select the 1000 longest trips
longest_1000 <- divvy_rides %>%
  arrange(desc(trip_duration)) %>%
  slice(1:1000)

Next I’ve corrected the user types so they all are in 2 types of cathegories:

I had to unfortunately delete one type since I had no tools to check either it should land in the subscribers or casual users.

Click to view: code for inspecting and cleaning user types
# Check what unique values exist in the user_type column
unique(divvy_rides$user_type)

# Count the number of occurrences of each user type
user_type_counts <- divvy_rides %>%
  group_by(user_type) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

print(user_type_counts)

# Standardize user_type values:
# - Convert "Subscriber" to "member"
# - Convert "Customer" to "casual"
# - Remove entries labeled as "Dependent" (unclear categorization)
divvy_rides <- divvy_rides %>%
  mutate(user_type = case_when(
    user_type == "Subscriber" ~ "member",
    user_type == "Customer" ~ "casual",
    TRUE ~ user_type
  )) %>%
  filter(user_type != "Dependent")

Lastly I’ve filtered out empty and missing station name:

Click to view: code for filtering out empty or missing station names
# Remove rows where from_station_name is empty or NA
filtered_divvy <- divvy_rides %>%
  filter(from_station_name != "", !is.na(from_station_name))

Analize and Share Phase

With the data prepared, I created a series of visualizations to explore differences in behavior between members and casual riders.

Top 5 Starting Stations by User Type

To identify where rides begin most frequently, I analyzed the top 5 starting stations for each user type.

Observation: Streeter Dr & Grand Ave is the most popular starting location, with casual riders using it significantly more than members. This reflects the location’s popularity among tourists, who are more likely to use casual passes.

Click to view the R code used to generate the Top 5 Starting Stations plot
# Load necessary libraries
library(ggplot2)
library(dplyr)

# (Assuming start_counts is already created and filtered to top 5)
top5_data <- start_counts %>%
  group_by(from_station_name) %>%
  summarise(n = sum(n)) %>%
  slice_max(n, n = 5)

filtered_start_counts <- start_counts %>%
  filter(from_station_name %in% top5_data$from_station_name)

# Save the plot to a file
p <- ggplot(filtered_start_counts, aes(x = reorder(from_station_name, -n), y = n, fill = user_type)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("casual" = "green", "member" = "blue")) +  # Set custom colors
  labs(
    title = "Top 5 Starting Stations by User Type",
    x = "Station Name",
    y = "Number of Rides",
    fill = "User Type"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Save the plot
ggsave("top5_start_plot.png", plot = p, width = 8, height = 5)

Top 5 Ending Stations by User Type

Next, I examined where rides most frequently end.

Observation: Similar patterns were found, with casual riders ending trips more often at popular tourist spots, while members use a wider range of end stations.

Click to see the R code used to geberate the Top 5 Finish Stations plot
library(dplyr)
library(ggplot2)

# Remove empty or NA station names
filtered_divvy <- divvy_rides %>%
  filter(to_station_name != "", !is.na(to_station_name))

# Get top 5 most popular end stations (non-empty, non-NA)
top_ends <- filtered_divvy %>%
  count(to_station_id, to_station_name, sort = TRUE) %>%
  slice_head(n = 5)

# Filter original data to only include top 5
top_end_data <- filtered_divvy %>%
  filter(to_station_id %in% top_ends$to_station_id)

# Count rides per station per user_type
end_counts <- top_end_data %>%
  count(to_station_name, user_type)

# Create plot
end_plot <- ggplot(end_counts, aes(x = reorder(to_station_name, -n), y = n, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(
    title = "Top 5 End Stations by User Type",
    x = "Station Name",
    y = "Number of Rides",
    fill = "User Type"
  ) +
  scale_fill_manual(values = c("casual" = "green", "member" = "blue")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Save plot to file
ggsave("top5_end_stations_plot.png", plot = end_plot, width = 8, height = 5)

Total Number of Rides by User Type

To compare usage volume, I analyzed the total number of rides by each user type.

Observation: Members account for a significantly higher number of rides than casual users, indicating strong engagement among members despite casual riders dominating certain popular stations.

Click to view the code used to generate this visualization
library(ggplot2)
library(dplyr)

# Count the number of users by user_type
user_counts <- divvy_rides %>%
  count(user_type)

# Create and save the plot
user_type_plot <- ggplot(user_counts, aes(x = user_type, y = n, fill = user_type)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  labs(
    title = "Number of Rides by User Type",
    x = "User Type",
    y = "Number of Rides"
  ) +
  scale_fill_manual(values = c("member" = "blue", "casual" = "green")) +
  theme_minimal()

# Save the plot as PNG
ggsave("user_type_plot.png", user_type_plot, width = 6, height = 4, dpi = 300)

Average Trip Duration by User Type

I analyzed average trip duration across user types to understand differences in usage patterns.

Observation: Casual riders tend to take longer trips on average compared to members, who typically use bikes for shorter, more routine commutes.

Click to view code used to generate this plot
# Load necessary libraries
library(dplyr)
library(ggplot2)

# Calculate average trip duration by user_type
avg_duration <- divvy_rides %>%
  filter(trip_duration > 0) %>%
  group_by(user_type) %>%
  summarise(avg_trip_duration = mean(trip_duration, na.rm = TRUE))

# Create the plot
duration_plot <- ggplot(avg_duration, aes(x = user_type, y = avg_trip_duration, fill = user_type)) +
  geom_col() +
  scale_fill_manual(values = c("casual" = "green", "member" = "blue")) +
  labs(
    title = "Average Trip Duration by User Type",
    x = "User Type",
    y = "Average Duration (seconds)",
    fill = "User Type"
  ) +
  theme_minimal()

# Save the plot
ggsave("avg_trip_duration_plot.png", plot = duration_plot, width = 8, height = 5)

Monthly Rides by User Type (2019–2023)

To examine seasonal and long-term trends, I analyzed monthly rides by user type from 2019 to 2023.

Observation: Ridership shows clear seasonal trends, with peaks in the summer months for both user types. Casual ridership is more variable and seasonally dependent, while member usage is more consistent year-round.

Monthly Rides by User Type
Monthly Rides by User Type
Click to view: code used to generate this plot
# Load necessary libraries
library(dplyr)
library(ggplot2)

# Ensure start_time is in POSIXct format
divvy_rides <- divvy_rides %>%
  mutate(start_time = as.POSIXct(start_time))

# Filter for dates between Jan 1, 2019 and Dec 31, 2023
filtered_by_date <- divvy_rides %>%
  filter(start_time >= as.Date("2019-01-01") & start_time <= as.Date("2023-12-31"))

# Create a new column rounded to the first of each month
filtered_by_date <- filtered_by_date %>%
  mutate(month = as.Date(floor_date(start_time, unit = "month")))

# Count rides per user type per month
monthly_counts <- filtered_by_date %>%
  group_by(month, user_type) %>%
  summarise(rides = n(), .groups = "drop")

# Create the line plot with x-axis showing every 3 months
monthly_plot <- ggplot(monthly_counts, aes(x = month, y = rides, color = user_type)) +
  geom_line(size = 1.2) +
  scale_color_manual(values = c("member" = "blue", "casual" = "green")) +
  scale_x_date(
    breaks = seq(as.Date("2019-01-01"), as.Date("2023-12-01"), by = "3 months"),
    date_labels = "%b\n%Y"
  ) +
  labs(
    title = "Monthly Rides by User Type (2019–2023)",
    x = "Month",
    y = "Number of Rides",
    color = "User Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text.x = element_text(size = 6, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

# Save the updated plot
ggsave("monthly_rides_by_user_type_2019_2023.png", plot = monthly_plot, width = 10, height = 5, dpi = 300)

Bike Types Used by User Type

I examined which bike types are used by each user type to understand preferences and inform operational planning.

Observation: Both user groups use similar bike types, but the distribution can provide insight for bike allocation and targeted marketing around new bike types or models.

Click to see the R code used for this visualization
# Load necessary libraries
library(dplyr)
library(ggplot2)

# Filter out rows with missing or empty rideable_type
bike_type_data <- divvy_rides %>%
  filter(!is.na(rideable_type), rideable_type != "") %>%
  group_by(user_type, rideable_type) %>%
  summarise(count = n(), .groups = "drop")

# Create the bar plot
bike_plot <- ggplot(bike_type_data, aes(x = rideable_type, y = count, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(
    title = "Bike Types Used by Different User Types",
    x = "Bike Type",
    y = "Number of Rides",
    fill = "User Type"
  ) +
  scale_fill_manual(values = c("casual" = "green", "member" = "blue")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

# Save the plot
ggsave("bike_type_by_user_type.png", bike_plot, width = 8, height = 5, dpi = 300)

Gender Distribution by User Type

I analyzed the gender distribution among riders by user type.

Observation: The majority of rides are taken by men across both user types, suggesting opportunities for targeted marketing toward women to increase engagement.

Show Code
# Filter and group the data
gender_data <- divvy_rides %>%
  filter(!is.na(gender), gender != "") %>%
  group_by(user_type, gender) %>%
  summarise(count = n(), .groups = "drop")

# Create the plot
gender_plot <- ggplot(gender_data, aes(x = gender, y = count, fill = user_type)) +
  geom_col(position = "dodge") +
  labs(
    title = "Gender Distribution by User Type",
    x = "Gender",
    y = "Number of Rides",
    fill = "User Type"
  ) +
  scale_fill_manual(values = c("casual" = "green", "member" = "blue")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

# Save to file
ggsave("gender_by_user_type.png", gender_plot, width = 8, height = 5, dpi = 300)

Age at Time of Ride by User Type

Lastly, I examined age distribution at the time of rides to determine which age groups dominate usage.

Observation: Riders between the ages of 18 and 35 make up the majority of users, particularly in the 20–30 age range, indicating a key demographic for membership conversion efforts. Age at Time of Ride by User Type

Show Code
library(dplyr)
library(ggplot2)
library(lubridate)

# Step 1: Compute age at time of ride
age_data <- divvy_rides %>%
  filter(!is.na(birth_year), birth_year != "") %>%
  mutate(
    birth_year = as.numeric(birth_year),
    ride_year = year(ymd_hms(start_time)),
    age = ride_year - birth_year
  ) %>%
  filter(age >= 10, age <= 100)

# Step 2: Count number of users per age per user type
age_distribution <- age_data %>%
  group_by(age, user_type) %>%
  summarise(user_count = n(), .groups = "drop")

# Step 3: Create the line plot
age_plot <- ggplot(age_distribution, aes(x = age, y = user_count, color = user_type)) +
  geom_line(size = 1.2) +
  labs(
    title = "Age at Time of Ride by User Type",
    x = "Age",
    y = "Number of Users",
    color = "User Type"
  ) +
  scale_color_manual(values = c("casual" = "green", "member" = "blue")) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

# Step 4: Save the plot
ggsave("age_at_ride_time_plot.png", age_plot, width = 8, height = 5, dpi = 300)

Conclusion

This report provides clear evidence of the differences in how members and casual riders use Cyclistic bikes, with actionable insights to guide marketing strategies aimed at converting casual riders to members. By targeting the most active age groups, leveraging popular stations, and aligning campaigns with seasonal peaks, Cyclistic can increase membership rates and strengthen long-term rider engagement.

Recommendations

Based on the analysis, I recommend the following strategies for the marketing team:

1. Focus on the 18–35 Age Group:

Prioritize marketing campaigns for individuals aged 18–35, with a particular emphasis on the 20–30 age bracket, as they represent the largest user segment.

2. Target Male Riders:

Given the current gender distribution, initial marketing efforts may focus on men to increase membership conversions efficiently while simultaneously exploring strategies to engage women riders in the future.

3. Leverage High-Traffic Stations:

Concentrate marketing resources and promotional activities at the most popular starting and ending stations used by casual riders, including: Streeter Dr & Grand Ave Michigan Ave & Oak St Lake Shore Dr & North Blvd Lake Shore Dr & Monroe St Streeter Dr & Illinois St Wells St & Concord Ln

4. Emphasize Convenience for Commuters:

Highlight the advantages of membership for shorter, routine commutes to attract casual riders who currently take longer, occasional trips.

5. Seasonal Campaign Timing:

Launch membership promotions in late spring and early summer to align with peak casual usage, capturing interest when ridership is highest.