1.What is the problem you are trying to solve? The difference in use between casual riders and annual (member)riders.
2.What is the stakeholder’s expectation? Design marketing strategies aimed at converting casual riders into annual members. The team needs to better understand how annual members and casual riders differ, why casual riders would buy a membership, and how digital media could affect their marketing tactics.
3.How can your insights drive business decisions? The results of this analysis will be used to design a new marketing strategy to convert casual riders to annual members.
Identify business task: Analyze the differences in bike usage between casual riders and annual members to develop insights that will inform a marketing strategy aimed at converting casual riders into annual members.
Consider key stakeholders Director of Marketing (Lily Moreno), Cyclistic marketing analytics team and Cyclistic executive team
The Cyclistic case study focuses on analyzing Divvy bike-share data to understand usage patterns between member and casual riders. The data for this analysis was sourced from the Divvy Trip Data repository, covering the year 2020.
The following datasets were downloaded and used for this analysis: -
202004-divvy-tripdata.zip -
202005-divvy-tripdata.zip -
202006-divvy-tripdata.zip -
202007-divvy-tripdata.zip -
202008-divvy-tripdata.zip -
202009-divvy-tripdata.zip -
202010-divvy-tripdata.zip -
202011-divvy-tripdata.zip -
202012-divvy-tripdata.zip -
Divvy_Trips_2020_Q1.zip
These datasets contain ride details for different months of 2020.
The workflow involves the following stages: 1. Data Preparation: Combining all monthly files into a single table in BigQuery. 2. Data Cleaning: Removing invalid or missing data, ensuring uniform data types. 3. Data Analysis: Exploring patterns in ride data and generating insights. 4. Visualization: Presenting the findings using graphical plots in R.
The datasets were uploaded to BigQuery, and the following SQL query was used to combine them into a unified table:
SELECT * FROM `project.dataset.202004`
UNION ALL
SELECT * FROM `project.dataset.202005`
UNION ALL
SELECT * FROM `project.dataset.202006`
-- Repeat for other months
#install important packages
library(tidyverse)
library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(DT)
Load uncleaned dataset
Data <- read_csv("data_trip.csv")
## Rows: 42140 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, e...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, en...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(tidyr)
library(dplyr)
# 1. Check Ride ID Length Consistency
ride_id_length <- Data %>%
mutate(ride_id_length = nchar(ride_id)) %>%
group_by(ride_id_length) %>%
summarize(count = n()) %>%
arrange(ride_id_length)
# Create an interactive DataTable with proper alignment
datatable(
ride_id_length,
colnames = c("Rider ID Length", "Count"), # Custom column names
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: center; font-size: 14px; color: grey;',
"Interactive Table: Rider ID Length Summary"
), # Add a caption at the bottom
options = list(
pageLength = 5, # Default number of rows per page
autoWidth = TRUE, # Automatically adjust column width
dom = 'Bfrtip', # Add buttons (copy, CSV, etc.)
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), # Export buttons
className = 'hover', # Highlight rows on hover
columnDefs = list(
list(className = 'dt-center', targets = "_all"), # Center-align all columns
list(width = '50%', targets = 0), # Set specific width for the first column
list(width = '50%', targets = 1) # Set specific width for the second column
),
initComplete = JS( # JavaScript for better theme
"function(settings, json) {
$(this.api().table().container()).css({'font-family': 'Arial', 'font-size': '14px'});
}"
)
),
extensions = c('Buttons', 'Responsive') # Add export buttons and responsiveness
)
–the ride_id is consistent with 16 characters.
rideable_type <- Data %>%
group_by(rideable_type) %>%
summarize(total = n())
–there are two types of bike: electric and docked
invalid_ride_duration <- Data %>%
mutate(ride_duration = as.numeric(difftime(ended_at, started_at, units = "mins"))) %>%
filter(ride_duration <= 1 | ride_duration >= 1440) %>%
select(ride_id, started_at, ended_at, ride_duration)
– check if the ride time is less than a minute or longer than a day – the end time is behind the start time – TIMESTAMP is in YYYY-MM-DD hh:mm:ss UTC format
# Count Null Values for Start/End Station Names
station_nulls_summary <- Data %>%
summarize(
start_station_nulls = sum(is.na(start_station_name)),
end_station_nulls = sum(is.na(end_station_name))
)
– a total of 1229 start_station_name with null values are determined – a total of 1049 start_station_name with null values are determined
# Check Null Values in Station IDs
station_id_nulls_summary <- Data %>%
filter(is.na(start_station_id) | is.na(end_station_id)) %>%
summarize(null_station_ids = n())
– 1690 null values are observed
membership_type_summary <- Data %>%
group_by(member_casual) %>%
summarize(membership_count = n())
– only two types: member and causal – total membership count : 25543 – total Casual count: 16597
data_trip <- Data %>%
# Create 'month_name' from 'started_at'
mutate(month_name = format(as.POSIXct(started_at, format = "%Y-%m-%d %H:%M:%S"), "%B"),
# Create 'day_name' from 'started_at'
day_name = format(as.POSIXct(started_at, format = "%Y-%m-%d %H:%M:%S"), "%A"),
# Calculate 'ride_length' in minutes
ride_length =round(as.numeric(difftime(as.POSIXct(ended_at, format = "%Y-%m-%d %H:%M:%S"),
as.POSIXct(started_at, format = "%Y-%m-%d %H:%M:%S"),
units = "mins")))) %>%
# 2. Filter Rows Based on Conditions
filter(
ride_length > 1 & ride_length < 1440, # Ride length must be between 1 minute and 24 hours
!is.na(ride_id), # No null values in ride_id
!is.na(rideable_type), # No null values in rideable_type
!is.na(started_at), # No null values in started_at
!is.na(ended_at), # No null values in ended_at
!is.na(start_station_name), # No null values in start_station_name
!is.na(start_station_id), # No null values in start_station_id
!is.na(end_station_name), # No null values in end_station_name
!is.na(end_station_id), # No null values in end_station_id
!is.na(start_lat), # No null values in start_lat
!is.na(start_lng), # No null values in start_lng
!is.na(end_lat), # No null values in end_lat
!is.na(end_lng), # No null values in end_lng
!is.na(member_casual) # No null values in member_casual
)
# Remove duplicates
data_trip <- data_trip[!duplicated(data_trip), ]
# Check for duplicate rows in the Cyclistic dataset
duplicate_rows <- data_trip[duplicated(data_trip), ]
# Count the number of duplicate rows
num_duplicates <- nrow(duplicate_rows)
# Output results
cat("Number of duplicate rows in the dataset:", num_duplicates, "\n")
## Number of duplicate rows in the dataset: 0
if (num_duplicates > 0) {
cat("Here are the duplicate rows:\n")
print(duplicate_rows)
} else {
cat("No duplicate rows found in the dataset.\n")
}
## No duplicate rows found in the dataset.
# Check for total null values in the entire dataset
sum(is.na(data_trip))
## [1] 0
# Check null values for each column
colSums(is.na(data_trip))
## ride_id rideable_type started_at ended_at
## 0 0 0 0
## start_station_name start_station_id end_station_name end_station_id
## 0 0 0 0
## start_lat start_lng end_lat end_lng
## 0 0 0 0
## member_casual month_name day_name ride_length
## 0 0 0 0
#total number of rides by each bike type and by member type
rides_per_bike_type <- data_trip %>%
group_by(rideable_type,member_casual) %>%
summarise(total_rides= n(),.groups = "drop")
This bar chart compares the total number of rides for different bike
types (rideable_type) between members and casual riders. It
highlights trends in bike usage based on membership status.
library(ggplot2)
ggplot(rides_per_bike_type, aes(x = rideable_type, y = total_rides, fill = member_casual)) +
geom_bar(stat = "identity", position = "dodge") + # Bar plot with dodged bars
labs(title = "Total Rides by Bike Type and Membership Type",
x = "Bike Type",
y = "Total Rides") +
theme_minimal() +
scale_fill_manual(values = c("member" = "blue", "casual" = "orange")) # Customize colors
Figure 1: Total rides by bike type for members and casual riders.
These insights suggest that docked bikes are favored by regular users, while electric bikes appeal equally to both membership types.
The following table summarizes the total number of rides for each month, grouped by membership type:
library(dplyr)
rides_per_month <- data_trip %>%
group_by(month_name, member_casual) %>%
summarise(total_rides = n(),.groups = "drop")
The faceted bar chart below visualizes the total rides for each
month, grouped by membership type (member and
casual). Each panel focuses on one membership type,
allowing for clearer identification of trends.
Ride Volume: Members significantly outnumber casual riders in total ride counts, reflecting their regular and committed usage.
Purpose of Usage: The difference in ride patterns suggests that casual riders tend to ride for leisure, while members primarily use the service for daily commuting, such as traveling to school or work.
This analysis shows the total number of rides for each day of the week, split by member and casual rider types
total_rides_by_day <-data_trip %>%
group_by(day_name,member_casual) %>%
summarise(total_ride=n(),.groups = "drop")
The following plot shows the total number of rides for each day of the week, split by membership type (casual and member). The bars are grouped side-by-side to compare the number of rides between casual and member riders for each day.
This visualization compares the total rides taken by casual riders and annual members across each day of the week. It provides valuable insights into rider behavior patterns.
This analysis categorizes the time of day into four periods: Night, Morning, Afternoon, and Evening. We then group the data by hour of the day, membership type (casual or member), and time of day. The total number of rides for each category is calculated, and the results are summarized. A bar plot is generated to visualize how ride counts vary by time of day and membership type.
# Data preparation:Extract the hour of day from the start_time
data_trip$started_at <- as.POSIXct(data_trip$started_at, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
data_trip$hour_of_day <- format(data_trip$started_at, "%H")
data_trip$hour_of_day <- as.numeric(data_trip$hour_of_day)#Convert to numeric for easier analysis
# Categorize time of day
data_trip$time_of_day <- case_when(
data_trip$hour_of_day >= 0 & data_trip$hour_of_day < 6 ~ "Night",
data_trip$hour_of_day >= 6 & data_trip$hour_of_day < 12 ~"Morning",
data_trip$hour_of_day >= 12 & data_trip$hour_of_day < 18 ~ "Afternoon",
data_trip$hour_of_day >= 18 & data_trip$hour_of_day <= 23 ~ "Evening"
)
# Group by hour of day and member/casual, then count the number of rides
ride_counts_by_time <- data_trip %>%
group_by(hour_of_day, member_casual,time_of_day) %>%
summarise(total_rides = n(), .groups = "drop") %>%
arrange(hour_of_day, member_casual)
From this analysis, you can gain insights into which times of the day are most popular for both casual and member riders.
This section calculates and visualizes the average ride length by membership type (casual vs member).
#average ride length by each member/casual
average_ride_length <- data_trip %>%
group_by(member_casual) %>%
summarise(avg_ride_length = mean(ride_length, na.rm = TRUE))
#Average ride length per day of week
average_ride_length_per_day <- data_trip %>%
group_by(day_name, member_casual) %>%
summarise(avg_ride_length_per_day = mean(ride_length, na.rm = TRUE),.groups = "drop")
# Reusable function for filtering, grouping, and sorting by station
get_station_data <- function(data, rider_type, station_type) {
data %>%
filter(member_casual == rider_type) %>%
group_by(!!sym(station_type)) %>%
summarise(total_rides = n(), .groups = "drop") %>%
arrange(desc(total_rides))
}
# Get data for casual riders (starting station)
start_station_casual <- get_station_data(data_trip, "casual",
"start_station_name")
# Create a combined data frame for visualization
combined_data <- data.frame(
member_casual = c("member", "casual"),
total_rides = c(sum(data_trip$member_casual == "member"), sum(data_trip$member_casual == "casual")),
avg_ride_length = c(mean(data_trip$ride_length[data_trip$member_casual == "member"], na.rm = TRUE),
mean(data_trip$ride_length[data_trip$member_casual == "casual"], na.rm = TRUE))
)
# Plot the data
ggplot(combined_data, aes(x = member_casual)) +
geom_col(aes(y = total_rides), fill = "skyblue", alpha = 0.7) +
geom_line(aes(y = avg_ride_length * 1000, group = 1), color = "tomato", size = 1) +
geom_point(aes(y = avg_ride_length * 1000), color = "tomato", size = 3) +
scale_y_continuous(
name = "Total Rides",
sec.axis = sec_axis(~ . / 1000, name = "Avg Ride Length (minutes)")
) +
labs(
title = "Comparison of Total Rides and Average Ride Length",
x = "Membership Type"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.