1.What is the problem you are trying to solve? The difference in use between casual riders and annual (member)riders.
2.What is the stakeholder’s expectation? Design marketing strategies aimed at converting casual riders into annual members. The team needs to better understand how annual members and casual riders differ, why casual riders would buy a membership, and how digital media could affect their marketing tactics.
3.How can your insights drive business decisions? The results of this analysis will be used to design a new marketing strategy to convert casual riders to annual members.
Identify business task: Analyze the differences in bike usage between casual riders and annual members to develop insights that will inform a marketing strategy aimed at converting casual riders into annual members.
Consider key stakeholders Director of Marketing (Lily Moreno), Cyclistic marketing analytics team and Cyclistic executive team
library(tidyverse)
library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(bigrquery)
install.packages("DT")
library(DT)
I did prepare and process steps in BigQuery.
query_ride_id <- "
SELECT LENGTH(ride_id) as rider_id_length, COUNT(*) AS count
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
GROUP BY LENGTH(ride_id)
"
# Execute query
ride_id_length <- bq_table_download(bq_project_query(project_id, query_ride_id))
# View results
# Create an interactive DataTable with proper alignment
datatable(
ride_id_length,
colnames = c("Rider ID Length", "Count"), # Custom column names
caption = htmltools::tags$caption(
style = 'caption-side: bottom; text-align: center; font-size: 14px; color: grey;',
"Interactive Table: Rider ID Length Summary"
), # Add a caption at the bottom
options = list(
pageLength = 5, # Default number of rows per page
autoWidth = TRUE, # Automatically adjust column width
dom = 'Bfrtip', # Add buttons (copy, CSV, etc.)
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), # Export buttons
className = 'hover', # Highlight rows on hover
columnDefs = list(
list(className = 'dt-center', targets = "_all"), # Center-align all columns
list(width = '50%', targets = 0), # Set specific width for the first column
list(width = '50%', targets = 1) # Set specific width for the second column
),
initComplete = JS( # JavaScript for better theme
"function(settings, json) {
$(this.api().table().container()).css({'font-family': 'Arial', 'font-size': '14px'});
}"
)
),
extensions = c('Buttons', 'Responsive') # Add export buttons and responsiveness
)
–the ride_id is consistent with 16 characters.
query_rideable_type <- "
SELECT rideable_type, COUNT(*) AS count
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
GROUP BY rideable_type
"
# Execute query
rideable_type <- bq_table_download(bq_project_query(project_id, query_rideable_type))
–there are two types of bike: electric and docked
query_ride_duration <- "
SELECT ride_id, started_at, ended_at,
TIMESTAMP_DIFF(ended_at, started_at, MINUTE) AS duration_minutes
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
WHERE
TIMESTAMP_DIFF(ended_at, started_at, MINUTE) <= 1 OR
TIMESTAMP_DIFF(ended_at, started_at, MINUTE) >= 1440
"
# Execute query
ride_duration <- bq_table_download(bq_project_query(project_id, query_ride_duration))
– check if the ride time is less than a minute or longer than a day – the end time is behind the start time – TIMESTAMP is in YYYY-MM-DD hh:mm:ss UTC format
query_start_station <- "
SELECT start_station_name, COUNT(*) AS start_station_count
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
GROUP BY start_station_name
"
# Execute query
start_station <- bq_table_download(bq_project_query(project_id, query_start_station))
– a total of 82958 start_station_name with null values are determined
query_end_station <- "
SELECT end_station_name, COUNT(*) AS end_station_count
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
GROUP BY end_station_name
"
# Execute query
end_station <-bq_table_download(bq_project_query(project_id, query_end_station))
– 97645 null values observed – the string lengths of station id are inconsistent. however, it will be ignored
query_lat_lng <- "
SELECT COUNT(*) AS null_count
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
WHERE start_lat IS NULL OR start_lng IS NULL OR
end_lat IS NULL OR end_lng IS NULL
"
# Execute query
lat_lng_nulls <- bq_table_download(bq_project_query(project_id, query_lat_lng))
– 4145 null values are observed
query_membership <- "
SELECT member_casual, COUNT(*) AS membership_count
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
GROUP BY member_casual
"
# Execute query
membership_types <- bq_table_download(bq_project_query(project_id, query_membership))
– only two types: member and causal – total membership count : 2073615 – total Casual count: 1336495
query_null_values <- "
SELECT
COUNTIF(ride_id IS NULL) AS null_count_ride_id,
COUNTIF(rideable_type IS NULL) AS null_count_rideable_type,
COUNTIF(started_at IS NULL) AS null_count_started_at,
COUNTIF(ended_at IS NULL) AS null_count_ended_at,
COUNTIF(start_station_name IS NULL) AS null_count_start_station_name,
COUNTIF(start_station_id IS NULL) AS null_count_start_station_id,
COUNTIF(end_station_name IS NULL) AS null_count_end_station_name,
COUNTIF(end_station_id IS NULL) AS null_count_end_station_id,
COUNTIF(start_lat IS NULL) AS null_count_start_lat,
COUNTIF(start_lng IS NULL) AS null_count_start_lng,
COUNTIF(end_lat IS NULL) AS null_count_end_lat,
COUNTIF(end_lng IS NULL) AS null_count_end_lng,
COUNTIF(member_casual IS NULL) AS null_count_member_casual
FROM `astute-curve-435613-t7.Data_trip.Data_trip2020`
"
# Execute the query and download the results
null_values <-bq_table_download(bq_project_query(project_id, query_null_values))
# Explicitly set project ID
project_id <- "astute-curve-435613-t7"
# SQL Query for creating the cleaned dataset
query_with_cast <- "
CREATE OR REPLACE TABLE `astute-curve-435613-t7.Data_trip.Cleaned_Data` AS (
SELECT
t.*,
FORMAT_TIMESTAMP('%B', t.started_at) AS month_name,
TIMESTAMP_DIFF(t.ended_at, t.started_at, MINUTE) AS ride_length,
FORMAT_TIMESTAMP('%A', t.started_at) AS day_name,
SAFE_CAST(t.ride_id AS STRING) AS fixed_ride_id,
SAFE_CAST(t.rideable_type AS STRING) AS fixed_rideable_type,
SAFE_CAST(t.started_at AS TIMESTAMP) AS fixed_started_at,
SAFE_CAST(t.ended_at AS TIMESTAMP) AS fixed_ended_at,
SAFE_CAST(t.start_station_name AS STRING) AS fixed_start_station_name,
SAFE_CAST(t.start_station_id AS INT64) AS fixed_start_station_id,
SAFE_CAST(t.end_station_name AS STRING) AS fixed_end_station_name,
SAFE_CAST(t.end_station_id AS INT64) AS fixed_end_station_id,
SAFE_CAST(t.start_lat AS FLOAT64) AS fixed_start_lat,
SAFE_CAST(t.start_lng AS FLOAT64) AS fixed_start_lng,
SAFE_CAST(t.end_lat AS FLOAT64) AS fixed_end_lat,
SAFE_CAST(t.end_lng AS FLOAT64) AS fixed_end_lng,
SAFE_CAST(t.member_casual AS STRING) AS fixed_member_casual
FROM
`astute-curve-435613-t7.Data_trip.Data_trip2020` t
WHERE
TIMESTAMP_DIFF(t.ended_at, t.started_at, MINUTE) BETWEEN 1 AND 1440 AND
t.ride_id IS NOT NULL
AND t.rideable_type IS NOT NULL
AND t.started_at IS NOT NULL
AND t.ended_at IS NOT NULL
AND t.start_station_name IS NOT NULL
AND t.start_station_id IS NOT NULL
AND t.end_station_name IS NOT NULL
AND t.end_station_id IS NOT NULL
AND t.start_lat IS NOT NULL
AND t.start_lng IS NOT NULL
AND t.end_lat IS NOT NULL
AND t.end_lng IS NOT NULL
AND t.member_casual IS NOT NULL
)
"
# Execute the query
bq_project_query(project_id, query_with_cast)
## <bq_table> astute-curve-435613-t7.Data_trip.Cleaned_Data
After These Steps(Prepare,process),I did Analyze,visualization and Act steps in Rstudio
## Rows: 500 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, e...
## dbl (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, en...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#total number of rides by each bike type and by member type
rides_per_bike_type <- data_trip %>%
group_by(rideable_type,member_casual) %>%
summarise(total_rides= n(),.groups = "drop")
This bar chart compares the total number of rides for different bike
types (rideable_type) between members and casual riders. It
highlights trends in bike usage based on membership status.
library(ggplot2)
ggplot(rides_per_bike_type, aes(x = rideable_type, y = total_rides, fill = member_casual)) +
geom_bar(stat = "identity", position = "dodge") + # Bar plot with dodged bars
labs(title = "Total Rides by Bike Type and Membership Type",
x = "Bike Type",
y = "Total Rides") +
theme_minimal() +
scale_fill_manual(values = c("member" = "blue", "casual" = "orange")) # Customize colors
Figure 1: Total rides by bike type for members and casual riders.
These insights suggest that docked bikes are favored by regular users, while electric bikes appeal equally to both membership types.
The following table summarizes the total number of rides for each month, grouped by membership type:
library(dplyr)
rides_per_month <- data_trip %>%
group_by(month_name, member_casual) %>%
summarise(total_rides = n(),.groups = "drop")
The faceted bar chart below visualizes the total rides for each
month, grouped by membership type (member and
casual). Each panel focuses on one membership type,
allowing for clearer identification of trends.
Ride Volume: Members significantly outnumber casual riders in total ride counts, reflecting their regular and committed usage.
Purpose of Usage: The difference in ride patterns suggests that casual riders tend to ride for leisure, while members primarily use the service for daily commuting, such as traveling to school or work.
This analysis shows the total number of rides for each day of the week, split by member and casual rider types
total_rides_by_day <-data_trip %>%
group_by(day_name,member_casual) %>%
summarise(total_ride=n(),.groups = "drop")
The following plot shows the total number of rides for each day of the week, split by membership type (casual and member). The bars are grouped side-by-side to compare the number of rides between casual and member riders for each day.
This analysis categorizes the time of day into four periods: Night, Morning, Afternoon, and Evening. We then group the data by hour of the day, membership type (casual or member), and time of day. The total number of rides for each category is calculated, and the results are summarized. A bar plot is generated to visualize how ride counts vary by time of day and membership type.
# Data preparation:Extract the hour of day from the start_time
library(dplyr)
data_trip$started_at <- as.POSIXct(data_trip$started_at, format = "%m/%d/%Y %H:%M:%S")
data_trip$hour_of_day <- format(data_trip$started_at, "%H")
data_trip$hour_of_day <- as.numeric(data_trip$hour_of_day)#Convert to numeric for easier analysis
# Categorize time of day
data_trip$time_of_day <- case_when(
data_trip$hour_of_day >= 0 & data_trip$hour_of_day < 6 ~ "Night",
data_trip$hour_of_day >= 6 & data_trip$hour_of_day < 12 ~"Morning",
data_trip$hour_of_day >= 12 & data_trip$hour_of_day < 18 ~ "Afternoon",
data_trip$hour_of_day >= 18 & data_trip$hour_of_day <= 23 ~ "Evening"
)
# Group by hour of day and member/casual, then count the number of rides
ride_counts_by_time <- data_trip %>%
group_by(hour_of_day, member_casual,time_of_day) %>%
summarise(total_rides = n(), .groups = "drop") %>%
arrange(hour_of_day, member_casual)
From this analysis, you can gain insights into which times of the day are most popular for both casual and member riders.
This section calculates and visualizes the average ride length by membership type (casual vs member).
#average ride length by each member/casual
average_ride_length <- data_trip %>%
group_by(member_casual) %>%
summarise(avg_ride_length = mean(ride_length, na.rm = TRUE))
#Average ride length per day of week
average_ride_length_per_day <- data_trip %>%
group_by(day_name, member_casual) %>%
summarise(avg_ride_length_per_day = mean(ride_length, na.rm = TRUE),.groups = "drop")
# Reusable function for filtering, grouping, and sorting by station
get_station_data <- function(data, rider_type, station_type) {
data %>%
filter(member_casual == rider_type) %>%
group_by(!!sym(station_type)) %>%
summarise(total_rides = n(), .groups = "drop") %>%
arrange(desc(total_rides))
}
# Get data for casual riders (starting station)
start_station_casual <- get_station_data(data_trip, "casual",
"start_station_name")
# Create a combined data frame for visualization
combined_data <- data.frame(
member_casual = c("member", "casual"),
total_rides = c(sum(data_trip$member_casual == "member"), sum(data_trip$member_casual == "casual")),
avg_ride_length = c(mean(data_trip$ride_length[data_trip$member_casual == "member"], na.rm = TRUE),
mean(data_trip$ride_length[data_trip$member_casual == "casual"], na.rm = TRUE))
)
# Plot the data
ggplot(combined_data, aes(x = member_casual)) +
geom_bar(aes(y = total_rides), stat = "identity", fill = "skyblue", alpha = 0.7) +
geom_point(aes(y = avg_ride_length * 10), color = "red", size = 4) +
scale_y_continuous(
name = "Total Rides",
sec.axis = sec_axis(~ . / 10, name = "Avg Ride Length (scaled)")
) +
labs(title = "Total Rides and Average Ride Length by Membership Type") +
theme_minimal()