This report analyzes bike ride data from multiple CSV files. The analysis includes data cleaning, summary statistics, and visualizations of ride patterns based on membership type.
library(readr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ stringr 1.5.2
## ✔ forcats 1.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(lubridate)
Cyclistic’s historical bike trip data, provided publicly by Motivate International Inc., was used for this analysis. • Data Link: Divvy Trip Data (Public Dataset) • License: Data is made available under the Divvy Data License Agreement. • Timeframe: The previous 12 months of ride data ( 2024/10 to 2025/9 ) were downloaded to ensure recent and relevant insights.
Set the working directory and read all CSV files into a single data frame.
setwd("~/my R files/case study cvc")
files <- list.files(pattern = "\\.csv$")
full_data <- files %>%
lapply(read.csv, stringsAsFactors = FALSE) %>%
bind_rows()
head(full_data)
## ride_id rideable_type started_at
## 1 4422E707103AA4FF electric_bike 2024-10-14 03:26:04.083
## 2 19DB722B44CBE82F electric_bike 2024-10-13 19:33:38.926
## 3 20AE2509FD68C939 electric_bike 2024-10-13 23:40:48.522
## 4 D0F17580AB9515A9 electric_bike 2024-10-14 02:13:41.602
## 5 A114A483941288D1 electric_bike 2024-10-13 19:26:41.383
## 6 97833F00E6A67DC6 electric_bike 2024-10-14 06:35:06.130
## ended_at start_station_name start_station_id end_station_name
## 1 2024-10-14 03:32:56.535
## 2 2024-10-13 19:39:04.490
## 3 2024-10-13 23:48:02.339
## 4 2024-10-14 02:25:40.057
## 5 2024-10-13 19:28:18.560
## 6 2024-10-14 06:42:10.776
## end_station_id start_lat start_lng end_lat end_lng member_casual
## 1 41.96 -87.65 41.98 -87.67 member
## 2 41.98 -87.67 41.97 -87.66 member
## 3 41.97 -87.66 41.95 -87.65 member
## 4 41.95 -87.65 41.96 -87.65 member
## 5 41.98 -87.67 41.98 -87.67 member
## 6 41.88 -87.65 41.89 -87.64 member
colnames(full_data)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
str(full_data)
## 'data.frame': 5539521 obs. of 13 variables:
## $ ride_id : chr "4422E707103AA4FF" "19DB722B44CBE82F" "20AE2509FD68C939" "D0F17580AB9515A9" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2024-10-14 03:26:04.083" "2024-10-13 19:33:38.926" "2024-10-13 23:40:48.522" "2024-10-14 02:13:41.602" ...
## $ ended_at : chr "2024-10-14 03:32:56.535" "2024-10-13 19:39:04.490" "2024-10-13 23:48:02.339" "2024-10-14 02:25:40.057" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 42 42 42 42 42 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num 42 42 42 42 42 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
Check for duplicates, missing values, and invalid ride times.
sum(duplicated(full_data$ride_id))
## [1] 0
sum(is.na(full_data$ride_id) | full_data$ride_id == "")
## [1] 0
full_data <- full_data %>%
mutate(length_id = nchar(ride_id))
table(full_data$length_id) # To check how many different length id we have
##
## 16
## 5539521
max(full_data$length_id)
## [1] 16
min(full_data$length_id)
## [1] 16
sum(is.na(full_data$started_at) | full_data$started_at == "")
## [1] 0
sum(is.na(full_data$ended_at) | full_data$ended_at == "")
## [1] 0
full_data$started_at <- ymd_hms(full_data$started_at)
full_data$ended_at <- ymd_hms(full_data$ended_at)
full_data <- full_data %>% mutate(ride_length = ended_at - started_at)
full_data <- full_data %>% filter(ended_at >= started_at)
full_data <- full_data %>%
mutate(month_name = month(started_at, label = TRUE, abbr = FALSE),
weekday_num = wday(started_at, week_start = 7),
weekday_name = wday(started_at, label = TRUE, abbr = FALSE))
Define a mode function and calculate summary statistics.
get_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
summary_stats <- full_data %>%
summarise(
min_ride_length = min(ride_length, na.rm = TRUE),
max_ride_length = max(ride_length, na.rm = TRUE),
avg_ride_length = mean(ride_length, na.rm = TRUE),
mode_weekday = get_mode(weekday_name)
)
summary_stats
## min_ride_length max_ride_length avg_ride_length mode_weekday
## 1 0.046 secs 94494.01 secs 963.5568 secs Saturday
summary_by_member <- full_data %>%
group_by(member_casual) %>%
summarise(
min_ride_length_seconds = min(as.numeric(ride_length, units = "secs"), na.rm = TRUE),
max_ride_length_hours = max(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 3600,
avg_ride_length_minutes = mean(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 60,
mode_weekday = get_mode(weekday_name),
.groups = 'drop'
)
summary_by_member
## # A tibble: 2 × 5
## member_casual min_ride_length_seconds max_ride_length_hours
## <chr> <dbl> <dbl>
## 1 casual 0.0460 26.2
## 2 member 0.0780 25.0
## # ℹ 2 more variables: avg_ride_length_minutes <dbl>, mode_weekday <ord>
ggplot(full_data) +
geom_bar(aes(x = member_casual, fill = member_casual)) +
facet_wrap(~weekday_name) +
labs(title = 'Daily View', y="Number of Rides",
subtitle = 'Membership type vs Number of Rides',
caption = "Data collected from 2024/10 to 2025/9") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(full_data) +
geom_bar(aes(x = member_casual, fill = member_casual)) +
facet_wrap(~month_name) +
labs(title = 'Monthly View', y="Number of Rides",
subtitle = 'Membership type vs Number of Rides',
caption = "Data collected from 2024/10 to 2025/9") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Daily_data <- full_data %>%
group_by(weekday_name, member_casual) %>%
summarise(
min_ride_Daily_length_seconds = min(as.numeric(ride_length, units = "secs"), na.rm = TRUE),
max_ride_Daily_length_hours = max(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 3600,
avg_ride_Daily_length_minutes = mean(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 60,
.groups = 'drop'
)
ggplot(data = Daily_data)+
geom_col(mapping = aes(x=weekday_name,y=avg_ride_Daily_length_minutes,fill =member_casual),position = "dodge")+
labs(title = 'Weekly View ',y="Average Ride length in minutes",
subtitle = 'Membership type vs Average Ride length ',
caption = "Data collected from 2024/10 to 2025/9")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Monthly_data <- full_data %>%
group_by(month_name, member_casual) %>%
summarise(avg_Monthly_ride_length_minutes = mean(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 60,
.groups = "drop")
ggplot(data = Monthly_data)+
geom_col(position = 'dodge',mapping = aes(x=month_name,y=avg_Monthly_ride_length_minutes,fill =member_casual))+
labs(title = 'Monthly View ',y="Average Ride length in minutes",
subtitle = 'Membership type vs Average Ride length ',
caption = "Data collected from 2024/10 to 2025/9")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Total_data <- full_data %>%
group_by(member_casual) %>%
summarise(Total_ride_length_Days = sum(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / (60*60*24),
.groups = "drop")
ggplot(data = Total_data)+
geom_col(mapping = aes(x=member_casual,y=Total_ride_length_Days,fill =member_casual))+
coord_cartesian(ylim = c(25000, NA)) +labs(title = 'Total View ',y="Total Ride length in days",
subtitle = 'Membership type vs Total Ride length ',
caption = "Data collected from 2024/10 to 2025/9")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))