Introduction

This report analyzes bike ride data from multiple CSV files. The analysis includes data cleaning, summary statistics, and visualizations of ride patterns based on membership type.

Load Libraries

library(readr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ stringr   1.5.2
## ✔ forcats   1.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(lubridate)

Data

Data Sources

Cyclistic’s historical bike trip data, provided publicly by Motivate International Inc., was used for this analysis. • Data Link: Divvy Trip Data (Public Dataset) • License: Data is made available under the Divvy Data License Agreement. • Timeframe: The previous 12 months of ride data ( 2024/10 to 2025/9 ) were downloaded to ensure recent and relevant insights.

Load and Merge Data

Set the working directory and read all CSV files into a single data frame.

setwd("~/my R files/case study cvc")
files <- list.files(pattern = "\\.csv$")
full_data <- files %>%
  lapply(read.csv, stringsAsFactors = FALSE) %>%
  bind_rows()

Explore the data

head(full_data)
##            ride_id rideable_type              started_at
## 1 4422E707103AA4FF electric_bike 2024-10-14 03:26:04.083
## 2 19DB722B44CBE82F electric_bike 2024-10-13 19:33:38.926
## 3 20AE2509FD68C939 electric_bike 2024-10-13 23:40:48.522
## 4 D0F17580AB9515A9 electric_bike 2024-10-14 02:13:41.602
## 5 A114A483941288D1 electric_bike 2024-10-13 19:26:41.383
## 6 97833F00E6A67DC6 electric_bike 2024-10-14 06:35:06.130
##                  ended_at start_station_name start_station_id end_station_name
## 1 2024-10-14 03:32:56.535                                                     
## 2 2024-10-13 19:39:04.490                                                     
## 3 2024-10-13 23:48:02.339                                                     
## 4 2024-10-14 02:25:40.057                                                     
## 5 2024-10-13 19:28:18.560                                                     
## 6 2024-10-14 06:42:10.776                                                     
##   end_station_id start_lat start_lng end_lat end_lng member_casual
## 1                    41.96    -87.65   41.98  -87.67        member
## 2                    41.98    -87.67   41.97  -87.66        member
## 3                    41.97    -87.66   41.95  -87.65        member
## 4                    41.95    -87.65   41.96  -87.65        member
## 5                    41.98    -87.67   41.98  -87.67        member
## 6                    41.88    -87.65   41.89  -87.64        member
colnames(full_data)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
str(full_data)
## 'data.frame':    5539521 obs. of  13 variables:
##  $ ride_id           : chr  "4422E707103AA4FF" "19DB722B44CBE82F" "20AE2509FD68C939" "D0F17580AB9515A9" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2024-10-14 03:26:04.083" "2024-10-13 19:33:38.926" "2024-10-13 23:40:48.522" "2024-10-14 02:13:41.602" ...
##  $ ended_at          : chr  "2024-10-14 03:32:56.535" "2024-10-13 19:39:04.490" "2024-10-13 23:48:02.339" "2024-10-14 02:25:40.057" ...
##  $ start_station_name: chr  "" "" "" "" ...
##  $ start_station_id  : chr  "" "" "" "" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  42 42 42 42 42 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  42 42 42 42 42 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

Data Cleaning

Check for duplicates, missing values, and invalid ride times.

Duplicate ride IDs

sum(duplicated(full_data$ride_id))
## [1] 0

Missing ride IDs

sum(is.na(full_data$ride_id) | full_data$ride_id == "")
## [1] 0

Length of ride IDs

full_data <- full_data %>%
  mutate(length_id = nchar(ride_id))
table(full_data$length_id) # To check how many different length id we have
## 
##      16 
## 5539521
max(full_data$length_id) 
## [1] 16
min(full_data$length_id)
## [1] 16

Missing started_at or ended_at

sum(is.na(full_data$started_at) | full_data$started_at == "")
## [1] 0
sum(is.na(full_data$ended_at) | full_data$ended_at == "")
## [1] 0

Convert to datetime

full_data$started_at <- ymd_hms(full_data$started_at)
full_data$ended_at   <- ymd_hms(full_data$ended_at)

Calculations

Ride length

full_data <- full_data %>% mutate(ride_length = ended_at - started_at)

Remove invalid rides

full_data <- full_data %>% filter(ended_at >= started_at)

Extract month and weekday

full_data <- full_data %>%
mutate(month_name = month(started_at, label = TRUE, abbr = FALSE),
weekday_num = wday(started_at, week_start = 7),
weekday_name = wday(started_at, label = TRUE, abbr = FALSE))

Summary Statistics

Define a mode function and calculate summary statistics.

get_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
summary_stats <- full_data %>%
summarise(
min_ride_length = min(ride_length, na.rm = TRUE),
max_ride_length = max(ride_length, na.rm = TRUE),
avg_ride_length = mean(ride_length, na.rm = TRUE),
mode_weekday   = get_mode(weekday_name)
)
summary_stats
##   min_ride_length max_ride_length avg_ride_length mode_weekday
## 1      0.046 secs   94494.01 secs   963.5568 secs     Saturday

Grouped summary by membership type:

summary_by_member <- full_data %>%
group_by(member_casual) %>%
summarise(
min_ride_length_seconds = min(as.numeric(ride_length, units = "secs"), na.rm = TRUE),
max_ride_length_hours = max(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 3600,
avg_ride_length_minutes = mean(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 60,
mode_weekday = get_mode(weekday_name),
.groups = 'drop'
)
summary_by_member
## # A tibble: 2 × 5
##   member_casual min_ride_length_seconds max_ride_length_hours
##   <chr>                           <dbl>                 <dbl>
## 1 casual                         0.0460                  26.2
## 2 member                         0.0780                  25.0
## # ℹ 2 more variables: avg_ride_length_minutes <dbl>, mode_weekday <ord>

Visualizations

Number of Rides by Membership

Daily

ggplot(full_data) +
geom_bar(aes(x = member_casual, fill = member_casual)) +
facet_wrap(~weekday_name) +
labs(title = 'Daily View', y="Number of Rides",
subtitle = 'Membership type vs Number of Rides',
caption = "Data collected from 2024/10 to 2025/9") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Monthly

ggplot(full_data) +
geom_bar(aes(x = member_casual, fill = member_casual)) +
facet_wrap(~month_name) +
labs(title = 'Monthly View', y="Number of Rides",
subtitle = 'Membership type vs Number of Rides',
caption = "Data collected from 2024/10 to 2025/9") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Average Ride Length

Daily

Daily_data <- full_data %>%
  group_by(weekday_name, member_casual) %>%
  summarise(
    min_ride_Daily_length_seconds = min(as.numeric(ride_length, units = "secs"), na.rm = TRUE),
    max_ride_Daily_length_hours = max(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 3600,
    avg_ride_Daily_length_minutes = mean(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 60,
    .groups = 'drop'
  )
ggplot(data = Daily_data)+
  geom_col(mapping = aes(x=weekday_name,y=avg_ride_Daily_length_minutes,fill =member_casual),position = "dodge")+
  labs(title = 'Weekly View ',y="Average Ride length in minutes",
                                 subtitle = 'Membership type vs Average Ride length ',
                                 caption = "Data collected from 2024/10 to 2025/9")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Monthly

Monthly_data <- full_data %>%
  group_by(month_name, member_casual) %>%
  summarise(avg_Monthly_ride_length_minutes = mean(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / 60,
            .groups = "drop")
ggplot(data = Monthly_data)+
  geom_col(position = 'dodge',mapping = aes(x=month_name,y=avg_Monthly_ride_length_minutes,fill =member_casual))+
  labs(title = 'Monthly View ',y="Average Ride length in minutes",
                                 subtitle = 'Membership type vs Average Ride length ',
                                 caption = "Data collected from 2024/10 to 2025/9")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Total Ride Length by Membership

Total_data <- full_data %>%
  group_by(member_casual) %>%
  summarise(Total_ride_length_Days = sum(as.numeric(ride_length, units = "secs"), na.rm = TRUE) / (60*60*24),
            .groups = "drop")
ggplot(data = Total_data)+
  geom_col(mapping = aes(x=member_casual,y=Total_ride_length_Days,fill =member_casual))+
  coord_cartesian(ylim = c(25000, NA)) +labs(title = 'Total View ',y="Total Ride length in days",
                               subtitle = 'Membership type vs Total Ride length ',
                               caption = "Data collected from 2024/10 to 2025/9")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Conclusion

This analysis provides insights into ride behavior by membership type, day, and month. The data cleaning process ensured reliable calculations of ride lengths and patterns.