First we load the stuffs

#load packages and libraries
install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", "scales"))

## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

## also installing the dependencies 'terra', 'raster'

## Warning in install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", :
## installation of package 'terra' had non-zero exit status

## Warning in install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", :
## installation of package 'raster' had non-zero exit status

install.packages("tidyverse")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(leaflet)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

I did have to load tidyverse twice (not sure why), on to the next one.

###Then we load the datasets

#Time to load the dataset

df <- read_csv("Cycle_trip.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): started_at, ended_at, start_station_name, end_station_name, member...
## dbl  (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Time to make sure everything is the correct data type

# Time to convert datetime columns
df <- df %>%
  mutate(
    started_at = ymd_hms(started_at),
    ended_at = ymd_hms(ended_at),
    ride_length = as.numeric(difftime(ended_at, started_at, units = "mins")),
    day_of_week = wday(started_at, label = TRUE, abbr = FALSE),
    hour_of_day = hour(started_at)
  )

The power of correct data types cannot be underestimated. Also..

# Time to filter unrealistic data
df <- df %>%
  filter(ride_length > 1, ride_length < 1440) # 1 minute to 24 hours

Analysis time, run from it, hide from it, analysis arrives all the same

# Average ride length by User type
df %>%
  group_by(member_casual) %>%
  summarise(
    avg_duration = mean(ride_length),
    median_duration = median(ride_length),
    count = n()
  )

## # A tibble: 2 × 4
##   member_casual avg_duration median_duration  count
##   <chr>                <dbl>           <dbl>  <int>
## 1 casual                40.1           23.1   44302
## 2 member                11.6            8.65 374599

Average ride length by day of week

df %>%
  group_by(member_casual, day_of_week) %>%
  summarise(avg_ride_length = mean(ride_length)) %>%
  ggplot(aes(x = day_of_week, y = avg_ride_length, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = "Average Ride Duration by Day and User Type", y = "Minutes", x = "Day of Week")

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Ride count by hour of day

df %>%
  group_by(hour_of_day, member_casual) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = hour_of_day, y = count, color = member_casual)) +
  geom_line(size = 1.2) +
  labs(title = "Rides by Hour of Day", x = "Hour", y = "Ride Count")

## `summarise()` has grouped output by 'hour_of_day'. You can override using the
## `.groups` argument.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Top 10 start station

top_stations <- df %>%
  group_by(start_station_name, start_lat, start_lng, member_casual) %>%
  summarise(rides = n()) %>%
  arrange(desc(rides)) %>%
  slice_max(order_by = rides, n = 10)

## `summarise()` has grouped output by 'start_station_name', 'start_lat',
## 'start_lng'. You can override using the `.groups` argument.

#map with leaflet

leaflet(top_stations) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~start_lng,
    lat = ~start_lat,
    radius = ~sqrt(rides),
    color = ~ifelse(member_casual == "casual", "red", "blue"),
    label = ~paste(start_station_name, "-", rides, "rides"),
    fillOpacity = 0.7
  ) %>%
  addLegend("bottomright", colors = c("blue", "red"), labels = c("Member", "Casual"), title = "User Type")

cycle_r_casestudy

KBRC

2025-07-19

First we load the stuffs

Time to make sure everything is the correct data type

Analysis time, run from it, hide from it, analysis arrives all the same

Average ride length by day of week

Ride count by hour of day

Top 10 start station

End