First we load the stuffs

#load packages and libraries
install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", "scales"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependencies 'terra', 'raster'
## Warning in install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", :
## installation of package 'terra' had non-zero exit status
## Warning in install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", :
## installation of package 'raster' had non-zero exit status
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(leaflet)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

I did have to load tidyverse twice (not sure why), on to the next one.

###Then we load the datasets

#Time to load the dataset

df <- read_csv("Cycle_trip.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): started_at, ended_at, start_station_name, end_station_name, member...
## dbl  (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Time to make sure everything is the correct data type

# Time to convert datetime columns
df <- df %>%
  mutate(
    started_at = ymd_hms(started_at),
    ended_at = ymd_hms(ended_at),
    ride_length = as.numeric(difftime(ended_at, started_at, units = "mins")),
    day_of_week = wday(started_at, label = TRUE, abbr = FALSE),
    hour_of_day = hour(started_at)
  )

The power of correct data types cannot be underestimated. Also..

# Time to filter unrealistic data
df <- df %>%
  filter(ride_length > 1, ride_length < 1440) # 1 minute to 24 hours

Analysis time, run from it, hide from it, analysis arrives all the same

# Average ride length by User type
df %>%
  group_by(member_casual) %>%
  summarise(
    avg_duration = mean(ride_length),
    median_duration = median(ride_length),
    count = n()
  )
## # A tibble: 2 × 4
##   member_casual avg_duration median_duration  count
##   <chr>                <dbl>           <dbl>  <int>
## 1 casual                40.1           23.1   44302
## 2 member                11.6            8.65 374599

Average ride length by day of week

df %>%
  group_by(member_casual, day_of_week) %>%
  summarise(avg_ride_length = mean(ride_length)) %>%
  ggplot(aes(x = day_of_week, y = avg_ride_length, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = "Average Ride Duration by Day and User Type", y = "Minutes", x = "Day of Week")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Ride count by hour of day

df %>%
  group_by(hour_of_day, member_casual) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = hour_of_day, y = count, color = member_casual)) +
  geom_line(size = 1.2) +
  labs(title = "Rides by Hour of Day", x = "Hour", y = "Ride Count")
## `summarise()` has grouped output by 'hour_of_day'. You can override using the
## `.groups` argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Top 10 start station

top_stations <- df %>%
  group_by(start_station_name, start_lat, start_lng, member_casual) %>%
  summarise(rides = n()) %>%
  arrange(desc(rides)) %>%
  slice_max(order_by = rides, n = 10)
## `summarise()` has grouped output by 'start_station_name', 'start_lat',
## 'start_lng'. You can override using the `.groups` argument.

#map with leaflet

leaflet(top_stations) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~start_lng,
    lat = ~start_lat,
    radius = ~sqrt(rides),
    color = ~ifelse(member_casual == "casual", "red", "blue"),
    label = ~paste(start_station_name, "-", rides, "rides"),
    fillOpacity = 0.7
  ) %>%
  addLegend("bottomright", colors = c("blue", "red"), labels = c("Member", "Casual"), title = "User Type")

End