#load packages and libraries
install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", "scales"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependencies 'terra', 'raster'
## Warning in install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", :
## installation of package 'terra' had non-zero exit status
## Warning in install.packages(c("tidyverse", "lubridate", "ggplot2", "leaflet", :
## installation of package 'raster' had non-zero exit status
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(leaflet)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
I did have to load tidyverse twice (not sure why), on to the next one.
###Then we load the datasets
#Time to load the dataset
df <- read_csv("Cycle_trip.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 426887 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): started_at, ended_at, start_station_name, end_station_name, member...
## dbl (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Time to convert datetime columns
df <- df %>%
mutate(
started_at = ymd_hms(started_at),
ended_at = ymd_hms(ended_at),
ride_length = as.numeric(difftime(ended_at, started_at, units = "mins")),
day_of_week = wday(started_at, label = TRUE, abbr = FALSE),
hour_of_day = hour(started_at)
)
The power of correct data types cannot be underestimated. Also..
# Time to filter unrealistic data
df <- df %>%
filter(ride_length > 1, ride_length < 1440) # 1 minute to 24 hours
# Average ride length by User type
df %>%
group_by(member_casual) %>%
summarise(
avg_duration = mean(ride_length),
median_duration = median(ride_length),
count = n()
)
## # A tibble: 2 × 4
## member_casual avg_duration median_duration count
## <chr> <dbl> <dbl> <int>
## 1 casual 40.1 23.1 44302
## 2 member 11.6 8.65 374599
df %>%
group_by(member_casual, day_of_week) %>%
summarise(avg_ride_length = mean(ride_length)) %>%
ggplot(aes(x = day_of_week, y = avg_ride_length, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title = "Average Ride Duration by Day and User Type", y = "Minutes", x = "Day of Week")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
df %>%
group_by(hour_of_day, member_casual) %>%
summarise(count = n()) %>%
ggplot(aes(x = hour_of_day, y = count, color = member_casual)) +
geom_line(size = 1.2) +
labs(title = "Rides by Hour of Day", x = "Hour", y = "Ride Count")
## `summarise()` has grouped output by 'hour_of_day'. You can override using the
## `.groups` argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
top_stations <- df %>%
group_by(start_station_name, start_lat, start_lng, member_casual) %>%
summarise(rides = n()) %>%
arrange(desc(rides)) %>%
slice_max(order_by = rides, n = 10)
## `summarise()` has grouped output by 'start_station_name', 'start_lat',
## 'start_lng'. You can override using the `.groups` argument.
#map with leaflet
leaflet(top_stations) %>%
addTiles() %>%
addCircleMarkers(
lng = ~start_lng,
lat = ~start_lat,
radius = ~sqrt(rides),
color = ~ifelse(member_casual == "casual", "red", "blue"),
label = ~paste(start_station_name, "-", rides, "rides"),
fillOpacity = 0.7
) %>%
addLegend("bottomright", colors = c("blue", "red"), labels = c("Member", "Casual"), title = "User Type")