Making sure I have these R packages (bigrquery, dplyr, ggplot2, lubridate, readr) installed:
install.packages(c("bigrquery", "dplyr", "ggplot2", "lubridate", "readr"))
## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
Load them:
library(bigrquery)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
divvy_2019 <- read_csv("divvy_2019_q1_clean.csv")
## Rows: 365069 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): started_at, ended_at, start_station_name, end_station_name, member_...
## dbl (6): ride_id, start_station_id, end_station_id, ride_length_seconds, day...
## lgl (1): rideable_type
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
divvy_2020 <- read_csv("divvy_2020_q1_clean.csv")
## Rows: 426677 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, started_at, ended_at, start_station_name, end_station_name...
## dbl (4): start_station_id, end_station_id, ride_length_seconds, day_of_week
## lgl (2): gender, birthyear
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(divvy_2019)
## Rows: 365,069
## Columns: 13
## $ ride_id <dbl> 21960544, 21758497, 22102594, 21928696, 22136306, …
## $ started_at <chr> "2019-02-24 14:45:32 UTC", "2019-01-04 12:31:59 UT…
## $ ended_at <chr> "2019-02-24 15:02:54 UTC", "2019-01-04 12:39:50 UT…
## $ start_station_name <chr> "2112 W Peterson Ave", "2112 W Peterson Ave", "Ben…
## $ start_station_id <dbl> 456, 456, 596, 454, 454, 461, 461, 461, 458, 458, …
## $ end_station_name <chr> "2112 W Peterson Ave", "2112 W Peterson Ave", "211…
## $ end_station_id <dbl> 456, 456, 456, 456, 456, 456, 456, 456, 456, 456, …
## $ member_casual <chr> "member", "casual", "member", "member", "member", …
## $ ride_length_seconds <dbl> 1042, 471, 2038, 527, 532, 700, 893, 780, 496, 438…
## $ day_of_week <dbl> 1, 6, 5, 6, 3, 7, 7, 2, 4, 4, 4, 2, 5, 5, 3, 4, 5,…
## $ gender <chr> "Male", NA, "Male", "Male", "Male", "Male", "Male"…
## $ birthyear <dbl> 1990, NA, 1990, 1974, 1990, 1960, 1960, NA, 1974, …
## $ rideable_type <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
glimpse(divvy_2020)
## Rows: 426,677
## Columns: 13
## $ ride_id <chr> "F5543D0167308145", "57E5D44302ED1D9A", "94E4474C1…
## $ started_at <chr> "2020-02-23 17:13:48 UTC", "2020-03-15 15:58:08 UT…
## $ ended_at <chr> "2020-02-23 17:44:02 UTC", "2020-03-15 16:17:36 UT…
## $ start_station_name <chr> "Clark St & Grace St", "Western Ave & Leland Ave",…
## $ start_station_id <dbl> 165, 239, 243, 244, 294, 295, 299, 326, 344, 344, …
## $ end_station_name <chr> "2112 W Peterson Ave", "2112 W Peterson Ave", "211…
## $ end_station_id <dbl> 456, 456, 456, 456, 456, 456, 456, 456, 456, 456, …
## $ member_casual <chr> "member", "member", "member", "member", "member", …
## $ ride_length_seconds <dbl> 1814, 1168, 1010, 1336, 749, 1184, 2273, 1267, 691…
## $ day_of_week <dbl> 1, 1, 2, 4, 1, 3, 6, 7, 3, 4, 2, 3, 1, 3, 1, 2, 5,…
## $ gender <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ birthyear <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ rideable_type <chr> "docked_bike", "docked_bike", "docked_bike", "dock…
No cleaning — only analysis-ready enhancements
Note:
The same transformations are applied to the 2019 and 2020 datasets to
ensure consistency. This step converts the ride start time to a
date-time format, calculates ride duration in minutes, and extracts the
day of the week and hour of day for time-based analysis.
divvy_2019 <- divvy_2019 %>%
mutate(
started_at = as.POSIXct(started_at),
ride_length_min = ride_length_seconds / 60,
day_of_week_label = weekdays(started_at),
hour_of_day = hour(started_at)
)
divvy_2020 <- divvy_2020 %>%
mutate(
started_at = as.POSIXct(started_at),
ride_length_min = ride_length_seconds / 60,
day_of_week_label = weekdays(started_at),
hour_of_day = hour(started_at)
)
Total Rides by Rider Type
count_2019 <- divvy_2019 %>%
count(member_casual)
count_2020 <- divvy_2020 %>%
count(member_casual)
count_2019
## # A tibble: 2 × 2
## member_casual n
## <chr> <int>
## 1 casual 23163
## 2 member 341906
count_2020
## # A tibble: 2 × 2
## member_casual n
## <chr> <int>
## 1 casual 48270
## 2 member 378407
Average Ride Duration by Rider Type
duration_2019 <- divvy_2019 %>%
group_by(member_casual) %>%
summarise(
avg_duration = mean(ride_length_min),
min_duration = min(ride_length_min),
max_duration = max(ride_length_min)
)
duration_2020 <- divvy_2020 %>%
group_by(member_casual) %>%
summarise(
avg_duration = mean(ride_length_min),
min_duration = min(ride_length_min),
max_duration = max(ride_length_min)
)
duration_2019
## # A tibble: 2 × 4
## member_casual avg_duration min_duration max_duration
## <chr> <dbl> <dbl> <dbl>
## 1 casual 61.9 1.02 177200.
## 2 member 13.9 1.02 101607.
duration_2020
## # A tibble: 2 × 4
## member_casual avg_duration min_duration max_duration
## <chr> <dbl> <dbl> <dbl>
## 1 casual 96.2 0.0167 156450.
## 2 member 12.7 0.0167 93794.
Trips by Day of Week This step counts the number of rides for each rider type (member vs. casual) by day of the week for 2019 and 2020. It helps identify weekly riding patterns and differences between rider groups.
day_2019 <- divvy_2019 %>%
count(member_casual, day_of_week_label)
day_2020 <- divvy_2020 %>%
count(member_casual, day_of_week_label)
day_2019
## # A tibble: 14 × 3
## member_casual day_of_week_label n
## <chr> <chr> <int>
## 1 casual Friday 3375
## 2 casual Monday 1892
## 3 casual Saturday 5993
## 4 casual Sunday 3766
## 5 casual Thursday 2920
## 6 casual Tuesday 2728
## 7 casual Wednesday 2489
## 8 member Friday 59672
## 9 member Monday 48507
## 10 member Saturday 29309
## 11 member Sunday 24233
## 12 member Thursday 63983
## 13 member Tuesday 58277
## 14 member Wednesday 57925
day_2020
## # A tibble: 14 × 3
## member_casual day_of_week_label n
## <chr> <chr> <int>
## 1 casual Friday 5133
## 2 casual Monday 4802
## 3 casual Saturday 7480
## 4 casual Sunday 14886
## 5 casual Thursday 4851
## 6 casual Tuesday 5244
## 7 casual Wednesday 5874
## 8 member Friday 55496
## 9 member Monday 61923
## 10 member Saturday 30104
## 11 member Sunday 35964
## 12 member Thursday 61245
## 13 member Tuesday 69697
## 14 member Wednesday 63978
This bar chart visualizes the number of rides in Q1 2019 by day of the week, comparing member and casual riders to highlight differences in weekly usage patterns.
ggplot(day_2019, aes(day_of_week_label, n, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title = "2019 Q1: Trips by Day of Week", x = "Day", y = "Rides") +
theme_minimal()
This bar chart shows the number of rides in Q1 2020 by day of the week, comparing member and casual riders to identify changes in weekly riding patterns.
ggplot(day_2020, aes(day_of_week_label, n, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title = "2020 Q1: Trips by Day of Week", x = "Day", y = "Rides") +
theme_minimal()
Trips by Hour of Day
hour_2019 <- divvy_2019 %>%
count(member_casual, hour_of_day)
hour_2020 <- divvy_2020 %>%
count(member_casual, hour_of_day)
hour_2019
## # A tibble: 48 × 3
## member_casual hour_of_day n
## <chr> <int> <int>
## 1 casual 0 153
## 2 casual 1 107
## 3 casual 2 111
## 4 casual 3 44
## 5 casual 4 37
## 6 casual 5 60
## 7 casual 6 141
## 8 casual 7 288
## 9 casual 8 664
## 10 casual 9 758
## # ℹ 38 more rows
hour_2020
## # A tibble: 48 × 3
## member_casual hour_of_day n
## <chr> <int> <int>
## 1 casual 0 312
## 2 casual 1 224
## 3 casual 2 137
## 4 casual 3 101
## 5 casual 4 58
## 6 casual 5 136
## 7 casual 6 341
## 8 casual 7 757
## 9 casual 8 1338
## 10 casual 9 1829
## # ℹ 38 more rows
This line chart displays the number of trips in Q1 2019 by hour of the day, comparing riding patterns between member and casual riders to identify peak usage times.
ggplot(hour_2019, aes(hour_of_day, n, color = member_casual)) +
geom_line() +
labs(title = "2019 Q1: Trips by Hour", x = "Hour", y = "Rides") +
theme_minimal()
This line chart shows the number of trips in Q1 2020 by hour of the day, comparing hourly riding patterns between member and casual riders.
ggplot(hour_2020, aes(hour_of_day, n, color = member_casual)) +
geom_line() +
labs(title = "2020 Q1: Trips by Hour", x = "Hour", y = "Rides") +
theme_minimal()
###Export Summary Files This step exports the summarized results for 2019 and 2020 as CSV files. Each file contains aggregated metrics (trip counts, duration summaries, and trips by day and hour) that can be used for reporting, visualization, or sharing with stakeholders.
write_csv(count_2019, "2019_trip_counts.csv")
write_csv(duration_2019, "2019_duration_summary.csv")
write_csv(day_2019, "2019_trips_by_day.csv")
write_csv(hour_2019, "2019_trips_by_hour.csv")
write_csv(count_2020, "2020_trip_counts.csv")
write_csv(duration_2020, "2020_duration_summary.csv")
write_csv(day_2020, "2020_trips_by_day.csv")
write_csv(hour_2020, "2020_trips_by_hour.csv")