ANALYZE PHASE — USING R (WITH CLEANED SQL DATA)

Step 0: Setup

Making sure I have these R packages (bigrquery, dplyr, ggplot2, lubridate, readr) installed:

install.packages(c("bigrquery", "dplyr", "ggplot2", "lubridate", "readr"))

## Installing packages into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)

Load them:

library(bigrquery)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(readr)

Step 1: Import Cleaned Data from BigQuery

Step 2: Read both CSV files into R

divvy_2019 <- read_csv("divvy_2019_q1_clean.csv")

## Rows: 365069 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): started_at, ended_at, start_station_name, end_station_name, member_...
## dbl (6): ride_id, start_station_id, end_station_id, ride_length_seconds, day...
## lgl (1): rideable_type
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

divvy_2020 <- read_csv("divvy_2020_q1_clean.csv")

## Rows: 426677 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, started_at, ended_at, start_station_name, end_station_name...
## dbl (4): start_station_id, end_station_id, ride_length_seconds, day_of_week
## lgl (2): gender, birthyear
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Step 3: Check schemas before combining (VERY IMPORTANT)

glimpse(divvy_2019)

## Rows: 365,069
## Columns: 13
## $ ride_id             <dbl> 21960544, 21758497, 22102594, 21928696, 22136306, …
## $ started_at          <chr> "2019-02-24 14:45:32 UTC", "2019-01-04 12:31:59 UT…
## $ ended_at            <chr> "2019-02-24 15:02:54 UTC", "2019-01-04 12:39:50 UT…
## $ start_station_name  <chr> "2112 W Peterson Ave", "2112 W Peterson Ave", "Ben…
## $ start_station_id    <dbl> 456, 456, 596, 454, 454, 461, 461, 461, 458, 458, …
## $ end_station_name    <chr> "2112 W Peterson Ave", "2112 W Peterson Ave", "211…
## $ end_station_id      <dbl> 456, 456, 456, 456, 456, 456, 456, 456, 456, 456, …
## $ member_casual       <chr> "member", "casual", "member", "member", "member", …
## $ ride_length_seconds <dbl> 1042, 471, 2038, 527, 532, 700, 893, 780, 496, 438…
## $ day_of_week         <dbl> 1, 6, 5, 6, 3, 7, 7, 2, 4, 4, 4, 2, 5, 5, 3, 4, 5,…
## $ gender              <chr> "Male", NA, "Male", "Male", "Male", "Male", "Male"…
## $ birthyear           <dbl> 1990, NA, 1990, 1974, 1990, 1960, 1960, NA, 1974, …
## $ rideable_type       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…

glimpse(divvy_2020)

## Rows: 426,677
## Columns: 13
## $ ride_id             <chr> "F5543D0167308145", "57E5D44302ED1D9A", "94E4474C1…
## $ started_at          <chr> "2020-02-23 17:13:48 UTC", "2020-03-15 15:58:08 UT…
## $ ended_at            <chr> "2020-02-23 17:44:02 UTC", "2020-03-15 16:17:36 UT…
## $ start_station_name  <chr> "Clark St & Grace St", "Western Ave & Leland Ave",…
## $ start_station_id    <dbl> 165, 239, 243, 244, 294, 295, 299, 326, 344, 344, …
## $ end_station_name    <chr> "2112 W Peterson Ave", "2112 W Peterson Ave", "211…
## $ end_station_id      <dbl> 456, 456, 456, 456, 456, 456, 456, 456, 456, 456, …
## $ member_casual       <chr> "member", "member", "member", "member", "member", …
## $ ride_length_seconds <dbl> 1814, 1168, 1010, 1336, 749, 1184, 2273, 1267, 691…
## $ day_of_week         <dbl> 1, 1, 2, 4, 1, 3, 6, 7, 3, 4, 2, 3, 1, 3, 1, 2, 5,…
## $ gender              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ birthyear           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ rideable_type       <chr> "docked_bike", "docked_bike", "docked_bike", "dock…

Step 4: Prepare Data for Analysis

No cleaning — only analysis-ready enhancements

Note:
The same transformations are applied to the 2019 and 2020 datasets to ensure consistency. This step converts the ride start time to a date-time format, calculates ride duration in minutes, and extracts the day of the week and hour of day for time-based analysis.

divvy_2019 <- divvy_2019 %>%
  mutate(
    started_at = as.POSIXct(started_at),
    ride_length_min = ride_length_seconds / 60,
    day_of_week_label = weekdays(started_at),
    hour_of_day = hour(started_at)
  )

divvy_2020 <- divvy_2020 %>%
  mutate(
    started_at = as.POSIXct(started_at),
    ride_length_min = ride_length_seconds / 60,
    day_of_week_label = weekdays(started_at),
    hour_of_day = hour(started_at)
  )

Step 5: Descriptive Analysis

Total Rides by Rider Type

count_2019 <- divvy_2019 %>%
  count(member_casual)

count_2020 <- divvy_2020 %>%
  count(member_casual)

count_2019

## # A tibble: 2 × 2
##   member_casual      n
##   <chr>          <int>
## 1 casual         23163
## 2 member        341906

count_2020

## # A tibble: 2 × 2
##   member_casual      n
##   <chr>          <int>
## 1 casual         48270
## 2 member        378407

Average Ride Duration by Rider Type

duration_2019 <- divvy_2019 %>%
  group_by(member_casual) %>%
  summarise(
    avg_duration = mean(ride_length_min),
    min_duration = min(ride_length_min),
    max_duration = max(ride_length_min)
  )

duration_2020 <- divvy_2020 %>%
  group_by(member_casual) %>%
  summarise(
    avg_duration = mean(ride_length_min),
    min_duration = min(ride_length_min),
    max_duration = max(ride_length_min)
  )

duration_2019

## # A tibble: 2 × 4
##   member_casual avg_duration min_duration max_duration
##   <chr>                <dbl>        <dbl>        <dbl>
## 1 casual                61.9         1.02      177200.
## 2 member                13.9         1.02      101607.

duration_2020

## # A tibble: 2 × 4
##   member_casual avg_duration min_duration max_duration
##   <chr>                <dbl>        <dbl>        <dbl>
## 1 casual                96.2       0.0167      156450.
## 2 member                12.7       0.0167       93794.

Trips by Day of Week This step counts the number of rides for each rider type (member vs. casual) by day of the week for 2019 and 2020. It helps identify weekly riding patterns and differences between rider groups.

day_2019 <- divvy_2019 %>%
  count(member_casual, day_of_week_label)

day_2020 <- divvy_2020 %>%
  count(member_casual, day_of_week_label)

day_2019

## # A tibble: 14 × 3
##    member_casual day_of_week_label     n
##    <chr>         <chr>             <int>
##  1 casual        Friday             3375
##  2 casual        Monday             1892
##  3 casual        Saturday           5993
##  4 casual        Sunday             3766
##  5 casual        Thursday           2920
##  6 casual        Tuesday            2728
##  7 casual        Wednesday          2489
##  8 member        Friday            59672
##  9 member        Monday            48507
## 10 member        Saturday          29309
## 11 member        Sunday            24233
## 12 member        Thursday          63983
## 13 member        Tuesday           58277
## 14 member        Wednesday         57925

day_2020

## # A tibble: 14 × 3
##    member_casual day_of_week_label     n
##    <chr>         <chr>             <int>
##  1 casual        Friday             5133
##  2 casual        Monday             4802
##  3 casual        Saturday           7480
##  4 casual        Sunday            14886
##  5 casual        Thursday           4851
##  6 casual        Tuesday            5244
##  7 casual        Wednesday          5874
##  8 member        Friday            55496
##  9 member        Monday            61923
## 10 member        Saturday          30104
## 11 member        Sunday            35964
## 12 member        Thursday          61245
## 13 member        Tuesday           69697
## 14 member        Wednesday         63978

This bar chart visualizes the number of rides in Q1 2019 by day of the week, comparing member and casual riders to highlight differences in weekly usage patterns.

ggplot(day_2019, aes(day_of_week_label, n, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = "2019 Q1: Trips by Day of Week", x = "Day", y = "Rides") +
  theme_minimal()

This bar chart shows the number of rides in Q1 2020 by day of the week, comparing member and casual riders to identify changes in weekly riding patterns.

ggplot(day_2020, aes(day_of_week_label, n, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title = "2020 Q1: Trips by Day of Week", x = "Day", y = "Rides") +
  theme_minimal()

Trips by Hour of Day

hour_2019 <- divvy_2019 %>%
  count(member_casual, hour_of_day)

hour_2020 <- divvy_2020 %>%
  count(member_casual, hour_of_day)

hour_2019

## # A tibble: 48 × 3
##    member_casual hour_of_day     n
##    <chr>               <int> <int>
##  1 casual                  0   153
##  2 casual                  1   107
##  3 casual                  2   111
##  4 casual                  3    44
##  5 casual                  4    37
##  6 casual                  5    60
##  7 casual                  6   141
##  8 casual                  7   288
##  9 casual                  8   664
## 10 casual                  9   758
## # ℹ 38 more rows

hour_2020

## # A tibble: 48 × 3
##    member_casual hour_of_day     n
##    <chr>               <int> <int>
##  1 casual                  0   312
##  2 casual                  1   224
##  3 casual                  2   137
##  4 casual                  3   101
##  5 casual                  4    58
##  6 casual                  5   136
##  7 casual                  6   341
##  8 casual                  7   757
##  9 casual                  8  1338
## 10 casual                  9  1829
## # ℹ 38 more rows

This line chart displays the number of trips in Q1 2019 by hour of the day, comparing riding patterns between member and casual riders to identify peak usage times.

ggplot(hour_2019, aes(hour_of_day, n, color = member_casual)) +
  geom_line() +
  labs(title = "2019 Q1: Trips by Hour", x = "Hour", y = "Rides") +
  theme_minimal()

This line chart shows the number of trips in Q1 2020 by hour of the day, comparing hourly riding patterns between member and casual riders.

ggplot(hour_2020, aes(hour_of_day, n, color = member_casual)) +
  geom_line() +
  labs(title = "2020 Q1: Trips by Hour", x = "Hour", y = "Rides") +
  theme_minimal()

###Export Summary Files This step exports the summarized results for 2019 and 2020 as CSV files. Each file contains aggregated metrics (trip counts, duration summaries, and trips by day and hour) that can be used for reporting, visualization, or sharing with stakeholders.

write_csv(count_2019, "2019_trip_counts.csv")
write_csv(duration_2019, "2019_duration_summary.csv")
write_csv(day_2019, "2019_trips_by_day.csv")
write_csv(hour_2019, "2019_trips_by_hour.csv")

write_csv(count_2020, "2020_trip_counts.csv")
write_csv(duration_2020, "2020_duration_summary.csv")
write_csv(day_2020, "2020_trips_by_day.csv")
write_csv(hour_2020, "2020_trips_by_hour.csv")

DA Capstone Project

Samuel Sanni

2026-01-02