This exploratory data analysis (EDA) is based on the Divvy case study “‘Sophisticated, Clear, and Polished’: Divvy and Data Visualization” written by Kevin Hartman (found here: https://artscience.blog/home/divvy-dataviz-case-study) with the adjustment of using 2022 bike data. The purpose of this EDA is to consolidate downloaded Divvy data into a single dataframe and then answer the key question: “In what ways do members and casual riders use Divvy bikes differently?” This case study is completed to satisfy the capstone project requirement for the Google Data Analytics Professional certificate hosted through Coursera.

Load packages

library(tidyverse) 
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
library(rstatix)
## Warning: package 'rstatix' was built under R version 4.2.3

Load data

load("cyclistic_data.Rdata")

Data Cleaning

Inspect new table

colnames(cyclistic_data)  
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"      "ride_length"        "day_of_week"
nrow(cyclistic_data) 
## [1] 5667792
dim(cyclistic_data)  
## [1] 5667792      15
head(cyclistic_data)  
## # A tibble: 6 × 15
##   ride_id          rideable_type started_at          ended_at           
##   <chr>            <chr>         <dttm>              <dttm>             
## 1 2D3C93E50745DD18 docked_bike   2021-12-31 22:58:06 2022-01-03 17:32:18
## 2 35897CC59E57BAF4 docked_bike   2021-12-31 10:40:31 2022-01-03 13:40:26
## 3 E5E9F1F5158AD65D docked_bike   2021-12-31 23:43:48 2022-01-02 00:43:49
## 4 E4E9354762A63483 docked_bike   2021-12-31 23:43:37 2022-01-02 00:43:37
## 5 3E8E99C459819F76 docked_bike   2021-12-31 16:06:46 2022-01-01 18:39:37
## 6 EC5375236F014FA4 docked_bike   2021-12-31 16:20:39 2022-01-01 17:20:39
## # ℹ 11 more variables: start_station_name <chr>, start_station_id <chr>,
## #   end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## #   start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>,
## #   ride_length <dttm>, day_of_week <dbl>
str(cyclistic_data) 
## tibble [5,667,792 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5667792] "2D3C93E50745DD18" "35897CC59E57BAF4" "E5E9F1F5158AD65D" "E4E9354762A63483" ...
##  $ rideable_type     : chr [1:5667792] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ started_at        : POSIXct[1:5667792], format: "2021-12-31 22:58:06" "2021-12-31 10:40:31" ...
##  $ ended_at          : POSIXct[1:5667792], format: "2022-01-03 17:32:18" "2022-01-03 13:40:26" ...
##  $ start_station_name: chr [1:5667792] "LaSalle Dr & Huron St" "Sacramento Blvd & Franklin Blvd" "Fairbanks Ct & Grand Ave" "Fairbanks Ct & Grand Ave" ...
##  $ start_station_id  : chr [1:5667792] "KP1705001026" "KA1504000113" "TA1305000003" "TA1305000003" ...
##  $ end_station_name  : chr [1:5667792] "Kingsbury St & Kinzie St" "Base - 2132 W Hubbard Warehouse" NA NA ...
##  $ end_station_id    : chr [1:5667792] "KA1503000043" "Hubbard Bike-checking (LBS-WH-TEST)" NA NA ...
##  $ start_lat         : num [1:5667792] 41.9 41.9 41.9 41.9 42 ...
##  $ start_lng         : num [1:5667792] -87.6 -87.7 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:5667792] 41.9 41.9 NA NA 42 ...
##  $ end_lng           : num [1:5667792] -87.6 -87.7 NA NA -87.7 ...
##  $ member_casual     : chr [1:5667792] "casual" "casual" "casual" "casual" ...
##  $ ride_length       : POSIXct[1:5667792], format: "1900-01-02 18:34:12" "1900-01-03 02:59:55" ...
##  $ day_of_week       : num [1:5667792] 6 6 6 6 6 6 5 6 6 6 ...
summary(cyclistic_data)
##    ride_id          rideable_type        started_at                    
##  Length:5667792     Length:5667792     Min.   :2021-12-23 02:11:40.00  
##  Class :character   Class :character   1st Qu.:2022-05-28 19:19:37.75  
##  Mode  :character   Mode  :character   Median :2022-07-22 15:02:37.00  
##                                        Mean   :2022-07-20 07:17:26.59  
##                                        3rd Qu.:2022-09-16 07:20:04.25  
##                                        Max.   :2022-12-31 23:59:26.00  
##                                                                        
##     ended_at                      start_station_name start_station_id  
##  Min.   :2022-01-01 00:00:12.00   Length:5667792     Length:5667792    
##  1st Qu.:2022-05-28 19:41:19.75   Class :character   Class :character  
##  Median :2022-07-22 15:23:20.00   Mode  :character   Mode  :character  
##  Mean   :2022-07-20 07:36:53.59                                        
##  3rd Qu.:2022-09-16 07:37:35.50                                        
##  Max.   :2023-01-02 04:56:45.00                                        
##                                                                        
##  end_station_name   end_station_id       start_lat       start_lng     
##  Length:5667792     Length:5667792     Min.   :41.64   Min.   :-87.84  
##  Class :character   Class :character   1st Qu.:41.88   1st Qu.:-87.66  
##  Mode  :character   Mode  :character   Median :41.90   Median :-87.64  
##                                        Mean   :41.90   Mean   :-87.65  
##                                        3rd Qu.:41.93   3rd Qu.:-87.63  
##                                        Max.   :45.64   Max.   :-73.80  
##                                                                        
##     end_lat         end_lng       member_casual     
##  Min.   : 0.00   Min.   :-88.14   Length:5667792    
##  1st Qu.:41.88   1st Qu.:-87.66   Class :character  
##  Median :41.90   Median :-87.64   Mode  :character  
##  Mean   :41.90   Mean   :-87.65                     
##  3rd Qu.:41.93   3rd Qu.:-87.63                     
##  Max.   :42.37   Max.   :  0.00                     
##  NA's   :5862    NA's   :5862                       
##   ride_length                      day_of_week   
##  Min.   :1899-12-30 21:42:35.00   Min.   :1.000  
##  1st Qu.:1899-12-31 00:05:49.00   1st Qu.:2.000  
##  Median :1899-12-31 00:10:17.00   Median :4.000  
##  Mean   :1899-12-31 00:19:10.17   Mean   :4.103  
##  3rd Qu.:1899-12-31 00:18:28.00   3rd Qu.:6.000  
##  Max.   :1900-01-25 04:17:48.00   Max.   :7.000  
##  NA's   :5
The data can only be aggregated at the ride-level. We will want to add some additional columns of data for day, month, and year to provide additional opportunities to aggregate the data. There are some rides where trip duration is negative, which is due to bikes being removed for quality control.
Add columns that list the date, month, day, and year of each ride. Note that the final line changed day of week to a word instead of a number, from numeric to character. Add a column ride_length as a calculation of trip duration in minutes.
cyclistic_data <- cyclistic_data %>%
  mutate(
    start_date = date(started_at),
    start_month = month(started_at),
    start_day = day(started_at), 
    start_year = year(started_at),
    start_day_of_week = wday(started_at, label = TRUE, abbr = FALSE),
    ride_length = interval(cyclistic_data$started_at, cyclistic_data$ended_at) / minutes(1)
  )

cyclistic_data <- cyclistic_data %>% filter(! (ride_length < 0))

Check Summary Statistics

Check out the summary statistics for ride_length and then do the same by membership type. Casual customers do not have a membership while member customers do have a membership. On average, bukes were used for 19.45 minutes over a range of 0 minutes to 41,387 minutes. Presumably 0-minute rides and extremely prolonged rides were accidental. No further information is available for these instances, however it could become a further point of study. On average, casual customers rode the bikes for an average of 29.16 minutes while members rode the bikes for an average of 12.71 minutes. Although Saturday has the highest number of rides per day, Sunday has the longest ride per day on average.
cyclistic_data %>% count(member_casual) %>% rename(`Membership Status` = member_casual, Total = n)
## # A tibble: 2 × 2
##   `Membership Status`   Total
##   <chr>                 <int>
## 1 casual              2322035
## 2 member              3345658
cyclistic_data %>% select(ride_length) %>% get_summary_stats()
## # A tibble: 1 × 13
##   variable        n   min   max median    q1    q3   iqr   mad  mean    sd    se
##   <fct>       <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ride_leng… 5.67e6     0 41387   10.3  5.87  18.5  12.6  7.98  19.5  176. 0.074
## # ℹ 1 more variable: ci <dbl>
cyclistic_data %>% group_by(member_casual) %>% select(ride_length) %>% get_summary_stats()
## Adding missing grouping variables: `member_casual`
## # A tibble: 2 × 14
##   member_casual variable          n   min    max median    q1    q3   iqr   mad
##   <chr>         <fct>         <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 casual        ride_length 2322035     0 41387   13     7.32  24.1  16.8 10.4 
## 2 member        ride_length 3345658     0  1560.   8.87  5.07  15.2  10.2  6.65
## # ℹ 4 more variables: mean <dbl>, sd <dbl>, se <dbl>, ci <dbl>
cyclistic_data %>% group_by(start_day_of_week) %>% select(ride_length) %>% get_summary_stats()
## Adding missing grouping variables: `start_day_of_week`
## # A tibble: 7 × 14
##   start_day_of_week variable       n   min    max median    q1    q3   iqr   mad
##   <ord>             <fct>      <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Sunday            ride_len… 776219     0 36258.  12.0   6.5   22.0  15.6  9.86
## 2 Monday            ride_len… 751007     0 32035.   9.8   5.53  17.8  12.3  7.68
## 3 Tuesday           ride_len… 782349     0 31086.   9.37  5.43  16.4  10.9  6.99
## 4 Wednesday         ride_len… 798221     0 35821.   9.47  5.5   16.4  10.9  7.04
## 5 Thursday          ride_len… 841583     0 31024.   9.68  5.58  16.9  11.3  7.29
## 6 Friday            ride_len… 801855     0 32403.  10.0   5.8   17.9  12.1  7.66
## 7 Saturday          ride_len… 916459     0 41387   12.1   6.73  22    15.3  9.74
## # ℹ 4 more variables: mean <dbl>, sd <dbl>, se <dbl>, ci <dbl>

Accompanying Graphics

Create a data subset for use with the visualizations.
cyclistic_subset <- cyclistic_data %>% 
  group_by(member_casual, start_day_of_week) %>%  
  summarize(
    `Number of Rides` = n(),                    
    `Average Duration` = round(mean(ride_length), 2)
    ) %>%       
  arrange(start_day_of_week, member_casual) %>%
  rename(`Membership Status` = member_casual, `Day of Week - Ride Start` = start_day_of_week)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
Consider the following two graphics displaying weekday usage and average ride duration by membership status. In the first graphic, bicycle usage regardless of membership status is near-equal on the weekends. Some sort of perk for weekend usage by members could be devised as an incentive for casual customers to begin a membership. In the second graphic, on all days it appears that casual customers tend to ride Cyclistic bikes longer than members. Some sort of a members-only mileage program can be implemented to encourage casual customers to begin memberships and encourage members to use the bikes more often.
ggplot(cyclistic_subset) +
  geom_col(
    aes(x = `Day of Week - Ride Start`, y = `Number of Rides`, fill = `Membership Status`), 
    position = "dodge"
    ) +
  labs(title = "Weekday Usage by Membership Status") +
  scale_y_continuous(breaks = seq(0, 500000, 50000), labels = scales::label_comma()) + 
  guides(fill = guide_legend(title = "Membership Status")) +
  scale_fill_discrete(labels = c("Casual", "Member")) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

  ggplot(cyclistic_subset) +
  geom_col(
    aes(`Day of Week - Ride Start`, y = `Average Duration`, fill = `Membership Status`), 
    position = "dodge"
    ) +
  labs(title = "Average Ride Duration by Membership Status", y = "Average Duration (minutes)") +
  scale_y_continuous(breaks = seq(0, 40, 5)) +
  guides(fill = guide_legend(title = "Membership")) +
  scale_fill_discrete(labels = c('Casual', 'Member')) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

Members did not use docked bicycles at all, while casual customers barely used them. Devise a reward system (reduced rates, free time, etc.) for consistently using a docked bicycle. Casual customers tend to use electric bicycles more than classic bicycles. A promotion for reduced rates with classic bicycle usage may appeal to casual customers and encourage them to start a membership.
cyclistic_data %>% 
  filter(!is.na(rideable_type)) %>%
  group_by(member_casual, rideable_type) %>% 
  summarise(number_of_rides = n()) %>% 
  ggplot() +
  geom_col(aes(x = rideable_type, y = number_of_rides, fill = member_casual), position = "dodge") +
  labs(
    title = "Bicycle Type by Membership Status", 
    x = "Type of Bicycle", 
    y = "Total"
    ) +
  scale_x_discrete(labels = c("Classic", "Docked", "Electric")) + 
  scale_y_continuous(breaks = seq(0, 2000000, 250000)) +
  guides(fill = guide_legend(title = "Membership Status")) +
  scale_fill_discrete(labels=c('Casual', 'Member')) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.