Cyclistic Case Study

Installing the core package

install.packages("tidyverse")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

Loading the libraries

Notes: Setting up my R environment by loading the ‘tidyverse’, ‘dplyr’ and ‘ggplot2’ packages.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(ggplot2)

Loading the datasets for analysis

df_1 <- read_csv("Divvy_Trips_2019_Q1.csv")

## Rows: 365069 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): start_station_name, end_station_name, member_casual
## dbl  (4): ride_id, start_station_id, end_station_id, day_of_week
## time (3): started_at, ended_at, ride_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_2 <- read_csv("Divvy_Trips_2020_Q1.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 426887 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): ride_id, start_station_name, end_station_name, member_casual
## dbl  (3): start_station_id, end_station_id, day_of_week
## time (3): started_at, ended_at, ride_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Merging the rows of the datasets

new_df <- rbind(df_1, df_2)

Summarizing the new dataset

In order to obtain a sense of the kind of data in the data sets, I have utilized a bunch of functions here.

summary(new_df)

##    ride_id           started_at         ended_at        start_station_name
##  Length:791956      Length:791956     Length:791956     Length:791956     
##  Class :character   Class1:hms        Class1:hms        Class :character  
##  Mode  :character   Class2:difftime   Class2:difftime   Mode  :character  
##                     Mode  :numeric    Mode  :numeric                      
##                                                                           
##                                                                           
##                                                                           
##  start_station_id end_station_name   end_station_id  member_casual     
##  Min.   :  2.0    Length:791956      Min.   :  2.0   Length:791956     
##  1st Qu.: 77.0    Class :character   1st Qu.: 77.0   Class :character  
##  Median :174.0    Mode  :character   Median :174.0   Mode  :character  
##  Mean   :204.4                       Mean   :204.4                     
##  3rd Qu.:291.0                       3rd Qu.:291.0                     
##  Max.   :675.0                       Max.   :675.0                     
##                                      NA's   :1                         
##  ride_length        day_of_week   
##  Length:791956     Min.   :1.000  
##  Class1:hms        1st Qu.:2.000  
##  Class2:difftime   Median :4.000  
##  Mode  :numeric    Mean   :3.923  
##                    3rd Qu.:6.000  
##                    Max.   :7.000  
##

colnames(new_df)

##  [1] "ride_id"            "started_at"         "ended_at"          
##  [4] "start_station_name" "start_station_id"   "end_station_name"  
##  [7] "end_station_id"     "member_casual"      "ride_length"       
## [10] "day_of_week"

str(new_df)

## spc_tbl_ [791,956 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:791956] "21742443" "21742444" "21742445" "21742446" ...
##  $ started_at        : 'hms' num [1:791956] 00:04:37 01:04:37 02:04:37 03:04:37 ...
##   ..- attr(*, "units")= chr "secs"
##  $ ended_at          : 'hms' num [1:791956] 00:11:07 01:11:07 02:11:07 03:11:07 ...
##   ..- attr(*, "units")= chr "secs"
##  $ start_station_name: chr [1:791956] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ start_station_id  : num [1:791956] 199 44 15 123 173 98 98 211 150 268 ...
##  $ end_station_name  : chr [1:791956] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ end_station_id    : num [1:791956] 84 624 644 176 35 49 49 142 148 141 ...
##  $ member_casual     : chr [1:791956] "member" "member" "member" "member" ...
##  $ ride_length       : 'hms' num [1:791956] 00:06:30 00:06:30 00:06:30 00:06:30 ...
##   ..- attr(*, "units")= chr "secs"
##  $ day_of_week       : num [1:791956] 3 3 3 3 3 3 3 3 3 3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_double(),
##   ..   started_at = col_time(format = ""),
##   ..   ended_at = col_time(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   member_casual = col_character(),
##   ..   ride_length = col_time(format = ""),
##   ..   day_of_week = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

head(new_df)

## # A tibble: 6 × 10
##   ride_id  started_at ended_at start_station_name               start_station_id
##   <chr>    <time>     <time>   <chr>                                       <dbl>
## 1 21742443 00:04:37   00:11:07 Wabash Ave & Grand Ave                        199
## 2 21742444 01:04:37   01:11:07 State St & Randolph St                         44
## 3 21742445 02:04:37   02:11:07 Racine Ave & 18th St                           15
## 4 21742446 03:04:37   03:11:07 California Ave & Milwaukee Ave                123
## 5 21742447 04:04:37   04:11:07 Mies van der Rohe Way & Chicago…              173
## 6 21742448 05:04:37   05:11:07 LaSalle St & Washington St                     98
## # ℹ 5 more variables: end_station_name <chr>, end_station_id <dbl>,
## #   member_casual <chr>, ride_length <time>, day_of_week <dbl>

Analyzing the new dataset

Here, I am calculating the mean and max of the merged data set.

new_df_summary <-
  new_df %>% 
  summarise(mean_ride_len = mean(new_df$ride_length[new_df$ride_length > 0], na.rm = TRUE),
            max_ride_len = max(new_df$ride_length[new_df$ride_length > 0], na.rm = TRUE))

head(new_df_summary)

## # A tibble: 1 × 2
##   mean_ride_len max_ride_len
##   <drtn>        <drtn>      
## 1 687.6315 secs 357387 secs

Analyzing the annual member riders

In this part, I am analyzing the annual members of Cyclistic by calculating the average, min, and max ride duration.

members_ride_len <- filter(new_df, new_df$member_casual=="member")

members_summary <-
  members_ride_len %>% 
  summarise(avg_ride_length = mean(members_ride_len$ride_length[members_ride_len$ride_length > 0], na.rm = TRUE),
            min_ride_length = min(members_ride_len$ride_length[members_ride_len$ride_length > 0], na.rm = TRUE),
            max_ride_length = max(members_ride_len$ride_length[members_ride_len$ride_length > 0], na.rm = TRUE))

head(members_summary)

## # A tibble: 1 × 3
##   avg_ride_length min_ride_length max_ride_length
##   <drtn>          <drtn>          <drtn>         
## 1 559.137 secs    1 secs          347120 secs

Analyzing the casual riders

Here, I am analyzing the casual riders of Cyclistic by calculating the average, min and max ride duration.

casuals_ride_len <- filter(new_df, new_df$member_casual== "casual")

casuals_summary <-
  casuals_ride_len %>% 
  summarise(avg_ride_length = mean(casuals_ride_len$ride_length[casuals_ride_len$ride_length > 0], na.rm = TRUE),
            min_ride_length = min(casuals_ride_len$ride_length[casuals_ride_len$ride_length > 0], na.rm = TRUE),
            max_ride_length = max(casuals_ride_len$ride_length[casuals_ride_len$ride_length > 0], na.rm = TRUE))

head(casuals_summary)

## # A tibble: 1 × 3
##   avg_ride_length min_ride_length max_ride_length
##   <drtn>          <drtn>          <drtn>         
## 1 1984.923 secs   1 secs          357387 secs

Calculating the ride count by Customer Type

ride_count_df <- 
  new_df %>% 
  group_by(member_casual) %>% 
  summarise(ride_count = n())
head(ride_count_df)

## # A tibble: 2 × 2
##   member_casual ride_count
##   <chr>              <int>
## 1 casual             71643
## 2 member            720313

Displaying the ride count by Customer Type

ggplot(data = new_df) +
  geom_bar(mapping = aes(x = member_casual, fill = member_casual)) +
  labs(title = "Ride Count Vs Customer Type",
       x = "Customer Type",
       y = "Ride Count")

Assigning the weekdays to numbers

Here, I am changing the day of week column in the dataset by assigning weekdays to the numbers to give it more context.

new_df$day_of_week <- recode(new_df$day_of_week, 
                             "1"="Sunday",
                             "2"="Monday",
                             "3"="Tuesday",
                             "4"="Wednesday",
                             "5"="Thursday",
                             "6"="Friday",
                             "7"="Saturday")

Ordering weekdays in a specific format

new_df$day_of_week <- ordered(new_df$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Displaying ride count by Customer Type for weekdays

new_df %>% 
  drop_na(member_casual) %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(number_of_rides = n(), .groups = 'keep') %>% 
  arrange(member_casual, day_of_week) %>% 
  ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual)) +
  geom_col(width=0.5, position = position_dodge(width=0.5)) +
  theme(axis.text.x = element_text(angle = 45)) +
  labs(title = "Ride count by Customer Type for week days",
       x = "Week days",
       y = "Ride Count")

Displaying average trip duration by Customer Type for weekdays

new_df %>%  
  drop_na(member_casual) %>%
  drop_na(ride_length) %>% 
  group_by(member_casual, day_of_week) %>% 
  summarise(average_trip_duration = mean(ride_length), .groups = 'keep') %>% 
  ggplot(aes(x = day_of_week, y = average_trip_duration, fill = member_casual)) +
  geom_col(width=0.5, position = position_dodge(width=0.5)) +
  theme(axis.text.x = element_text(angle = 45)) +
  labs(title = "Average trip duration by Customer Type for week days",
       x = "Week days",
       y = "Average trip duration")

## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.