Cyclistic Business Case Analysis

In this scenario I am a junior data analyst at the fictional company Cyclistic and I have been tasked with the overall goal of designing marketing strategies for Cyclistic, a bike sharing company. Cyclistic has provided me with a large dataset of bike rides that have happened over the past 12 months. With this analysis, I will show the different types rides that are recorded by Cyclistic, who rides the bikes, and more descriptive statistics.

This file is intended to only showcase the R code used to clean and analyze the data. For an in-depth analysis, check out my Medium page, referenced in the README file on Git.


Data Processing and Cleaning

Setting Work Directory and Loading Packages

setwd("C:/Users/gashi/Desktop/Google Data Analytics Case Study (Cyclistic)/Raw Data")
library(tidyverse) #transform data
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate) #work with dates
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(hms) #work with hours
## 
## Attaching package: 'hms'
## 
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2) #graphs
library(scales) #scales for graphs
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggchicklet) #geom chicklet
library(gtExtras) #fancy tables
library(gt) #fancy tables
library(webshot) #exporting fancy tables to jpg
#webshot::install_phantomjs()

Importing Data

Since there are several datasets, instead of importing them manually, I will do so with a for loop. However, I will import them as individual files because the names of the columns might differ between files and that would present a problem when trying to import them in one file.

#Read files named xyz1111.csv, xyz2222.csv, etc. in working directory
filenames <- list.files(pattern="*csv")

##Create list of data frame names without the ".csv" part 
names <-substr(filenames,1,21)

###Load all files
for(i in names){
    filepath <- file.path("C:/Users/gashi/Desktop/Google Data Analytics Case Study (Cyclistic)/Raw Data",paste(i,".csv",sep=""))
    assign(i, read.csv(filepath, sep = ",")) #the data is read in via read.csv and to change the column types, add col_type for each variable
}

Checking the column names to see if they match and allow appending the data

colnames(`202108-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202109-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202110-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202111-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202112-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202201-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202202-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202203-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202204-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202205-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202206-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames(`202207-divvy-tripdata`)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

Appending the Data

# Appending the data
cyclist <- rbind(`202108-divvy-tripdata`, `202109-divvy-tripdata`, `202110-divvy-tripdata`, `202111-divvy-tripdata`,
                 `202112-divvy-tripdata`, `202201-divvy-tripdata`, `202202-divvy-tripdata`, `202203-divvy-tripdata`,
                 `202204-divvy-tripdata`, `202205-divvy-tripdata`, `202206-divvy-tripdata`, `202207-divvy-tripdata`)

str(cyclist) # seeing the column types
## 'data.frame':    5901463 obs. of  13 variables:
##  $ ride_id           : chr  "99103BB87CC6C1BB" "EAFCCCFB0A3FC5A1" "9EF4F46C57AD234D" "5834D3208BFAF1DA" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2021-08-10 17:15:49" "2021-08-10 17:23:14" "2021-08-21 02:34:23" "2021-08-21 06:52:55" ...
##  $ ended_at          : chr  "2021-08-10 17:22:44" "2021-08-10 17:39:24" "2021-08-21 02:50:36" "2021-08-21 07:08:13" ...
##  $ start_station_name: chr  "" "" "" "" ...
##  $ start_station_id  : chr  "" "" "" "" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.8 41.8 42 42 41.8 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.6 ...
##  $ end_lat           : num  41.8 41.8 42 42 41.8 ...
##  $ end_lng           : num  -87.7 -87.6 -87.7 -87.7 -87.6 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

Cleaning and Inspecting the Data

#Removing unnecessary columns
df2 <- cyclist %>% 
    select(-start_lat, -start_lng, -end_lat, -end_lng, -start_station_name, -start_station_id, -end_station_name, -end_station_id)


colnames(df2)
## [1] "ride_id"       "rideable_type" "started_at"    "ended_at"     
## [5] "member_casual"
nrow(df2)
## [1] 5901463
dim(df2)
## [1] 5901463       5
head(df2)
##            ride_id rideable_type          started_at            ended_at
## 1 99103BB87CC6C1BB electric_bike 2021-08-10 17:15:49 2021-08-10 17:22:44
## 2 EAFCCCFB0A3FC5A1 electric_bike 2021-08-10 17:23:14 2021-08-10 17:39:24
## 3 9EF4F46C57AD234D electric_bike 2021-08-21 02:34:23 2021-08-21 02:50:36
## 4 5834D3208BFAF1DA electric_bike 2021-08-21 06:52:55 2021-08-21 07:08:13
## 5 CD825CB87ED1D096 electric_bike 2021-08-19 11:55:29 2021-08-19 12:04:11
## 6 612F12C94A964F3E electric_bike 2021-08-19 12:41:12 2021-08-19 12:47:47
##   member_casual
## 1        member
## 2        member
## 3        member
## 4        member
## 5        member
## 6        member
str(df2)
## 'data.frame':    5901463 obs. of  5 variables:
##  $ ride_id      : chr  "99103BB87CC6C1BB" "EAFCCCFB0A3FC5A1" "9EF4F46C57AD234D" "5834D3208BFAF1DA" ...
##  $ rideable_type: chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at   : chr  "2021-08-10 17:15:49" "2021-08-10 17:23:14" "2021-08-21 02:34:23" "2021-08-21 06:52:55" ...
##  $ ended_at     : chr  "2021-08-10 17:22:44" "2021-08-10 17:39:24" "2021-08-21 02:50:36" "2021-08-21 07:08:13" ...
##  $ member_casual: chr  "member" "member" "member" "member" ...
summary(df2)
##    ride_id          rideable_type       started_at          ended_at        
##  Length:5901463     Length:5901463     Length:5901463     Length:5901463    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  member_casual     
##  Length:5901463    
##  Class :character  
##  Mode  :character
df2 %>% count(member_casual)
##   member_casual       n
## 1        casual 2522226
## 2        member 3379237
df2 %>% count(rideable_type)
##   rideable_type       n
## 1  classic_bike 3055641
## 2   docked_bike  226728
## 3 electric_bike 2619094

Generating Necessary Variables

df3 <- df2 %>% 
    mutate(year = year(started_at),
           month = month(started_at, label = T),
           day = day(started_at),
           day_of_week = wday(started_at, week_start = getOption("lubridate.week.start", 1), label = T), #starting the week from Monday (default Sunday - 7)
           date = date(started_at),
           ride_length = round(difftime(ymd_hms(ended_at), ymd_hms(started_at), units = "mins"), digits = 1))

Final Cleaning of the Data

df3 <- na.omit(df3) #remove rows with NA values
df3 <- distinct(df3) #remove duplicate rows 
df3 <- df3[!(df3$ride_length <=0),] #remove where ride_length is 0 or negative

Descriptive Analysis

Analysis

#Number of rides per type of member
df3 %>% 
    group_by(member_casual) %>% 
    summarize(count=n()) %>% 
    mutate(percent = count/sum(count)*100)
## # A tibble: 2 × 3
##   member_casual   count percent
##   <chr>           <int>   <dbl>
## 1 casual        2518963    42.8
## 2 member        3372596    57.2
#Summary statistics per type of member
df3 %>% 
    group_by(member_casual) %>% 
    summarize(mean = mean(ride_length),
              median = median(ride_length),
              min = min(ride_length),
              max = max(ride_length))
## # A tibble: 2 × 5
##   member_casual mean          median    min      max         
##   <chr>         <drtn>        <drtn>    <drtn>   <drtn>      
## 1 casual        29.24983 mins 14.4 mins 0.1 mins 41629.2 mins
## 2 member        12.95784 mins  9.0 mins 0.1 mins  1559.9 mins
#Average ride time by each day for members vs casual users
df3 %>% 
    group_by(member_casual, day_of_week) %>% 
    summarize(mean = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 3
## # Groups:   member_casual [2]
##    member_casual day_of_week mean         
##    <chr>         <ord>       <drtn>       
##  1 casual        Mon         29.75948 mins
##  2 casual        Tue         25.48510 mins
##  3 casual        Wed         25.03491 mins
##  4 casual        Thu         26.22729 mins
##  5 casual        Fri         27.44100 mins
##  6 casual        Sat         31.87516 mins
##  7 casual        Sun         34.01256 mins
##  8 member        Mon         12.59160 mins
##  9 member        Tue         12.16835 mins
## 10 member        Wed         12.19777 mins
## 11 member        Thu         12.42834 mins
## 12 member        Fri         12.63092 mins
## 13 member        Sat         14.50353 mins
## 14 member        Sun         14.66144 mins
#Average ride time by each day for members vs casual users
df3 %>% 
    group_by(member_casual, day_of_week) %>% 
    summarize(number_of_rides=n(),
              mean_ride_length = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups:   member_casual [2]
##    member_casual day_of_week number_of_rides mean_ride_length
##    <chr>         <ord>                 <int> <drtn>          
##  1 casual        Mon                  299270 29.75948 mins   
##  2 casual        Tue                  273468 25.48510 mins   
##  3 casual        Wed                  281406 25.03491 mins   
##  4 casual        Thu                  315740 26.22729 mins   
##  5 casual        Fri                  347203 27.44100 mins   
##  6 casual        Sat                  526895 31.87516 mins   
##  7 casual        Sun                  474981 34.01256 mins   
##  8 member        Mon                  471481 12.59160 mins   
##  9 member        Tue                  522429 12.16835 mins   
## 10 member        Wed                  521557 12.19777 mins   
## 11 member        Thu                  521734 12.42834 mins   
## 12 member        Fri                  465749 12.63092 mins   
## 13 member        Sat                  452566 14.50353 mins   
## 14 member        Sun                  417080 14.66144 mins

Visualisations

General Infosheet

#Infosheet Table
library(data.table) #creating data tables
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
atb <- c("Most Rides by Type", "Average Ride Length", "Busiest Month", "Busiest Weekday", "Most Used Bike")
info <- c("Member", "20 Minutes", "July", "Saturday", "Classic" )
infosheet <- data.table(atb, info)


infosheet %>% 
    gt() %>% 
    gt_theme_guardian() %>% 
    tab_style(locations = cells_body( #changes cells
        rows = everything(),
        columns = "info"
    ),
    style = list(cell_text(color = "#7d3ac1",
                           size = "medium",
                           weight = "bold"))
    ) %>% 
    tab_options(column_labels.hidden = TRUE)
Most Rides by Type Member
Average Ride Length 20 Minutes
Busiest Month July
Busiest Weekday Saturday
Most Used Bike Classic

Pie Chart of Total Rides

#Pie chart of Total Rides
df3 %>% 
    group_by(member_casual) %>% 
    summarize(number_of_rides = n()) %>% 
    ggplot(aes(x="", y=number_of_rides, fill=member_casual)) +
    geom_col(alpha=0.7, width=0.3) +
    coord_polar(theta = "y") + #makes the pie chart
    scale_fill_manual(values=c("#ea7369","#7d3ac1"), breaks = c("casual", "member"), labels=c("Casual", "Member")) +
    ggtitle("Total Rides by Subscription") +
    annotate("text", x = 0, y = 0, label = "5.9M Total Rides", colour="black", fontface="bold", size=7) + #adds text in the middle
    theme_minimal() +
    theme(legend.title = element_blank(),
          legend.text = element_text(face="bold"),
          axis.title = element_blank(),
          axis.text = element_blank(),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          plot.margin = margin(1,1,1.5,1.2, "cm"),
          plot.title = element_text(face="bold"))

Total Rides Table

#Total Rides Table
df3 %>% 
    group_by(member_casual) %>% 
    summarize(count=n()) %>% 
    mutate(percent = paste0(round((count/sum(count)*100), 1), "%"),
           count2 = label_comma()(count),
           member_casual = case_when(member_casual == "casual" ~ "Casual",
                                     member_casual == "member" ~ "Member")) %>%
    select(-count) %>%
    relocate(count2, .after=member_casual) %>% 
    rename(Subscriber = member_casual,
           'Total Rides' = count2,
           'Percentage of Total' = percent) %>% 
    gt() %>% 
    gt_theme_guardian() %>% 
    data_color(
        columns = 'Percentage of Total', colors = c("#7d3ac1"))
Subscriber Total Rides Percentage of Total
Casual 2,518,963 42.8%
Member 3,372,596 57.2%

Average Ride Length Table

#Average Ride Length Table
df3 %>% 
    group_by(member_casual) %>% 
    summarize(mean=mean(ride_length)) %>% 
    mutate(
        avg = paste0(                    #paste0 pastes better than just paste
            round(mean, 1)," ","Minutes"
        ),
        member_casual = case_when(member_casual == "casual" ~ "Casual",
                                  member_casual == "member" ~ "Member")
    ) %>%
    select(-mean) %>% 
    rename(Subscriber = member_casual,
           'Avg. Ride Length' = avg) %>% 
    gt() %>% 
    gt_theme_guardian() %>% 
    data_color(
        columns = 'Avg. Ride Length', colors = c("#7d3ac1"))
Subscriber Avg. Ride Length
Casual 29.2 Minutes
Member 13 Minutes

Bar Graph of Total Rides per Weekday

#Bar Chart of total rides by day of week (value labels are shown fully)    
df3 %>% 
    group_by(member_casual, day_of_week) %>% 
    summarize(number_of_rides = n()) %>% 
    ggplot(aes(x=day_of_week, y=number_of_rides, fill=member_casual)) +
    geom_chicklet(position = "stack", alpha=0.7, radius = grid::unit(5, "mm"), width = 0.6) +
    scale_y_continuous(labels = unit_format(unit = "K", scale = 1e-3)) +
    #    scale_fill_discrete(labels=c("Casual", "Member")) +
    scale_fill_manual(values=c("#ea7369","#7d3ac1"), breaks = c("casual", "member"), labels=c("Casual", "Member")) +
    geom_text(aes(label = label_comma()(number_of_rides)), position = position_stack(vjust = 1.05), colour="black", fontface = "bold") + #label values shown fully and with a comma
    ggtitle("Total Rides by Subscription and Day of Week") +
    labs(x = "Day of the week",
         y = "Number of rides",
         fill = "Type of Member") +
    theme_minimal() +
    theme(legend.title = element_blank(),
          legend.text = element_text(face="bold"),
          axis.title = element_blank(),
          axis.text = element_text(face="bold"),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          plot.margin = margin(1,1,1.5,1.2, "cm"),
          plot.title = element_text(face="bold"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Bar Graph of Total Rides per Bike Type

#Bar Chart of total rides by bike type (value labels are shown fully)    
df3 %>% 
    group_by(member_casual, rideable_type) %>% 
    summarize(number_of_rides = n()) %>% 
    ggplot(aes(x=rideable_type, y=number_of_rides, fill=member_casual)) +
    geom_chicklet(position = "stack", alpha=0.7, radius = grid::unit(5, "mm"), width = 0.6) +
    coord_flip() +
    scale_y_continuous(labels = unit_format(unit = "K", scale = 1e-3)) +
    scale_fill_manual(values=c("#ea7369","#7d3ac1"), breaks = c("casual", "member"), labels=c("Casual", "Member")) +
    scale_x_discrete(labels = c("Classic Bike", "Docked Bike", "Electric Bike")) + #labels axis ticks manually 
    geom_text(aes(label = label_comma()(number_of_rides)), position = position_stack(vjust = 0.5), colour="black", fontface="bold") + #label values shown fully
    ggtitle("Total Rides by Subscription and Bike Type") +
    labs(x = "Bike Type",
         y = "Number of rides",
         fill = "Type of Member") +
    theme_minimal() +
    theme(legend.title = element_blank(),
          legend.text = element_text(face="bold"),
          axis.title = element_blank(),
          axis.text = element_text(face="bold"),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          plot.margin = margin(1,1,1.5,1.2, "cm"),
          plot.title = element_text(face="bold"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Bar Graph of Total Rides per Month

#Bar Chart of total rides by month (value labels are shown fully)    
df3 %>% 
    group_by(member_casual, month) %>% 
    summarize(number_of_rides = n()) %>% 
    ggplot(aes(x=month, y=number_of_rides, fill=member_casual)) +
    geom_chicklet(position = "stack", alpha=0.7, radius = grid::unit(3, "mm"), width = 0.6) +
    scale_y_continuous(labels = unit_format(unit = "K", scale = 1e-3)) + #transforms axis label in thousands
    scale_fill_manual(values=c("#ea7369","#7d3ac1"), breaks = c("casual", "member"), labels=c("Casual", "Member")) +
    geom_text(aes(label = label_comma()(number_of_rides)), position = position_stack(vjust = 0.5), colour="black", fontface="bold") + #label values shown fully
    ggtitle("Total Rides by Month") +
    labs(x = "Month",
         y = "Number of rides",
         fill = "Type of Member") +
    theme_minimal() +
    theme(legend.title = element_blank(),
          legend.text = element_text(face="bold"),
          axis.title = element_blank(),
          axis.text = element_text(face="bold"),
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          plot.margin = margin(1,1,1.5,1.2, "cm"),
          plot.title = element_text(face="bold"))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.