Google Capstone Cyclistic Analysis

How Does a Bike-Share Company Navigate Speedy Success?

Cyclistic is a public bike sharing company that provides its customers with access to bikes at accessible locations. It is the fastest and most convenient way to travel without having the responsibility of owning a bike. With its flexible pricing plan, Cyclistic can get you where you need to go.

Customers have the opportunity to purchase single-ride or full day passes. These customers are referred to as casual members while those that purchase the membership are annual members.

Business Task

To investigate the differences between Casual and Annual members in hopes to convince Casual members to invest in an annual membership.

Data Source

Google has provided data for solving the business task. The data is public and has been made available by Motivate International Inc.The data is in wide format.This data does not contain any user information such as credit card information and any means of personal identification.

ROCCC

Reliable - This data is free of biases and can answer the business task.

Original - This data is source by Google.

Comprehensive - There is sufficient data to tackle the business task.

Current - The data is up to date.

Cited - The data is made available by Motivate International Inc.

Tools

Rstudio (R programming language)

I’m using the R programming to perform my analysis. The R programming language makes it easier to work with a large dataset. This dataset has over 3 million entries. It also makes it easy for others to replicate my findings in this project.

Install and import Libraries

library(tidyverse)
library(lubridate)
library(modeest)

Import csv files

jun_2020 <- read_csv("~/Downloads/trip_data/202005-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
jul_2020 <- read_csv("~/Downloads/trip_data/202006-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
aug_2020 <- read_csv("~/Downloads/trip_data/202007-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
sep_2020 <- read_csv("~/Downloads/trip_data/202008-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
oct_2020 <- read_csv("~/Downloads/trip_data/202009-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
nov_2020 <- read_csv("~/Downloads/trip_data/202010-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
dec_2020 <- read_csv("~/Downloads/trip_data/202011-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_double(),
##   end_station_name = col_character(),
##   end_station_id = col_double(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
jan_2021 <- read_csv("~/Downloads/trip_data/202012-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
feb_2021 <- read_csv("~/Downloads/trip_data/202101-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
mar_2021 <- read_csv("~/Downloads/trip_data/202102-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
apr_2021 <- read_csv("~/Downloads/trip_data/202103-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
may_2021 <- read_csv("~/Downloads/trip_data/202104-divvy-tripdata.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )

Format all start_station_id and end_station_id

jun_2020 <- jun_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

jul_2020 <- jul_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

aug_2020 <- aug_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

sep_2020 <- sep_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

oct_2020 <- oct_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

nov_2020 <- nov_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

dec_2020 <- dec_2020 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

jan_2021 <- jan_2021 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

feb_2021 <- feb_2021 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

mar_2021 <- mar_2021 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

apr_2021 <- apr_2021 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

may_2021 <- may_2021 %>% 
  mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))

Combining all csv files into one

all_trip_data <- bind_rows(jun_2020,jul_2020,aug_2020,sep_2020,oct_2020,nov_2020,dec_2020,jan_2021,feb_2021,mar_2021,apr_2021,may_2021)

Taking a look at the combine table

glimpse(all_trip_data)
## Rows: 3,742,202
## Columns: 13
## $ ride_id            <chr> "02668AD35674B983", "7A50CCAF1EDDB28F", "2FFCDFDB91…
## $ rideable_type      <chr> "docked_bike", "docked_bike", "docked_bike", "docke…
## $ started_at         <dttm> 2020-05-27 10:03:52, 2020-05-25 10:47:11, 2020-05-…
## $ ended_at           <dttm> 2020-05-27 10:16:49, 2020-05-25 11:05:40, 2020-05-…
## $ start_station_name <chr> "Franklin St & Jackson Blvd", "Clark St & Wrightwoo…
## $ start_station_id   <dbl> 36, 340, 260, 251, 261, 206, 261, 180, 331, 219, 24…
## $ end_station_name   <chr> "Wabash Ave & Grand Ave", "Clark St & Leland Ave", …
## $ end_station_id     <dbl> 199, 326, 260, 157, 206, 22, 261, 180, 300, 305, 14…
## $ start_lat          <dbl> 41.8777, 41.9295, 41.9296, 41.9680, 41.8715, 41.847…
## $ start_lng          <dbl> -87.6353, -87.6431, -87.7079, -87.6500, -87.6699, -…
## $ end_lat            <dbl> 41.8915, 41.9671, 41.9296, 41.9367, 41.8472, 41.869…
## $ end_lng            <dbl> -87.6268, -87.6674, -87.7079, -87.6368, -87.6468, -…
## $ member_casual      <chr> "member", "casual", "casual", "casual", "member", "…

Calculating the ride duration

all_trip_data <- all_trip_data %>% 
  mutate(ride_length = difftime(ended_at,started_at,units = "min"))

Creating monthly/yearly,weekday columns

#create date column
all_trip_data$date <- as.Date(all_trip_data$started_at)
#create weekday column
all_trip_data$weekdays <- wday(all_trip_data$date,label = TRUE,abbr = TRUE)
#create month column
all_trip_data$months <- format(as.Date(all_trip_data$date),"%y %m")

Filtering out test data

final_data <- filter(all_trip_data,all_trip_data$start_station_name != "TEST"
                      |all_trip_data$start_station_name != "Test"
                      |all_trip_data$start_station_name != "test"
                      )
final_data <- filter(final_data,final_data$ride_length > 0)
#omit NA rows
final_data <- drop_na(final_data)

Analyze

Calculating frequent weekdays for riding

final_data %>%
  group_by(member_casual) %>% 
  summarise(frequent_weekday = mfv(weekdays))
## # A tibble: 2 x 2
##   member_casual frequent_weekday
##   <chr>         <fct>           
## 1 casual        Sat             
## 2 member        Sat

Calculating average ride duration for each rider_type

final_data %>%
  group_by(member_casual) %>% 
  summarise(average_ride_length = mean(ride_length))
## # A tibble: 2 x 2
##   member_casual average_ride_length
##   <chr>         <drtn>             
## 1 casual        45.82089 mins      
## 2 member        16.18987 mins

Visualiztions and Results

Maximum ride duration for member types

final_data %>% 
  group_by(member_casual,weekdays) %>% 
  summarise(num_of_rides = n()) %>% 
  ggplot(aes(x=weekdays,y=num_of_rides,fill=member_casual)) + geom_col(color = "Black", position = "dodge") +
  scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
  scale_y_continuous(breaks = c(0,100000,200000,300000,400000),labels = c("0","100k","200k","300k","400k"))+
  labs(title = "Number of rides per week in a year", y = "Number of Rides in a year (thousands)",x="Weekdays") 

Preferred bike type

final_data %>% 
  group_by(member_casual,rideable_type) %>% 
  summarise(num_of_rides = n()) %>% 
  ggplot(aes(x=rideable_type, y=num_of_rides,fill=member_casual)) +
  scale_y_continuous()+
  scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
  geom_col(color="Black",position = "dodge")+
  scale_y_continuous(breaks = c(0,500000,1000000),labels = c("0","500k","1mil"))+
  labs(title = "Preferred bike type", x = "Rideable Bike Type", y="Number of Rides (thousands)")

Average ride duration on each day

final_data%>% 
  group_by(member_casual,weekdays) %>% 
  summarise(avg_ride_length = mean(ride_length)) %>% 
  ggplot(aes(x=weekdays, y = avg_ride_length, fill = member_casual)) + 
  geom_col(color = "Black", position = "dodge")+
  scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
  labs(title = "Average ride duration per weekday in a year", y ="Average ride duration (minutes)", x = "Weekdays")

Number of rides each month

final_data %>% 
  group_by(member_casual,months) %>%
  arrange(months) %>% 
  summarise(num_of_rides = n()) %>% 
  ggplot(aes(x=months,y=num_of_rides, fill=member_casual))+
  geom_col(color="Black", position = "dodge")+
  scale_y_continuous(breaks = c(0,100000,200000,300000,400000),labels = c("0","100k","200k","300k","400k"))+
  scale_x_discrete(labels = c("May-20", "Jun-20", "Jul-20", "Aug-20","Sep-20","Oct-20","Nov-20","Dec-20","Jan-21","Feb-21","Mar-21","Apr-21"))+
  theme(axis.text.x = element_text(angle = 45))+
  scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
  labs(title = "Number of rides per month in a year ", y = "Number of rides (thousands)", x = "Months" )

Conclusions

Suggestions