To investigate the differences between Casual and Annual members in hopes to convince Casual members to invest in an annual membership.
Google has provided data for solving the business task. The data is public and has been made available by Motivate International Inc.The data is in wide format.This data does not contain any user information such as credit card information and any means of personal identification.
ROCCC
Reliable - This data is free of biases and can answer the business task.
Original - This data is source by Google.
Comprehensive - There is sufficient data to tackle the business task.
Current - The data is up to date.
Cited - The data is made available by Motivate International Inc.
Rstudio (R programming language)
I’m using the R programming to perform my analysis. The R programming language makes it easier to work with a large dataset. This dataset has over 3 million entries. It also makes it easy for others to replicate my findings in this project.
Install and import Libraries
library(tidyverse)
library(lubridate)
library(modeest)
Import csv files
jun_2020 <- read_csv("~/Downloads/trip_data/202005-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
jul_2020 <- read_csv("~/Downloads/trip_data/202006-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
aug_2020 <- read_csv("~/Downloads/trip_data/202007-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
sep_2020 <- read_csv("~/Downloads/trip_data/202008-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
oct_2020 <- read_csv("~/Downloads/trip_data/202009-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
nov_2020 <- read_csv("~/Downloads/trip_data/202010-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
dec_2020 <- read_csv("~/Downloads/trip_data/202011-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
jan_2021 <- read_csv("~/Downloads/trip_data/202012-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
feb_2021 <- read_csv("~/Downloads/trip_data/202101-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
mar_2021 <- read_csv("~/Downloads/trip_data/202102-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
apr_2021 <- read_csv("~/Downloads/trip_data/202103-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
may_2021 <- read_csv("~/Downloads/trip_data/202104-divvy-tripdata.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
Format all start_station_id and end_station_id
jun_2020 <- jun_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
jul_2020 <- jul_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
aug_2020 <- aug_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
sep_2020 <- sep_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
oct_2020 <- oct_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
nov_2020 <- nov_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
dec_2020 <- dec_2020 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
jan_2021 <- jan_2021 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
feb_2021 <- feb_2021 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
mar_2021 <- mar_2021 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
apr_2021 <- apr_2021 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
may_2021 <- may_2021 %>%
mutate(start_station_id = as.numeric(as.character(start_station_id)),end_station_id = as.numeric(as.character(end_station_id)))
Combining all csv files into one
all_trip_data <- bind_rows(jun_2020,jul_2020,aug_2020,sep_2020,oct_2020,nov_2020,dec_2020,jan_2021,feb_2021,mar_2021,apr_2021,may_2021)
Taking a look at the combine table
glimpse(all_trip_data)
## Rows: 3,742,202
## Columns: 13
## $ ride_id <chr> "02668AD35674B983", "7A50CCAF1EDDB28F", "2FFCDFDB91…
## $ rideable_type <chr> "docked_bike", "docked_bike", "docked_bike", "docke…
## $ started_at <dttm> 2020-05-27 10:03:52, 2020-05-25 10:47:11, 2020-05-…
## $ ended_at <dttm> 2020-05-27 10:16:49, 2020-05-25 11:05:40, 2020-05-…
## $ start_station_name <chr> "Franklin St & Jackson Blvd", "Clark St & Wrightwoo…
## $ start_station_id <dbl> 36, 340, 260, 251, 261, 206, 261, 180, 331, 219, 24…
## $ end_station_name <chr> "Wabash Ave & Grand Ave", "Clark St & Leland Ave", …
## $ end_station_id <dbl> 199, 326, 260, 157, 206, 22, 261, 180, 300, 305, 14…
## $ start_lat <dbl> 41.8777, 41.9295, 41.9296, 41.9680, 41.8715, 41.847…
## $ start_lng <dbl> -87.6353, -87.6431, -87.7079, -87.6500, -87.6699, -…
## $ end_lat <dbl> 41.8915, 41.9671, 41.9296, 41.9367, 41.8472, 41.869…
## $ end_lng <dbl> -87.6268, -87.6674, -87.7079, -87.6368, -87.6468, -…
## $ member_casual <chr> "member", "casual", "casual", "casual", "member", "…
Calculating the ride duration
all_trip_data <- all_trip_data %>%
mutate(ride_length = difftime(ended_at,started_at,units = "min"))
Creating monthly/yearly,weekday columns
#create date column
all_trip_data$date <- as.Date(all_trip_data$started_at)
#create weekday column
all_trip_data$weekdays <- wday(all_trip_data$date,label = TRUE,abbr = TRUE)
#create month column
all_trip_data$months <- format(as.Date(all_trip_data$date),"%y %m")
Filtering out test data
final_data <- filter(all_trip_data,all_trip_data$start_station_name != "TEST"
|all_trip_data$start_station_name != "Test"
|all_trip_data$start_station_name != "test"
)
final_data <- filter(final_data,final_data$ride_length > 0)
#omit NA rows
final_data <- drop_na(final_data)
Calculating frequent weekdays for riding
final_data %>%
group_by(member_casual) %>%
summarise(frequent_weekday = mfv(weekdays))
## # A tibble: 2 x 2
## member_casual frequent_weekday
## <chr> <fct>
## 1 casual Sat
## 2 member Sat
Calculating average ride duration for each rider_type
final_data %>%
group_by(member_casual) %>%
summarise(average_ride_length = mean(ride_length))
## # A tibble: 2 x 2
## member_casual average_ride_length
## <chr> <drtn>
## 1 casual 45.82089 mins
## 2 member 16.18987 mins
Maximum ride duration for member types
final_data %>%
group_by(member_casual,weekdays) %>%
summarise(num_of_rides = n()) %>%
ggplot(aes(x=weekdays,y=num_of_rides,fill=member_casual)) + geom_col(color = "Black", position = "dodge") +
scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
scale_y_continuous(breaks = c(0,100000,200000,300000,400000),labels = c("0","100k","200k","300k","400k"))+
labs(title = "Number of rides per week in a year", y = "Number of Rides in a year (thousands)",x="Weekdays")
Preferred bike type
final_data %>%
group_by(member_casual,rideable_type) %>%
summarise(num_of_rides = n()) %>%
ggplot(aes(x=rideable_type, y=num_of_rides,fill=member_casual)) +
scale_y_continuous()+
scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
geom_col(color="Black",position = "dodge")+
scale_y_continuous(breaks = c(0,500000,1000000),labels = c("0","500k","1mil"))+
labs(title = "Preferred bike type", x = "Rideable Bike Type", y="Number of Rides (thousands)")
Average ride duration on each day
final_data%>%
group_by(member_casual,weekdays) %>%
summarise(avg_ride_length = mean(ride_length)) %>%
ggplot(aes(x=weekdays, y = avg_ride_length, fill = member_casual)) +
geom_col(color = "Black", position = "dodge")+
scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
labs(title = "Average ride duration per weekday in a year", y ="Average ride duration (minutes)", x = "Weekdays")
Number of rides each month
final_data %>%
group_by(member_casual,months) %>%
arrange(months) %>%
summarise(num_of_rides = n()) %>%
ggplot(aes(x=months,y=num_of_rides, fill=member_casual))+
geom_col(color="Black", position = "dodge")+
scale_y_continuous(breaks = c(0,100000,200000,300000,400000),labels = c("0","100k","200k","300k","400k"))+
scale_x_discrete(labels = c("May-20", "Jun-20", "Jul-20", "Aug-20","Sep-20","Oct-20","Nov-20","Dec-20","Jan-21","Feb-21","Mar-21","Apr-21"))+
theme(axis.text.x = element_text(angle = 45))+
scale_fill_manual(name="Riders",labels = c("Casual","Annual"),values = c("#552583","#FDB927"))+
labs(title = "Number of rides per month in a year ", y = "Number of rides (thousands)", x = "Months" )