A bike-share company in Chicago. The director of marketing believes the company’s future success depends on maximizing the number of annual memberships. Therefore, your team wants to understand how casual riders and annual members use Cyclistic bikes differently. From these insights, your team will design a new marketing strategy to convert casual riders into annual members. But first, Cyclistic executives must approve your recommendations, so they must be backed up with compelling data insights and professional data visualizations.Characters and tea
#install.packages("tidyverse") #for data import and wrangling
#install.packages("lubridate") #for date function
#install.packages("scales") #for limit and ranges
#install.packages("ggplot2") #for visualisation
library(tidyverse) #for data import and wrangling
library(lubridate) #for date function
library(scales) #for limit and ranges
library(ggplot2) #for visualisation
q2_2019=read_csv('Data/Divvy_Trips_2019_Q2.csv')
q3_2019=read_csv('Data/Divvy_Trips_2019_Q3.csv')
q4_2019=read_csv('Data/Divvy_Trips_2019_Q4.csv')
q1_2020=read_csv('Data/Divvy_Trips_2020_Q1.csv')
colnames(q2_2019)
## [1] "01 - Rental Details Rental ID"
## [2] "01 - Rental Details Local Start Time"
## [3] "01 - Rental Details Local End Time"
## [4] "01 - Rental Details Bike ID"
## [5] "01 - Rental Details Duration In Seconds Uncapped"
## [6] "03 - Rental Start Station ID"
## [7] "03 - Rental Start Station Name"
## [8] "02 - Rental End Station ID"
## [9] "02 - Rental End Station Name"
## [10] "User Type"
## [11] "Member Gender"
## [12] "05 - Member Details Member Birthday Year"
colnames(q3_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q4_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q1_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
#It displays number of columns in each dataframe
ncol(q2_2019)
## [1] 12
ncol(q3_2019)
## [1] 12
ncol(q4_2019)
## [1] 12
ncol(q1_2020)
## [1] 13
q2_2019=rename(q2_2019,
ride_id = "01 - Rental Details Rental ID",
rideable_type = "01 - Rental Details Bike ID",
started_at = "01 - Rental Details Local Start Time",
ended_at = "01 - Rental Details Local End Time",
start_station_name = "03 - Rental Start Station Name",
start_station_id = "03 - Rental Start Station ID",
end_station_name = "02 - Rental End Station Name",
end_station_id = "02 - Rental End Station ID",
member_casual = "User Type"
)
q3_2019=rename(q3_2019,
ride_id = "trip_id",
rideable_type = "bikeid",
started_at = "start_time",
ended_at = "end_time",
start_station_name = "from_station_name",
start_station_id = "from_station_id",
end_station_name = "to_station_name",
end_station_id = "to_station_id",
member_casual = "usertype"
)
q4_2019=rename(q4_2019,
ride_id = "trip_id",
rideable_type = "bikeid",
started_at = "start_time",
ended_at = "end_time",
start_station_name = "from_station_name",
start_station_id = "from_station_id",
end_station_name = "to_station_name",
end_station_id = "to_station_id",
member_casual = "usertype"
)
sapply(q2_2019,class)
## $ride_id
## [1] "numeric"
##
## $started_at
## [1] "POSIXct" "POSIXt"
##
## $ended_at
## [1] "POSIXct" "POSIXt"
##
## $rideable_type
## [1] "numeric"
##
## $`01 - Rental Details Duration In Seconds Uncapped`
## [1] "numeric"
##
## $start_station_id
## [1] "numeric"
##
## $start_station_name
## [1] "character"
##
## $end_station_id
## [1] "numeric"
##
## $end_station_name
## [1] "character"
##
## $member_casual
## [1] "character"
##
## $`Member Gender`
## [1] "character"
##
## $`05 - Member Details Member Birthday Year`
## [1] "numeric"
sapply(q3_2019,class)
## $ride_id
## [1] "numeric"
##
## $started_at
## [1] "POSIXct" "POSIXt"
##
## $ended_at
## [1] "POSIXct" "POSIXt"
##
## $rideable_type
## [1] "numeric"
##
## $tripduration
## [1] "numeric"
##
## $start_station_id
## [1] "numeric"
##
## $start_station_name
## [1] "character"
##
## $end_station_id
## [1] "numeric"
##
## $end_station_name
## [1] "character"
##
## $member_casual
## [1] "character"
##
## $gender
## [1] "character"
##
## $birthyear
## [1] "numeric"
sapply(q4_2019,class)
## $ride_id
## [1] "numeric"
##
## $started_at
## [1] "POSIXct" "POSIXt"
##
## $ended_at
## [1] "POSIXct" "POSIXt"
##
## $rideable_type
## [1] "numeric"
##
## $tripduration
## [1] "numeric"
##
## $start_station_id
## [1] "numeric"
##
## $start_station_name
## [1] "character"
##
## $end_station_id
## [1] "numeric"
##
## $end_station_name
## [1] "character"
##
## $member_casual
## [1] "character"
##
## $gender
## [1] "character"
##
## $birthyear
## [1] "numeric"
sapply(q1_2020,class)
## $ride_id
## [1] "character"
##
## $rideable_type
## [1] "character"
##
## $started_at
## [1] "POSIXct" "POSIXt"
##
## $ended_at
## [1] "POSIXct" "POSIXt"
##
## $start_station_name
## [1] "character"
##
## $start_station_id
## [1] "numeric"
##
## $end_station_name
## [1] "character"
##
## $end_station_id
## [1] "numeric"
##
## $start_lat
## [1] "numeric"
##
## $start_lng
## [1] "numeric"
##
## $end_lat
## [1] "numeric"
##
## $end_lng
## [1] "numeric"
##
## $member_casual
## [1] "character"
#dataframe q2_2019, q3_2019, q4_2019 contains ride as numeric datatype
#to make it consistent with q1_2020 we have to mutate it as character.
q2_2019=mutate(q2_2019,
ride_id=as.character(ride_id),
rideable_type=as.character(rideable_type))
q3_2019=mutate(q3_2019,
ride_id=as.character(ride_id),
rideable_type=as.character(rideable_type))
q4_2019=mutate(q4_2019,
ride_id=as.character(ride_id),
rideable_type=as.character(rideable_type))
alltrip = bind_rows(q2_2019,q3_2019,q4_2019,q1_2020)
colnames(alltrip)
## [1] "ride_id"
## [2] "started_at"
## [3] "ended_at"
## [4] "rideable_type"
## [5] "01 - Rental Details Duration In Seconds Uncapped"
## [6] "start_station_id"
## [7] "start_station_name"
## [8] "end_station_id"
## [9] "end_station_name"
## [10] "member_casual"
## [11] "Member Gender"
## [12] "05 - Member Details Member Birthday Year"
## [13] "tripduration"
## [14] "gender"
## [15] "birthyear"
## [16] "start_lat"
## [17] "start_lng"
## [18] "end_lat"
## [19] "end_lng"
alltrip = alltrip %>%
select(-c("01 - Rental Details Duration In Seconds Uncapped", "Member Gender",
"05 - Member Details Member Birthday Year",
"tripduration", "gender", "birthyear",
"start_lat", "start_lng", "end_lat", "end_lng"))
colnames(alltrip)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
In the “member_casual” column, there are two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). We will need to consolidate that from four to two labels.
table(alltrip$member_casual)
##
## casual Customer member Subscriber
## 48480 857474 378407 2595461
alltrip = alltrip %>%
mutate(member_casual = recode(member_casual,
"Subscriber" = "member",
"Customer" = "casual"))
table(alltrip$member_casual)
##
## casual member
## 905954 2973868
alltrip$ride_length=difftime(
as.POSIXct(alltrip$ended_at, format="%Y-%m-%d %H:%M:%S"),
as.POSIXct(alltrip$started_at, format="%Y-%m-%d %H:%M:%S"),
unit="secs"
)
The data can only be aggregated at the ride-level, which is too granular. We will want to add some additional columns of data – such as day, month, year – that provide additional opportunities to aggregate the data.
alltrip$date = format(as.POSIXct(alltrip$started_at, format="%Y-%m-%d %H:%M:%S"), "%d-%m-%Y")
alltrip$month = format(as.POSIXct(alltrip$started_at, format="%Y-%m-%d %H:%M:%S"), "%m")
alltrip$day = format(as.POSIXct(alltrip$started_at, format="%Y-%m-%d %H:%M:%S"), "%d")
alltrip$year = format(as.POSIXct(alltrip$started_at, format="%Y-%m-%d %H:%M:%S"), "%Y")
#alltrip$day_of_week = format(as.POSIXct(alltrip$started_at, format="%Y-%m-%d %H:%M:%S"), "%A", week_start = 1)
alltrip$day_of_week = wday(dmy(alltrip$date), label=TRUE, abbr=FALSE, week_start=1)
alltrip$ride_length <- as.numeric(as.character(alltrip$ride_length))
str(alltrip$ride_length,class)
## num [1:3879822] 446 1048 252 357 1007 ...
The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative
alltrip_v2=alltrip[!(alltrip$start_station_name=="HQ QR" | alltrip$ride_length<0),]
#storing filtered data of alltrip and filteration must be done [,] before comma
summary(alltrip_v2$ride_length) #min, Q1, mean, median, Q3, max
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 412 712 1479 1289 9387024
#average ride duration by member type
aggregate(alltrip_v2$ride_length ~alltrip_v2$member_casual, FUN=mean)
## alltrip_v2$member_casual alltrip_v2$ride_length
## 1 casual 3552.7502
## 2 member 850.0662
aggregate(alltrip_v2$ride_length ~alltrip_v2$member_casual, FUN=median)
## alltrip_v2$member_casual alltrip_v2$ride_length
## 1 casual 1546
## 2 member 589
aggregate(alltrip_v2$ride_length ~alltrip_v2$member_casual, FUN=max)
## alltrip_v2$member_casual alltrip_v2$ride_length
## 1 casual 9387024
## 2 member 9056634
aggregate(alltrip_v2$ride_length ~alltrip_v2$member_casual, FUN=min)
## alltrip_v2$member_casual alltrip_v2$ride_length
## 1 casual 2
## 2 member 1
aggregate(alltrip_v2$ride_length
~alltrip_v2$member_casual + alltrip_v2$day_of_week, FUN=mean)
## alltrip_v2$member_casual alltrip_v2$day_of_week alltrip_v2$ride_length
## 1 casual Monday 3372.2869
## 2 member Monday 842.5726
## 3 casual Tuesday 3596.3599
## 4 member Tuesday 826.1427
## 5 casual Wednesday 3718.6619
## 6 member Wednesday 823.9996
## 7 casual Thursday 3682.9847
## 8 member Thursday 823.9278
## 9 casual Friday 3773.8351
## 10 member Friday 824.5305
## 11 casual Saturday 3331.9138
## 12 member Saturday 968.9337
## 13 casual Sunday 3581.4054
## 14 member Sunday 919.9746
#Calculating the no. rides and the average duration and group by member type and weekdays
avg_trip=alltrip_v2 %>%
group_by(member_casual,day_of_week) %>% #it groups weekdays by member type
summarise(number_of_rides = n(), #calculates the no. of rides
average_duration = mean(ride_length)) %>% #calulates the average duration.
arrange(member_casual, day_of_week) #it sorts
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.