This capstone project is to use the knowledge from Google Data Analytics courses to answer business questions coming from bike-share service while using supporting evidence to persuade stakeholders & executive while the decisions have to be made.
The data can be download in the link below: https://divvy-tripdata.s3.amazonaws.com/index.htm
The data was obtained from 2020 October up to 2021 October for the bike sharing service’s website, and the remaining operation and result will be based on the given time frame.
Meanwhile, the original csv files were already download to the working directory. Therefore the operations would be depending on where users download the file to.
This section will load the required library, load in the .csv data into the data frame and identify each table’s format.
library(tidyverse)
library(lubridate)
library(ggplot2)
library(stats)
library(dplyr)
# Load csv file into individual data frame
Oct_2020 <- read_csv("202010-divvy-tripdata.csv")
Nov_2020 <- read_csv("202011-divvy-tripdata.csv")
Dec_2020 <- read_csv("202012-divvy-tripdata.csv")
Jan_2021 <- read_csv("202101-divvy-tripdata.csv")
Feb_2021 <- read_csv("202102-divvy-tripdata.csv")
Mar_2021 <- read_csv("202103-divvy-tripdata.csv")
Apr_2021 <- read_csv("202104-divvy-tripdata.csv")
May_2021 <- read_csv("202105-divvy-tripdata.csv")
Jun_2021 <- read_csv("202106-divvy-tripdata.csv")
Jul_2021 <- read_csv("202107-divvy-tripdata.csv")
Aug_2021 <- read_csv("202108-divvy-tripdata.csv")
Sep_2021 <- read_csv("202109-divvy-tripdata.csv")
Oct_2021 <- read_csv("202110-divvy-tripdata.csv")
# Verify column names at each data frame
colnames(Oct_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Nov_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Dec_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Jan_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Feb_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Mar_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Apr_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(May_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Jun_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Jul_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Aug_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Sep_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(Oct_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
## Verify the Structure of each data frame
str(Oct_2020)
## spec_tbl_df [388,653 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:388653] "ACB6B40CF5B9044C" "DF450C72FD109C01" "B6396B54A15AC0DF" "44A4AEE261B9E854" ...
## $ rideable_type : chr [1:388653] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:388653], format: "2020-10-31 19:39:43" "2020-10-31 23:50:08" ...
## $ ended_at : POSIXct[1:388653], format: "2020-10-31 19:57:12" "2020-11-01 00:04:16" ...
## $ start_station_name: chr [1:388653] "Lakeview Ave & Fullerton Pkwy" "Southport Ave & Waveland Ave" "Stony Island Ave & 67th St" "Clark St & Grace St" ...
## $ start_station_id : num [1:388653] 313 227 102 165 190 359 313 125 NA 174 ...
## $ end_station_name : chr [1:388653] "Rush St & Hubbard St" "Kedzie Ave & Milwaukee Ave" "University Ave & 57th St" "Broadway & Sheridan Rd" ...
## $ end_station_id : num [1:388653] 125 260 423 256 185 53 125 313 199 635 ...
## $ start_lat : num [1:388653] 41.9 41.9 41.8 42 41.9 ...
## $ start_lng : num [1:388653] -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num [1:388653] 41.9 41.9 41.8 42 41.9 ...
## $ end_lng : num [1:388653] -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ member_casual : chr [1:388653] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Nov_2020)
## spec_tbl_df [259,716 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:259716] "BD0A6FF6FFF9B921" "96A7A7A4BDE4F82D" "C61526D06582BDC5" "E533E89C32080B9E" ...
## $ rideable_type : chr [1:259716] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:259716], format: "2020-11-01 13:36:00" "2020-11-01 10:03:26" ...
## $ ended_at : POSIXct[1:259716], format: "2020-11-01 13:45:40" "2020-11-01 10:14:45" ...
## $ start_station_name: chr [1:259716] "Dearborn St & Erie St" "Franklin St & Illinois St" "Lake Shore Dr & Monroe St" "Leavitt St & Chicago Ave" ...
## $ start_station_id : num [1:259716] 110 672 76 659 2 72 76 NA 58 394 ...
## $ end_station_name : chr [1:259716] "St. Clair St & Erie St" "Noble St & Milwaukee Ave" "Federal St & Polk St" "Stave St & Armitage Ave" ...
## $ end_station_id : num [1:259716] 211 29 41 185 2 76 72 NA 288 273 ...
## $ start_lat : num [1:259716] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:259716] -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:259716] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:259716] -87.6 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:259716] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Dec_2020)
## spec_tbl_df [131,573 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:131573] "70B6A9A437D4C30D" "158A465D4E74C54A" "5262016E0F1F2F9A" "BE119628E44F871E" ...
## $ rideable_type : chr [1:131573] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:131573], format: "2020-12-27 12:44:29" "2020-12-18 17:37:15" ...
## $ ended_at : POSIXct[1:131573], format: "2020-12-27 12:55:06" "2020-12-18 17:44:19" ...
## $ start_station_name: chr [1:131573] "Aberdeen St & Jackson Blvd" NA NA NA ...
## $ start_station_id : chr [1:131573] "13157" NA NA NA ...
## $ end_station_name : chr [1:131573] "Desplaines St & Kinzie St" NA NA NA ...
## $ end_station_id : chr [1:131573] "TA1306000003" NA NA NA ...
## $ start_lat : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
## $ start_lng : num [1:131573] -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
## $ end_lng : num [1:131573] -87.6 -87.7 -87.7 -87.7 -87.6 ...
## $ member_casual : chr [1:131573] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Jan_2021)
## spec_tbl_df [96,834 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ rideable_type : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:96834], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
## $ ended_at : POSIXct[1:96834], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
## $ start_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ start_station_id : chr [1:96834] "17660" "17660" "17660" "17660" ...
## $ end_station_name : chr [1:96834] NA NA NA NA ...
## $ end_station_id : chr [1:96834] NA NA NA NA ...
## $ start_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:96834] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Feb_2021)
## spec_tbl_df [49,622 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:49622] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:49622] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:49622], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:49622], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:49622] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:49622] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:49622] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:49622] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:49622] "member" "casual" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Mar_2021)
## spec_tbl_df [228,496 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:228496] "CFA86D4455AA1030" "30D9DC61227D1AF3" "846D87A15682A284" "994D05AA75A168F2" ...
## $ rideable_type : chr [1:228496] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : POSIXct[1:228496], format: "2021-03-16 08:32:30" "2021-03-28 01:26:28" ...
## $ ended_at : POSIXct[1:228496], format: "2021-03-16 08:36:34" "2021-03-28 01:36:55" ...
## $ start_station_name: chr [1:228496] "Humboldt Blvd & Armitage Ave" "Humboldt Blvd & Armitage Ave" "Shields Ave & 28th Pl" "Winthrop Ave & Lawrence Ave" ...
## $ start_station_id : chr [1:228496] "15651" "15651" "15443" "TA1308000021" ...
## $ end_station_name : chr [1:228496] "Stave St & Armitage Ave" "Central Park Ave & Bloomingdale Ave" "Halsted St & 35th St" "Broadway & Sheridan Rd" ...
## $ end_station_id : chr [1:228496] "13266" "18017" "TA1308000043" "13323" ...
## $ start_lat : num [1:228496] 41.9 41.9 41.8 42 42 ...
## $ start_lng : num [1:228496] -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num [1:228496] 41.9 41.9 41.8 42 42.1 ...
## $ end_lng : num [1:228496] -87.7 -87.7 -87.6 -87.6 -87.7 ...
## $ member_casual : chr [1:228496] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Apr_2021)
## spec_tbl_df [337,230 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:337230] "6C992BD37A98A63F" "1E0145613A209000" "E498E15508A80BAD" "1887262AD101C604" ...
## $ rideable_type : chr [1:337230] "classic_bike" "docked_bike" "docked_bike" "classic_bike" ...
## $ started_at : POSIXct[1:337230], format: "2021-04-12 18:25:36" "2021-04-27 17:27:11" ...
## $ ended_at : POSIXct[1:337230], format: "2021-04-12 18:56:55" "2021-04-27 18:31:29" ...
## $ start_station_name: chr [1:337230] "State St & Pearson St" "Dorchester Ave & 49th St" "Loomis Blvd & 84th St" "Honore St & Division St" ...
## $ start_station_id : chr [1:337230] "TA1307000061" "KA1503000069" "20121" "TA1305000034" ...
## $ end_station_name : chr [1:337230] "Southport Ave & Waveland Ave" "Dorchester Ave & 49th St" "Loomis Blvd & 84th St" "Southport Ave & Waveland Ave" ...
## $ end_station_id : chr [1:337230] "13235" "KA1503000069" "20121" "13235" ...
## $ start_lat : num [1:337230] 41.9 41.8 41.7 41.9 41.7 ...
## $ start_lng : num [1:337230] -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:337230] 41.9 41.8 41.7 41.9 41.7 ...
## $ end_lng : num [1:337230] -87.7 -87.6 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:337230] "member" "casual" "casual" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(May_2021)
## spec_tbl_df [531,633 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:531633] "C809ED75D6160B2A" "DD59FDCE0ACACAF3" "0AB83CB88C43EFC2" "7881AC6D39110C60" ...
## $ rideable_type : chr [1:531633] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:531633], format: "2021-05-30 11:58:15" "2021-05-30 11:29:14" ...
## $ ended_at : POSIXct[1:531633], format: "2021-05-30 12:10:39" "2021-05-30 12:14:09" ...
## $ start_station_name: chr [1:531633] NA NA NA NA ...
## $ start_station_id : chr [1:531633] NA NA NA NA ...
## $ end_station_name : chr [1:531633] NA NA NA NA ...
## $ end_station_id : chr [1:531633] NA NA NA NA ...
## $ start_lat : num [1:531633] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:531633] -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:531633] 41.9 41.8 41.9 41.9 41.9 ...
## $ end_lng : num [1:531633] -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:531633] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Jun_2021)
## spec_tbl_df [729,595 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:729595] "99FEC93BA843FB20" "06048DCFC8520CAF" "9598066F68045DF2" "B03C0FE48C412214" ...
## $ rideable_type : chr [1:729595] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:729595], format: "2021-06-13 14:31:28" "2021-06-04 11:18:02" ...
## $ ended_at : POSIXct[1:729595], format: "2021-06-13 14:34:11" "2021-06-04 11:24:19" ...
## $ start_station_name: chr [1:729595] NA NA NA NA ...
## $ start_station_id : chr [1:729595] NA NA NA NA ...
## $ end_station_name : chr [1:729595] NA NA NA NA ...
## $ end_station_id : chr [1:729595] NA NA NA NA ...
## $ start_lat : num [1:729595] 41.8 41.8 41.8 41.8 41.8 ...
## $ start_lng : num [1:729595] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:729595] 41.8 41.8 41.8 41.8 41.8 ...
## $ end_lng : num [1:729595] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr [1:729595] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Jul_2021)
## spec_tbl_df [822,410 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:822410] "0A1B623926EF4E16" "B2D5583A5A5E76EE" "6F264597DDBF427A" "379B58EAB20E8AA5" ...
## $ rideable_type : chr [1:822410] "docked_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : POSIXct[1:822410], format: "2021-07-02 14:44:36" "2021-07-07 16:57:42" ...
## $ ended_at : POSIXct[1:822410], format: "2021-07-02 15:19:58" "2021-07-07 17:16:09" ...
## $ start_station_name: chr [1:822410] "Michigan Ave & Washington St" "California Ave & Cortez St" "Wabash Ave & 16th St" "California Ave & Cortez St" ...
## $ start_station_id : chr [1:822410] "13001" "17660" "SL-012" "17660" ...
## $ end_station_name : chr [1:822410] "Halsted St & North Branch St" "Wood St & Hubbard St" "Rush St & Hubbard St" "Carpenter St & Huron St" ...
## $ end_station_id : chr [1:822410] "KA1504000117" "13432" "KA1503000044" "13196" ...
## $ start_lat : num [1:822410] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:822410] -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num [1:822410] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:822410] -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ member_casual : chr [1:822410] "casual" "casual" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Aug_2021)
## spec_tbl_df [804,352 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:804352] "99103BB87CC6C1BB" "EAFCCCFB0A3FC5A1" "9EF4F46C57AD234D" "5834D3208BFAF1DA" ...
## $ rideable_type : chr [1:804352] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:804352], format: "2021-08-10 17:15:49" "2021-08-10 17:23:14" ...
## $ ended_at : POSIXct[1:804352], format: "2021-08-10 17:22:44" "2021-08-10 17:39:24" ...
## $ start_station_name: chr [1:804352] NA NA NA NA ...
## $ start_station_id : chr [1:804352] NA NA NA NA ...
## $ end_station_name : chr [1:804352] NA NA NA NA ...
## $ end_station_id : chr [1:804352] NA NA NA NA ...
## $ start_lat : num [1:804352] 41.8 41.8 42 42 41.8 ...
## $ start_lng : num [1:804352] -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num [1:804352] 41.8 41.8 42 42 41.8 ...
## $ end_lng : num [1:804352] -87.7 -87.6 -87.7 -87.7 -87.6 ...
## $ member_casual : chr [1:804352] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Sep_2021)
## spec_tbl_df [756,147 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:756147] "9DC7B962304CBFD8" "F930E2C6872D6B32" "6EF72137900BB910" "78D1DE133B3DBF55" ...
## $ rideable_type : chr [1:756147] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:756147], format: "2021-09-28 16:07:10" "2021-09-28 14:24:51" ...
## $ ended_at : POSIXct[1:756147], format: "2021-09-28 16:09:54" "2021-09-28 14:40:05" ...
## $ start_station_name: chr [1:756147] NA NA NA NA ...
## $ start_station_id : chr [1:756147] NA NA NA NA ...
## $ end_station_name : chr [1:756147] NA NA NA NA ...
## $ end_station_id : chr [1:756147] NA NA NA NA ...
## $ start_lat : num [1:756147] 41.9 41.9 41.8 41.8 41.9 ...
## $ start_lng : num [1:756147] -87.7 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:756147] 41.9 42 41.8 41.8 41.9 ...
## $ end_lng : num [1:756147] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:756147] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(Oct_2021)
## spec_tbl_df [631,226 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:631226] "620BC6107255BF4C" "4471C70731AB2E45" "26CA69D43D15EE14" "362947F0437E1514" ...
## $ rideable_type : chr [1:631226] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:631226], format: "2021-10-22 12:46:42" "2021-10-21 09:12:37" ...
## $ ended_at : POSIXct[1:631226], format: "2021-10-22 12:49:50" "2021-10-21 09:14:14" ...
## $ start_station_name: chr [1:631226] "Kingsbury St & Kinzie St" NA NA NA ...
## $ start_station_id : chr [1:631226] "KA1503000043" NA NA NA ...
## $ end_station_name : chr [1:631226] NA NA NA NA ...
## $ end_station_id : chr [1:631226] NA NA NA NA ...
## $ start_lat : num [1:631226] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:631226] -87.6 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:631226] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:631226] -87.6 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:631226] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
After the data are loaded, some of the data format were not the same for same columns. Therefore, transfer these variable time to be the same before cleaning NA items to ensure all data points are valid.
This section will also calculate the ride length based on the transformed time frame interval. The conversion will be ensure the ride length format are in seconds.
## Rename the conflicted variables as character variables
Oct_2020 <- mutate(Oct_2020, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Nov_2020 <- mutate(Nov_2020, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Dec_2020 <- mutate(Dec_2020, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Jan_2021 <- mutate(Jan_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Feb_2021 <- mutate(Feb_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Mar_2021 <- mutate(Mar_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Apr_2021 <- mutate(Apr_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
May_2021 <- mutate(May_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Jun_2021 <- mutate(Jun_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Jul_2021 <- mutate(Jul_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Aug_2021 <- mutate(Aug_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Sep_2021 <- mutate(Sep_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
Oct_2021 <- mutate(Oct_2021, ride_id = as.character(ride_id), rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id),end_station_id = as.character(end_station_id))
## Stack all data as one data frame
alltrips <- bind_rows(Oct_2020, Nov_2020, Dec_2020, Jan_2021, Feb_2021, Mar_2021, Apr_2021, May_2021, Jun_2021, Jul_2021, Aug_2021, Sep_2021, Oct_2021)
## Remove unnecessary columns for the data extracts
alltrips <- alltrips %>%
select(-c(start_lat, start_lng, end_lat, end_lng))
## Inspect the dimension of data
colnames(alltrips)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "member_casual"
nrow(alltrips)
## [1] 5767487
dim(alltrips)
## [1] 5767487 9
head(alltrips)
## # A tibble: 6 x 9
## ride_id rideable_type started_at ended_at start_station_n~
## <chr> <chr> <dttm> <dttm> <chr>
## 1 ACB6B4~ electric_bike 2020-10-31 19:39:43 2020-10-31 19:57:12 Lakeview Ave & ~
## 2 DF450C~ electric_bike 2020-10-31 23:50:08 2020-11-01 00:04:16 Southport Ave &~
## 3 B6396B~ electric_bike 2020-10-31 23:00:01 2020-10-31 23:08:22 Stony Island Av~
## 4 44A4AE~ electric_bike 2020-10-31 22:16:43 2020-10-31 22:19:35 Clark St & Grac~
## 5 10B7DD~ electric_bike 2020-10-31 19:38:19 2020-10-31 19:54:32 Southport Ave &~
## 6 DA6C37~ electric_bike 2020-10-29 17:38:04 2020-10-29 17:45:43 Larrabee St & D~
## # ... with 4 more variables: start_station_id <chr>, end_station_name <chr>,
## # end_station_id <chr>, member_casual <chr>
tail(alltrips)
## # A tibble: 6 x 9
## ride_id rideable_type started_at ended_at start_station_n~
## <chr> <chr> <dttm> <dttm> <chr>
## 1 817A85~ classic_bike 2021-10-15 18:01:23 2021-10-15 18:09:41 Franklin St & J~
## 2 BA077F~ classic_bike 2021-10-14 21:45:05 2021-10-14 22:07:25 Franklin St & C~
## 3 B7D992~ classic_bike 2021-10-02 15:28:28 2021-10-02 15:51:02 Streeter Dr & G~
## 4 BCCFD6~ electric_bike 2021-10-08 16:47:10 2021-10-08 16:52:43 Calumet Ave & 2~
## 5 623E0F~ classic_bike 2021-10-08 07:49:47 2021-10-08 07:55:15 Calumet Ave & 2~
## 6 83FA6A~ classic_bike 2021-10-02 12:55:45 2021-10-02 13:21:10 Winthrop Ave & ~
## # ... with 4 more variables: start_station_id <chr>, end_station_name <chr>,
## # end_station_id <chr>, member_casual <chr>
str(alltrips)
## tibble [5,767,487 x 9] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5767487] "ACB6B40CF5B9044C" "DF450C72FD109C01" "B6396B54A15AC0DF" "44A4AEE261B9E854" ...
## $ rideable_type : chr [1:5767487] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:5767487], format: "2020-10-31 19:39:43" "2020-10-31 23:50:08" ...
## $ ended_at : POSIXct[1:5767487], format: "2020-10-31 19:57:12" "2020-11-01 00:04:16" ...
## $ start_station_name: chr [1:5767487] "Lakeview Ave & Fullerton Pkwy" "Southport Ave & Waveland Ave" "Stony Island Ave & 67th St" "Clark St & Grace St" ...
## $ start_station_id : chr [1:5767487] "313" "227" "102" "165" ...
## $ end_station_name : chr [1:5767487] "Rush St & Hubbard St" "Kedzie Ave & Milwaukee Ave" "University Ave & 57th St" "Broadway & Sheridan Rd" ...
## $ end_station_id : chr [1:5767487] "125" "260" "423" "256" ...
## $ member_casual : chr [1:5767487] "casual" "casual" "casual" "casual" ...
summary(alltrips)
## ride_id rideable_type started_at
## Length:5767487 Length:5767487 Min. :2020-10-01 00:00:06
## Class :character Class :character 1st Qu.:2021-04-26 19:16:25
## Mode :character Mode :character Median :2021-07-05 14:28:03
## Mean :2021-06-10 12:36:52
## 3rd Qu.:2021-08-29 15:51:40
## Max. :2021-10-31 23:59:49
## ended_at start_station_name start_station_id
## Min. :2020-10-01 00:05:09 Length:5767487 Length:5767487
## 1st Qu.:2021-04-26 19:39:39 Class :character Class :character
## Median :2021-07-05 14:57:35 Mode :character Mode :character
## Mean :2021-06-10 12:57:19
## 3rd Qu.:2021-08-29 16:13:46
## Max. :2021-11-03 21:45:48
## end_station_name end_station_id member_casual
## Length:5767487 Length:5767487 Length:5767487
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Use re-code to rename the membership mode
alltrips <- alltrips %>%
mutate(member_casual = recode(member_casual,"Subscriber" = "member", "Customer" = "casual"))
table(alltrips$member_casual)
##
## casual member
## 2615529 3151958
## Convert POSI Dates as date format
alltrips$date <- as.Date(alltrips$started_at)
alltrips$month <- format(as.Date(alltrips$date), "%m")
alltrips$day <- format(as.Date(alltrips$date), "%d")
alltrips$year <- format(as.Date(alltrips$date), "%Y")
alltrips$day_of_week <- format(as.Date(alltrips$date), "%A")
## Calculate Ride Length
alltrips$ride_length <- difftime(alltrips$ended_at, alltrips$started_at)
## Recheck Updated Structure
str(alltrips)
## tibble [5,767,487 x 15] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5767487] "ACB6B40CF5B9044C" "DF450C72FD109C01" "B6396B54A15AC0DF" "44A4AEE261B9E854" ...
## $ rideable_type : chr [1:5767487] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:5767487], format: "2020-10-31 19:39:43" "2020-10-31 23:50:08" ...
## $ ended_at : POSIXct[1:5767487], format: "2020-10-31 19:57:12" "2020-11-01 00:04:16" ...
## $ start_station_name: chr [1:5767487] "Lakeview Ave & Fullerton Pkwy" "Southport Ave & Waveland Ave" "Stony Island Ave & 67th St" "Clark St & Grace St" ...
## $ start_station_id : chr [1:5767487] "313" "227" "102" "165" ...
## $ end_station_name : chr [1:5767487] "Rush St & Hubbard St" "Kedzie Ave & Milwaukee Ave" "University Ave & 57th St" "Broadway & Sheridan Rd" ...
## $ end_station_id : chr [1:5767487] "125" "260" "423" "256" ...
## $ member_casual : chr [1:5767487] "casual" "casual" "casual" "casual" ...
## $ date : Date[1:5767487], format: "2020-10-31" "2020-10-31" ...
## $ month : chr [1:5767487] "10" "10" "10" "10" ...
## $ day : chr [1:5767487] "31" "31" "31" "31" ...
## $ year : chr [1:5767487] "2020" "2020" "2020" "2020" ...
## $ day_of_week : chr [1:5767487] "星期六" "星期六" "星期六" "星期六" ...
## $ ride_length : 'difftime' num [1:5767487] 1049 848 501 172 ...
## ..- attr(*, "units")= chr "secs"
## Convert ride length from factor to numeric for calculation
is.factor(alltrips$ride_length)
## [1] FALSE
alltrips$ride_length <- as.numeric(as.character(alltrips$ride_length))
## Verify the character type is numeric
is.numeric(alltrips$ride_length)
## [1] TRUE
## Remove Bad Data from either incomplete ride length (less than 0) or taken out of docks.
alltrips_v2 <- alltrips[!(alltrips$start_station_name == "HQ QR" | alltrips$ride_length < 0), ]
## Remove the NA values to get a more precise data summary
alltrips_v3 <- na.omit(alltrips_v2)
This section will order the weekdays as required sequence, meanwhile summarizing the data
## `summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
## # A tibble: 14 x 4
## # Groups: member_casual [2]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <dbl>
## 1 casual 週日 422101 2282.
## 2 casual 週一 234683 1960.
## 3 casual 週二 222659 1763.
## 4 casual 週三 227208 1710.
## 5 casual 週四 237675 1685.
## 6 casual 週五 307261 1889.
## 7 casual 週六 501384 2125.
## 8 member 週日 332489 923.
## 9 member 週一 356405 777.
## 10 member 週二 396651 764.
## 11 member 週三 411003 768.
## 12 member 週四 397746 760.
## 13 member 週五 391594 796.
## 14 member 週六 389965 905.
## geom_col: width = NULL, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_dodge
## `summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
Based on the visual display and the data, the following conclusion can be made for the recommendation associated with the observed trends.
After seeing this trend, perhaps the following strategy can be implemented for the casual riders to buy memberships.