This is a capstone project as part of my Google Data Analytics Professional Certificate course. I will be using R Programming language and Rstudio to perform the analysis and data vizualization.
The 5 phases of data analysis will be followed to complete the case study:
Statement of the business task
*To understand how casual riders use Cyclistic bikes differently to annual members in order to make focused marketing strategies to attract casual users to change to annual membership.
Key Stakeholders
Source of data
Public data set was collected and made available by Motivate International Inc. under this license
Data set is available for download [here] (https://divvy-tripdata.s3.amazonaws.com/index.html)
Data Organization
Data credibility
Load and combine all data sets into a single data frame
#install and load r packages
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Linh/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Linh\AppData\Local\Temp\Rtmp2lUZNw\downloaded_packages
install.packages("hms", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Linh/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'hms' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Linh\AppData\Local\Temp\Rtmp2lUZNw\downloaded_packages
install.packages("kableExtra", dependencies = TRUE, repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Linh/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'kableExtra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Linh\AppData\Local\Temp\Rtmp2lUZNw\downloaded_packages
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
## Loading required package: timechange
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(skimr)
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 2 > 1' in coercion to 'logical(1)'
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
#import data to RStudio Desktop
nov_21 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202111-divvy-tripdata.csv")
dec_21 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202112-divvy-tripdata.csv")
jan_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202201-divvy-tripdata.csv")
feb_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202202-divvy-tripdata.csv")
mar_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202203-divvy-tripdata.csv")
apr_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202204-divvy-tripdata.csv")
may_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202205-divvy-tripdata.csv")
jun_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202206-divvy-tripdata.csv")
jul_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202207-divvy-tripdata.csv")
aug_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202208-divvy-tripdata.csv")
sep_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202209-divvy-publictripdata.csv")
oct_22 <- read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\202210-divvy-tripdata.csv")
test_stations <-read.csv("C:\\Users\\Linh\\Desktop\\Data Analytic\\New folder\\R Projects\\test_stations.csv")
#check data structure
str(nov_21)
## 'data.frame': 359978 obs. of 13 variables:
## $ ride_id : chr "7C00A93E10556E47" "90854840DFD508BA" "0A7D10CDD144061C" "2F3BE33085BCFF02" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-11-27 13:27:38" "2021-11-27 13:38:25" "2021-11-26 22:03:34" "2021-11-27 09:56:49" ...
## $ ended_at : chr "2021-11-27 13:46:38" "2021-11-27 13:56:10" "2021-11-26 22:05:56" "2021-11-27 10:01:50" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 42 42 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ end_lat : num 42 41.9 42 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(dec_21)
## 'data.frame': 247540 obs. of 13 variables:
## $ ride_id : chr "46F8167220E4431F" "73A77762838B32FD" "4CF42452054F59C5" "3278BA87BF698339" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "2021-12-07 15:06:07" "2021-12-11 03:43:29" "2021-12-15 23:10:28" "2021-12-26 16:16:10" ...
## $ ended_at : chr "2021-12-07 15:13:42" "2021-12-11 04:10:23" "2021-12-15 23:23:14" "2021-12-26 16:30:53" ...
## $ start_station_name: chr "Laflin St & Cullerton St" "LaSalle Dr & Huron St" "Halsted St & North Branch St" "Halsted St & North Branch St" ...
## $ start_station_id : chr "13307" "KP1705001026" "KA1504000117" "KA1504000117" ...
## $ end_station_name : chr "Morgan St & Polk St" "Clarendon Ave & Leland Ave" "Broadway & Barry Ave" "LaSalle Dr & Huron St" ...
## $ end_station_id : chr "TA1307000130" "TA1307000119" "13137" "KP1705001026" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num 41.9 42 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "casual" "member" "member" ...
str(jan_22)
## 'data.frame': 103770 obs. of 13 variables:
## $ ride_id : chr "C2F7DD78E82EC875" "A6CF8980A652D272" "BD0F91DFF741C66D" "CBB80ED419105406" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-01-13 11:59:47" "2022-01-10 08:41:56" "2022-01-25 04:53:40" "2022-01-04 00:18:04" ...
## $ ended_at : chr "2022-01-13 12:02:44" "2022-01-10 08:46:17" "2022-01-25 04:58:01" "2022-01-04 00:33:00" ...
## $ start_station_name: chr "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Sheffield Ave & Fullerton Ave" "Clark St & Bryn Mawr Ave" ...
## $ start_station_id : chr "525" "525" "TA1306000016" "KA1504000151" ...
## $ end_station_name : chr "Clark St & Touhy Ave" "Clark St & Touhy Ave" "Greenview Ave & Fullerton Ave" "Paulina St & Montrose Ave" ...
## $ end_station_id : chr "RP-007" "RP-007" "TA1307000001" "TA1309000021" ...
## $ start_lat : num 42 42 41.9 42 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num 42 42 41.9 42 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ member_casual : chr "casual" "casual" "member" "casual" ...
str(feb_22)
## 'data.frame': 115609 obs. of 13 variables:
## $ ride_id : chr "E1E065E7ED285C02" "1602DCDC5B30FFE3" "BE7DD2AF4B55C4AF" "A1789BDF844412BE" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-02-19 18:08:41" "2022-02-20 17:41:30" "2022-02-25 18:55:56" "2022-02-14 11:57:03" ...
## $ ended_at : chr "2022-02-19 18:23:56" "2022-02-20 17:45:56" "2022-02-25 19:09:34" "2022-02-14 12:04:00" ...
## $ start_station_name: chr "State St & Randolph St" "Halsted St & Wrightwood Ave" "State St & Randolph St" "Southport Ave & Waveland Ave" ...
## $ start_station_id : chr "TA1305000029" "TA1309000061" "TA1305000029" "13235" ...
## $ end_station_name : chr "Clark St & Lincoln Ave" "Southport Ave & Wrightwood Ave" "Canal St & Adams St" "Broadway & Sheridan Rd" ...
## $ end_station_id : chr "13179" "TA1307000113" "13011" "13323" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 42 41.9 ...
## $ end_lng : num -87.6 -87.7 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(mar_22)
## 'data.frame': 284042 obs. of 13 variables:
## $ ride_id : chr "47EC0A7F82E65D52" "8494861979B0F477" "EFE527AF80B66109" "9F446FD9DEE3F389" ...
## $ rideable_type : chr "classic_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-03-21 13:45:01" "2022-03-16 09:37:16" "2022-03-23 19:52:02" "2022-03-01 19:12:26" ...
## $ ended_at : chr "2022-03-21 13:51:18" "2022-03-16 09:43:34" "2022-03-23 19:54:48" "2022-03-01 19:22:14" ...
## $ start_station_name: chr "Wabash Ave & Wacker Pl" "Michigan Ave & Oak St" "Broadway & Berwyn Ave" "Wabash Ave & Wacker Pl" ...
## $ start_station_id : chr "TA1307000131" "13042" "13109" "TA1307000131" ...
## $ end_station_name : chr "Kingsbury St & Kinzie St" "Orleans St & Chestnut St (NEXT Apts)" "Broadway & Ridge Ave" "Franklin St & Jackson Blvd" ...
## $ end_station_id : chr "KA1503000043" "620" "15578" "TA1305000025" ...
## $ start_lat : num 41.9 41.9 42 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.7 -87.6 -87.6 ...
## $ end_lat : num 41.9 41.9 42 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.7 -87.6 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(apr_22)
## 'data.frame': 371249 obs. of 13 variables:
## $ ride_id : chr "3564070EEFD12711" "0B820C7FCF22F489" "89EEEE32293F07FF" "84D4751AEB31888D" ...
## $ rideable_type : chr "electric_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-04-06 17:42:48" "2022-04-24 19:23:07" "2022-04-20 19:29:08" "2022-04-22 21:14:06" ...
## $ ended_at : chr "2022-04-06 17:54:36" "2022-04-24 19:43:17" "2022-04-20 19:35:16" "2022-04-22 21:23:29" ...
## $ start_station_name: chr "Paulina St & Howard St" "Wentworth Ave & Cermak Rd" "Halsted St & Polk St" "Wentworth Ave & Cermak Rd" ...
## $ start_station_id : chr "515" "13075" "TA1307000121" "13075" ...
## $ end_station_name : chr "University Library (NU)" "Green St & Madison St" "Green St & Madison St" "Delano Ct & Roosevelt Rd" ...
## $ end_station_id : chr "605" "TA1307000120" "TA1307000120" "KA1706005007" ...
## $ start_lat : num 42 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 42.1 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "member" "casual" ...
str(may_22)
## 'data.frame': 634858 obs. of 13 variables:
## $ ride_id : chr "EC2DE40644C6B0F4" "1C31AD03897EE385" "1542FBEC830415CF" "6FF59852924528F8" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-05-23 23:06:58" "2022-05-11 08:53:28" "2022-05-26 18:36:28" "2022-05-10 07:30:07" ...
## $ ended_at : chr "2022-05-23 23:40:19" "2022-05-11 09:31:22" "2022-05-26 18:58:18" "2022-05-10 07:38:49" ...
## $ start_station_name: chr "Wabash Ave & Grand Ave" "DuSable Lake Shore Dr & Monroe St" "Clinton St & Madison St" "Clinton St & Madison St" ...
## $ start_station_id : chr "TA1307000117" "13300" "TA1305000032" "TA1305000032" ...
## $ end_station_name : chr "Halsted St & Roscoe St" "Field Blvd & South Water St" "Wood St & Milwaukee Ave" "Clark St & Randolph St" ...
## $ end_station_id : chr "TA1309000025" "15534" "13221" "TA1305000030" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.7 -87.6 -87.7 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(jun_22)
## 'data.frame': 769204 obs. of 13 variables:
## $ ride_id : chr "600CFD130D0FD2A4" "F5E6B5C1682C6464" "B6EB6D27BAD771D2" "C9C320375DE1D5C6" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2022-06-30 17:27:53" "2022-06-30 18:39:52" "2022-06-30 11:49:25" "2022-06-30 11:15:25" ...
## $ ended_at : chr "2022-06-30 17:35:15" "2022-06-30 18:47:28" "2022-06-30 12:02:54" "2022-06-30 11:19:43" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.9 41.8 41.9 ...
## $ start_lng : num -87.6 -87.6 -87.7 -87.7 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 41.8 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(jul_22)
## 'data.frame': 823488 obs. of 13 variables:
## $ ride_id : chr "954144C2F67B1932" "292E027607D218B6" "57765852588AD6E0" "B5B6BE44314590E6" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "2022-07-05 08:12:47" "2022-07-26 12:53:38" "2022-07-03 13:58:49" "2022-07-31 17:44:21" ...
## $ ended_at : chr "2022-07-05 08:24:32" "2022-07-26 12:55:31" "2022-07-03 14:06:32" "2022-07-31 18:42:50" ...
## $ start_station_name: chr "Ashland Ave & Blackhawk St" "Buckingham Fountain (Temp)" "Buckingham Fountain (Temp)" "Buckingham Fountain (Temp)" ...
## $ start_station_id : chr "13224" "15541" "15541" "15541" ...
## $ end_station_name : chr "Kingsbury St & Kinzie St" "Michigan Ave & 8th St" "Michigan Ave & 8th St" "Woodlawn Ave & 55th St" ...
## $ end_station_id : chr "KA1503000043" "623" "623" "TA1307000164" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num 41.9 41.9 41.9 41.8 41.9 ...
## $ end_lng : num -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ member_casual : chr "member" "casual" "casual" "casual" ...
str(aug_22)
## 'data.frame': 785932 obs. of 13 variables:
## $ ride_id : chr "550CF7EFEAE0C618" "DAD198F405F9C5F5" "E6F2BC47B65CB7FD" "F597830181C2E13C" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2022-08-07 21:34:15" "2022-08-08 14:39:21" "2022-08-08 15:29:50" "2022-08-08 02:43:50" ...
## $ ended_at : chr "2022-08-07 21:41:46" "2022-08-08 14:53:23" "2022-08-08 15:40:34" "2022-08-08 02:58:53" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 41.9 42 41.9 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 42 42 41.8 ...
## $ end_lng : num -87.7 -87.6 -87.7 -87.7 -87.7 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(sep_22)
## 'data.frame': 701339 obs. of 13 variables:
## $ ride_id : chr "5156990AC19CA285" "E12D4A16BF51C274" "A02B53CD7DB72DD7" "C82E05FEE872DF11" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2022-09-01 08:36:22" "2022-09-01 17:11:29" "2022-09-01 17:15:50" "2022-09-01 09:00:28" ...
## $ ended_at : chr "2022-09-01 08:39:05" "2022-09-01 17:14:45" "2022-09-01 17:16:12" "2022-09-01 09:10:32" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "California Ave & Milwaukee Ave" "" "" "" ...
## $ end_station_id : chr "13084" "" "" "" ...
## $ start_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.6 -87.6 -87.7 -87.7 ...
## $ end_lat : num 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.6 -87.6 -87.7 -87.7 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
str(oct_22)
## 'data.frame': 558685 obs. of 13 variables:
## $ ride_id : chr "A50255C1E17942AB" "DB692A70BD2DD4E3" "3C02727AAF60F873" "47E653FDC2D99236" ...
## $ rideable_type : chr "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2022-10-14 17:13:30" "2022-10-01 16:29:26" "2022-10-19 18:55:40" "2022-10-31 07:52:36" ...
## $ ended_at : chr "2022-10-14 17:19:39" "2022-10-01 16:49:06" "2022-10-19 19:03:30" "2022-10-31 07:58:49" ...
## $ start_station_name: chr "Noble St & Milwaukee Ave" "Damen Ave & Charleston St" "Hoyne Ave & Balmoral Ave" "Rush St & Cedar St" ...
## $ start_station_id : chr "13290" "13288" "655" "KA1504000133" ...
## $ end_station_name : chr "Larrabee St & Division St" "Damen Ave & Cullerton St" "Western Ave & Leland Ave" "Orleans St & Chestnut St (NEXT Apts)" ...
## $ end_station_id : chr "KA1504000079" "13089" "TA1307000140" "620" ...
## $ start_lat : num 41.9 41.9 42 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ end_lat : num 41.9 41.9 42 41.9 41.9 ...
## $ end_lng : num -87.6 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr "member" "casual" "member" "member" ...
We will merge all the data sets into a single data frame for analysis
bike_share <- bind_rows(nov_21, dec_21, jan_22, feb_22, mar_22, apr_22, may_22, jun_22, jul_22, aug_22, sep_22, oct_22)
#View new merged data frame
View(bike_share)
Data cleaning
#check merged data frame
colnames(bike_share) #see list of column names
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
head(bike_share) #see the first 6 rows of data
## ride_id rideable_type started_at ended_at
## 1 7C00A93E10556E47 electric_bike 2021-11-27 13:27:38 2021-11-27 13:46:38
## 2 90854840DFD508BA electric_bike 2021-11-27 13:38:25 2021-11-27 13:56:10
## 3 0A7D10CDD144061C electric_bike 2021-11-26 22:03:34 2021-11-26 22:05:56
## 4 2F3BE33085BCFF02 electric_bike 2021-11-27 09:56:49 2021-11-27 10:01:50
## 5 D67B4781A19928D4 electric_bike 2021-11-26 19:09:28 2021-11-26 19:30:41
## 6 02F85C2C3C5F7D46 electric_bike 2021-11-26 18:34:07 2021-11-26 18:52:49
## start_station_name start_station_id end_station_name end_station_id
## 1
## 2
## 3
## 4
## 5
## 6 Michigan Ave & Oak St 13042
## start_lat start_lng end_lat end_lng member_casual
## 1 41.93000 -87.72000 41.96 -87.73 casual
## 2 41.96000 -87.70000 41.92 -87.70 casual
## 3 41.96000 -87.70000 41.96 -87.70 casual
## 4 41.94000 -87.79000 41.93 -87.79 casual
## 5 41.90000 -87.63000 41.88 -87.62 casual
## 6 41.90086 -87.62379 41.90 -87.63 casual
str(bike_share) #see list of column names and data type
## 'data.frame': 5755694 obs. of 13 variables:
## $ ride_id : chr "7C00A93E10556E47" "90854840DFD508BA" "0A7D10CDD144061C" "2F3BE33085BCFF02" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-11-27 13:27:38" "2021-11-27 13:38:25" "2021-11-26 22:03:34" "2021-11-27 09:56:49" ...
## $ ended_at : chr "2021-11-27 13:46:38" "2021-11-27 13:56:10" "2021-11-26 22:05:56" "2021-11-27 10:01:50" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 42 42 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ end_lat : num 42 41.9 42 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
summary(bike_share) #see statistical summary of data
## ride_id rideable_type started_at ended_at
## Length:5755694 Length:5755694 Length:5755694 Length:5755694
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:5755694 Length:5755694 Length:5755694 Length:5755694
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.64 Min. :-87.84 Min. :41.39 Min. :-88.97
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :45.64 Max. :-73.80 Max. :42.37 Max. :-87.30
## NA's :5835 NA's :5835
## member_casual
## Length:5755694
## Class :character
## Mode :character
##
##
##
##
As shown in the results of summary function, there are 5,835 N/A (NULL) values in the data frame, I will use drop-na function to remove all N/A from the data frame.
cleaned_bike_share <- drop_na(bike_share)
Add date, month, year and day_of_week columns to the data frame
cleaned_bike_share$date <- as.Date(cleaned_bike_share$started_at)
cleaned_bike_share$month <- format(as.Date(cleaned_bike_share$date), "%m")
cleaned_bike_share$day <- format(as.Date(cleaned_bike_share$date), "%d")
cleaned_bike_share$year <- format(as.Date(cleaned_bike_share$date), "%Y")
cleaned_bike_share$day_of_week <- format(as.Date(cleaned_bike_share$date), "%A")
Add a new ride_length column to the data frame. The value will be converted to minutes and variable as numeric.
cleaned_bike_share$ride_length <- round(as.numeric(difftime(cleaned_bike_share$ended_at,cleaned_bike_share$started_at, units = "mins")),2)
Let’s check the structure of the data frame
str(cleaned_bike_share)
## 'data.frame': 5749859 obs. of 19 variables:
## $ ride_id : chr "7C00A93E10556E47" "90854840DFD508BA" "0A7D10CDD144061C" "2F3BE33085BCFF02" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "2021-11-27 13:27:38" "2021-11-27 13:38:25" "2021-11-26 22:03:34" "2021-11-27 09:56:49" ...
## $ ended_at : chr "2021-11-27 13:46:38" "2021-11-27 13:56:10" "2021-11-26 22:05:56" "2021-11-27 10:01:50" ...
## $ start_station_name: chr "" "" "" "" ...
## $ start_station_id : chr "" "" "" "" ...
## $ end_station_name : chr "" "" "" "" ...
## $ end_station_id : chr "" "" "" "" ...
## $ start_lat : num 41.9 42 42 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ end_lat : num 42 41.9 42 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.8 -87.6 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
## $ date : Date, format: "2021-11-27" "2021-11-27" ...
## $ month : chr "11" "11" "11" "11" ...
## $ day : chr "27" "27" "26" "27" ...
## $ year : chr "2021" "2021" "2021" "2021" ...
## $ day_of_week : chr "Saturday" "Saturday" "Friday" "Saturday" ...
## $ ride_length : num 19 17.75 2.37 5.02 21.22 ...
Rename columns for analysis
cleaned_bike_share <- rename(cleaned_bike_share, "bike_type" = "rideable_type", "user_type" = "member_casual")
Upon my initial inspection of the csv files, I have noticed the data sets included trip where bikes were taken out from dock for test or repair. The list of testing/repair stations as following:
kable(test_stations,
col.names = c("Test Station"),
align="lcc", escape = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Test Station |
|---|
| Hubbard Bike-checking (LBS-WH-TEST) |
| Pawel Bialowas - Test- PBSC charging station |
| DIVVY CASSETTE REPAIR MOBILE STATION |
| DIVVY 001 - Warehouse test station |
Bike trips with start_station_id and end_station_id which are testing/repair stations will be removed.
final_bike_share <- cleaned_bike_share %>%
filter(!start_station_id %in% c("Hubbard Bike-checking (LBS-WH-TEST)", "Pawel Bialowas - Test- PBSC charging station", "DIVVY CASSETTE REPAIR MOBILE STATION", "DIVVY 001 - Warehouse test station"))
final_bike_share <- final_bike_share %>%
filter(!end_station_id %in% c("Hubbard Bike-checking (LBS-WH-TEST)", "Pawel Bialowas - Test- PBSC charging station", "DIVVY CASSETTE REPAIR MOBILE STATION", "DIVVY 001 - Warehouse test station"))
Confirm removal of rides related to test/repair stations
final_bike_share %>%
filter(start_station_id %in% c("Hubbard Bike-checking (LBS-WH-TEST)", "Pawel Bialowas - Test- PBSC charging station", "DIVVY CASSETTE REPAIR MOBILE STATION", "DIVVY 001 - Warehouse test station")) %>%
count(start_station_id)
## [1] start_station_id n
## <0 rows> (or 0-length row.names)
final_bike_share %>%
filter(end_station_id %in% c("Hubbard Bike-checking (LBS-WH-TEST)", "Pawel Bialowas - Test- PBSC charging station", "DIVVY CASSETTE REPAIR MOBILE STATION", "DIVVY 001 - Warehouse test station")) %>%
count(end_station_id)
## [1] end_station_id n
## <0 rows> (or 0-length row.names)
Remove of rides with negative ride_length as these are invalid data, and rides that are less than a minute as any trips that were below 60 seconds in length are potentially false starts or users trying to re-dock a bike to ensure it was secured.Also rides that are longer than 1440 minutes (24 hours) as these are considered invalid outliers for this project purpose.
final_bike_share <- final_bike_share %>%
filter(ride_length>=1 & ride_length<= 1440)
Confirm rides shorter than 1 minute are not included in the data frame.
final_bike_share %>%
filter(ride_length<1) %>%
count(ride_length)
## [1] ride_length n
## <0 rows> (or 0-length row.names)
Confirm rides longer than 24 hours are not included in the data frame.
final_bike_share %>%
filter(ride_length>1440) %>%
count(ride_length)
## [1] ride_length n
## <0 rows> (or 0-length row.names)
Identify Trends and Relationships
Let’s look at the ride_length statistics
summary(final_bike_share$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.08 10.55 16.63 18.80 1439.85
Key Insights: * Average ride length is 16.63 minutes * Minimum ride length is 1 minute * Maximum ride length is under 24 hours
Calculate the mean, max and min values of ride_length by user types.
#Average ride_length in minutes
aggregate(final_bike_share$ride_length~final_bike_share$user_type,FUN = mean)
## final_bike_share$user_type final_bike_share$ride_length
## 1 casual 22.36659
## 2 member 12.66003
#Longest ride_length in minutes
aggregate(final_bike_share$ride_length~final_bike_share$user_type,FUN = max)
## final_bike_share$user_type final_bike_share$ride_length
## 1 casual 1439.85
## 2 member 1436.33
#Shortest ride_length in minutes
aggregate(final_bike_share$ride_length~final_bike_share$user_type,FUN = min)
## final_bike_share$user_type final_bike_share$ride_length
## 1 casual 1
## 2 member 1
summary_type_weekday <- final_bike_share %>%
mutate(weekday = wday(started_at,label = TRUE, week_start = getOption("lubridate.week.start" , 1) ))%>%
group_by(user_type,weekday) %>%
summarise(number_of_rides = n(),average_duration = round(mean(ride_length) , 2)) %>%
arrange(user_type , weekday)
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
Let’s look at the average ride time by user type for each day of the week.
summary_type_weekday %>%
ggplot(mapping = aes(x=weekday, y = average_duration, fill=user_type)) +
geom_col(position = "dodge" , width = .7 , color = "white") + theme(panel.background = element_rect(fill = "white") ,
panel.grid = element_blank() , plot.background = element_rect(fill = "white"),
legend.background=element_rect(fill="white") ,
axis.title.x = element_text(colour = "black" , vjust = -2) ,
axis.title.y = element_text(colour = "black", vjust = 10),
axis.line = element_line(colour = "black"),
axis.text = element_text(colour = "black"),
legend.text = element_text(colour = "black")) +
theme(plot.margin=unit(c(1,1,2,3),"cm"), text = element_text(size = 12)) +
labs(title= "Average Ride Length by User Type", x="Day of Week", y="Ride Length (in mins)") + scale_y_continuous()
Key Insights:
Let’s look at the number of rides by user type.
summary_type_weekday %>%
ggplot(mapping = aes(x=weekday, y = number_of_rides, fill= user_type)) +
geom_col(position = "dodge" , width = .7 , color = "white") +
theme(panel.background = element_rect(fill = "white") ,panel.grid = element_blank() ,
plot.background = element_rect(fill = "white"),
legend.background=element_rect(fill="white") ,
axis.title.x = element_text(colour = "black" , vjust = -3) ,
axis.title.y = element_text(colour = "black", vjust = 10),
axis.line = element_line(colour = "black"),
axis.text = element_text(colour = "black"),
legend.text = element_text(colour = "black")) +
theme(plot.margin=unit(c(1,1,2,3),"cm") , text = element_text(size = 12)) +
labs(title= "Number of Rides by User Type",x="Day of week" , y = "Number of rides") +
scale_y_continuous(breaks = c(100000,200000,300000,400000,500000,600000,700000) ,
labels = c("100k","200k","300k","400k","500k","600k","700k"))
Key Insights:
summary_type_month <- final_bike_share %>%
group_by(user_type, month) %>%
summarise(count = n()) %>%
arrange(user_type , month)
## `summarise()` has grouped output by 'user_type'. You can override using the
## `.groups` argument.
summary_type_month %>%
ggplot(mapping = aes(x=month, y = count, fill= user_type)) +
geom_col(position = "dodge" , width = .7 , color = "white") +
theme(panel.background = element_rect(fill = "white") ,panel.grid = element_blank() ,
plot.background = element_rect(fill = "white"),
legend.background=element_rect(fill="white") ,
axis.title.x = element_text(colour = "black" , vjust = -3) ,
axis.title.y = element_text(colour = "black", vjust = 10),
axis.line = element_line(colour = "black"),
axis.text = element_text(colour = "black"),
legend.text = element_text(colour = "black")) +
theme(plot.margin=unit(c(1,1,2,3),"cm") , text = element_text(size = 12)) +
labs(title= "Number of Rides by User Type",x="Month" , y = "Number of rides")
Key Insights: