The Capstone Project is the last portion of the Google Professional Data Analytics specialization. Under this project we have been supplied by a dataset by Motivate International Inc. under the license. We would be looking at the dataset and finding how differently the members and casual riders use the bikeshare app Divvy for riding bikes.
suppressPackageStartupMessages(library(tidyverse))
df1<- list.files(path='D:/Data Backup 18.12.21/Data Analytics Google/Case Study - bikesharing/CSV Files',pattern='*.csv') %>%
map_df(~read_csv(.,show_col_types = FALSE))
colnames(df1)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
str(df1)
## spc_tbl_ [5,667,717 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5667717] "C2F7DD78E82EC875" "A6CF8980A652D272" "BD0F91DFF741C66D" "CBB80ED419105406" ...
## $ rideable_type : chr [1:5667717] "electric_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : POSIXct[1:5667717], format: "2022-01-13 11:59:47" "2022-01-10 08:41:56" ...
## $ ended_at : POSIXct[1:5667717], format: "2022-01-13 12:02:44" "2022-01-10 08:46:17" ...
## $ start_station_name: chr [1:5667717] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Sheffield Ave & Fullerton Ave" "Clark St & Bryn Mawr Ave" ...
## $ start_station_id : chr [1:5667717] "525" "525" "TA1306000016" "KA1504000151" ...
## $ end_station_name : chr [1:5667717] "Clark St & Touhy Ave" "Clark St & Touhy Ave" "Greenview Ave & Fullerton Ave" "Paulina St & Montrose Ave" ...
## $ end_station_id : chr [1:5667717] "RP-007" "RP-007" "TA1307000001" "TA1309000021" ...
## $ start_lat : num [1:5667717] 42 42 41.9 42 41.9 ...
## $ start_lng : num [1:5667717] -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num [1:5667717] 42 42 41.9 42 41.9 ...
## $ end_lng : num [1:5667717] -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ member_casual : chr [1:5667717] "casual" "casual" "member" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(df1)
## ride_id rideable_type started_at
## Length:5667717 Length:5667717 Min. :2022-01-01 00:00:05.00
## Class :character Class :character 1st Qu.:2022-05-28 19:21:05.00
## Mode :character Mode :character Median :2022-07-22 15:03:59.00
## Mean :2022-07-20 07:21:18.74
## 3rd Qu.:2022-09-16 07:21:29.00
## Max. :2022-12-31 23:59:26.00
##
## ended_at start_station_name start_station_id
## Min. :2022-01-01 00:01:48.00 Length:5667717 Length:5667717
## 1st Qu.:2022-05-28 19:43:07.00 Class :character Class :character
## Median :2022-07-22 15:24:44.00 Mode :character Mode :character
## Mean :2022-07-20 07:40:45.33
## 3rd Qu.:2022-09-16 07:39:03.00
## Max. :2023-01-02 04:56:45.00
##
## end_station_name end_station_id start_lat start_lng
## Length:5667717 Length:5667717 Min. :41.64 Min. :-87.84
## Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
## Mode :character Mode :character Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :45.64 Max. :-73.80
##
## end_lat end_lng member_casual
## Min. : 0.00 Min. :-88.14 Length:5667717
## 1st Qu.:41.88 1st Qu.:-87.66 Class :character
## Median :41.90 Median :-87.64 Mode :character
## Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.37 Max. : 0.00
## NA's :5858 NA's :5858
We can see there’s a lot of empty data in different columns and some of the names can be modified to make them more intuitive
df1 <- df1 %>% rename(user_type=member_casual)
na_count <- data.frame(sapply(df1,function(y) sum(length(which(is.na(y))))))
print(na_count)
## sapply.df1..function.y..sum.length.which.is.na.y.....
## ride_id 0
## rideable_type 0
## started_at 0
## ended_at 0
## start_station_name 833064
## start_station_id 833064
## end_station_name 892742
## end_station_id 892742
## start_lat 0
## start_lng 0
## end_lat 5858
## end_lng 5858
## user_type 0
df2 <- df1 %>% drop_na()
na_count_1 <- data.frame(sapply(df2,function(y) sum(length(which(is.na(y))))))
print(na_count_1)
## sapply.df2..function.y..sum.length.which.is.na.y.....
## ride_id 0
## rideable_type 0
## started_at 0
## ended_at 0
## start_station_name 0
## start_station_id 0
## end_station_name 0
## end_station_id 0
## start_lat 0
## start_lng 0
## end_lat 0
## end_lng 0
## user_type 0
df2$ride_length<-df2$ended_at-df2$started_at
df2$week_of_day <- weekdays.POSIXt(df2$started_at)
df2$week_of_day<-as.factor(df2$week_of_day)
df2$user_type<-as.factor(df2$user_type)
df2 <- df2 %>% rename(type_of_bike=rideable_type)
df2$ride_length<-as.numeric(df2$ride_length)
df2$type_of_bike<-gsub('_bike','',df2$type_of_bike)
df2 <- df2[df2$type_of_bike!='docked',]
df2$type_of_bike<-as.factor(df2$type_of_bike)
Now we can see that there are no more empty rows of data and we have significantly changed the names of the columns to make them more presentable.
Now we will summarize our data and plot them
summary_df<-df2 %>% group_by(week_of_day,type_of_bike,user_type) %>%
summarize(avg_ride_length=mean(ride_length),num_of_rides=n()) %>% as.data.frame()
## `summarise()` has grouped output by 'week_of_day', 'type_of_bike'. You can
## override using the `.groups` argument.
print(summary_df)
## week_of_day type_of_bike user_type avg_ride_length num_of_rides
## 1 Friday classic casual 1382.0759 122722
## 2 Friday classic member 778.3264 231666
## 3 Friday electric casual 970.4448 103056
## 4 Friday electric member 652.9350 128388
## 5 Monday classic casual 1494.4424 103966
## 6 Monday classic member 768.4842 247261
## 7 Monday electric casual 1008.6913 84596
## 8 Monday electric member 632.0427 127910
## 9 Saturday classic casual 1595.8202 196654
## 10 Saturday classic member 887.2466 227187
## 11 Saturday electric casual 1143.6643 130334
## 12 Saturday electric member 739.9633 111092
## 13 Sunday classic casual 1630.9284 158094
## 14 Sunday classic member 876.4591 200952
## 15 Sunday electric casual 1145.6660 108030
## 16 Sunday electric member 736.3617 96781
## 17 Thursday classic casual 1328.8531 113477
## 18 Thursday classic member 768.6592 267279
## 19 Thursday electric casual 893.1744 97044
## 20 Thursday electric member 637.7513 148611
## 21 Tuesday classic casual 1348.9980 95811
## 22 Tuesday classic member 753.2221 268159
## 23 Tuesday electric casual 883.7330 83068
## 24 Tuesday electric member 621.6335 143090
## 25 Wednesday classic casual 1294.4558 98056
## 26 Wednesday classic member 756.0897 266142
## 27 Wednesday electric casual 871.7371 88423
## 28 Wednesday electric member 628.6449 146653
We can see from the plots that classic bikes are the most popular and members utilize it often on daily basis but the casual riders utilize more often during weekends rather than on weekdays