##Install packages for data cleaning

install.packages('tidyverse')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

##Import data (divvy-tripdata)

divvy_0221 <- read_csv('/cloud/project/202102-divvy-tripdata.csv')
## Rows: 49622 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
divvy_0121 <- read_csv('/cloud/project/202101-divvy-tripdata.csv')
## Rows: 96834 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
divvy_1220 <- read_csv('/cloud/project/202012-divvy-tripdata.csv')
## Rows: 131573 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##Inspecting data: All metrics are in the same formats. There are some NA values in the start_station_name, start_station_id, end_station_name, end_station_id columns. I decided not to delete the rows that have NA value since it does not influence the outcome of the analysis.

str(divvy_0221)
## spec_tbl_df [49,622 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:49622] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ rideable_type     : chr [1:49622] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:49622], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ ended_at          : POSIXct[1:49622], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ start_station_name: chr [1:49622] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ start_station_id  : chr [1:49622] "525" "525" "KA1503000012" "637" ...
##  $ end_station_name  : chr [1:49622] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ end_station_id    : chr [1:49622] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ start_lat         : num [1:49622] 42 42 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:49622] 42 42 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:49622] "member" "casual" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(divvy_0121)
## spec_tbl_df [96,834 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ rideable_type     : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:96834], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
##  $ ended_at          : POSIXct[1:96834], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
##  $ start_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr [1:96834] "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr [1:96834] NA NA NA NA ...
##  $ end_station_id    : chr [1:96834] NA NA NA NA ...
##  $ start_lat         : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:96834] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(divvy_1220)
## spec_tbl_df [131,573 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:131573] "70B6A9A437D4C30D" "158A465D4E74C54A" "5262016E0F1F2F9A" "BE119628E44F871E" ...
##  $ rideable_type     : chr [1:131573] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:131573], format: "2020-12-27 12:44:29" "2020-12-18 17:37:15" ...
##  $ ended_at          : POSIXct[1:131573], format: "2020-12-27 12:55:06" "2020-12-18 17:44:19" ...
##  $ start_station_name: chr [1:131573] "Aberdeen St & Jackson Blvd" NA NA NA ...
##  $ start_station_id  : chr [1:131573] "13157" NA NA NA ...
##  $ end_station_name  : chr [1:131573] "Desplaines St & Kinzie St" NA NA NA ...
##  $ end_station_id    : chr [1:131573] "TA1306000003" NA NA NA ...
##  $ start_lat         : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:131573] -87.7 -87.7 -87.7 -87.7 -87.6 ...
##  $ end_lat           : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:131573] -87.6 -87.7 -87.7 -87.7 -87.6 ...
##  $ member_casual     : chr [1:131573] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Merging datasets:

I merge 3 datasets into 1 summary dataset to be able to easily transform and analyze data

summary_divvy_data <- bind_rows(divvy_0221,divvy_0121,divvy_1220)
str(summary_divvy_data)
## spec_tbl_df [278,029 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:278029] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ rideable_type     : chr [1:278029] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:278029], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ ended_at          : POSIXct[1:278029], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ start_station_name: chr [1:278029] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ start_station_id  : chr [1:278029] "525" "525" "KA1503000012" "637" ...
##  $ end_station_name  : chr [1:278029] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ end_station_id    : chr [1:278029] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ start_lat         : num [1:278029] 42 42 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:278029] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:278029] 42 42 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:278029] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:278029] "member" "casual" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

There are some NA values in the dataframe. I decided to delete the rows that have NA value since it does not influence the outcome of the analysis.

summary_divvy_data <- summary_divvy_data %>% 
  drop_na()

Data transformation:

I decided to add 3 more columns to prepare for the data analysis and I also added ride_length column to calculate the length of each ride for each rider.

summary_divvy_data <- summary_divvy_data %>%
  mutate(ride_length = difftime(ended_at, started_at, units ="min"))
str(summary_divvy_data)
## tibble [240,416 × 14] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ rideable_type     : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ ended_at          : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ start_station_id  : chr [1:240416] "525" "525" "KA1503000012" "637" ...
##  $ end_station_name  : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ end_station_id    : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ start_lat         : num [1:240416] 42 42 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:240416] 42 42 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:240416] "member" "casual" "member" "member" ...
##  $ ride_length       : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
##   ..- attr(*, "units")= chr "mins"

Adding day_of_week column to identify the weekdays when the ride occured

summary_divvy_data <- summary_divvy_data %>% 
  mutate(day_of_week = weekdays(started_at,abbreviate = FALSE))
str(summary_divvy_data)
## tibble [240,416 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ rideable_type     : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ ended_at          : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ start_station_id  : chr [1:240416] "525" "525" "KA1503000012" "637" ...
##  $ end_station_name  : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ end_station_id    : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ start_lat         : num [1:240416] 42 42 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:240416] 42 42 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:240416] "member" "casual" "member" "member" ...
##  $ ride_length       : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
##   ..- attr(*, "units")= chr "mins"
##  $ day_of_week       : chr [1:240416] "Friday" "Sunday" "Tuesday" "Tuesday" ...

Adding hour column to identify the hour when the ride occured

With that at first I will have to install and load the lubridate package. #### Loading the lubridate package

install.packages('lubridate')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Adding ‘hour’ column

summary_divvy_data <- summary_divvy_data %>% 
  mutate(hour = hour(started_at))
str(summary_divvy_data)
## tibble [240,416 × 16] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ rideable_type     : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ ended_at          : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ start_station_id  : chr [1:240416] "525" "525" "KA1503000012" "637" ...
##  $ end_station_name  : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ end_station_id    : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ start_lat         : num [1:240416] 42 42 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:240416] 42 42 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:240416] "member" "casual" "member" "member" ...
##  $ ride_length       : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
##   ..- attr(*, "units")= chr "mins"
##  $ day_of_week       : chr [1:240416] "Friday" "Sunday" "Tuesday" "Tuesday" ...
##  $ hour              : int [1:240416] 16 17 19 17 15 15 17 18 15 8 ...

Creating a summary dataframe and removing outlier columns, which are start_lat, start_lng, end_lat and end_lng.

summary_divvy_data_final <- summary_divvy_data %>% 
  select('ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'member_casual', 'ride_length', 'day_of_week', 'hour')

Remove duplicate if there’s any

summary_divvy_data_finaL <- distinct(summary_divvy_data_final, ride_id, .keep_all = TRUE)
str(summary_divvy_data_final)
## tibble [240,416 × 12] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ rideable_type     : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ ended_at          : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ start_station_id  : chr [1:240416] "525" "525" "KA1503000012" "637" ...
##  $ end_station_name  : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ end_station_id    : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ member_casual     : chr [1:240416] "member" "casual" "member" "member" ...
##  $ ride_length       : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
##   ..- attr(*, "units")= chr "mins"
##  $ day_of_week       : chr [1:240416] "Friday" "Sunday" "Tuesday" "Tuesday" ...
##  $ hour              : int [1:240416] 16 17 19 17 15 15 17 18 15 8 ...

##Data Analysis Installing skimr packages for analysis

install.packages('skimr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(skimr)

Let’s have an overview of the dataframe

summary(summary_divvy_data_final) 
##    ride_id          rideable_type        started_at                 
##  Length:240416      Length:240416      Min.   :2020-12-01 00:07:08  
##  Class :character   Class :character   1st Qu.:2020-12-14 15:46:07  
##  Mode  :character   Mode  :character   Median :2021-01-04 14:25:26  
##                                        Mean   :2021-01-06 10:35:49  
##                                        3rd Qu.:2021-01-23 13:42:20  
##                                        Max.   :2021-02-28 23:59:41  
##     ended_at                   start_station_name start_station_id  
##  Min.   :2020-11-25 07:40:56   Length:240416      Length:240416     
##  1st Qu.:2020-12-14 14:59:43   Class :character   Class :character  
##  Median :2021-01-04 14:40:54   Mode  :character   Mode  :character  
##  Mean   :2021-01-06 10:14:36                                        
##  3rd Qu.:2021-01-23 13:57:53                                        
##  Max.   :2021-03-05 15:11:45                                        
##  end_station_name   end_station_id     member_casual      ride_length      
##  Length:240416      Length:240416      Length:240416      Length:240416    
##  Class :character   Class :character   Class :character   Class :difftime  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :numeric   
##                                                                            
##                                                                            
##                                                                            
##  day_of_week             hour      
##  Length:240416      Min.   : 0.00  
##  Class :character   1st Qu.:11.00  
##  Mode  :character   Median :14.00  
##                     Mean   :13.59  
##                     3rd Qu.:17.00  
##                     Max.   :23.00
skim_without_charts(summary_divvy_data_final)
Data summary
Name summary_divvy_data_final
Number of rows 240416
Number of columns 12
_______________________
Column type frequency:
character 8
difftime 1
numeric 1
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1 16 16 0 240416 0
rideable_type 0 1 11 13 0 3 0
start_station_name 0 1 10 51 0 659 0
start_station_id 0 1 3 35 0 655 0
end_station_name 0 1 10 53 0 667 0
end_station_id 0 1 3 35 0 662 0
member_casual 0 1 6 6 0 2 0
day_of_week 0 1 6 9 0 7 0

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
ride_length 0 1 -29049.97 mins 30129.23 mins 9.85 mins 6977

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
hour 0 1 13.59 4.4 0 11 14 17 23

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2020-12-01 00:07:08 2021-02-28 23:59:41 2021-01-04 14:25:26 232281
ended_at 0 1 2020-11-25 07:40:56 2021-03-05 15:11:45 2021-01-04 14:40:54 231891

After running the skim_without_charts code, I notice 2 things 1. The unique number of ride_length is much less than number of ride_id this could mean there are many rides that have the same length.

  1. The min ride_length amount has a negative value and the max value of the ride_length is 30129, which is excessively higher than the other value, there must be some outliers in the dataset. I should have a better look at this variable.

###First, I decided to filter out all the negative value

ride_length_filtered <- summary_divvy_data_final %>% 
  filter(ride_length >0)

Then I summarized some metrics needed to analyze

summarize(ride_length_filtered, sd = sd(ride_length), mean_ride_length = mean(ride_length), median_ride_length = median(ride_length),min = min(ride_length), max = max(ride_length), Q1 = quantile(ride_length, probs = 0.25), Q3 = quantile(ride_length, probs = 0.75))
## # A tibble: 1 × 7
##      sd mean_ride_length median_ride_length min             max     Q1     Q3   
##   <dbl> <drtn>           <drtn>             <drtn>          <drtn>  <drtn> <drt>
## 1  122. 16.27021 mins    9.866667 mins      0.01666667 mins 30129.… 5.883… 17.3…

I use visualizations to have a better view of the ride_length variable

install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data = ride_length_filtered) +
  geom_histogram (mapping = aes(x = ride_length, fill = member_casual))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Because there are some value much higher than others, the visualizations couldn’t show an overview of the data. So I decided to narrow it down to which ride_length are less than 1000

ride_length_filtered <- summary_divvy_data_final %>% 
  filter(ride_length >0 & ride_length <1000)
ggplot(data = ride_length_filtered) +
  geom_histogram (mapping = aes(x = ride_length, fill = member_casual))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For the next step, I summary the dataframe and group by the member_casual variables to have some comparision between these 2 variables

ride_length_grouped <- ride_length_filtered %>%
  group_by(member_casual) %>%
  summarize(sd = sd(ride_length), mean_ride_length = mean(ride_length), median_ride_length = median(ride_length),min = min(ride_length), max = max(ride_length), .groups ="drop")

Have a view of the dataframe

ride_length_grouped
## # A tibble: 2 × 6
##   member_casual    sd mean_ride_length median_ride_length min             max   
##   <chr>         <dbl> <drtn>           <drtn>             <drtn>          <drtn>
## 1 casual         34.2 23.08165 mins    13.933333 mins     0.01666667 mins 991.5…
## 2 member         15.0 12.52177 mins     9.116667 mins     0.01666667 mins 986.7…

After I summarized the data, it is easily seen that the average amount of a ride_length for casual member are more than twice of the member’s. Let’s make a visualization to be clearer ### Now, let’s make a comparision of the number of member’s ride and casual’s. First, let’s assign start date and end date of the dataset

mindate <- min(date(ride_length_filtered$started_at))
maxdate <- max(date(ride_length_filtered$started_at))

After that, I run this code to make a bar chart about the number of rides by casual and member riders

ggplot(data = ride_length_filtered) + 
  geom_bar (mapping = aes(x=member_casual)) +
  labs(title = 'Number of rides by casual and member riders', x = 'Type of riders', y = 'Number of rides', caption =paste0('Data form ',mindate,'to ',maxdate))

In this bar chart, it’s cleary seen that the number of member’s ride is much greater than casual’s. This is understandable since member riders could want to make use of their purchases. But although the number of rides for the member are much more than the others, their ride length seem much less due to previous findings. So let’s dig deeper in this behavior, starting with finding out their range of ride length for both member and casual riders.

ggplot(data = ride_length_filtered) +
  geom_freqpoly (mapping = aes(x = ride_length, color = member_casual)) +
  labs(title = 'Number of rides by casual and member riders', x = 'Type of riders', y = 'Number of rides', caption =paste0('Data form ',mindate,'to ',maxdate))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

As seen from the charts, the casual rides are much less than the member’s but the casual riders tend to ride longer than the others in average and the number of rides over 100 mins for the casual riders are slightly greater. In summary, casual riders tend to have less ride than member but they tend to have more long ride than member

###Now let’s take a look at the frequency of the each type of rider using visualizations

ggplot(data = ride_length_filtered) +
  geom_bar(mapping = aes(x = day_of_week,fill = member_casual)) +
  facet_wrap(~member_casual) +
  theme(axis.text.x = element_text(angle = 45)) +
  labs(title="Weekly frequency for casual and member riders", x = 'Weekday', y = 'Number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate)) 

From the charts, we can identify the weekdays that have most rides for casual and member riders. For the casuals, Saturday and sunday tend to have most rides. For the members, although from tuesday to thursday are the weekdays that occur most rides, sunday seems to be the weekday that has the lowest number of rides and the number of rides seems to occur varily from Monday to Saturday ### Let’see what are the difference in the time between these 2 riders on a day that the rides occur

ggplot(data = ride_length_filtered) +
  geom_bar(mapping = aes(x=hour, fill = member_casual)) +
  facet_wrap (~member_casual) +
  labs(title ='Hour distribution of divvy rides', x = 'Hour', y = 'number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))

ggplot(data = ride_length_filtered) +
  geom_bar(mapping = aes (x=hour, fill = member_casual)) +
  facet_grid (~day_of_week ~member_casual) +
  labs(title = 'Hourly distribution broken down by week days of divvy rides', x = 'Hour', y = 'Number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))

Making a freqpoly chart for the visualization to be clearer

ggplot(data = ride_length_filtered) +
  geom_freqpoly(mapping = aes (x=hour, color = member_casual)) +
  facet_wrap (~day_of_week) +
  labs(title = 'Hourly distribution broken down by week days of divvy rides', x = 'Hour', y = 'Number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

From the charts, it seems that from Monday to Friday, member riders tend to use the service from 7 o’clock to 16 o’clock varily and reach the peak at 17 o’clock for the member riders.Meanwhile the casual riders tends to use the ride service much more often and mainly on the weekends.

##Conclusions Many member riders use divvy service mainly from 7 o’clock to 17 o’clock for the daily commute. While casual riders use divvy service mainly on the weekends. The business could use this findings to make some marketing campaigns for users to promote the service for daily commuter