install.packages('tidyverse')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library('tidyverse')
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##Import data (divvy-tripdata)
divvy_0221 <- read_csv('/cloud/project/202102-divvy-tripdata.csv')
## Rows: 49622 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
divvy_0121 <- read_csv('/cloud/project/202101-divvy-tripdata.csv')
## Rows: 96834 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
divvy_1220 <- read_csv('/cloud/project/202012-divvy-tripdata.csv')
## Rows: 131573 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##Inspecting data: All metrics are in the same formats. There are some NA values in the start_station_name, start_station_id, end_station_name, end_station_id columns. I decided not to delete the rows that have NA value since it does not influence the outcome of the analysis.
str(divvy_0221)
## spec_tbl_df [49,622 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:49622] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:49622] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:49622], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:49622], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:49622] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:49622] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:49622] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:49622] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:49622] "member" "casual" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(divvy_0121)
## spec_tbl_df [96,834 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ rideable_type : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:96834], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
## $ ended_at : POSIXct[1:96834], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
## $ start_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ start_station_id : chr [1:96834] "17660" "17660" "17660" "17660" ...
## $ end_station_name : chr [1:96834] NA NA NA NA ...
## $ end_station_id : chr [1:96834] NA NA NA NA ...
## $ start_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:96834] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(divvy_1220)
## spec_tbl_df [131,573 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:131573] "70B6A9A437D4C30D" "158A465D4E74C54A" "5262016E0F1F2F9A" "BE119628E44F871E" ...
## $ rideable_type : chr [1:131573] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:131573], format: "2020-12-27 12:44:29" "2020-12-18 17:37:15" ...
## $ ended_at : POSIXct[1:131573], format: "2020-12-27 12:55:06" "2020-12-18 17:44:19" ...
## $ start_station_name: chr [1:131573] "Aberdeen St & Jackson Blvd" NA NA NA ...
## $ start_station_id : chr [1:131573] "13157" NA NA NA ...
## $ end_station_name : chr [1:131573] "Desplaines St & Kinzie St" NA NA NA ...
## $ end_station_id : chr [1:131573] "TA1306000003" NA NA NA ...
## $ start_lat : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
## $ start_lng : num [1:131573] -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
## $ end_lng : num [1:131573] -87.6 -87.7 -87.7 -87.7 -87.6 ...
## $ member_casual : chr [1:131573] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
I merge 3 datasets into 1 summary dataset to be able to easily transform and analyze data
summary_divvy_data <- bind_rows(divvy_0221,divvy_0121,divvy_1220)
str(summary_divvy_data)
## spec_tbl_df [278,029 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:278029] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:278029] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:278029], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:278029], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:278029] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:278029] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:278029] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:278029] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:278029] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:278029] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:278029] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:278029] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:278029] "member" "casual" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
There are some NA values in the dataframe. I decided to delete the rows that have NA value since it does not influence the outcome of the analysis.
summary_divvy_data <- summary_divvy_data %>%
drop_na()
I decided to add 3 more columns to prepare for the data analysis and I also added ride_length column to calculate the length of each ride for each rider.
summary_divvy_data <- summary_divvy_data %>%
mutate(ride_length = difftime(ended_at, started_at, units ="min"))
str(summary_divvy_data)
## tibble [240,416 × 14] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:240416] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:240416] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:240416] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:240416] "member" "casual" "member" "member" ...
## $ ride_length : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
## ..- attr(*, "units")= chr "mins"
summary_divvy_data <- summary_divvy_data %>%
mutate(day_of_week = weekdays(started_at,abbreviate = FALSE))
str(summary_divvy_data)
## tibble [240,416 × 15] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:240416] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:240416] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:240416] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:240416] "member" "casual" "member" "member" ...
## $ ride_length : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
## ..- attr(*, "units")= chr "mins"
## $ day_of_week : chr [1:240416] "Friday" "Sunday" "Tuesday" "Tuesday" ...
With that at first I will have to install and load the lubridate package. #### Loading the lubridate package
install.packages('lubridate')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
summary_divvy_data <- summary_divvy_data %>%
mutate(hour = hour(started_at))
str(summary_divvy_data)
## tibble [240,416 × 16] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:240416] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:240416] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:240416] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:240416] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:240416] "member" "casual" "member" "member" ...
## $ ride_length : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
## ..- attr(*, "units")= chr "mins"
## $ day_of_week : chr [1:240416] "Friday" "Sunday" "Tuesday" "Tuesday" ...
## $ hour : int [1:240416] 16 17 19 17 15 15 17 18 15 8 ...
summary_divvy_data_final <- summary_divvy_data %>%
select('ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'member_casual', 'ride_length', 'day_of_week', 'hour')
summary_divvy_data_finaL <- distinct(summary_divvy_data_final, ride_id, .keep_all = TRUE)
str(summary_divvy_data_final)
## tibble [240,416 × 12] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:240416] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ rideable_type : chr [1:240416] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:240416], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ ended_at : POSIXct[1:240416], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ start_station_name: chr [1:240416] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ start_station_id : chr [1:240416] "525" "525" "KA1503000012" "637" ...
## $ end_station_name : chr [1:240416] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ end_station_id : chr [1:240416] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ member_casual : chr [1:240416] "member" "casual" "member" "member" ...
## $ ride_length : 'difftime' num [1:240416] 6.78333333333333 19.5166666666667 8.86666666666667 4.41666666666667 ...
## ..- attr(*, "units")= chr "mins"
## $ day_of_week : chr [1:240416] "Friday" "Sunday" "Tuesday" "Tuesday" ...
## $ hour : int [1:240416] 16 17 19 17 15 15 17 18 15 8 ...
##Data Analysis Installing skimr packages for analysis
install.packages('skimr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(skimr)
Let’s have an overview of the dataframe
summary(summary_divvy_data_final)
## ride_id rideable_type started_at
## Length:240416 Length:240416 Min. :2020-12-01 00:07:08
## Class :character Class :character 1st Qu.:2020-12-14 15:46:07
## Mode :character Mode :character Median :2021-01-04 14:25:26
## Mean :2021-01-06 10:35:49
## 3rd Qu.:2021-01-23 13:42:20
## Max. :2021-02-28 23:59:41
## ended_at start_station_name start_station_id
## Min. :2020-11-25 07:40:56 Length:240416 Length:240416
## 1st Qu.:2020-12-14 14:59:43 Class :character Class :character
## Median :2021-01-04 14:40:54 Mode :character Mode :character
## Mean :2021-01-06 10:14:36
## 3rd Qu.:2021-01-23 13:57:53
## Max. :2021-03-05 15:11:45
## end_station_name end_station_id member_casual ride_length
## Length:240416 Length:240416 Length:240416 Length:240416
## Class :character Class :character Class :character Class :difftime
## Mode :character Mode :character Mode :character Mode :numeric
##
##
##
## day_of_week hour
## Length:240416 Min. : 0.00
## Class :character 1st Qu.:11.00
## Mode :character Median :14.00
## Mean :13.59
## 3rd Qu.:17.00
## Max. :23.00
skim_without_charts(summary_divvy_data_final)
| Name | summary_divvy_data_final |
| Number of rows | 240416 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| difftime | 1 |
| numeric | 1 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1 | 16 | 16 | 0 | 240416 | 0 |
| rideable_type | 0 | 1 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 0 | 1 | 10 | 51 | 0 | 659 | 0 |
| start_station_id | 0 | 1 | 3 | 35 | 0 | 655 | 0 |
| end_station_name | 0 | 1 | 10 | 53 | 0 | 667 | 0 |
| end_station_id | 0 | 1 | 3 | 35 | 0 | 662 | 0 |
| member_casual | 0 | 1 | 6 | 6 | 0 | 2 | 0 |
| day_of_week | 0 | 1 | 6 | 9 | 0 | 7 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | -29049.97 mins | 30129.23 mins | 9.85 mins | 6977 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| hour | 0 | 1 | 13.59 | 4.4 | 0 | 11 | 14 | 17 | 23 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2020-12-01 00:07:08 | 2021-02-28 23:59:41 | 2021-01-04 14:25:26 | 232281 |
| ended_at | 0 | 1 | 2020-11-25 07:40:56 | 2021-03-05 15:11:45 | 2021-01-04 14:40:54 | 231891 |
After running the skim_without_charts code, I notice 2 things 1. The unique number of ride_length is much less than number of ride_id this could mean there are many rides that have the same length.
###First, I decided to filter out all the negative value
ride_length_filtered <- summary_divvy_data_final %>%
filter(ride_length >0)
Then I summarized some metrics needed to analyze
summarize(ride_length_filtered, sd = sd(ride_length), mean_ride_length = mean(ride_length), median_ride_length = median(ride_length),min = min(ride_length), max = max(ride_length), Q1 = quantile(ride_length, probs = 0.25), Q3 = quantile(ride_length, probs = 0.75))
## # A tibble: 1 × 7
## sd mean_ride_length median_ride_length min max Q1 Q3
## <dbl> <drtn> <drtn> <drtn> <drtn> <drtn> <drt>
## 1 122. 16.27021 mins 9.866667 mins 0.01666667 mins 30129.… 5.883… 17.3…
I use visualizations to have a better view of the ride_length variable
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data = ride_length_filtered) +
geom_histogram (mapping = aes(x = ride_length, fill = member_casual))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Because there are some value much higher than others, the visualizations couldn’t show an overview of the data. So I decided to narrow it down to which ride_length are less than 1000
ride_length_filtered <- summary_divvy_data_final %>%
filter(ride_length >0 & ride_length <1000)
ggplot(data = ride_length_filtered) +
geom_histogram (mapping = aes(x = ride_length, fill = member_casual))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For the next step, I summary the dataframe and group by the member_casual variables to have some comparision between these 2 variables
ride_length_grouped <- ride_length_filtered %>%
group_by(member_casual) %>%
summarize(sd = sd(ride_length), mean_ride_length = mean(ride_length), median_ride_length = median(ride_length),min = min(ride_length), max = max(ride_length), .groups ="drop")
Have a view of the dataframe
ride_length_grouped
## # A tibble: 2 × 6
## member_casual sd mean_ride_length median_ride_length min max
## <chr> <dbl> <drtn> <drtn> <drtn> <drtn>
## 1 casual 34.2 23.08165 mins 13.933333 mins 0.01666667 mins 991.5…
## 2 member 15.0 12.52177 mins 9.116667 mins 0.01666667 mins 986.7…
After I summarized the data, it is easily seen that the average amount of a ride_length for casual member are more than twice of the member’s. Let’s make a visualization to be clearer ### Now, let’s make a comparision of the number of member’s ride and casual’s. First, let’s assign start date and end date of the dataset
mindate <- min(date(ride_length_filtered$started_at))
maxdate <- max(date(ride_length_filtered$started_at))
After that, I run this code to make a bar chart about the number of rides by casual and member riders
ggplot(data = ride_length_filtered) +
geom_bar (mapping = aes(x=member_casual)) +
labs(title = 'Number of rides by casual and member riders', x = 'Type of riders', y = 'Number of rides', caption =paste0('Data form ',mindate,'to ',maxdate))
In this bar chart, it’s cleary seen that the number of member’s ride is much greater than casual’s. This is understandable since member riders could want to make use of their purchases. But although the number of rides for the member are much more than the others, their ride length seem much less due to previous findings. So let’s dig deeper in this behavior, starting with finding out their range of ride length for both member and casual riders.
ggplot(data = ride_length_filtered) +
geom_freqpoly (mapping = aes(x = ride_length, color = member_casual)) +
labs(title = 'Number of rides by casual and member riders', x = 'Type of riders', y = 'Number of rides', caption =paste0('Data form ',mindate,'to ',maxdate))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
As seen from the charts, the casual rides are much less than the member’s but the casual riders tend to ride longer than the others in average and the number of rides over 100 mins for the casual riders are slightly greater. In summary, casual riders tend to have less ride than member but they tend to have more long ride than member
###Now let’s take a look at the frequency of the each type of rider using visualizations
ggplot(data = ride_length_filtered) +
geom_bar(mapping = aes(x = day_of_week,fill = member_casual)) +
facet_wrap(~member_casual) +
theme(axis.text.x = element_text(angle = 45)) +
labs(title="Weekly frequency for casual and member riders", x = 'Weekday', y = 'Number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))
From the charts, we can identify the weekdays that have most rides for casual and member riders. For the casuals, Saturday and sunday tend to have most rides. For the members, although from tuesday to thursday are the weekdays that occur most rides, sunday seems to be the weekday that has the lowest number of rides and the number of rides seems to occur varily from Monday to Saturday ### Let’see what are the difference in the time between these 2 riders on a day that the rides occur
ggplot(data = ride_length_filtered) +
geom_bar(mapping = aes(x=hour, fill = member_casual)) +
facet_wrap (~member_casual) +
labs(title ='Hour distribution of divvy rides', x = 'Hour', y = 'number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))
ggplot(data = ride_length_filtered) +
geom_bar(mapping = aes (x=hour, fill = member_casual)) +
facet_grid (~day_of_week ~member_casual) +
labs(title = 'Hourly distribution broken down by week days of divvy rides', x = 'Hour', y = 'Number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))
Making a freqpoly chart for the visualization to be clearer
ggplot(data = ride_length_filtered) +
geom_freqpoly(mapping = aes (x=hour, color = member_casual)) +
facet_wrap (~day_of_week) +
labs(title = 'Hourly distribution broken down by week days of divvy rides', x = 'Hour', y = 'Number of rides', caption = paste0('Data from ', mindate, 'to ', maxdate))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From the charts, it seems that from Monday to Friday, member riders tend to use the service from 7 o’clock to 16 o’clock varily and reach the peak at 17 o’clock for the member riders.Meanwhile the casual riders tends to use the ride service much more often and mainly on the weekends.
##Conclusions Many member riders use divvy service mainly from 7 o’clock to 17 o’clock for the daily commute. While casual riders use divvy service mainly on the weekends. The business could use this findings to make some marketing campaigns for users to promote the service for daily commuter