This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.7 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readr)
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
x202101_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202101-divvy-tripdata.csv")
## Rows: 96834 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202102_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202102-divvy-tripdata.csv")
## Rows: 49622 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202103_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202103-divvy-tripdata.csv")
## Rows: 228496 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202104_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202104-divvy-tripdata.csv")
## Rows: 337230 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202105_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202105-divvy-tripdata.csv")
## Rows: 531633 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202106_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202106-divvy-tripdata.csv")
## Rows: 729595 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202107_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202107-divvy-tripdata.csv")
## Rows: 822410 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202108_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202108-divvy-tripdata.csv")
## Rows: 804352 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202109_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202109-divvy-tripdata.csv")
## Rows: 756147 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202110_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202110-divvy-tripdata.csv")
## Rows: 631226 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202111_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202111-divvy-tripdata.csv")
## Rows: 359978 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202112_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202112-divvy-tripdata.csv")
## Rows: 247540 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(x202112_dataset)
skim_without_charts(x202101_dataset)
| Name | x202101_dataset |
| Number of rows | 96834 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 96834 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 8625 | 0.91 | 10 | 51 | 0 | 640 | 0 |
| start_station_id | 8625 | 0.91 | 3 | 35 | 0 | 638 | 0 |
| end_station_name | 10277 | 0.89 | 10 | 53 | 0 | 632 | 0 |
| end_station_id | 10277 | 0.89 | 3 | 35 | 0 | 629 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.06 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.78 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 103 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| end_lng | 103 | 1 | -87.65 | 0.03 | -87.81 | -87.66 | -87.64 | -87.63 | -87.51 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-01-01 00:02:05 | 2021-01-31 23:57:00 | 2021-01-15 06:05:04 | 93736 |
| ended_at | 0 | 1 | 2021-01-01 00:08:39 | 2021-02-01 15:33:15 | 2021-01-15 06:19:58 | 93582 |
skim_without_charts(x202102_dataset)
| Name | x202102_dataset |
| Number of rows | 49622 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 49622 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 4046 | 0.92 | 10 | 51 | 0 | 582 | 0 |
| start_station_id | 4046 | 0.92 | 3 | 35 | 0 | 582 | 0 |
| end_station_name | 5358 | 0.89 | 10 | 53 | 0 | 584 | 0 |
| end_station_id | 5358 | 0.89 | 3 | 35 | 0 | 584 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.06 |
| start_lng | 0 | 1 | -87.64 | 0.03 | -87.77 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 214 | 1 | 41.90 | 0.04 | 41.54 | 41.88 | 41.90 | 41.93 | 42.07 |
| end_lng | 214 | 1 | -87.64 | 0.03 | -87.77 | -87.66 | -87.64 | -87.63 | -87.53 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-02-01 00:55:44 | 2021-02-28 23:59:41 | 2021-02-22 13:17:53 | 48139 |
| ended_at | 0 | 1 | 2021-02-01 01:22:48 | 2021-03-05 15:11:45 | 2021-02-22 13:39:20 | 48035 |
skim_without_charts(x202103_dataset)
| Name | x202103_dataset |
| Number of rows | 228496 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 228496 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 14848 | 0.94 | 10 | 53 | 0 | 673 | 0 |
| start_station_id | 14848 | 0.94 | 3 | 35 | 0 | 673 | 0 |
| end_station_name | 16727 | 0.93 | 10 | 53 | 0 | 673 | 0 |
| end_station_id | 16727 | 0.93 | 3 | 35 | 0 | 673 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.64 | 0.03 | -87.78 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 167 | 1 | 41.90 | 0.04 | 41.64 | 41.88 | 41.90 | 41.93 | 42.08 |
| end_lng | 167 | 1 | -87.65 | 0.03 | -88.07 | -87.66 | -87.64 | -87.63 | -87.53 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-03-01 00:01:09 | 2021-03-31 23:59:08 | 2021-03-19 17:37:20 | 209025 |
| ended_at | 0 | 1 | 2021-03-01 00:06:28 | 2021-04-06 11:00:11 | 2021-03-19 17:55:05 | 208629 |
skim_without_charts(x202104_dataset)
| Name | x202104_dataset |
| Number of rows | 337230 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 337230 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 26056 | 0.92 | 10 | 53 | 0 | 681 | 0 |
| start_station_id | 26056 | 0.92 | 3 | 35 | 0 | 681 | 0 |
| end_station_name | 28174 | 0.92 | 10 | 53 | 0 | 681 | 0 |
| end_station_id | 28174 | 0.92 | 3 | 35 | 0 | 681 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.64 | 0.03 | -87.78 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 267 | 1 | 41.90 | 0.05 | 41.59 | 41.88 | 41.90 | 41.93 | 42.15 |
| end_lng | 267 | 1 | -87.65 | 0.03 | -87.85 | -87.66 | -87.64 | -87.63 | -87.52 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-04-01 00:03:18 | 2021-04-30 23:59:53 | 2021-04-15 22:37:04 | 298722 |
| ended_at | 0 | 1 | 2021-04-01 00:14:29 | 2021-05-05 22:14:39 | 2021-04-15 23:00:10 | 298625 |
skim_without_charts(x202105_dataset)
| Name | x202105_dataset |
| Number of rows | 531633 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 531633 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 53744 | 0.90 | 10 | 53 | 0 | 687 | 0 |
| start_station_id | 53744 | 0.90 | 3 | 35 | 0 | 686 | 0 |
| end_station_name | 58194 | 0.89 | 10 | 53 | 0 | 683 | 0 |
| end_station_id | 58194 | 0.89 | 3 | 35 | 0 | 682 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.65 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.64 | 0.03 | -87.78 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 452 | 1 | 41.90 | 0.05 | 41.56 | 41.88 | 41.90 | 41.93 | 42.09 |
| end_lng | 452 | 1 | -87.64 | 0.03 | -87.85 | -87.66 | -87.64 | -87.63 | -87.52 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-05-01 00:00:11 | 2021-05-31 23:59:16 | 2021-05-19 07:44:31 | 447224 |
| ended_at | 0 | 1 | 2021-05-01 00:03:26 | 2021-06-10 22:17:11 | 2021-05-19 07:59:43 | 447217 |
skim_without_charts(x202106_dataset)
| Name | x202106_dataset |
| Number of rows | 729595 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 729595 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 80093 | 0.89 | 10 | 53 | 0 | 689 | 0 |
| start_station_id | 80093 | 0.89 | 3 | 35 | 0 | 689 | 0 |
| end_station_name | 86387 | 0.88 | 10 | 53 | 0 | 690 | 0 |
| end_station_id | 86387 | 0.88 | 3 | 35 | 0 | 690 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.64 | 0.03 | -87.78 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 717 | 1 | 41.90 | 0.04 | 41.51 | 41.88 | 41.90 | 41.93 | 42.08 |
| end_lng | 717 | 1 | -87.64 | 0.03 | -87.86 | -87.66 | -87.64 | -87.63 | -87.49 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-06-01 00:00:38 | 2021-06-30 23:59:59 | 2021-06-14 19:46:47 | 589805 |
| ended_at | 0 | 1 | 2021-06-01 00:06:22 | 2021-07-13 22:51:35 | 2021-06-14 20:13:55 | 589069 |
skim_without_charts(x202107_dataset)
| Name | x202107_dataset |
| Number of rows | 822410 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 822410 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 87263 | 0.89 | 10 | 53 | 0 | 717 | 0 |
| start_station_id | 87262 | 0.89 | 3 | 36 | 0 | 710 | 0 |
| end_station_name | 93158 | 0.89 | 10 | 53 | 0 | 714 | 0 |
| end_station_id | 93158 | 0.89 | 3 | 36 | 0 | 707 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 731 | 1 | 41.90 | 0.04 | 41.63 | 41.88 | 41.90 | 41.93 | 42.15 |
| end_lng | 731 | 1 | -87.65 | 0.03 | -87.85 | -87.66 | -87.64 | -87.63 | -87.49 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-07-01 00:00:22 | 2021-07-31 23:59:58 | 2021-07-17 13:58:37 | 659640 |
| ended_at | 0 | 1 | 2021-07-01 00:04:51 | 2021-08-12 17:45:41 | 2021-07-17 14:28:04 | 658663 |
skim_without_charts(x202108_dataset)
| Name | x202108_dataset |
| Number of rows | 804352 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 804352 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 88458 | 0.89 | 3 | 53 | 0 | 727 | 0 |
| start_station_id | 88458 | 0.89 | 3 | 35 | 0 | 726 | 0 |
| end_station_name | 94115 | 0.88 | 10 | 53 | 0 | 727 | 0 |
| end_station_id | 94115 | 0.88 | 3 | 35 | 0 | 727 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.65 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 706 | 1 | 41.90 | 0.04 | 41.58 | 41.88 | 41.90 | 41.93 | 42.15 |
| end_lng | 706 | 1 | -87.65 | 0.03 | -87.85 | -87.66 | -87.64 | -87.63 | -87.51 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-08-01 00:00:04 | 2021-08-31 23:59:35 | 2021-08-16 07:57:11 | 646516 |
| ended_at | 0 | 1 | 2021-08-01 00:03:11 | 2021-09-01 17:37:35 | 2021-08-16 08:12:14 | 645299 |
skim_without_charts(x202109_dataset)
| Name | x202109_dataset |
| Number of rows | 756147 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 756147 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 93113 | 0.88 | 10 | 53 | 0 | 758 | 0 |
| start_station_id | 93111 | 0.88 | 3 | 35 | 0 | 758 | 0 |
| end_station_name | 99261 | 0.87 | 10 | 53 | 0 | 756 | 0 |
| end_station_id | 99261 | 0.87 | 3 | 35 | 0 | 756 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.65 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 595 | 1 | 41.90 | 0.05 | 41.57 | 41.88 | 41.90 | 41.93 | 42.17 |
| end_lng | 595 | 1 | -87.65 | 0.03 | -87.87 | -87.66 | -87.64 | -87.63 | -87.50 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-09-01 00:00:06 | 2021-09-30 23:59:48 | 2021-09-15 16:43:37 | 611240 |
| ended_at | 0 | 1 | 2021-09-01 00:00:41 | 2021-10-01 22:55:35 | 2021-09-15 17:01:16 | 610277 |
skim_without_charts(x202110_dataset)
| Name | x202110_dataset |
| Number of rows | 631226 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 631226 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 108210 | 0.83 | 10 | 53 | 0 | 793 | 0 |
| start_station_id | 108210 | 0.83 | 3 | 35 | 0 | 793 | 0 |
| end_station_name | 114834 | 0.82 | 10 | 53 | 0 | 790 | 0 |
| end_station_id | 114834 | 0.82 | 3 | 35 | 0 | 790 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.65 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.83 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 484 | 1 | 41.90 | 0.05 | 41.60 | 41.88 | 41.90 | 41.93 | 42.13 |
| end_lng | 484 | 1 | -87.65 | 0.03 | -87.96 | -87.66 | -87.64 | -87.63 | -87.52 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-10-01 00:00:09 | 2021-10-31 23:59:49 | 2021-10-15 05:31:57 | 524629 |
| ended_at | 0 | 1 | 2021-10-01 00:03:11 | 2021-11-03 21:45:48 | 2021-10-15 05:56:26 | 523397 |
skim_without_charts(x202111_dataset)
| Name | x202111_dataset |
| Number of rows | 359978 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 359978 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 75290 | 0.79 | 10 | 53 | 0 | 815 | 0 |
| start_station_id | 75290 | 0.79 | 3 | 35 | 0 | 815 | 0 |
| end_station_name | 79187 | 0.78 | 10 | 53 | 0 | 805 | 0 |
| end_station_id | 79187 | 0.78 | 3 | 35 | 0 | 805 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.89 | 0.05 | 41.65 | 41.88 | 41.89 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 191 | 1 | 41.89 | 0.05 | 41.39 | 41.88 | 41.89 | 41.93 | 42.12 |
| end_lng | 191 | 1 | -87.65 | 0.03 | -88.97 | -87.66 | -87.64 | -87.63 | -87.53 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-11-01 00:00:14 | 2021-11-30 23:59:56 | 2021-11-12 08:32:12 | 320477 |
| ended_at | 0 | 1 | 2021-11-01 00:04:06 | 2021-12-02 06:41:33 | 2021-11-12 08:46:55 | 320071 |
skim_without_charts(x202112_dataset)
| Name | x202112_dataset |
| Number of rows | 247540 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 247540 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 51063 | 0.79 | 10 | 53 | 0 | 818 | 0 |
| start_station_id | 51063 | 0.79 | 3 | 35 | 0 | 816 | 0 |
| end_station_name | 53498 | 0.78 | 10 | 53 | 0 | 800 | 0 |
| end_station_id | 53498 | 0.78 | 3 | 35 | 0 | 798 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.67 | -87.64 | -87.63 | -87.52 |
| end_lat | 144 | 1 | 41.90 | 0.05 | 41.48 | 41.88 | 41.90 | 41.93 | 42.07 |
| end_lng | 144 | 1 | -87.65 | 0.03 | -87.85 | -87.67 | -87.64 | -87.63 | -87.52 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-12-01 00:00:01 | 2021-12-31 23:59:48 | 2021-12-13 13:04:54 | 228845 |
| ended_at | 0 | 1 | 2021-12-01 00:02:40 | 2022-01-03 17:32:18 | 2021-12-13 13:18:39 | 228657 |
Observation: There are some variables which has lots of missing values 1. start_station_name 2. start_station_id 3. end_station_name 4. end_station_id 5. end_lat 6. end_lng
tripdata_all <- rbind(x202101_dataset, x202102_dataset, x202103_dataset, x202104_dataset, x202105_dataset, x202106_dataset, x202107_dataset, x202108_dataset,
x202109_dataset, x202110_dataset, x202111_dataset, x202112_dataset)
head(tripdata_all)
## # A tibble: 6 x 13
## ride_id rideable_type started_at ended_at start_station_n~
## <chr> <chr> <dttm> <dttm> <chr>
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## # end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, member_casual <chr>
skim_without_charts(tripdata_all)
| Name | tripdata_all |
| Number of rows | 5595063 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 7 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5595063 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| start_station_name | 690809 | 0.88 | 3 | 53 | 0 | 847 | 0 |
| start_station_id | 690806 | 0.88 | 3 | 36 | 0 | 834 | 0 |
| end_station_name | 739170 | 0.87 | 10 | 53 | 0 | 844 | 0 |
| end_station_id | 739170 | 0.87 | 3 | 36 | 0 | 832 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 4771 | 1 | 41.90 | 0.05 | 41.39 | 41.88 | 41.90 | 41.93 | 42.17 |
| end_lng | 4771 | 1 | -87.65 | 0.03 | -88.97 | -87.66 | -87.64 | -87.63 | -87.49 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-01-01 00:02:05 | 2021-12-31 23:59:48 | 2021-08-01 01:52:11 | 4677998 |
| ended_at | 0 | 1 | 2021-01-01 00:08:39 | 2022-01-03 17:32:18 | 2021-08-01 02:21:55 | 4671372 |
tripdata_all %>%
group_by(ride_id) %>%
summarise(n=sum(n())) %>%
filter(n>1)
## # A tibble: 0 x 2
## # ... with 2 variables: ride_id <chr>, n <int>
head(tripdata_all)
## # A tibble: 6 x 13
## ride_id rideable_type started_at ended_at start_station_n~
## <chr> <chr> <dttm> <dttm> <chr>
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## # end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, member_casual <chr>
Observation: Need to check whether we have all ids mapped correctly with station name
start_station_sumamry <- tripdata_all %>%
count(start_station_id, start_station_name, name = "count_numbers") %>%
mutate(ID_length = str_length(start_station_id), NAME_length = str_length(start_station_name)) %>%
arrange(count_numbers)
view(start_station_sumamry)
start_station_sumamry %>%
filter(ID_length > 14)
## # A tibble: 3 x 5
## start_station_id start_station_n~ count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~ 2 30 30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~ 4 36 36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~ 313 35 31
Observation:
The station id is wrong and showing as station name, we need to fix it
start_station_sumamry %>%
filter(is.na(ID_length) & !is.na(NAME_length))
## # A tibble: 0 x 5
## # ... with 5 variables: start_station_id <chr>, start_station_name <chr>,
## # count_numbers <int>, ID_length <int>, NAME_length <int>
start_station_sumamry %>%
filter(start_station_name %in% c("Throop/Hastings Mobile Station", "DIVVY CASSETTE REPAIR MOBILE STATION", "Base - 2132 W Hubbard Warehouse"))
## # A tibble: 3 x 5
## start_station_id start_station_n~ count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~ 2 30 30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~ 4 36 36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~ 313 35 31
Observation: Tried to search for an ID but failed to get any id from the Dataframe Task: Figuraring out whether all the statition names are correct or not
start_station_sumamry %>%
filter(NAME_length < 10)
## # A tibble: 1 x 5
## start_station_id start_station_name count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 351 351 2 3 3
Observation: The strat_station_name should not be equivalent to start_station_id
start_station_sumamry %>%
filter(is.na(NAME_length) & !is.na(ID_length))
## # A tibble: 3 x 5
## start_station_id start_station_name count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 13221 <NA> 1 5 NA
## 2 20215 <NA> 1 5 NA
## 3 WL-008 <NA> 1 6 NA
Condition: Id lenght should not be NA and Name length should be NA
Observation: There are 3 missing station name where it has station-id
start_station_sumamry %>%
filter(start_station_id %in% c("351", "13221", "20215", "WL-008"))
## # A tibble: 8 x 5
## start_station_id start_station_name count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 13221 <NA> 1 5 NA
## 2 20215 <NA> 1 5 NA
## 3 WL-008 <NA> 1 6 NA
## 4 351 351 2 3 3
## 5 20215 Hegewisch Metra Station 86 5 23
## 6 351 Mulligan Ave & Wellingto~ 264 3 29
## 7 WL-008 Clinton St & Roosevelt Rd 8376 6 25
## 8 13221 Wood St & Milwaukee Ave 16765 5 23
Observation : We figuared out the correct station names and will replace the NA and iD’s with correct values
tripdata_all_new <- tripdata_all %>%
mutate(new_start_station_name = case_when(
start_station_id == 13221 ~ "Wood St & Milwaukee Ave",
start_station_id == "WL-008" ~ "Clinton St & Roosevelt Rd",
start_station_id == 351 ~ "Mulligan Ave & Wellington Ave",
start_station_id == 20215 ~ "Hegewisch Metra Station", TRUE ~ start_station_name), .before = start_station_name)
tripdata_all_new %>%
filter(start_station_id %in% c("351", "13221", "20215", "WL-008")) %>%
count(start_station_id, new_start_station_name)
## # A tibble: 4 x 3
## start_station_id new_start_station_name n
## <chr> <chr> <int>
## 1 13221 Wood St & Milwaukee Ave 16766
## 2 20215 Hegewisch Metra Station 87
## 3 351 Mulligan Ave & Wellington Ave 266
## 4 WL-008 Clinton St & Roosevelt Rd 8377
end_station_summary <-
tripdata_all %>%
count(end_station_id, end_station_name, name = "count_numbers") %>%
mutate(ID_length =str_length(end_station_id), NAME_length = str_length(end_station_name)) %>%
arrange(count_numbers)
view(end_station_summary)
end_station_summary %>%
filter(ID_length > 14)
## # A tibble: 3 x 5
## end_station_id end_station_name count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~ 1 30 30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~ 4 36 36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~ 578 35 31
end_station_summary %>%
filter(is.na(ID_length) & !is.na(NAME_length))
## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## # count_numbers <int>, ID_length <int>, NAME_length <int>
end_station_summary %>%
filter(end_station_name %in% c("Throop/Hastings Mobile Station", "DIVVY CASSETTE REPAIR MOBILE STATION", "Base - 2132 W Hubbard Warehouse"))
## # A tibble: 3 x 5
## end_station_id end_station_name count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~ 1 30 30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~ 4 36 36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~ 578 35 31
end_station_summary %>%
filter(NAME_length < 10)
## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## # count_numbers <int>, ID_length <int>, NAME_length <int>
end_station_summary %>%
filter(is.na(NAME_length) & !is.na(ID_length))
## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## # count_numbers <int>, ID_length <int>, NAME_length <int>
end_station_summary %>%
filter(end_station_id %in% c("351", "13221", "20215", "WL-008"))
## # A tibble: 4 x 5
## end_station_id end_station_name count_numbers ID_length NAME_length
## <chr> <chr> <int> <int> <int>
## 1 20215 Hegewisch Metra Station 72 5 23
## 2 351 Mulligan Ave & Wellington ~ 208 3 29
## 3 WL-008 Clinton St & Roosevelt Rd 8697 6 25
## 4 13221 Wood St & Milwaukee Ave 17172 5 23
Observation: there is not wrong end_station_name
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
tripdata_all_new_2 <- tripdata_all_new %>%
mutate(ride_length = hms::as_hms(tripdata_all_new$ended_at -tripdata_all_new$started_at),
start_weekday = wday(started_at, label=TRUE), end_weekday = wday(ended_at, label=TRUE, abbr=TRUE), start_my = format(started_at, "%Y-%m"), end_my = format(ended_at, "%Y-%m"))
Observation: All the weekdays are now splited
skim_without_charts(tripdata_all_new_2)
| Name | tripdata_all_new_2 |
| Number of rows | 5595063 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| difftime | 1 |
| factor | 2 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5595063 | 0 |
| rideable_type | 0 | 1.00 | 11 | 13 | 0 | 3 | 0 |
| new_start_station_name | 690806 | 0.88 | 10 | 53 | 0 | 846 | 0 |
| start_station_name | 690809 | 0.88 | 3 | 53 | 0 | 847 | 0 |
| start_station_id | 690806 | 0.88 | 3 | 36 | 0 | 834 | 0 |
| end_station_name | 739170 | 0.87 | 10 | 53 | 0 | 844 | 0 |
| end_station_id | 739170 | 0.87 | 3 | 36 | 0 | 832 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| start_my | 0 | 1.00 | 7 | 7 | 0 | 12 | 0 |
| end_my | 0 | 1.00 | 7 | 7 | 0 | 13 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | -3482 secs | 3356649 secs | 00:12:00 | 25645 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| start_weekday | 0 | 1 | TRUE | 7 | Sat: 991047, Sun: 857285, Fri: 810508, Wed: 756142 |
| end_weekday | 0 | 1 | TRUE | 7 | Sat: 987780, Sun: 863702, Fri: 806655, Wed: 756208 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -87.52 |
| end_lat | 4771 | 1 | 41.90 | 0.05 | 41.39 | 41.88 | 41.90 | 41.93 | 42.17 |
| end_lng | 4771 | 1 | -87.65 | 0.03 | -88.97 | -87.66 | -87.64 | -87.63 | -87.49 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2021-01-01 00:02:05 | 2021-12-31 23:59:48 | 2021-08-01 01:52:11 | 4677998 |
| ended_at | 0 | 1 | 2021-01-01 00:08:39 | 2022-01-03 17:32:18 | 2021-08-01 02:21:55 | 4671372 |
tripdata_all_new_2 %>%
select(ride_id, started_at, ended_at, ride_length) %>%
filter(ride_length <= 0)
## # A tibble: 653 x 4
## ride_id started_at ended_at ride_length
## <chr> <dttm> <dttm> <time>
## 1 FC1EFEF4475D7F56 2021-01-09 15:42:45 2021-01-09 15:41:02 -01'43"
## 2 6B51296F8E269F2F 2021-01-15 16:40:47 2021-01-15 16:40:47 00'00"
## 3 B1235D38EB2F8A9E 2021-01-06 18:33:12 2021-01-06 18:31:07 -02'05"
## 4 3F02776D0D38F96E 2021-01-29 21:02:41 2021-01-29 21:02:41 00'00"
## 5 417EE43395E20C71 2021-01-14 17:30:55 2021-01-14 17:30:55 00'00"
## 6 FBFC52F121A5C3E1 2021-01-14 17:26:46 2021-01-14 17:26:46 00'00"
## 7 578B5E6B37A8C1A9 2021-02-24 21:14:58 2021-02-24 21:14:58 00'00"
## 8 2A63B43959DE477F 2021-02-08 11:18:56 2021-02-08 11:18:56 00'00"
## 9 D69558609E2E6E5B 2021-02-13 17:32:01 2021-02-13 17:32:01 00'00"
## 10 52061914ECE4B8D9 2021-02-22 17:24:04 2021-02-22 17:24:04 00'00"
## # ... with 643 more rows
Observation: No one can travel less than 0 secs hence we need to clean these records
tripdata_all_new_2 %>%
select (ride_id, started_at, ended_at, ride_length) %>%
filter(ride_length / 86400 > 1)
## # A tibble: 4,016 x 4
## ride_id started_at ended_at ride_length
## <chr> <dttm> <dttm> <time>
## 1 C832F2F65BBA9BDB 2021-01-07 00:31:02 2021-01-08 01:30:47 24:59:45
## 2 573647A9A72AF73A 2021-01-28 11:30:49 2021-01-29 12:30:42 24:59:53
## 3 0348F90AFE78CC44 2021-01-24 00:46:04 2021-01-25 01:45:59 24:59:55
## 4 410D8AAB684CF9D1 2021-01-19 19:52:14 2021-01-20 20:22:24 24:30:10
## 5 B477C62F547027A0 2021-01-21 17:54:37 2021-01-22 18:54:26 24:59:49
## 6 B67A4DBE0EAF7EE9 2021-01-16 13:48:56 2021-01-17 14:48:42 24:59:46
## 7 2DA3CC5031CF1D83 2021-01-18 14:46:47 2021-01-19 15:46:38 24:59:51
## 8 3DBA9CCB6B88B134 2021-01-18 17:42:02 2021-01-19 18:41:57 24:59:55
## 9 C6AC2730FD0E5EAA 2021-01-06 19:07:16 2021-01-07 20:07:06 24:59:50
## 10 7EBFBC8A8E73577A 2021-01-11 16:18:26 2021-01-12 17:18:07 24:59:41
## # ... with 4,006 more rows
606024 = 86400
tripdata_all_new_3 <- tripdata_all_new_2 %>%
select(-start_station_name) %>%
filter(ride_length > 0)
tripdata_all_new_3$new_start_station_name[tripdata_all_new_3$new_start_station_name == "Lake
Shore Dr & Monroe St"] <-"DuSable Lake Shore Dr & Monroe St"
Next Steps: Analysis of the clean
head(tripdata_all_new_3)
## # A tibble: 6 x 18
## ride_id rideable_type started_at ended_at new_start_stati~
## <chr> <chr> <dttm> <dttm> <chr>
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 13 more variables: start_station_id <chr>, end_station_name <chr>,
## # end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, member_casual <chr>, ride_length <time>,
## # start_weekday <ord>, end_weekday <ord>, start_my <chr>, end_my <chr>
**********************************Data Analysis**********************************
ave_max_ride_length_all <- tripdata_all_new_3 %>%
group_by(member_casual) %>%
summarise(average_ride_length = hms::as_hms(mean(ride_length)), max_ride_length = hms::as_hms(max(ride_length))) %>%
print()
## # A tibble: 2 x 3
## member_casual average_ride_length max_ride_length
## <chr> <time> <time>
## 1 casual 32'00.346797" 932:24:09
## 2 member 13'38.072709" 25:59:56
Observation: 1. Avg ride for casual users is 32 min 2. Avg ride for member user is 13 min
tripdata_all_new_3 %>%
group_by(member_casual, start_weekday) %>%
summarise(average_ride_length = hms::as_hms(mean(ride_length))) %>%
pivot_wider(names_from = start_weekday, values_from = average_ride_length)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 2 x 8
## # Groups: member_casual [2]
## member_casual Sun Mon Tue Wed
## <chr> <time> <time> <time> <time>
## 1 casual 37'34.257336" 31'52.747311" 27'58.529274" 27'39.664429"
## 2 member 15'39.553754" 13'14.895591" 12'47.345070" 12'49.212422"
## # ... with 3 more variables: Thu <time>, Fri <time>, Sat <time>
Observation: The causal users are spending on avg of 28 min to 31 mins The member users are spending on avg of 15 mins
tripdata_all_new_3 %>%
count(member_casual,start_weekday, name = "count_rides") %>%
pivot_wider(names_from = start_weekday, values_from = count_rides)
## # A tibble: 2 x 8
## member_casual Sun Mon Tue Wed Thu Fri Sat
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 casual 481048 286340 274357 278910 286038 364037 557934
## 2 member 376086 416181 465474 477117 451490 446384 433014
Observation:
Our observation is basically casual user used the bikes mostly in weekends whereas the member users are mostly using the bikes at weekdays so may be they are commuting to office/school
tripdata_all_new_3 %>%
count(member_casual,start_my, name = "count_rides") %>%
pivot_wider(names_from = start_my, values_from = count_rides)
## # A tibble: 2 x 13
## member_casual `2021-01` `2021-02` `2021-03` `2021-04` `2021-05` `2021-06`
## <chr> <int> <int> <int> <int> <int> <int>
## 1 casual 18117 10130 84028 136590 256888 370636
## 2 member 78711 39488 144456 200602 274691 358893
## # ... with 6 more variables: `2021-07` <int>, `2021-08` <int>, `2021-09` <int>,
## # `2021-10` <int>, `2021-11` <int>, `2021-12` <int>
Observation: Subscription are higher in the summer time for casual users and whereas members users are using based upon there need
tripdata_all_new_3 %>%
filter(!is.na(new_start_station_name)) %>%
filter(member_casual == "casual") %>%
count(new_start_station_name) %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 x 2
## new_start_station_name n
## <chr> <int>
## 1 Streeter Dr & Grand Ave 66353
## 2 Millennium Park 33578
## 3 Michigan Ave & Oak St 29778
## 4 Shedd Aquarium 23249
## 5 Theater on the Lake 21349
## 6 Wells St & Concord Ln 19888
## 7 Lake Shore Dr & Monroe St 19616
## 8 Clark St & Lincoln Ave 17033
## 9 Wells St & Elm St 16664
## 10 Indiana Ave & Roosevelt Rd 16628
Observation: Top ten station for casual users
tripdata_all_new_3 %>%
filter(!is.na(new_start_station_name))%>%
filter(member_casual=="member") %>%
count(new_start_station_name) %>%
arrange(desc(n)) %>%
head(10)
## # A tibble: 10 x 2
## new_start_station_name n
## <chr> <int>
## 1 Clark St & Elm St 24739
## 2 Wells St & Concord Ln 23716
## 3 Kingsbury St & Kinzie St 23562
## 4 Wells St & Elm St 21019
## 5 Dearborn St & Erie St 19585
## 6 Wells St & Huron St 19188
## 7 St. Clair St & Erie St 18901
## 8 Broadway & Barry Ave 17800
## 9 Clinton St & Madison St 16913
## 10 Desplaines St & Kinzie St 16820
Observation: Top 10 streets for member users
library(ggplot2)
library(knitr)
ave_max_ride_length_all_sec <- tripdata_all_new_3 %>%
group_by(member_casual) %>%
summarise(average_ride_length = mean(ride_length),max_ride_length = max(ride_length))
# draw plot :Average
ggplot(data = ave_max_ride_length_all_sec) +
geom_col(mapping = aes(x= member_casual, y = average_ride_length, fill = member_casual)) +
labs(title = "Average ride length by member",x = "Member",y = "Average")+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(mapping =aes(x = member_casual,y = average_ride_length,label = hms::as_hms(ceiling(average_ride_length)),vjust = 2.5))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
# draw plot : max
ggplot(data = ave_max_ride_length_all_sec) +
geom_col(mapping = aes(x= member_casual, y = max_ride_length, fill = member_casual)) +
labs(title = "Max ride length by member",x = "Member",y = "MAX")+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(mapping =aes(x = member_casual,y = max_ride_length,label = hms::as_hms(ceiling(max_ride_length)),vjust = 1.5))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
Observation: Outlier: 932 hours for casual users as well as for member
user is an outlier
kable(ave_max_ride_length_all,"pipe")
| member_casual | average_ride_length | max_ride_length |
|---|---|---|
| casual | 00:32:00.346797 | 932:24:09 |
| member | 00:13:38.072709 | 25:59:56 |
ave_ride_length_weekday_all <-tripdata_all_new_3 %>%
group_by(member_casual, start_weekday) %>%
summarise(average_ride_length = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
# draw plot
ggplot(data = ave_ride_length_weekday_all) +
geom_col(mapping = aes(x= start_weekday, y = average_ride_length, fill = member_casual),position = "dodge") +
labs(title = "Average ride length by weekday",x = "Weekday",y = "Average(secs)")+
theme(plot.title = element_text(hjust = 0.5))+
geom_smooth(mapping = aes(x = start_weekday,y = average_ride_length,colour = member_casual,group = member_casual),se = FALSE)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Observation : The time spent by the menber users are statics and
uniformly distributed
# display the table
kable(ave_ride_length_weekday_all %>%
pivot_wider(names_from = start_weekday, values_from = average_ride_length))
| member_casual | Sun | Mon | Tue | Wed | Thu | Fri | Sat |
|---|---|---|---|---|---|---|---|
| casual | 2254.2573 secs | 1912.7473 secs | 1678.5293 secs | 1659.6644 secs | 1662.347 secs | 1821.1061 secs | 2082.5980 secs |
| member | 939.5538 secs | 794.8956 secs | 767.3451 secs | 769.2124 secs | 766.622 secs | 799.5648 secs | 915.9313 secs |
#prepare data
nums_rides_weekday_all <- tripdata_all_new_3 %>%
count(member_casual,start_weekday, name = "count_rides")
# draw plot
ggplot(data = nums_rides_weekday_all) +
geom_col(mapping = aes(x= start_weekday, y = count_rides, fill = member_casual),position = "dodge") +
labs(title = "The number of rides for users by weekday",x = "Weekday",y = "Number")+
theme(plot.title = element_text(hjust = 0.5))+
geom_smooth(mapping = aes(x = start_weekday,y = count_rides,colour = member_casual,group = member_casual),se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Observation: More users are riding in weekdays by memeber users
kable(nums_rides_weekday_all %>% pivot_wider(names_from = start_weekday, values_from = count_rides))
| member_casual | Sun | Mon | Tue | Wed | Thu | Fri | Sat |
|---|---|---|---|---|---|---|---|
| casual | 481048 | 286340 | 274357 | 278910 | 286038 | 364037 | 557934 |
| member | 376086 | 416181 | 465474 | 477117 | 451490 | 446384 | 433014 |
nums_rides_months_all <- tripdata_all_new_3 %>%
count(member_casual,start_my, name = "count_rides")
#draw plot
ggplot(data = nums_rides_months_all) +
geom_col(mapping = aes(x= start_my, y = count_rides, fill = member_casual),position = "dodge") +
labs(title = "The number of rides for users by month",x = "Month",y = "Number")+
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45,vjust = 0.5))+
geom_smooth(mapping = aes(x = start_my,y = count_rides,colour = member_casual,group = member_casual),se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
kable(nums_rides_months_all %>%
pivot_wider(names_from = start_my, values_from = count_rides))
| member_casual | 2021-01 | 2021-02 | 2021-03 | 2021-04 | 2021-05 | 2021-06 | 2021-07 | 2021-08 | 2021-09 | 2021-10 | 2021-11 | 2021-12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| casual | 18117 | 10130 | 84028 | 136590 | 256888 | 370636 | 442011 | 412608 | 363840 | 257203 | 106884 | 69729 |
| member | 78711 | 39488 | 144456 | 200602 | 274691 | 358893 | 380317 | 391637 | 392200 | 373953 | 253008 | 177790 |
# find the top10 rides of started stations
top10start_stations<-tripdata_all_new_3 %>%
filter(!is.na(new_start_station_name)) %>%
count(new_start_station_name) %>%
arrange(desc(n)) %>%
head(10)
# extract names
top10start_stations_names <-top10start_stations$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations <- tripdata_all_new_3 %>%
filter(new_start_station_name %in% top10start_stations_names ) %>%
group_by(new_start_station_name,member_casual) %>%
tally() %>%
mutate(percent=n/sum(n))
#draw plot
ggplot(data = percent_top10_stations,aes(x=reorder(new_start_station_name,n),y=n,fill=member_casual))+
geom_bar(stat = "identity")+coord_flip()+labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stations")+theme(plot.title = element_text(hjust = 0.5))+geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)
# find the top10 rides of started stations by member
top10start_stations_m<-tripdata_all_new_3 %>%
filter(!is.na(new_start_station_name))%>%
filter(member_casual=="member") %>%
count(new_start_station_name) %>%
arrange(desc(n)) %>%
head(10) %>%
arrange(n) %>%
mutate(new_start_station_name=factor(new_start_station_name))
# extract names
top10start_stations_m_names <-top10start_stations_m$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations_m <- tripdata_all_new_3 %>%
filter(new_start_station_name %in% top10start_stations_m_names ) %>%
group_by(new_start_station_name,member_casual) %>%
tally() %>%
mutate(percent=n/sum(n),new_start_station_name=factor(new_start_station_name,levels =top10start_stations_m$new_start_station_name,ordered = TRUE ))
#draw plot
ggplot(data = percent_top10_stations_m,aes(x=new_start_station_name,y=n,fill=member_casual))+
geom_bar(stat = "identity")+
coord_flip()+
labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stations by member")+
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)+
scale_fill_manual(values=c("member"="cyan3","casual"="grey90"))
# find the top10 rides of started stitions by casual
top10start_stations_c<-tripdata_all_new_3 %>%
filter(!is.na(new_start_station_name))%>%
filter(member_casual=="casual") %>%
count(new_start_station_name) %>%
arrange(desc(n)) %>%
head(10) %>%
arrange(n) %>%
mutate(new_start_station_name=factor(new_start_station_name))
# extract names
top10start_stations_c_names <-top10start_stations_c$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations_c <- tripdata_all_new_3 %>%
filter(new_start_station_name %in% top10start_stations_c_names ) %>%
group_by(new_start_station_name,member_casual) %>%
tally() %>%
mutate(percent=n/sum(n),new_start_station_name=factor(new_start_station_name,levels =top10start_stations_c$new_start_station_name,ordered = TRUE ))
#draw plot
ggplot(data = percent_top10_stations_c,aes(x=new_start_station_name,y=n,fill=member_casual),group = new_start_station_name)+
geom_bar(stat = "identity",position=position_stack(reverse = TRUE))+
coord_flip()+
labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stitions by Casual")+
theme_classic()+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5,reverse = TRUE),size=3)+
scale_fill_manual(values=c("casual"="coral","member"="grey90"))
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:magrittr':
##
## inset
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
stations_202101 <- tripdata_all_new_3 %>%
select(new_start_station_name, start_lat, start_lng, start_my, started_at) %>%
filter(!is.na(new_start_station_name)) %>%
filter(start_my == "2021-01") %>%
arrange(desc(started_at))
stations_202101 <- stations_202101[!duplicated(stations_202101$new_start_station_name),]
#prepare data
bike_type_p <- tripdata_all_new_3 %>%
group_by(member_casual,rideable_type) %>%
tally() %>%
mutate(per=n/sum(n))
#DRAW PLOT
ggplot(data = bike_type_p,aes(x="",y=per,fill=rideable_type))+
geom_bar(stat = "identity",width = 1)+
coord_polar("y",start = 0 )+
labs(title = "The percentage of Bike types")+
theme_void()+
theme(plot.title = element_text(hjust = 0.5))+
geom_text(aes(y=per,label=paste0(round(per*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)+
facet_wrap(~member_casual,strip.position="bottom")
Observation: Member users don’t use docked bikes