R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(skimr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(readr)
library(dplyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
x202101_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202101-divvy-tripdata.csv")
## Rows: 96834 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202102_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202102-divvy-tripdata.csv")
## Rows: 49622 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202103_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202103-divvy-tripdata.csv")
## Rows: 228496 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202104_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202104-divvy-tripdata.csv")
## Rows: 337230 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202105_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202105-divvy-tripdata.csv")
## Rows: 531633 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202106_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202106-divvy-tripdata.csv")
## Rows: 729595 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202107_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202107-divvy-tripdata.csv")
## Rows: 822410 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202108_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202108-divvy-tripdata.csv")
## Rows: 804352 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202109_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202109-divvy-tripdata.csv")
## Rows: 756147 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202110_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202110-divvy-tripdata.csv")
## Rows: 631226 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202111_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202111-divvy-tripdata.csv")
## Rows: 359978 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
x202112_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202112-divvy-tripdata.csv")
## Rows: 247540 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(x202112_dataset)

skim_without_charts(x202101_dataset)
Data summary
Name x202101_dataset
Number of rows 96834
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 96834 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 8625 0.91 10 51 0 640 0
start_station_id 8625 0.91 3 35 0 638 0
end_station_name 10277 0.89 10 53 0 632 0
end_station_id 10277 0.89 3 35 0 629 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.64 41.88 41.90 41.93 42.06
start_lng 0 1 -87.65 0.03 -87.78 -87.66 -87.64 -87.63 -87.53
end_lat 103 1 41.90 0.05 41.64 41.88 41.90 41.93 42.07
end_lng 103 1 -87.65 0.03 -87.81 -87.66 -87.64 -87.63 -87.51

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-01-01 00:02:05 2021-01-31 23:57:00 2021-01-15 06:05:04 93736
ended_at 0 1 2021-01-01 00:08:39 2021-02-01 15:33:15 2021-01-15 06:19:58 93582
skim_without_charts(x202102_dataset)
Data summary
Name x202102_dataset
Number of rows 49622
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 49622 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 4046 0.92 10 51 0 582 0
start_station_id 4046 0.92 3 35 0 582 0
end_station_name 5358 0.89 10 53 0 584 0
end_station_id 5358 0.89 3 35 0 584 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.04 41.65 41.88 41.90 41.93 42.06
start_lng 0 1 -87.64 0.03 -87.77 -87.66 -87.64 -87.63 -87.53
end_lat 214 1 41.90 0.04 41.54 41.88 41.90 41.93 42.07
end_lng 214 1 -87.64 0.03 -87.77 -87.66 -87.64 -87.63 -87.53

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-02-01 00:55:44 2021-02-28 23:59:41 2021-02-22 13:17:53 48139
ended_at 0 1 2021-02-01 01:22:48 2021-03-05 15:11:45 2021-02-22 13:39:20 48035
skim_without_charts(x202103_dataset)
Data summary
Name x202103_dataset
Number of rows 228496
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 228496 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 14848 0.94 10 53 0 673 0
start_station_id 14848 0.94 3 35 0 673 0
end_station_name 16727 0.93 10 53 0 673 0
end_station_id 16727 0.93 3 35 0 673 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.04 41.65 41.88 41.90 41.93 42.07
start_lng 0 1 -87.64 0.03 -87.78 -87.66 -87.64 -87.63 -87.53
end_lat 167 1 41.90 0.04 41.64 41.88 41.90 41.93 42.08
end_lng 167 1 -87.65 0.03 -88.07 -87.66 -87.64 -87.63 -87.53

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-03-01 00:01:09 2021-03-31 23:59:08 2021-03-19 17:37:20 209025
ended_at 0 1 2021-03-01 00:06:28 2021-04-06 11:00:11 2021-03-19 17:55:05 208629
skim_without_charts(x202104_dataset)
Data summary
Name x202104_dataset
Number of rows 337230
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 337230 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 26056 0.92 10 53 0 681 0
start_station_id 26056 0.92 3 35 0 681 0
end_station_name 28174 0.92 10 53 0 681 0
end_station_id 28174 0.92 3 35 0 681 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.64 41.88 41.90 41.93 42.07
start_lng 0 1 -87.64 0.03 -87.78 -87.66 -87.64 -87.63 -87.52
end_lat 267 1 41.90 0.05 41.59 41.88 41.90 41.93 42.15
end_lng 267 1 -87.65 0.03 -87.85 -87.66 -87.64 -87.63 -87.52

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-04-01 00:03:18 2021-04-30 23:59:53 2021-04-15 22:37:04 298722
ended_at 0 1 2021-04-01 00:14:29 2021-05-05 22:14:39 2021-04-15 23:00:10 298625
skim_without_charts(x202105_dataset)
Data summary
Name x202105_dataset
Number of rows 531633
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 531633 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 53744 0.90 10 53 0 687 0
start_station_id 53744 0.90 3 35 0 686 0
end_station_name 58194 0.89 10 53 0 683 0
end_station_id 58194 0.89 3 35 0 682 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.65 41.88 41.90 41.93 42.07
start_lng 0 1 -87.64 0.03 -87.78 -87.66 -87.64 -87.63 -87.52
end_lat 452 1 41.90 0.05 41.56 41.88 41.90 41.93 42.09
end_lng 452 1 -87.64 0.03 -87.85 -87.66 -87.64 -87.63 -87.52

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-05-01 00:00:11 2021-05-31 23:59:16 2021-05-19 07:44:31 447224
ended_at 0 1 2021-05-01 00:03:26 2021-06-10 22:17:11 2021-05-19 07:59:43 447217
skim_without_charts(x202106_dataset)
Data summary
Name x202106_dataset
Number of rows 729595
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 729595 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 80093 0.89 10 53 0 689 0
start_station_id 80093 0.89 3 35 0 689 0
end_station_name 86387 0.88 10 53 0 690 0
end_station_id 86387 0.88 3 35 0 690 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.04 41.64 41.88 41.90 41.93 42.07
start_lng 0 1 -87.64 0.03 -87.78 -87.66 -87.64 -87.63 -87.52
end_lat 717 1 41.90 0.04 41.51 41.88 41.90 41.93 42.08
end_lng 717 1 -87.64 0.03 -87.86 -87.66 -87.64 -87.63 -87.49

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-06-01 00:00:38 2021-06-30 23:59:59 2021-06-14 19:46:47 589805
ended_at 0 1 2021-06-01 00:06:22 2021-07-13 22:51:35 2021-06-14 20:13:55 589069
skim_without_charts(x202107_dataset)
Data summary
Name x202107_dataset
Number of rows 822410
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 822410 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 87263 0.89 10 53 0 717 0
start_station_id 87262 0.89 3 36 0 710 0
end_station_name 93158 0.89 10 53 0 714 0
end_station_id 93158 0.89 3 36 0 707 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.04 41.65 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.66 -87.64 -87.63 -87.52
end_lat 731 1 41.90 0.04 41.63 41.88 41.90 41.93 42.15
end_lng 731 1 -87.65 0.03 -87.85 -87.66 -87.64 -87.63 -87.49

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-07-01 00:00:22 2021-07-31 23:59:58 2021-07-17 13:58:37 659640
ended_at 0 1 2021-07-01 00:04:51 2021-08-12 17:45:41 2021-07-17 14:28:04 658663
skim_without_charts(x202108_dataset)
Data summary
Name x202108_dataset
Number of rows 804352
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 804352 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 88458 0.89 3 53 0 727 0
start_station_id 88458 0.89 3 35 0 726 0
end_station_name 94115 0.88 10 53 0 727 0
end_station_id 94115 0.88 3 35 0 727 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.04 41.65 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.66 -87.64 -87.63 -87.52
end_lat 706 1 41.90 0.04 41.58 41.88 41.90 41.93 42.15
end_lng 706 1 -87.65 0.03 -87.85 -87.66 -87.64 -87.63 -87.51

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-08-01 00:00:04 2021-08-31 23:59:35 2021-08-16 07:57:11 646516
ended_at 0 1 2021-08-01 00:03:11 2021-09-01 17:37:35 2021-08-16 08:12:14 645299
skim_without_charts(x202109_dataset)
Data summary
Name x202109_dataset
Number of rows 756147
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 756147 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 93113 0.88 10 53 0 758 0
start_station_id 93111 0.88 3 35 0 758 0
end_station_name 99261 0.87 10 53 0 756 0
end_station_id 99261 0.87 3 35 0 756 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.65 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.66 -87.64 -87.63 -87.52
end_lat 595 1 41.90 0.05 41.57 41.88 41.90 41.93 42.17
end_lng 595 1 -87.65 0.03 -87.87 -87.66 -87.64 -87.63 -87.50

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-09-01 00:00:06 2021-09-30 23:59:48 2021-09-15 16:43:37 611240
ended_at 0 1 2021-09-01 00:00:41 2021-10-01 22:55:35 2021-09-15 17:01:16 610277
skim_without_charts(x202110_dataset)
Data summary
Name x202110_dataset
Number of rows 631226
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 631226 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 108210 0.83 10 53 0 793 0
start_station_id 108210 0.83 3 35 0 793 0
end_station_name 114834 0.82 10 53 0 790 0
end_station_id 114834 0.82 3 35 0 790 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.65 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.83 -87.66 -87.64 -87.63 -87.52
end_lat 484 1 41.90 0.05 41.60 41.88 41.90 41.93 42.13
end_lng 484 1 -87.65 0.03 -87.96 -87.66 -87.64 -87.63 -87.52

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-10-01 00:00:09 2021-10-31 23:59:49 2021-10-15 05:31:57 524629
ended_at 0 1 2021-10-01 00:03:11 2021-11-03 21:45:48 2021-10-15 05:56:26 523397
skim_without_charts(x202111_dataset)
Data summary
Name x202111_dataset
Number of rows 359978
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 359978 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 75290 0.79 10 53 0 815 0
start_station_id 75290 0.79 3 35 0 815 0
end_station_name 79187 0.78 10 53 0 805 0
end_station_id 79187 0.78 3 35 0 805 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.89 0.05 41.65 41.88 41.89 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.66 -87.64 -87.63 -87.53
end_lat 191 1 41.89 0.05 41.39 41.88 41.89 41.93 42.12
end_lng 191 1 -87.65 0.03 -88.97 -87.66 -87.64 -87.63 -87.53

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-11-01 00:00:14 2021-11-30 23:59:56 2021-11-12 08:32:12 320477
ended_at 0 1 2021-11-01 00:04:06 2021-12-02 06:41:33 2021-11-12 08:46:55 320071
skim_without_charts(x202112_dataset)
Data summary
Name x202112_dataset
Number of rows 247540
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 247540 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 51063 0.79 10 53 0 818 0
start_station_id 51063 0.79 3 35 0 816 0
end_station_name 53498 0.78 10 53 0 800 0
end_station_id 53498 0.78 3 35 0 798 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.64 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.67 -87.64 -87.63 -87.52
end_lat 144 1 41.90 0.05 41.48 41.88 41.90 41.93 42.07
end_lng 144 1 -87.65 0.03 -87.85 -87.67 -87.64 -87.63 -87.52

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-12-01 00:00:01 2021-12-31 23:59:48 2021-12-13 13:04:54 228845
ended_at 0 1 2021-12-01 00:02:40 2022-01-03 17:32:18 2021-12-13 13:18:39 228657

Observation: There are some variables which has lots of missing values 1. start_station_name 2. start_station_id 3. end_station_name 4. end_station_id 5. end_lat 6. end_lng

tripdata_all <- rbind(x202101_dataset, x202102_dataset, x202103_dataset, x202104_dataset, x202105_dataset, x202106_dataset, x202107_dataset, x202108_dataset, 
                      x202109_dataset, x202110_dataset, x202111_dataset, x202112_dataset)
head(tripdata_all)
## # A tibble: 6 x 13
##   ride_id rideable_type started_at          ended_at            start_station_n~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>
skim_without_charts(tripdata_all)
Data summary
Name tripdata_all
Number of rows 5595063
Number of columns 13
_______________________
Column type frequency:
character 7
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 5595063 0
rideable_type 0 1.00 11 13 0 3 0
start_station_name 690809 0.88 3 53 0 847 0
start_station_id 690806 0.88 3 36 0 834 0
end_station_name 739170 0.87 10 53 0 844 0
end_station_id 739170 0.87 3 36 0 832 0
member_casual 0 1.00 6 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.64 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.66 -87.64 -87.63 -87.52
end_lat 4771 1 41.90 0.05 41.39 41.88 41.90 41.93 42.17
end_lng 4771 1 -87.65 0.03 -88.97 -87.66 -87.64 -87.63 -87.49

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-01-01 00:02:05 2021-12-31 23:59:48 2021-08-01 01:52:11 4677998
ended_at 0 1 2021-01-01 00:08:39 2022-01-03 17:32:18 2021-08-01 02:21:55 4671372
tripdata_all %>%
  group_by(ride_id) %>%
  summarise(n=sum(n())) %>%
  filter(n>1)
## # A tibble: 0 x 2
## # ... with 2 variables: ride_id <chr>, n <int>
head(tripdata_all)
## # A tibble: 6 x 13
##   ride_id rideable_type started_at          ended_at            start_station_n~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>

Observation: Need to check whether we have all ids mapped correctly with station name

start_station_sumamry <- tripdata_all %>%
  count(start_station_id, start_station_name, name = "count_numbers") %>%
  mutate(ID_length = str_length(start_station_id), NAME_length = str_length(start_station_name)) %>%
  arrange(count_numbers)
view(start_station_sumamry)
start_station_sumamry %>%
  filter(ID_length > 14)
## # A tibble: 3 x 5
##   start_station_id          start_station_n~ count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             2        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           313        35          31

Observation:

The station id is wrong and showing as station name, we need to fix it

start_station_sumamry %>%
  filter(is.na(ID_length) & !is.na(NAME_length))
## # A tibble: 0 x 5
## # ... with 5 variables: start_station_id <chr>, start_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>
start_station_sumamry %>%
  filter(start_station_name %in% c("Throop/Hastings Mobile Station", "DIVVY CASSETTE REPAIR MOBILE STATION", "Base - 2132 W Hubbard Warehouse"))
## # A tibble: 3 x 5
##   start_station_id          start_station_n~ count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             2        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           313        35          31

Observation: Tried to search for an ID but failed to get any id from the Dataframe Task: Figuraring out whether all the statition names are correct or not

start_station_sumamry %>%
  filter(NAME_length < 10)
## # A tibble: 1 x 5
##   start_station_id start_station_name count_numbers ID_length NAME_length
##   <chr>            <chr>                      <int>     <int>       <int>
## 1 351              351                            2         3           3

Observation: The strat_station_name should not be equivalent to start_station_id

start_station_sumamry %>%
  filter(is.na(NAME_length) & !is.na(ID_length))
## # A tibble: 3 x 5
##   start_station_id start_station_name count_numbers ID_length NAME_length
##   <chr>            <chr>                      <int>     <int>       <int>
## 1 13221            <NA>                           1         5          NA
## 2 20215            <NA>                           1         5          NA
## 3 WL-008           <NA>                           1         6          NA

Condition: Id lenght should not be NA and Name length should be NA

Observation: There are 3 missing station name where it has station-id

start_station_sumamry %>%
  filter(start_station_id %in% c("351", "13221", "20215", "WL-008"))
## # A tibble: 8 x 5
##   start_station_id start_station_name        count_numbers ID_length NAME_length
##   <chr>            <chr>                             <int>     <int>       <int>
## 1 13221            <NA>                                  1         5          NA
## 2 20215            <NA>                                  1         5          NA
## 3 WL-008           <NA>                                  1         6          NA
## 4 351              351                                   2         3           3
## 5 20215            Hegewisch Metra Station              86         5          23
## 6 351              Mulligan Ave & Wellingto~           264         3          29
## 7 WL-008           Clinton St & Roosevelt Rd          8376         6          25
## 8 13221            Wood St & Milwaukee Ave           16765         5          23

Observation : We figuared out the correct station names and will replace the NA and iD’s with correct values

tripdata_all_new <- tripdata_all %>%
  mutate(new_start_station_name = case_when(
      start_station_id == 13221 ~ "Wood St & Milwaukee Ave", 
      start_station_id == "WL-008" ~ "Clinton St & Roosevelt Rd", 
      start_station_id == 351 ~ "Mulligan Ave & Wellington Ave", 
      start_station_id == 20215 ~ "Hegewisch Metra Station", TRUE ~ start_station_name), .before = start_station_name)
tripdata_all_new %>%
  filter(start_station_id %in% c("351", "13221", "20215", "WL-008")) %>%
  count(start_station_id, new_start_station_name)
## # A tibble: 4 x 3
##   start_station_id new_start_station_name            n
##   <chr>            <chr>                         <int>
## 1 13221            Wood St & Milwaukee Ave       16766
## 2 20215            Hegewisch Metra Station          87
## 3 351              Mulligan Ave & Wellington Ave   266
## 4 WL-008           Clinton St & Roosevelt Rd      8377
end_station_summary <- 
  tripdata_all %>%
  count(end_station_id, end_station_name, name = "count_numbers") %>%
  mutate(ID_length =str_length(end_station_id), NAME_length = str_length(end_station_name)) %>%
  arrange(count_numbers)
view(end_station_summary)
end_station_summary %>%
  filter(ID_length > 14)
## # A tibble: 3 x 5
##   end_station_id            end_station_name count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             1        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           578        35          31
end_station_summary %>%
  filter(is.na(ID_length) & !is.na(NAME_length))
## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>
end_station_summary %>%
  filter(end_station_name %in% c("Throop/Hastings Mobile Station", "DIVVY CASSETTE REPAIR MOBILE STATION", "Base - 2132 W Hubbard Warehouse"))
## # A tibble: 3 x 5
##   end_station_id            end_station_name count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             1        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           578        35          31
end_station_summary %>%
  filter(NAME_length < 10)
## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>
end_station_summary %>%
  filter(is.na(NAME_length) & !is.na(ID_length))
## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>
end_station_summary %>%
  filter(end_station_id %in% c("351", "13221", "20215", "WL-008"))
## # A tibble: 4 x 5
##   end_station_id end_station_name            count_numbers ID_length NAME_length
##   <chr>          <chr>                               <int>     <int>       <int>
## 1 20215          Hegewisch Metra Station                72         5          23
## 2 351            Mulligan Ave & Wellington ~           208         3          29
## 3 WL-008         Clinton St & Roosevelt Rd            8697         6          25
## 4 13221          Wood St & Milwaukee Ave             17172         5          23

Observation: there is not wrong end_station_name

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
tripdata_all_new_2 <- tripdata_all_new %>%
  mutate(ride_length = hms::as_hms(tripdata_all_new$ended_at -tripdata_all_new$started_at), 
         start_weekday = wday(started_at, label=TRUE), end_weekday = wday(ended_at, label=TRUE, abbr=TRUE), start_my = format(started_at, "%Y-%m"), end_my = format(ended_at, "%Y-%m"))

Observation: All the weekdays are now splited

skim_without_charts(tripdata_all_new_2)
Data summary
Name tripdata_all_new_2
Number of rows 5595063
Number of columns 19
_______________________
Column type frequency:
character 10
difftime 1
factor 2
numeric 4
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ride_id 0 1.00 16 16 0 5595063 0
rideable_type 0 1.00 11 13 0 3 0
new_start_station_name 690806 0.88 10 53 0 846 0
start_station_name 690809 0.88 3 53 0 847 0
start_station_id 690806 0.88 3 36 0 834 0
end_station_name 739170 0.87 10 53 0 844 0
end_station_id 739170 0.87 3 36 0 832 0
member_casual 0 1.00 6 6 0 2 0
start_my 0 1.00 7 7 0 12 0
end_my 0 1.00 7 7 0 13 0

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
ride_length 0 1 -3482 secs 3356649 secs 00:12:00 25645

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
start_weekday 0 1 TRUE 7 Sat: 991047, Sun: 857285, Fri: 810508, Wed: 756142
end_weekday 0 1 TRUE 7 Sat: 987780, Sun: 863702, Fri: 806655, Wed: 756208

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
start_lat 0 1 41.90 0.05 41.64 41.88 41.90 41.93 42.07
start_lng 0 1 -87.65 0.03 -87.84 -87.66 -87.64 -87.63 -87.52
end_lat 4771 1 41.90 0.05 41.39 41.88 41.90 41.93 42.17
end_lng 4771 1 -87.65 0.03 -88.97 -87.66 -87.64 -87.63 -87.49

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
started_at 0 1 2021-01-01 00:02:05 2021-12-31 23:59:48 2021-08-01 01:52:11 4677998
ended_at 0 1 2021-01-01 00:08:39 2022-01-03 17:32:18 2021-08-01 02:21:55 4671372
tripdata_all_new_2 %>%
  select(ride_id, started_at, ended_at, ride_length) %>%
  filter(ride_length <= 0)
## # A tibble: 653 x 4
##    ride_id          started_at          ended_at            ride_length
##    <chr>            <dttm>              <dttm>              <time>     
##  1 FC1EFEF4475D7F56 2021-01-09 15:42:45 2021-01-09 15:41:02 -01'43"    
##  2 6B51296F8E269F2F 2021-01-15 16:40:47 2021-01-15 16:40:47  00'00"    
##  3 B1235D38EB2F8A9E 2021-01-06 18:33:12 2021-01-06 18:31:07 -02'05"    
##  4 3F02776D0D38F96E 2021-01-29 21:02:41 2021-01-29 21:02:41  00'00"    
##  5 417EE43395E20C71 2021-01-14 17:30:55 2021-01-14 17:30:55  00'00"    
##  6 FBFC52F121A5C3E1 2021-01-14 17:26:46 2021-01-14 17:26:46  00'00"    
##  7 578B5E6B37A8C1A9 2021-02-24 21:14:58 2021-02-24 21:14:58  00'00"    
##  8 2A63B43959DE477F 2021-02-08 11:18:56 2021-02-08 11:18:56  00'00"    
##  9 D69558609E2E6E5B 2021-02-13 17:32:01 2021-02-13 17:32:01  00'00"    
## 10 52061914ECE4B8D9 2021-02-22 17:24:04 2021-02-22 17:24:04  00'00"    
## # ... with 643 more rows

Observation: No one can travel less than 0 secs hence we need to clean these records

tripdata_all_new_2 %>%
  select (ride_id, started_at, ended_at, ride_length) %>%
  filter(ride_length / 86400 > 1)
## # A tibble: 4,016 x 4
##    ride_id          started_at          ended_at            ride_length
##    <chr>            <dttm>              <dttm>              <time>     
##  1 C832F2F65BBA9BDB 2021-01-07 00:31:02 2021-01-08 01:30:47 24:59:45   
##  2 573647A9A72AF73A 2021-01-28 11:30:49 2021-01-29 12:30:42 24:59:53   
##  3 0348F90AFE78CC44 2021-01-24 00:46:04 2021-01-25 01:45:59 24:59:55   
##  4 410D8AAB684CF9D1 2021-01-19 19:52:14 2021-01-20 20:22:24 24:30:10   
##  5 B477C62F547027A0 2021-01-21 17:54:37 2021-01-22 18:54:26 24:59:49   
##  6 B67A4DBE0EAF7EE9 2021-01-16 13:48:56 2021-01-17 14:48:42 24:59:46   
##  7 2DA3CC5031CF1D83 2021-01-18 14:46:47 2021-01-19 15:46:38 24:59:51   
##  8 3DBA9CCB6B88B134 2021-01-18 17:42:02 2021-01-19 18:41:57 24:59:55   
##  9 C6AC2730FD0E5EAA 2021-01-06 19:07:16 2021-01-07 20:07:06 24:59:50   
## 10 7EBFBC8A8E73577A 2021-01-11 16:18:26 2021-01-12 17:18:07 24:59:41   
## # ... with 4,006 more rows

606024 = 86400

tripdata_all_new_3  <- tripdata_all_new_2 %>%
  select(-start_station_name) %>%
  filter(ride_length > 0)


  tripdata_all_new_3$new_start_station_name[tripdata_all_new_3$new_start_station_name == "Lake
                                            Shore Dr & Monroe St"] <-"DuSable Lake Shore Dr & Monroe St"

Next Steps: Analysis of the clean

head(tripdata_all_new_3)
## # A tibble: 6 x 18
##   ride_id rideable_type started_at          ended_at            new_start_stati~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 13 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>, ride_length <time>,
## #   start_weekday <ord>, end_weekday <ord>, start_my <chr>, end_my <chr>

**********************************Data Analysis**********************************

ave_max_ride_length_all <- tripdata_all_new_3 %>% 
  group_by(member_casual) %>% 
  summarise(average_ride_length = hms::as_hms(mean(ride_length)), max_ride_length = hms::as_hms(max(ride_length))) %>% 
  print()
## # A tibble: 2 x 3
##   member_casual average_ride_length max_ride_length
##   <chr>         <time>              <time>         
## 1 casual        32'00.346797"       932:24:09      
## 2 member        13'38.072709"        25:59:56

Observation: 1. Avg ride for casual users is 32 min 2. Avg ride for member user is 13 min

tripdata_all_new_3 %>% 
  group_by(member_casual, start_weekday) %>% 
  summarise(average_ride_length = hms::as_hms(mean(ride_length))) %>% 
  pivot_wider(names_from = start_weekday, values_from = average_ride_length)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 2 x 8
## # Groups:   member_casual [2]
##   member_casual Sun           Mon           Tue           Wed          
##   <chr>         <time>        <time>        <time>        <time>       
## 1 casual        37'34.257336" 31'52.747311" 27'58.529274" 27'39.664429"
## 2 member        15'39.553754" 13'14.895591" 12'47.345070" 12'49.212422"
## # ... with 3 more variables: Thu <time>, Fri <time>, Sat <time>

Observation: The causal users are spending on avg of 28 min to 31 mins The member users are spending on avg of 15 mins

tripdata_all_new_3 %>% 
  count(member_casual,start_weekday, name = "count_rides") %>% 
  pivot_wider(names_from = start_weekday, values_from = count_rides)
## # A tibble: 2 x 8
##   member_casual    Sun    Mon    Tue    Wed    Thu    Fri    Sat
##   <chr>          <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1 casual        481048 286340 274357 278910 286038 364037 557934
## 2 member        376086 416181 465474 477117 451490 446384 433014

Observation:

Our observation is basically casual user used the bikes mostly in weekends whereas the member users are mostly using the bikes at weekdays so may be they are commuting to office/school

tripdata_all_new_3 %>% 
  count(member_casual,start_my, name = "count_rides") %>% 
  pivot_wider(names_from = start_my, values_from = count_rides)
## # A tibble: 2 x 13
##   member_casual `2021-01` `2021-02` `2021-03` `2021-04` `2021-05` `2021-06`
##   <chr>             <int>     <int>     <int>     <int>     <int>     <int>
## 1 casual            18117     10130     84028    136590    256888    370636
## 2 member            78711     39488    144456    200602    274691    358893
## # ... with 6 more variables: `2021-07` <int>, `2021-08` <int>, `2021-09` <int>,
## #   `2021-10` <int>, `2021-11` <int>, `2021-12` <int>

Observation: Subscription are higher in the summer time for casual users and whereas members users are using based upon there need

tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name)) %>% 
  filter(member_casual == "casual") %>%
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10)
## # A tibble: 10 x 2
##    new_start_station_name         n
##    <chr>                      <int>
##  1 Streeter Dr & Grand Ave    66353
##  2 Millennium Park            33578
##  3 Michigan Ave & Oak St      29778
##  4 Shedd Aquarium             23249
##  5 Theater on the Lake        21349
##  6 Wells St & Concord Ln      19888
##  7 Lake Shore Dr & Monroe St  19616
##  8 Clark St & Lincoln Ave     17033
##  9 Wells St & Elm St          16664
## 10 Indiana Ave & Roosevelt Rd 16628

Observation: Top ten station for casual users

tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name))%>% 
  filter(member_casual=="member") %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10)
## # A tibble: 10 x 2
##    new_start_station_name        n
##    <chr>                     <int>
##  1 Clark St & Elm St         24739
##  2 Wells St & Concord Ln     23716
##  3 Kingsbury St & Kinzie St  23562
##  4 Wells St & Elm St         21019
##  5 Dearborn St & Erie St     19585
##  6 Wells St & Huron St       19188
##  7 St. Clair St & Erie St    18901
##  8 Broadway & Barry Ave      17800
##  9 Clinton St & Madison St   16913
## 10 Desplaines St & Kinzie St 16820

Observation: Top 10 streets for member users

library(ggplot2)
library(knitr) 
ave_max_ride_length_all_sec <- tripdata_all_new_3 %>% 
  group_by(member_casual) %>% 
  summarise(average_ride_length = mean(ride_length),max_ride_length = max(ride_length))
# draw plot :Average
ggplot(data = ave_max_ride_length_all_sec) +
  geom_col(mapping = aes(x= member_casual, y = average_ride_length, fill = member_casual)) +
  labs(title = "Average ride length by member",x = "Member",y = "Average")+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(mapping =aes(x = member_casual,y = average_ride_length,label = hms::as_hms(ceiling(average_ride_length)),vjust = 2.5))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

# draw plot : max
ggplot(data = ave_max_ride_length_all_sec) +
  geom_col(mapping = aes(x= member_casual, y = max_ride_length, fill = member_casual)) +
  labs(title = "Max ride length by member",x = "Member",y = "MAX")+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(mapping =aes(x = member_casual,y = max_ride_length,label = hms::as_hms(ceiling(max_ride_length)),vjust = 1.5))
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

Observation: Outlier: 932 hours for casual users as well as for member user is an outlier

kable(ave_max_ride_length_all,"pipe")
member_casual average_ride_length max_ride_length
casual 00:32:00.346797 932:24:09
member 00:13:38.072709 25:59:56
ave_ride_length_weekday_all <-tripdata_all_new_3 %>% 
  group_by(member_casual, start_weekday) %>% 
  summarise(average_ride_length = mean(ride_length))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
# draw plot
ggplot(data = ave_ride_length_weekday_all) +
  geom_col(mapping = aes(x= start_weekday, y = average_ride_length, fill = member_casual),position = "dodge") +
  labs(title = "Average ride length by weekday",x = "Weekday",y = "Average(secs)")+
  theme(plot.title = element_text(hjust = 0.5))+
   geom_smooth(mapping = aes(x = start_weekday,y = average_ride_length,colour = member_casual,group = member_casual),se = FALSE)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Observation : The time spent by the menber users are statics and uniformly distributed

# display the table
kable(ave_ride_length_weekday_all %>% 
  pivot_wider(names_from = start_weekday, values_from = average_ride_length))
member_casual Sun Mon Tue Wed Thu Fri Sat
casual 2254.2573 secs 1912.7473 secs 1678.5293 secs 1659.6644 secs 1662.347 secs 1821.1061 secs 2082.5980 secs
member 939.5538 secs 794.8956 secs 767.3451 secs 769.2124 secs 766.622 secs 799.5648 secs 915.9313 secs
#prepare data
nums_rides_weekday_all <- tripdata_all_new_3 %>% 
  count(member_casual,start_weekday, name = "count_rides")
# draw plot
ggplot(data = nums_rides_weekday_all) +
  geom_col(mapping = aes(x= start_weekday, y = count_rides, fill = member_casual),position = "dodge") +
  labs(title = "The number of rides for users by weekday",x = "Weekday",y = "Number")+
  theme(plot.title = element_text(hjust = 0.5))+
   geom_smooth(mapping = aes(x = start_weekday,y = count_rides,colour = member_casual,group = member_casual),se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Observation: More users are riding in weekdays by memeber users

kable(nums_rides_weekday_all %>% pivot_wider(names_from = start_weekday, values_from = count_rides))
member_casual Sun Mon Tue Wed Thu Fri Sat
casual 481048 286340 274357 278910 286038 364037 557934
member 376086 416181 465474 477117 451490 446384 433014
nums_rides_months_all <- tripdata_all_new_3 %>% 
  count(member_casual,start_my, name = "count_rides")
#draw plot
ggplot(data = nums_rides_months_all) +
  geom_col(mapping = aes(x= start_my, y = count_rides, fill = member_casual),position = "dodge") +
  labs(title = "The number of rides for users by month",x = "Month",y = "Number")+
  theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45,vjust = 0.5))+
   geom_smooth(mapping = aes(x = start_my,y = count_rides,colour = member_casual,group = member_casual),se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

kable(nums_rides_months_all %>% 
      pivot_wider(names_from = start_my, values_from = count_rides))
member_casual 2021-01 2021-02 2021-03 2021-04 2021-05 2021-06 2021-07 2021-08 2021-09 2021-10 2021-11 2021-12
casual 18117 10130 84028 136590 256888 370636 442011 412608 363840 257203 106884 69729
member 78711 39488 144456 200602 274691 358893 380317 391637 392200 373953 253008 177790
# find the top10 rides of started stations
top10start_stations<-tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name)) %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10) 
# extract names
top10start_stations_names <-top10start_stations$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations <- tripdata_all_new_3 %>% 
  filter(new_start_station_name %in% top10start_stations_names ) %>% 
  group_by(new_start_station_name,member_casual) %>% 
  tally() %>% 
  mutate(percent=n/sum(n))
#draw plot
ggplot(data = percent_top10_stations,aes(x=reorder(new_start_station_name,n),y=n,fill=member_casual))+
  geom_bar(stat = "identity")+coord_flip()+labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stations")+theme(plot.title = element_text(hjust = 0.5))+geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)

# find the top10 rides of started stations by member
top10start_stations_m<-tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name))%>% 
  filter(member_casual=="member") %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  arrange(n) %>% 
  mutate(new_start_station_name=factor(new_start_station_name))
# extract names
top10start_stations_m_names <-top10start_stations_m$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations_m <- tripdata_all_new_3 %>% 
  filter(new_start_station_name %in% top10start_stations_m_names ) %>% 
  group_by(new_start_station_name,member_casual) %>% 
  tally() %>% 
  mutate(percent=n/sum(n),new_start_station_name=factor(new_start_station_name,levels =top10start_stations_m$new_start_station_name,ordered = TRUE ))
#draw plot
ggplot(data = percent_top10_stations_m,aes(x=new_start_station_name,y=n,fill=member_casual))+
  geom_bar(stat = "identity")+
  coord_flip()+
  labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stations by member")+
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)+
  scale_fill_manual(values=c("member"="cyan3","casual"="grey90"))

# find the top10 rides of started stitions by casual
top10start_stations_c<-tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name))%>% 
  filter(member_casual=="casual") %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  arrange(n) %>% 
  mutate(new_start_station_name=factor(new_start_station_name))
# extract names
top10start_stations_c_names <-top10start_stations_c$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations_c <- tripdata_all_new_3 %>% 
  filter(new_start_station_name %in% top10start_stations_c_names ) %>% 
  group_by(new_start_station_name,member_casual) %>% 
  tally() %>% 
  mutate(percent=n/sum(n),new_start_station_name=factor(new_start_station_name,levels =top10start_stations_c$new_start_station_name,ordered = TRUE ))
#draw plot
ggplot(data = percent_top10_stations_c,aes(x=new_start_station_name,y=n,fill=member_casual),group = new_start_station_name)+
  geom_bar(stat = "identity",position=position_stack(reverse = TRUE))+
  coord_flip()+
  labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stitions by Casual")+
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5,reverse = TRUE),size=3)+
  scale_fill_manual(values=c("casual"="coral","member"="grey90"))

library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:magrittr':
## 
##     inset
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
stations_202101 <- tripdata_all_new_3 %>%
  select(new_start_station_name, start_lat, start_lng, start_my, started_at) %>%
  filter(!is.na(new_start_station_name)) %>%
  filter(start_my == "2021-01") %>%
  arrange(desc(started_at))
stations_202101 <- stations_202101[!duplicated(stations_202101$new_start_station_name),]
#prepare data
bike_type_p <- tripdata_all_new_3 %>% 
  group_by(member_casual,rideable_type) %>%
  tally() %>% 
  mutate(per=n/sum(n))
#DRAW PLOT
ggplot(data = bike_type_p,aes(x="",y=per,fill=rideable_type))+
  geom_bar(stat = "identity",width = 1)+
  coord_polar("y",start = 0 )+
  labs(title = "The percentage of Bike types")+
  theme_void()+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(aes(y=per,label=paste0(round(per*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)+
  facet_wrap(~member_casual,strip.position="bottom")

Observation: Member users don’t use docked bikes