RMarkdown

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(skimr)
library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(readr)
library(dplyr)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

x202101_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202101-divvy-tripdata.csv")

## Rows: 96834 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202102_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202102-divvy-tripdata.csv")

## Rows: 49622 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202103_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202103-divvy-tripdata.csv")

## Rows: 228496 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202104_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202104-divvy-tripdata.csv")

## Rows: 337230 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202105_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202105-divvy-tripdata.csv")

## Rows: 531633 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202106_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202106-divvy-tripdata.csv")

## Rows: 729595 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202107_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202107-divvy-tripdata.csv")

## Rows: 822410 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202108_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202108-divvy-tripdata.csv")

## Rows: 804352 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202109_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202109-divvy-tripdata.csv")

## Rows: 756147 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202110_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202110-divvy-tripdata.csv")

## Rows: 631226 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202111_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202111-divvy-tripdata.csv")

## Rows: 359978 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

x202112_dataset <-read_csv("/Users/zfroz/OneDrive/Masaüstü/cyclistic_project/202112-divvy-tripdata.csv")

## Rows: 247540 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(x202112_dataset)

skim_without_charts(x202101_dataset)

Data summary
Name	x202101_dataset
Number of rows	96834
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	96834
rideable_type	0	1.00	11	13	3
start_station_name	8625	0.91	10	51	640
start_station_id	8625	0.91	3	35	638
end_station_name	10277	0.89	10	53	632
end_station_id	10277	0.89	3	35	629
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.64	41.88	41.90	41.93	42.06
start_lng	0	1	-87.65	0.03	-87.78	-87.66	-87.64	-87.63	-87.53
end_lat	103	1	41.90	0.05	41.64	41.88	41.90	41.93	42.07
end_lng	103	1	-87.65	0.03	-87.81	-87.66	-87.64	-87.63	-87.51

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-01-01 00:02:05	2021-01-31 23:57:00	2021-01-15 06:05:04	93736
ended_at	0	1	2021-01-01 00:08:39	2021-02-01 15:33:15	2021-01-15 06:19:58	93582

skim_without_charts(x202102_dataset)

Data summary
Name	x202102_dataset
Number of rows	49622
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	49622
rideable_type	0	1.00	11	13	3
start_station_name	4046	0.92	10	51	582
start_station_id	4046	0.92	3	35	582
end_station_name	5358	0.89	10	53	584
end_station_id	5358	0.89	3	35	584
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.04	41.65	41.88	41.90	41.93	42.06
start_lng	0	1	-87.64	0.03	-87.77	-87.66	-87.64	-87.63	-87.53
end_lat	214	1	41.90	0.04	41.54	41.88	41.90	41.93	42.07
end_lng	214	1	-87.64	0.03	-87.77	-87.66	-87.64	-87.63	-87.53

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-02-01 00:55:44	2021-02-28 23:59:41	2021-02-22 13:17:53	48139
ended_at	0	1	2021-02-01 01:22:48	2021-03-05 15:11:45	2021-02-22 13:39:20	48035

skim_without_charts(x202103_dataset)

Data summary
Name	x202103_dataset
Number of rows	228496
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	228496
rideable_type	0	1.00	11	13	3
start_station_name	14848	0.94	10	53	673
start_station_id	14848	0.94	3	35	673
end_station_name	16727	0.93	10	53	673
end_station_id	16727	0.93	3	35	673
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.04	41.65	41.88	41.90	41.93	42.07
start_lng	0	1	-87.64	0.03	-87.78	-87.66	-87.64	-87.63	-87.53
end_lat	167	1	41.90	0.04	41.64	41.88	41.90	41.93	42.08
end_lng	167	1	-87.65	0.03	-88.07	-87.66	-87.64	-87.63	-87.53

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-03-01 00:01:09	2021-03-31 23:59:08	2021-03-19 17:37:20	209025
ended_at	0	1	2021-03-01 00:06:28	2021-04-06 11:00:11	2021-03-19 17:55:05	208629

skim_without_charts(x202104_dataset)

Data summary
Name	x202104_dataset
Number of rows	337230
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	337230
rideable_type	0	1.00	11	13	3
start_station_name	26056	0.92	10	53	681
start_station_id	26056	0.92	3	35	681
end_station_name	28174	0.92	10	53	681
end_station_id	28174	0.92	3	35	681
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.64	41.88	41.90	41.93	42.07
start_lng	0	1	-87.64	0.03	-87.78	-87.66	-87.64	-87.63	-87.52
end_lat	267	1	41.90	0.05	41.59	41.88	41.90	41.93	42.15
end_lng	267	1	-87.65	0.03	-87.85	-87.66	-87.64	-87.63	-87.52

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-04-01 00:03:18	2021-04-30 23:59:53	2021-04-15 22:37:04	298722
ended_at	0	1	2021-04-01 00:14:29	2021-05-05 22:14:39	2021-04-15 23:00:10	298625

skim_without_charts(x202105_dataset)

Data summary
Name	x202105_dataset
Number of rows	531633
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	531633
rideable_type	0	1.00	11	13	3
start_station_name	53744	0.90	10	53	687
start_station_id	53744	0.90	3	35	686
end_station_name	58194	0.89	10	53	683
end_station_id	58194	0.89	3	35	682
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.65	41.88	41.90	41.93	42.07
start_lng	0	1	-87.64	0.03	-87.78	-87.66	-87.64	-87.63	-87.52
end_lat	452	1	41.90	0.05	41.56	41.88	41.90	41.93	42.09
end_lng	452	1	-87.64	0.03	-87.85	-87.66	-87.64	-87.63	-87.52

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-05-01 00:00:11	2021-05-31 23:59:16	2021-05-19 07:44:31	447224
ended_at	0	1	2021-05-01 00:03:26	2021-06-10 22:17:11	2021-05-19 07:59:43	447217

skim_without_charts(x202106_dataset)

Data summary
Name	x202106_dataset
Number of rows	729595
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	729595
rideable_type	0	1.00	11	13	3
start_station_name	80093	0.89	10	53	689
start_station_id	80093	0.89	3	35	689
end_station_name	86387	0.88	10	53	690
end_station_id	86387	0.88	3	35	690
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.04	41.64	41.88	41.90	41.93	42.07
start_lng	0	1	-87.64	0.03	-87.78	-87.66	-87.64	-87.63	-87.52
end_lat	717	1	41.90	0.04	41.51	41.88	41.90	41.93	42.08
end_lng	717	1	-87.64	0.03	-87.86	-87.66	-87.64	-87.63	-87.49

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-06-01 00:00:38	2021-06-30 23:59:59	2021-06-14 19:46:47	589805
ended_at	0	1	2021-06-01 00:06:22	2021-07-13 22:51:35	2021-06-14 20:13:55	589069

skim_without_charts(x202107_dataset)

Data summary
Name	x202107_dataset
Number of rows	822410
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	822410
rideable_type	0	1.00	11	13	3
start_station_name	87263	0.89	10	53	717
start_station_id	87262	0.89	3	36	710
end_station_name	93158	0.89	10	53	714
end_station_id	93158	0.89	3	36	707
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.04	41.65	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.52
end_lat	731	1	41.90	0.04	41.63	41.88	41.90	41.93	42.15
end_lng	731	1	-87.65	0.03	-87.85	-87.66	-87.64	-87.63	-87.49

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-07-01 00:00:22	2021-07-31 23:59:58	2021-07-17 13:58:37	659640
ended_at	0	1	2021-07-01 00:04:51	2021-08-12 17:45:41	2021-07-17 14:28:04	658663

skim_without_charts(x202108_dataset)

Data summary
Name	x202108_dataset
Number of rows	804352
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	804352
rideable_type	0	1.00	11	13	3
start_station_name	88458	0.89	3	53	727
start_station_id	88458	0.89	3	35	726
end_station_name	94115	0.88	10	53	727
end_station_id	94115	0.88	3	35	727
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.04	41.65	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.52
end_lat	706	1	41.90	0.04	41.58	41.88	41.90	41.93	42.15
end_lng	706	1	-87.65	0.03	-87.85	-87.66	-87.64	-87.63	-87.51

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-08-01 00:00:04	2021-08-31 23:59:35	2021-08-16 07:57:11	646516
ended_at	0	1	2021-08-01 00:03:11	2021-09-01 17:37:35	2021-08-16 08:12:14	645299

skim_without_charts(x202109_dataset)

Data summary
Name	x202109_dataset
Number of rows	756147
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	756147
rideable_type	0	1.00	11	13	3
start_station_name	93113	0.88	10	53	758
start_station_id	93111	0.88	3	35	758
end_station_name	99261	0.87	10	53	756
end_station_id	99261	0.87	3	35	756
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.65	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.52
end_lat	595	1	41.90	0.05	41.57	41.88	41.90	41.93	42.17
end_lng	595	1	-87.65	0.03	-87.87	-87.66	-87.64	-87.63	-87.50

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-09-01 00:00:06	2021-09-30 23:59:48	2021-09-15 16:43:37	611240
ended_at	0	1	2021-09-01 00:00:41	2021-10-01 22:55:35	2021-09-15 17:01:16	610277

skim_without_charts(x202110_dataset)

Data summary
Name	x202110_dataset
Number of rows	631226
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	631226
rideable_type	0	1.00	11	13	3
start_station_name	108210	0.83	10	53	793
start_station_id	108210	0.83	3	35	793
end_station_name	114834	0.82	10	53	790
end_station_id	114834	0.82	3	35	790
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.65	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.83	-87.66	-87.64	-87.63	-87.52
end_lat	484	1	41.90	0.05	41.60	41.88	41.90	41.93	42.13
end_lng	484	1	-87.65	0.03	-87.96	-87.66	-87.64	-87.63	-87.52

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-10-01 00:00:09	2021-10-31 23:59:49	2021-10-15 05:31:57	524629
ended_at	0	1	2021-10-01 00:03:11	2021-11-03 21:45:48	2021-10-15 05:56:26	523397

skim_without_charts(x202111_dataset)

Data summary
Name	x202111_dataset
Number of rows	359978
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	359978
rideable_type	0	1.00	11	13	3
start_station_name	75290	0.79	10	53	815
start_station_id	75290	0.79	3	35	815
end_station_name	79187	0.78	10	53	805
end_station_id	79187	0.78	3	35	805
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.89	0.05	41.65	41.88	41.89	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.53
end_lat	191	1	41.89	0.05	41.39	41.88	41.89	41.93	42.12
end_lng	191	1	-87.65	0.03	-88.97	-87.66	-87.64	-87.63	-87.53

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-11-01 00:00:14	2021-11-30 23:59:56	2021-11-12 08:32:12	320477
ended_at	0	1	2021-11-01 00:04:06	2021-12-02 06:41:33	2021-11-12 08:46:55	320071

skim_without_charts(x202112_dataset)

Data summary
Name	x202112_dataset
Number of rows	247540
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	247540
rideable_type	0	1.00	11	13	3
start_station_name	51063	0.79	10	53	818
start_station_id	51063	0.79	3	35	816
end_station_name	53498	0.78	10	53	800
end_station_id	53498	0.78	3	35	798
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.64	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.67	-87.64	-87.63	-87.52
end_lat	144	1	41.90	0.05	41.48	41.88	41.90	41.93	42.07
end_lng	144	1	-87.65	0.03	-87.85	-87.67	-87.64	-87.63	-87.52

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-12-01 00:00:01	2021-12-31 23:59:48	2021-12-13 13:04:54	228845
ended_at	0	1	2021-12-01 00:02:40	2022-01-03 17:32:18	2021-12-13 13:18:39	228657

Observation: There are some variables which has lots of missing values 1. start_station_name 2. start_station_id 3. end_station_name 4. end_station_id 5. end_lat 6. end_lng

tripdata_all <- rbind(x202101_dataset, x202102_dataset, x202103_dataset, x202104_dataset, x202105_dataset, x202106_dataset, x202107_dataset, x202108_dataset, 
                      x202109_dataset, x202110_dataset, x202111_dataset, x202112_dataset)

head(tripdata_all)

## # A tibble: 6 x 13
##   ride_id rideable_type started_at          ended_at            start_station_n~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>

skim_without_charts(tripdata_all)

Data summary
Name	tripdata_all
Number of rows	5595063
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	5595063
rideable_type	0	1.00	11	13	3
start_station_name	690809	0.88	3	53	847
start_station_id	690806	0.88	3	36	834
end_station_name	739170	0.87	10	53	844
end_station_id	739170	0.87	3	36	832
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.64	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.52
end_lat	4771	1	41.90	0.05	41.39	41.88	41.90	41.93	42.17
end_lng	4771	1	-87.65	0.03	-88.97	-87.66	-87.64	-87.63	-87.49

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-01-01 00:02:05	2021-12-31 23:59:48	2021-08-01 01:52:11	4677998
ended_at	0	1	2021-01-01 00:08:39	2022-01-03 17:32:18	2021-08-01 02:21:55	4671372

tripdata_all %>%
  group_by(ride_id) %>%
  summarise(n=sum(n())) %>%
  filter(n>1)

## # A tibble: 0 x 2
## # ... with 2 variables: ride_id <chr>, n <int>

head(tripdata_all)

## # A tibble: 6 x 13
##   ride_id rideable_type started_at          ended_at            start_station_n~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 8 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>

Observation: Need to check whether we have all ids mapped correctly with station name

start_station_sumamry <- tripdata_all %>%
  count(start_station_id, start_station_name, name = "count_numbers") %>%
  mutate(ID_length = str_length(start_station_id), NAME_length = str_length(start_station_name)) %>%
  arrange(count_numbers)

view(start_station_sumamry)

start_station_sumamry %>%
  filter(ID_length > 14)

## # A tibble: 3 x 5
##   start_station_id          start_station_n~ count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             2        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           313        35          31

Observation:

The station id is wrong and showing as station name, we need to fix it

start_station_sumamry %>%
  filter(is.na(ID_length) & !is.na(NAME_length))

## # A tibble: 0 x 5
## # ... with 5 variables: start_station_id <chr>, start_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>

start_station_sumamry %>%
  filter(start_station_name %in% c("Throop/Hastings Mobile Station", "DIVVY CASSETTE REPAIR MOBILE STATION", "Base - 2132 W Hubbard Warehouse"))

## # A tibble: 3 x 5
##   start_station_id          start_station_n~ count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             2        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           313        35          31

Observation: Tried to search for an ID but failed to get any id from the Dataframe Task: Figuraring out whether all the statition names are correct or not

start_station_sumamry %>%
  filter(NAME_length < 10)

## # A tibble: 1 x 5
##   start_station_id start_station_name count_numbers ID_length NAME_length
##   <chr>            <chr>                      <int>     <int>       <int>
## 1 351              351                            2         3           3

Observation: The strat_station_name should not be equivalent to start_station_id

start_station_sumamry %>%
  filter(is.na(NAME_length) & !is.na(ID_length))

## # A tibble: 3 x 5
##   start_station_id start_station_name count_numbers ID_length NAME_length
##   <chr>            <chr>                      <int>     <int>       <int>
## 1 13221            <NA>                           1         5          NA
## 2 20215            <NA>                           1         5          NA
## 3 WL-008           <NA>                           1         6          NA

Condition: Id lenght should not be NA and Name length should be NA

Observation: There are 3 missing station name where it has station-id

start_station_sumamry %>%
  filter(start_station_id %in% c("351", "13221", "20215", "WL-008"))

## # A tibble: 8 x 5
##   start_station_id start_station_name        count_numbers ID_length NAME_length
##   <chr>            <chr>                             <int>     <int>       <int>
## 1 13221            <NA>                                  1         5          NA
## 2 20215            <NA>                                  1         5          NA
## 3 WL-008           <NA>                                  1         6          NA
## 4 351              351                                   2         3           3
## 5 20215            Hegewisch Metra Station              86         5          23
## 6 351              Mulligan Ave & Wellingto~           264         3          29
## 7 WL-008           Clinton St & Roosevelt Rd          8376         6          25
## 8 13221            Wood St & Milwaukee Ave           16765         5          23

Observation : We figuared out the correct station names and will replace the NA and iD’s with correct values

tripdata_all_new <- tripdata_all %>%
  mutate(new_start_station_name = case_when(
      start_station_id == 13221 ~ "Wood St & Milwaukee Ave", 
      start_station_id == "WL-008" ~ "Clinton St & Roosevelt Rd", 
      start_station_id == 351 ~ "Mulligan Ave & Wellington Ave", 
      start_station_id == 20215 ~ "Hegewisch Metra Station", TRUE ~ start_station_name), .before = start_station_name)

tripdata_all_new %>%
  filter(start_station_id %in% c("351", "13221", "20215", "WL-008")) %>%
  count(start_station_id, new_start_station_name)

## # A tibble: 4 x 3
##   start_station_id new_start_station_name            n
##   <chr>            <chr>                         <int>
## 1 13221            Wood St & Milwaukee Ave       16766
## 2 20215            Hegewisch Metra Station          87
## 3 351              Mulligan Ave & Wellington Ave   266
## 4 WL-008           Clinton St & Roosevelt Rd      8377

end_station_summary <- 
  tripdata_all %>%
  count(end_station_id, end_station_name, name = "count_numbers") %>%
  mutate(ID_length =str_length(end_station_id), NAME_length = str_length(end_station_name)) %>%
  arrange(count_numbers)

view(end_station_summary)

end_station_summary %>%
  filter(ID_length > 14)

## # A tibble: 3 x 5
##   end_station_id            end_station_name count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             1        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           578        35          31

end_station_summary %>%
  filter(is.na(ID_length) & !is.na(NAME_length))

## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>

end_station_summary %>%
  filter(end_station_name %in% c("Throop/Hastings Mobile Station", "DIVVY CASSETTE REPAIR MOBILE STATION", "Base - 2132 W Hubbard Warehouse"))

## # A tibble: 3 x 5
##   end_station_id            end_station_name count_numbers ID_length NAME_length
##   <chr>                     <chr>                    <int>     <int>       <int>
## 1 Throop/Hastings Mobile S~ Throop/Hastings~             1        30          30
## 2 DIVVY CASSETTE REPAIR MO~ DIVVY CASSETTE ~             4        36          36
## 3 Hubbard Bike-checking (L~ Base - 2132 W H~           578        35          31

end_station_summary %>%
  filter(NAME_length < 10)

## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>

end_station_summary %>%
  filter(is.na(NAME_length) & !is.na(ID_length))

## # A tibble: 0 x 5
## # ... with 5 variables: end_station_id <chr>, end_station_name <chr>,
## #   count_numbers <int>, ID_length <int>, NAME_length <int>

end_station_summary %>%
  filter(end_station_id %in% c("351", "13221", "20215", "WL-008"))

## # A tibble: 4 x 5
##   end_station_id end_station_name            count_numbers ID_length NAME_length
##   <chr>          <chr>                               <int>     <int>       <int>
## 1 20215          Hegewisch Metra Station                72         5          23
## 2 351            Mulligan Ave & Wellington ~           208         3          29
## 3 WL-008         Clinton St & Roosevelt Rd            8697         6          25
## 4 13221          Wood St & Milwaukee Ave             17172         5          23

Observation: there is not wrong end_station_name

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

tripdata_all_new_2 <- tripdata_all_new %>%
  mutate(ride_length = hms::as_hms(tripdata_all_new$ended_at -tripdata_all_new$started_at), 
         start_weekday = wday(started_at, label=TRUE), end_weekday = wday(ended_at, label=TRUE, abbr=TRUE), start_my = format(started_at, "%Y-%m"), end_my = format(ended_at, "%Y-%m"))

Observation: All the weekdays are now splited

skim_without_charts(tripdata_all_new_2)

Data summary
Name	tripdata_all_new_2
Number of rows	5595063
Number of columns	19
_______________________
Column type frequency:
character	10
difftime	1
factor	2
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	5595063
rideable_type	0	1.00	11	13	3
new_start_station_name	690806	0.88	10	53	846
start_station_name	690809	0.88	3	53	847
start_station_id	690806	0.88	3	36	834
end_station_name	739170	0.87	10	53	844
end_station_id	739170	0.87	3	36	832
member_casual	0	1.00	6	6	2
start_my	0	1.00	7	7	12
end_my	0	1.00	7	7	13

Variable type: difftime

skim_variable	n_missing	complete_rate	min	max	median	n_unique
ride_length	0	1	-3482 secs	3356649 secs	00:12:00	25645

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
start_weekday	0	1	TRUE	7	Sat: 991047, Sun: 857285, Fri: 810508, Wed: 756142
end_weekday	0	1	TRUE	7	Sat: 987780, Sun: 863702, Fri: 806655, Wed: 756208

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.05	41.64	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.52
end_lat	4771	1	41.90	0.05	41.39	41.88	41.90	41.93	42.17
end_lng	4771	1	-87.65	0.03	-88.97	-87.66	-87.64	-87.63	-87.49

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-01-01 00:02:05	2021-12-31 23:59:48	2021-08-01 01:52:11	4677998
ended_at	0	1	2021-01-01 00:08:39	2022-01-03 17:32:18	2021-08-01 02:21:55	4671372

tripdata_all_new_2 %>%
  select(ride_id, started_at, ended_at, ride_length) %>%
  filter(ride_length <= 0)

## # A tibble: 653 x 4
##    ride_id          started_at          ended_at            ride_length
##    <chr>            <dttm>              <dttm>              <time>     
##  1 FC1EFEF4475D7F56 2021-01-09 15:42:45 2021-01-09 15:41:02 -01'43"    
##  2 6B51296F8E269F2F 2021-01-15 16:40:47 2021-01-15 16:40:47  00'00"    
##  3 B1235D38EB2F8A9E 2021-01-06 18:33:12 2021-01-06 18:31:07 -02'05"    
##  4 3F02776D0D38F96E 2021-01-29 21:02:41 2021-01-29 21:02:41  00'00"    
##  5 417EE43395E20C71 2021-01-14 17:30:55 2021-01-14 17:30:55  00'00"    
##  6 FBFC52F121A5C3E1 2021-01-14 17:26:46 2021-01-14 17:26:46  00'00"    
##  7 578B5E6B37A8C1A9 2021-02-24 21:14:58 2021-02-24 21:14:58  00'00"    
##  8 2A63B43959DE477F 2021-02-08 11:18:56 2021-02-08 11:18:56  00'00"    
##  9 D69558609E2E6E5B 2021-02-13 17:32:01 2021-02-13 17:32:01  00'00"    
## 10 52061914ECE4B8D9 2021-02-22 17:24:04 2021-02-22 17:24:04  00'00"    
## # ... with 643 more rows

Observation: No one can travel less than 0 secs hence we need to clean these records

tripdata_all_new_2 %>%
  select (ride_id, started_at, ended_at, ride_length) %>%
  filter(ride_length / 86400 > 1)

## # A tibble: 4,016 x 4
##    ride_id          started_at          ended_at            ride_length
##    <chr>            <dttm>              <dttm>              <time>     
##  1 C832F2F65BBA9BDB 2021-01-07 00:31:02 2021-01-08 01:30:47 24:59:45   
##  2 573647A9A72AF73A 2021-01-28 11:30:49 2021-01-29 12:30:42 24:59:53   
##  3 0348F90AFE78CC44 2021-01-24 00:46:04 2021-01-25 01:45:59 24:59:55   
##  4 410D8AAB684CF9D1 2021-01-19 19:52:14 2021-01-20 20:22:24 24:30:10   
##  5 B477C62F547027A0 2021-01-21 17:54:37 2021-01-22 18:54:26 24:59:49   
##  6 B67A4DBE0EAF7EE9 2021-01-16 13:48:56 2021-01-17 14:48:42 24:59:46   
##  7 2DA3CC5031CF1D83 2021-01-18 14:46:47 2021-01-19 15:46:38 24:59:51   
##  8 3DBA9CCB6B88B134 2021-01-18 17:42:02 2021-01-19 18:41:57 24:59:55   
##  9 C6AC2730FD0E5EAA 2021-01-06 19:07:16 2021-01-07 20:07:06 24:59:50   
## 10 7EBFBC8A8E73577A 2021-01-11 16:18:26 2021-01-12 17:18:07 24:59:41   
## # ... with 4,006 more rows

606024 = 86400

tripdata_all_new_3  <- tripdata_all_new_2 %>%
  select(-start_station_name) %>%
  filter(ride_length > 0)


  tripdata_all_new_3$new_start_station_name[tripdata_all_new_3$new_start_station_name == "Lake
                                            Shore Dr & Monroe St"] <-"DuSable Lake Shore Dr & Monroe St"

Next Steps: Analysis of the clean

head(tripdata_all_new_3)

## # A tibble: 6 x 18
##   ride_id rideable_type started_at          ended_at            new_start_stati~
##   <chr>   <chr>         <dttm>              <dttm>              <chr>           
## 1 E19E6F~ electric_bike 2021-01-23 16:14:19 2021-01-23 16:24:44 California Ave ~
## 2 DC88F2~ electric_bike 2021-01-27 18:43:08 2021-01-27 18:47:12 California Ave ~
## 3 EC45C9~ electric_bike 2021-01-21 22:35:54 2021-01-21 22:37:14 California Ave ~
## 4 4FA453~ electric_bike 2021-01-07 13:31:13 2021-01-07 13:42:55 California Ave ~
## 5 BE5E8E~ electric_bike 2021-01-23 02:24:02 2021-01-23 02:24:45 California Ave ~
## 6 5D8969~ electric_bike 2021-01-09 14:24:07 2021-01-09 15:17:54 California Ave ~
## # ... with 13 more variables: start_station_id <chr>, end_station_name <chr>,
## #   end_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, member_casual <chr>, ride_length <time>,
## #   start_weekday <ord>, end_weekday <ord>, start_my <chr>, end_my <chr>

**********************************Data Analysis**********************************

ave_max_ride_length_all <- tripdata_all_new_3 %>% 
  group_by(member_casual) %>% 
  summarise(average_ride_length = hms::as_hms(mean(ride_length)), max_ride_length = hms::as_hms(max(ride_length))) %>% 
  print()

## # A tibble: 2 x 3
##   member_casual average_ride_length max_ride_length
##   <chr>         <time>              <time>         
## 1 casual        32'00.346797"       932:24:09      
## 2 member        13'38.072709"        25:59:56

Observation: 1. Avg ride for casual users is 32 min 2. Avg ride for member user is 13 min

tripdata_all_new_3 %>% 
  group_by(member_casual, start_weekday) %>% 
  summarise(average_ride_length = hms::as_hms(mean(ride_length))) %>% 
  pivot_wider(names_from = start_weekday, values_from = average_ride_length)

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 2 x 8
## # Groups:   member_casual [2]
##   member_casual Sun           Mon           Tue           Wed          
##   <chr>         <time>        <time>        <time>        <time>       
## 1 casual        37'34.257336" 31'52.747311" 27'58.529274" 27'39.664429"
## 2 member        15'39.553754" 13'14.895591" 12'47.345070" 12'49.212422"
## # ... with 3 more variables: Thu <time>, Fri <time>, Sat <time>

Observation: The causal users are spending on avg of 28 min to 31 mins The member users are spending on avg of 15 mins

tripdata_all_new_3 %>% 
  count(member_casual,start_weekday, name = "count_rides") %>% 
  pivot_wider(names_from = start_weekday, values_from = count_rides)

## # A tibble: 2 x 8
##   member_casual    Sun    Mon    Tue    Wed    Thu    Fri    Sat
##   <chr>          <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1 casual        481048 286340 274357 278910 286038 364037 557934
## 2 member        376086 416181 465474 477117 451490 446384 433014

Observation:

Our observation is basically casual user used the bikes mostly in weekends whereas the member users are mostly using the bikes at weekdays so may be they are commuting to office/school

tripdata_all_new_3 %>% 
  count(member_casual,start_my, name = "count_rides") %>% 
  pivot_wider(names_from = start_my, values_from = count_rides)

## # A tibble: 2 x 13
##   member_casual `2021-01` `2021-02` `2021-03` `2021-04` `2021-05` `2021-06`
##   <chr>             <int>     <int>     <int>     <int>     <int>     <int>
## 1 casual            18117     10130     84028    136590    256888    370636
## 2 member            78711     39488    144456    200602    274691    358893
## # ... with 6 more variables: `2021-07` <int>, `2021-08` <int>, `2021-09` <int>,
## #   `2021-10` <int>, `2021-11` <int>, `2021-12` <int>

Observation: Subscription are higher in the summer time for casual users and whereas members users are using based upon there need

tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name)) %>% 
  filter(member_casual == "casual") %>%
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10)

## # A tibble: 10 x 2
##    new_start_station_name         n
##    <chr>                      <int>
##  1 Streeter Dr & Grand Ave    66353
##  2 Millennium Park            33578
##  3 Michigan Ave & Oak St      29778
##  4 Shedd Aquarium             23249
##  5 Theater on the Lake        21349
##  6 Wells St & Concord Ln      19888
##  7 Lake Shore Dr & Monroe St  19616
##  8 Clark St & Lincoln Ave     17033
##  9 Wells St & Elm St          16664
## 10 Indiana Ave & Roosevelt Rd 16628

Observation: Top ten station for casual users

tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name))%>% 
  filter(member_casual=="member") %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10)

## # A tibble: 10 x 2
##    new_start_station_name        n
##    <chr>                     <int>
##  1 Clark St & Elm St         24739
##  2 Wells St & Concord Ln     23716
##  3 Kingsbury St & Kinzie St  23562
##  4 Wells St & Elm St         21019
##  5 Dearborn St & Erie St     19585
##  6 Wells St & Huron St       19188
##  7 St. Clair St & Erie St    18901
##  8 Broadway & Barry Ave      17800
##  9 Clinton St & Madison St   16913
## 10 Desplaines St & Kinzie St 16820

Observation: Top 10 streets for member users

library(ggplot2)
library(knitr)

ave_max_ride_length_all_sec <- tripdata_all_new_3 %>% 
  group_by(member_casual) %>% 
  summarise(average_ride_length = mean(ride_length),max_ride_length = max(ride_length))
# draw plot :Average
ggplot(data = ave_max_ride_length_all_sec) +
  geom_col(mapping = aes(x= member_casual, y = average_ride_length, fill = member_casual)) +
  labs(title = "Average ride length by member",x = "Member",y = "Average")+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(mapping =aes(x = member_casual,y = average_ride_length,label = hms::as_hms(ceiling(average_ride_length)),vjust = 2.5))

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

# draw plot : max
ggplot(data = ave_max_ride_length_all_sec) +
  geom_col(mapping = aes(x= member_casual, y = max_ride_length, fill = member_casual)) +
  labs(title = "Max ride length by member",x = "Member",y = "MAX")+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(mapping =aes(x = member_casual,y = max_ride_length,label = hms::as_hms(ceiling(max_ride_length)),vjust = 1.5))

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

Observation: Outlier: 932 hours for casual users as well as for member user is an outlier

kable(ave_max_ride_length_all,"pipe")

member_casual	average_ride_length	max_ride_length
casual	00:32:00.346797	932:24:09
member	00:13:38.072709	25:59:56

ave_ride_length_weekday_all <-tripdata_all_new_3 %>% 
  group_by(member_casual, start_weekday) %>% 
  summarise(average_ride_length = mean(ride_length))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# draw plot
ggplot(data = ave_ride_length_weekday_all) +
  geom_col(mapping = aes(x= start_weekday, y = average_ride_length, fill = member_casual),position = "dodge") +
  labs(title = "Average ride length by weekday",x = "Weekday",y = "Average(secs)")+
  theme(plot.title = element_text(hjust = 0.5))+
   geom_smooth(mapping = aes(x = start_weekday,y = average_ride_length,colour = member_casual,group = member_casual),se = FALSE)

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Observation : The time spent by the menber users are statics and uniformly distributed

# display the table
kable(ave_ride_length_weekday_all %>% 
  pivot_wider(names_from = start_weekday, values_from = average_ride_length))

member_casual	Sun	Mon	Tue	Wed	Thu	Fri	Sat
casual	2254.2573 secs	1912.7473 secs	1678.5293 secs	1659.6644 secs	1662.347 secs	1821.1061 secs	2082.5980 secs
member	939.5538 secs	794.8956 secs	767.3451 secs	769.2124 secs	766.622 secs	799.5648 secs	915.9313 secs

#prepare data
nums_rides_weekday_all <- tripdata_all_new_3 %>% 
  count(member_casual,start_weekday, name = "count_rides")
# draw plot
ggplot(data = nums_rides_weekday_all) +
  geom_col(mapping = aes(x= start_weekday, y = count_rides, fill = member_casual),position = "dodge") +
  labs(title = "The number of rides for users by weekday",x = "Weekday",y = "Number")+
  theme(plot.title = element_text(hjust = 0.5))+
   geom_smooth(mapping = aes(x = start_weekday,y = count_rides,colour = member_casual,group = member_casual),se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Observation: More users are riding in weekdays by memeber users

kable(nums_rides_weekday_all %>% pivot_wider(names_from = start_weekday, values_from = count_rides))

member_casual	Sun	Mon	Tue	Wed	Thu	Fri	Sat
casual	481048	286340	274357	278910	286038	364037	557934
member	376086	416181	465474	477117	451490	446384	433014

nums_rides_months_all <- tripdata_all_new_3 %>% 
  count(member_casual,start_my, name = "count_rides")
#draw plot
ggplot(data = nums_rides_months_all) +
  geom_col(mapping = aes(x= start_my, y = count_rides, fill = member_casual),position = "dodge") +
  labs(title = "The number of rides for users by month",x = "Month",y = "Number")+
  theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45,vjust = 0.5))+
   geom_smooth(mapping = aes(x = start_my,y = count_rides,colour = member_casual,group = member_casual),se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

kable(nums_rides_months_all %>% 
      pivot_wider(names_from = start_my, values_from = count_rides))

member_casual	2021-01	2021-02	2021-03	2021-04	2021-05	2021-06	2021-07	2021-08	2021-09	2021-10	2021-11	2021-12
casual	18117	10130	84028	136590	256888	370636	442011	412608	363840	257203	106884	69729
member	78711	39488	144456	200602	274691	358893	380317	391637	392200	373953	253008	177790

# find the top10 rides of started stations
top10start_stations<-tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name)) %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10) 
# extract names
top10start_stations_names <-top10start_stations$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations <- tripdata_all_new_3 %>% 
  filter(new_start_station_name %in% top10start_stations_names ) %>% 
  group_by(new_start_station_name,member_casual) %>% 
  tally() %>% 
  mutate(percent=n/sum(n))
#draw plot
ggplot(data = percent_top10_stations,aes(x=reorder(new_start_station_name,n),y=n,fill=member_casual))+
  geom_bar(stat = "identity")+coord_flip()+labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stations")+theme(plot.title = element_text(hjust = 0.5))+geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)

# find the top10 rides of started stations by member
top10start_stations_m<-tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name))%>% 
  filter(member_casual=="member") %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  arrange(n) %>% 
  mutate(new_start_station_name=factor(new_start_station_name))
# extract names
top10start_stations_m_names <-top10start_stations_m$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations_m <- tripdata_all_new_3 %>% 
  filter(new_start_station_name %in% top10start_stations_m_names ) %>% 
  group_by(new_start_station_name,member_casual) %>% 
  tally() %>% 
  mutate(percent=n/sum(n),new_start_station_name=factor(new_start_station_name,levels =top10start_stations_m$new_start_station_name,ordered = TRUE ))
#draw plot
ggplot(data = percent_top10_stations_m,aes(x=new_start_station_name,y=n,fill=member_casual))+
  geom_bar(stat = "identity")+
  coord_flip()+
  labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stations by member")+
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)+
  scale_fill_manual(values=c("member"="cyan3","casual"="grey90"))

# find the top10 rides of started stitions by casual
top10start_stations_c<-tripdata_all_new_3 %>% 
  filter(!is.na(new_start_station_name))%>% 
  filter(member_casual=="casual") %>% 
  count(new_start_station_name) %>% 
  arrange(desc(n)) %>% 
  head(10) %>% 
  arrange(n) %>% 
  mutate(new_start_station_name=factor(new_start_station_name))
# extract names
top10start_stations_c_names <-top10start_stations_c$new_start_station_name
# prepare data and calculate the percentages of each station by member_casual
percent_top10_stations_c <- tripdata_all_new_3 %>% 
  filter(new_start_station_name %in% top10start_stations_c_names ) %>% 
  group_by(new_start_station_name,member_casual) %>% 
  tally() %>% 
  mutate(percent=n/sum(n),new_start_station_name=factor(new_start_station_name,levels =top10start_stations_c$new_start_station_name,ordered = TRUE ))
#draw plot
ggplot(data = percent_top10_stations_c,aes(x=new_start_station_name,y=n,fill=member_casual),group = new_start_station_name)+
  geom_bar(stat = "identity",position=position_stack(reverse = TRUE))+
  coord_flip()+
  labs(x="Stations",y="Rides",title = "TOP10 Rides of Started Stitions by Casual")+
  theme_classic()+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(aes(label=paste0(round(percent*100,digits = 1),"%")),position=position_stack(vjust = 0.5,reverse = TRUE),size=3)+
  scale_fill_manual(values=c("casual"="coral","member"="grey90"))

library(ggmap)

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.

## Please cite ggmap if you use it! See citation("ggmap") for details.

## 
## Attaching package: 'ggmap'

## The following object is masked from 'package:magrittr':
## 
##     inset

library(maps)

## 
## Attaching package: 'maps'

## The following object is masked from 'package:purrr':
## 
##     map

stations_202101 <- tripdata_all_new_3 %>%
  select(new_start_station_name, start_lat, start_lng, start_my, started_at) %>%
  filter(!is.na(new_start_station_name)) %>%
  filter(start_my == "2021-01") %>%
  arrange(desc(started_at))

stations_202101 <- stations_202101[!duplicated(stations_202101$new_start_station_name),]

#prepare data
bike_type_p <- tripdata_all_new_3 %>% 
  group_by(member_casual,rideable_type) %>%
  tally() %>% 
  mutate(per=n/sum(n))
#DRAW PLOT
ggplot(data = bike_type_p,aes(x="",y=per,fill=rideable_type))+
  geom_bar(stat = "identity",width = 1)+
  coord_polar("y",start = 0 )+
  labs(title = "The percentage of Bike types")+
  theme_void()+
  theme(plot.title = element_text(hjust = 0.5))+
  geom_text(aes(y=per,label=paste0(round(per*100,digits = 1),"%")),position=position_stack(vjust = 0.5),size=3)+
  facet_wrap(~member_casual,strip.position="bottom")

Observation: Member users don’t use docked bikes

RMarkdown

Zafer Ozdemir

2022-06-13

R Markdown