This analysis is based on the Divvy case study Sophisticated, Clear, and Polished’: Divvy and Data Visualization” written by Kevin Hartman (found here: https://artscience.blog/home/divvy-dataviz-case-study).
The purpose of this script is to consolidate downloaded Divvy data into a single data frame and then conduct simple analysis to help answer the key question: “In what ways do members and casual riders use Divvy bikes differently?”
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate)
library(ggplot2)
getwd()
## [1] "C:/Users/HP/Desktop/CapStone Project/Visualized Data"
setwd("C:/Users/HP/Desktop/All Data/All Data")
this were the steps taking to Upload Divvy datasets (csv files)
q1_2021 <- read_csv("C:/Users/HP/Desktop/All Data/202101-divvy-tripdata.csv")
## Rows: 96834 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q2_2021 <- read_csv("C:/Users/HP/Desktop/All Data/202102-divvy-tripdata.csv")
## Rows: 49622 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q3_2021 <- read_csv("C:/Users/HP/Desktop/All Data/202103-divvy-tripdata.csv")
## Rows: 228496 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q4_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202004-divvy-tripdata.csv")
## Rows: 84776 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q5_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202005-divvy-tripdata.csv")
## Rows: 200274 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q6_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202006-divvy-tripdata.csv")
## Rows: 343005 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q7_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202007-divvy-tripdata.csv")
## Rows: 551480 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q8_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202008-divvy-tripdata.csv")
## Rows: 622361 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q9_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202009-divvy-tripdata.csv")
## Rows: 532958 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q10_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202010-divvy-tripdata.csv")
## Rows: 388653 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q11_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202011-divvy-tripdata.csv")
## Rows: 259716 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q12_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202012-divvy-tripdata.csv")
## Rows: 131573 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Compare column names for each of the files. This is to ensure that they match perfectly before i can use a command to join them into a single file.
colnames(q1_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q2_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q3_2021)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q4_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q5_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q6_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q7_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q8_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q9_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q10_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q11_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q12_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
Rename columns to make them consistent with q1_2021 (as this will be the supposed going-forward table design for Divvy).
(q1_2021 <- rename(q1_2021
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 96,834 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 E19E6F1B8D4C… electr… 2021-01-23 16:14:19 2021-01-23 16:24:44 Califo… 17660
## 2 DC88F20C2C55… electr… 2021-01-27 18:43:08 2021-01-27 18:47:12 Califo… 17660
## 3 EC45C94683FE… electr… 2021-01-21 22:35:54 2021-01-21 22:37:14 Califo… 17660
## 4 4FA453A75AE3… electr… 2021-01-07 13:31:13 2021-01-07 13:42:55 Califo… 17660
## 5 BE5E8EB4E726… electr… 2021-01-23 02:24:02 2021-01-23 02:24:45 Califo… 17660
## 6 5D8969F88C77… electr… 2021-01-09 14:24:07 2021-01-09 15:17:54 Califo… 17660
## 7 09275CC10F85… electr… 2021-01-04 05:05:04 2021-01-04 05:10:39 Califo… 17660
## 8 DF7A32A217AE… electr… 2021-01-14 15:07:00 2021-01-14 15:13:40 Califo… 17660
## 9 C2EFC62379EB… electr… 2021-01-09 09:57:55 2021-01-09 10:00:26 Califo… 17660
## 10 B9F73448DFBE… classi… 2021-01-24 19:15:38 2021-01-24 19:22:51 Califo… 17660
## # … with 96,824 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q2_2021 <- rename(q2_2021
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 49,622 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 89E7AA6C2922… classi… 2021-02-12 16:14:56 2021-02-12 16:21:43 Glenwo… 525
## 2 0FEFDE260356… classi… 2021-02-14 17:52:38 2021-02-14 18:12:09 Glenwo… 525
## 3 E6159D746B2D… electr… 2021-02-09 19:10:18 2021-02-09 19:19:10 Clark … KA1503…
## 4 B32D3199F1C2… classi… 2021-02-02 17:49:41 2021-02-02 17:54:06 Wood S… 637
## 5 83E463F23575… electr… 2021-02-23 15:07:23 2021-02-23 15:22:37 State … 13216
## 6 BDAA7E3494E8… electr… 2021-02-24 15:43:33 2021-02-24 15:49:05 Fairba… 18003
## 7 A77274235117… classi… 2021-02-01 17:47:42 2021-02-01 17:48:33 LaSall… KP1705…
## 8 295476889D9B… classi… 2021-02-11 18:33:53 2021-02-11 18:35:09 Fairba… 18003
## 9 362087194BA4… classi… 2021-02-27 15:13:39 2021-02-27 15:36:36 LaSall… KP1705…
## 10 21630F715038… classi… 2021-02-20 08:59:42 2021-02-20 09:17:04 LaSall… KP1705…
## # … with 49,612 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q3_2021 <- rename(q3_2021
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 228,496 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 CFA86D4455AA… classi… 2021-03-16 08:32:30 2021-03-16 08:36:34 Humbol… 15651
## 2 30D9DC61227D… classi… 2021-03-28 01:26:28 2021-03-28 01:36:55 Humbol… 15651
## 3 846D87A15682… classi… 2021-03-11 21:17:29 2021-03-11 21:33:53 Shield… 15443
## 4 994D05AA75A1… classi… 2021-03-11 13:26:42 2021-03-11 13:55:41 Winthr… TA1308…
## 5 DF7464FBE92D… classi… 2021-03-21 09:09:37 2021-03-21 09:27:33 Glenwo… 525
## 6 CEBA8516FD17… classi… 2021-03-20 11:08:47 2021-03-20 11:29:39 Glenwo… 525
## 7 297268586B79… classi… 2021-03-20 14:10:41 2021-03-20 14:22:13 State … 13050
## 8 F39301858B60… electr… 2021-03-23 07:56:51 2021-03-23 08:05:50 Shore … TA1308…
## 9 D297F199D875… electr… 2021-03-31 15:31:19 2021-03-31 15:35:58 Clinto… 13021
## 10 36B877141175… classi… 2021-03-11 17:37:37 2021-03-11 17:52:44 Michig… TA1305…
## # … with 228,486 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q4_2020 <- rename(q4_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 84,776 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 A847FADBBC63… docked… 2020-04-26 17:45:14 2020-04-26 18:12:03 Eckhar… 86
## 2 5405B80E996F… docked… 2020-04-17 17:08:54 2020-04-17 17:17:03 Drake … 503
## 3 5DD24A79A4E0… docked… 2020-04-01 17:54:13 2020-04-01 18:08:36 McClur… 142
## 4 2A59BBDF5CDB… docked… 2020-04-07 12:50:19 2020-04-07 13:02:31 Califo… 216
## 5 27AD306C119C… docked… 2020-04-18 10:22:59 2020-04-18 11:15:54 Rush S… 125
## 6 356216E87513… docked… 2020-04-30 17:55:47 2020-04-30 18:01:11 Mies v… 173
## 7 A2759CB06A81… docked… 2020-04-02 14:47:19 2020-04-02 14:52:32 Street… 35
## 8 FC8BC2E2D54F… docked… 2020-04-07 12:22:20 2020-04-07 13:38:09 Ogden … 434
## 9 9EC5648678DE… docked… 2020-04-15 10:30:11 2020-04-15 10:35:55 LaSall… 627
## 10 A8FFF89140C3… docked… 2020-04-04 15:02:28 2020-04-04 15:19:47 Kedzie… 377
## # … with 84,766 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q5_2020 <- rename(q5_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 200,274 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 02668AD35674… docked… 2020-05-27 10:03:52 2020-05-27 10:16:49 Frankl… 36
## 2 7A50CCAF1EDD… docked… 2020-05-25 10:47:11 2020-05-25 11:05:40 Clark … 340
## 3 2FFCDFDB91FE… docked… 2020-05-02 14:11:03 2020-05-02 15:48:21 Kedzie… 260
## 4 58991CF1DB75… docked… 2020-05-02 16:25:36 2020-05-02 16:39:28 Claren… 251
## 5 A79651EFECC2… docked… 2020-05-29 12:49:54 2020-05-29 13:27:11 Hermit… 261
## 6 1466C5B39F68… docked… 2020-05-29 13:27:24 2020-05-29 14:14:45 Halste… 206
## 7 2500D7957D4D… docked… 2020-05-20 12:51:41 2020-05-20 13:46:47 Hermit… 261
## 8 ED42D3E06AFB… docked… 2020-05-06 18:21:42 2020-05-06 19:07:07 Ritchi… 180
## 9 23AFBD962F9C… docked… 2020-05-30 17:00:58 2020-05-30 17:19:52 Halste… 331
## 10 52C0D13F6B81… docked… 2020-05-23 10:22:02 2020-05-23 10:52:02 Damen … 219
## # … with 200,264 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q6_2020 <- rename(q6_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 343,005 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 8CD5DE2C2B6C… docked… 2020-06-13 23:24:48 2020-06-13 23:36:55 Wilton… 117
## 2 9A191EB2C751… docked… 2020-06-26 07:26:10 2020-06-26 07:31:58 Federa… 41
## 3 F37D14B0B565… docked… 2020-06-23 17:12:41 2020-06-23 17:21:14 Daley … 81
## 4 C41237B506E8… docked… 2020-06-20 01:09:35 2020-06-20 01:28:24 Broadw… 303
## 5 4B51B3B0BDA7… docked… 2020-06-25 16:59:25 2020-06-25 17:08:48 Sheffi… 327
## 6 D50DF288196B… docked… 2020-06-17 18:07:18 2020-06-17 18:18:14 Sheffi… 327
## 7 165FA6D223E5… docked… 2020-06-25 07:24:33 2020-06-25 07:31:11 Federa… 41
## 8 D8236CFC050E… docked… 2020-06-19 00:00:56 2020-06-19 00:09:15 Sheffi… 115
## 9 9D82B9B53C37… docked… 2020-06-30 12:11:36 2020-06-30 12:32:43 Calume… 338
## 10 3DFF4AB10A68… docked… 2020-06-28 14:17:09 2020-06-28 14:27:51 Milwau… 84
## # … with 342,995 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q7_2020 <- rename(q7_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 551,480 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 762198876D69… docked… 2020-07-09 15:22:02 2020-07-09 15:25:52 Ritchi… 180
## 2 BEC9C9FBA0D4… docked… 2020-07-24 23:56:30 2020-07-25 00:20:17 Halste… 299
## 3 D2FD8EA432C7… docked… 2020-07-08 19:49:07 2020-07-08 19:56:22 Lake S… 329
## 4 54AE594E20B3… docked… 2020-07-17 19:06:42 2020-07-17 19:27:38 LaSall… 181
## 5 54025FDC7440… docked… 2020-07-04 10:39:57 2020-07-04 10:45:05 Lake S… 268
## 6 65636B619E24… docked… 2020-07-28 16:33:03 2020-07-28 16:49:10 Fairba… 635
## 7 22DB94283ECF… docked… 2020-07-30 11:58:12 2020-07-30 12:16:12 Bissel… 113
## 8 C9D789BEF899… docked… 2020-07-13 16:48:03 2020-07-13 16:57:00 St. Cl… 211
## 9 FBE24D943CDE… docked… 2020-07-30 11:00:12 2020-07-30 11:14:48 Clark … 176
## 10 8A2BBE457325… docked… 2020-07-06 18:05:29 2020-07-06 18:15:38 Frankl… 31
## # … with 551,470 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q8_2020 <- rename(q8_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 622,361 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 322BD23D2877… docked… 2020-08-20 18:08:14 2020-08-20 18:17:51 Lake S… 329
## 2 2A3AEF1AB905… electr… 2020-08-27 18:46:04 2020-08-27 19:54:51 Michig… 168
## 3 67DC1D133E8B… electr… 2020-08-26 19:44:14 2020-08-26 21:53:07 Columb… 195
## 4 C79FBBD412E5… electr… 2020-08-27 12:05:41 2020-08-27 12:53:45 Daley … 81
## 5 13814D3D661E… electr… 2020-08-27 16:49:02 2020-08-27 16:59:49 Leavit… 658
## 6 56349A5A42F0… electr… 2020-08-27 17:26:23 2020-08-27 18:07:50 Leavit… 658
## 7 EB6ABC5570C2… electr… 2020-08-26 20:14:02 2020-08-26 20:34:00 Cityfr… 196
## 8 B4ECE389A1DE… electr… 2020-08-26 21:59:50 2020-08-26 22:12:35 Sheffi… 67
## 9 0B355B0FE076… electr… 2020-08-26 19:17:42 2020-08-26 19:32:14 Southp… 153
## 10 1ECE04F779E9… electr… 2020-08-27 15:13:57 2020-08-27 15:41:59 Theate… 177
## # … with 622,351 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q9_2020 <- rename(q9_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 532,958 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 2B22BD5F95FB… electr… 2020-09-17 14:27:11 2020-09-17 14:44:24 Michig… 52
## 2 A7FB70B4AFC6… electr… 2020-09-17 15:07:31 2020-09-17 15:07:45 W Oakd… NA
## 3 86057FA01BAC… electr… 2020-09-17 15:09:04 2020-09-17 15:09:35 W Oakd… NA
## 4 57F6DC9A153D… electr… 2020-09-17 18:10:46 2020-09-17 18:35:49 Ashlan… 246
## 5 B9C4712F78C1… electr… 2020-09-17 15:16:13 2020-09-17 15:52:55 Fairba… 24
## 6 378BBCE1E444… electr… 2020-09-17 18:37:04 2020-09-17 19:23:28 Clark … 94
## 7 0CB5E2496B6F… electr… 2020-09-16 21:39:57 2020-09-16 21:53:44 Wells … 291
## 8 9673F5D39BDB… electr… 2020-09-17 12:18:06 2020-09-17 12:18:19 <NA> NA
## 9 54B91F5C95B2… electr… 2020-09-17 17:09:17 2020-09-17 17:34:20 <NA> NA
## 10 91CEBB66076D… electr… 2020-09-17 12:20:25 2020-09-17 12:29:47 <NA> NA
## # … with 532,948 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q10_2020 <- rename(q10_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 388,653 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 ACB6B40CF5B9… electr… 2020-10-31 19:39:43 2020-10-31 19:57:12 Lakevi… 313
## 2 DF450C72FD10… electr… 2020-10-31 23:50:08 2020-11-01 00:04:16 Southp… 227
## 3 B6396B54A15A… electr… 2020-10-31 23:00:01 2020-10-31 23:08:22 Stony … 102
## 4 44A4AEE261B9… electr… 2020-10-31 22:16:43 2020-10-31 22:19:35 Clark … 165
## 5 10B7DD76A6A2… electr… 2020-10-31 19:38:19 2020-10-31 19:54:32 Southp… 190
## 6 DA6C37596601… electr… 2020-10-29 17:38:04 2020-10-29 17:45:43 Larrab… 359
## 7 C2F3808FD56B… electr… 2020-10-29 09:03:06 2020-10-29 09:17:56 Lakevi… 313
## 8 15B13B5A508B… electr… 2020-10-29 16:37:21 2020-10-29 16:52:40 Rush S… 125
## 9 285D224410C1… electr… 2020-10-28 23:12:03 2020-10-28 23:24:32 <NA> NA
## 10 E1FB79FFE6DB… electr… 2020-10-29 16:38:44 2020-10-29 16:50:17 Canal … 174
## # … with 388,643 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q11_2020 <- rename(q11_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 259,716 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <dbl>
## 1 BD0A6FF6FFF9… electr… 2020-11-01 13:36:00 2020-11-01 13:45:40 Dearbo… 110
## 2 96A7A7A4BDE4… electr… 2020-11-01 10:03:26 2020-11-01 10:14:45 Frankl… 672
## 3 C61526D06582… electr… 2020-11-01 00:34:05 2020-11-01 01:03:06 Lake S… 76
## 4 E533E89C3208… electr… 2020-11-01 00:45:16 2020-11-01 00:54:31 Leavit… 659
## 5 1C9F4EF18C16… electr… 2020-11-01 15:43:25 2020-11-01 16:16:52 Buckin… 2
## 6 7259585D8276… electr… 2020-11-14 15:55:17 2020-11-14 16:44:38 Wabash… 72
## 7 91FE5C8F8A67… electr… 2020-11-14 16:47:29 2020-11-14 17:03:03 Lake S… 76
## 8 9E7A79ADA90C… electr… 2020-11-14 16:04:15 2020-11-14 16:19:33 <NA> NA
## 9 A5B02C0D41DB… electr… 2020-11-14 16:24:09 2020-11-14 16:51:34 Marshf… 58
## 10 8234407C29FE… electr… 2020-11-14 01:24:22 2020-11-14 01:31:42 Clark … 394
## # … with 259,706 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
(q12_2020 <- rename(q12_2020
,trip_id = ride_id
,bike_id = rideable_type
,start_time = started_at
,end_time = ended_at
,from_station_name = start_station_name
,from_station_id = start_station_id
,to_station_name = end_station_name
,to_station_id = end_station_id
,usertype = member_casual))
## # A tibble: 131,573 × 13
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 70B6A9A437D4… classi… 2020-12-27 12:44:29 2020-12-27 12:55:06 Aberde… 13157
## 2 158A465D4E74… electr… 2020-12-18 17:37:15 2020-12-18 17:44:19 <NA> <NA>
## 3 5262016E0F1F… electr… 2020-12-15 15:04:33 2020-12-15 15:11:28 <NA> <NA>
## 4 BE119628E44F… electr… 2020-12-15 15:54:18 2020-12-15 16:00:11 <NA> <NA>
## 5 69AF78D57854… electr… 2020-12-22 12:08:17 2020-12-22 12:10:59 <NA> <NA>
## 6 C1DECC4AB488… electr… 2020-12-22 13:26:37 2020-12-22 13:34:50 <NA> <NA>
## 7 B014A60B856C… electr… 2020-12-03 16:23:48 2020-12-03 16:33:39 <NA> <NA>
## 8 1E127B1929C0… electr… 2020-12-03 15:03:38 2020-12-03 15:12:39 <NA> <NA>
## 9 05F41F5137B5… electr… 2020-12-12 09:26:17 2020-12-12 09:26:35 <NA> <NA>
## 10 BB807646588D… electr… 2020-12-18 12:52:06 2020-12-18 12:52:23 <NA> <NA>
## # … with 131,563 more rows, 7 more variables: to_station_name <chr>,
## # to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## # end_lng <dbl>, usertype <chr>, and abbreviated variable names
## # ¹from_station_name, ²from_station_id
Inspect the dataframes and look for incongruencies
str(q1_2021)
## spc_tbl_ [96,834 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ bike_id : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:96834], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
## $ end_time : POSIXct[1:96834], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
## $ from_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ from_station_id : chr [1:96834] "17660" "17660" "17660" "17660" ...
## $ to_station_name : chr [1:96834] NA NA NA NA ...
## $ to_station_id : chr [1:96834] NA NA NA NA ...
## $ start_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ usertype : chr [1:96834] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q2_2021)
## spc_tbl_ [49,622 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:49622] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
## $ bike_id : chr [1:49622] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ start_time : POSIXct[1:49622], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
## $ end_time : POSIXct[1:49622], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
## $ from_station_name: chr [1:49622] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
## $ from_station_id : chr [1:49622] "525" "525" "KA1503000012" "637" ...
## $ to_station_name : chr [1:49622] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
## $ to_station_id : chr [1:49622] "660" "16806" "TA1305000029" "TA1305000034" ...
## $ start_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
## $ start_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ usertype : chr [1:49622] "member" "casual" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q3_2021)
## spc_tbl_ [228,496 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:228496] "CFA86D4455AA1030" "30D9DC61227D1AF3" "846D87A15682A284" "994D05AA75A168F2" ...
## $ bike_id : chr [1:228496] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ start_time : POSIXct[1:228496], format: "2021-03-16 08:32:30" "2021-03-28 01:26:28" ...
## $ end_time : POSIXct[1:228496], format: "2021-03-16 08:36:34" "2021-03-28 01:36:55" ...
## $ from_station_name: chr [1:228496] "Humboldt Blvd & Armitage Ave" "Humboldt Blvd & Armitage Ave" "Shields Ave & 28th Pl" "Winthrop Ave & Lawrence Ave" ...
## $ from_station_id : chr [1:228496] "15651" "15651" "15443" "TA1308000021" ...
## $ to_station_name : chr [1:228496] "Stave St & Armitage Ave" "Central Park Ave & Bloomingdale Ave" "Halsted St & 35th St" "Broadway & Sheridan Rd" ...
## $ to_station_id : chr [1:228496] "13266" "18017" "TA1308000043" "13323" ...
## $ start_lat : num [1:228496] 41.9 41.9 41.8 42 42 ...
## $ start_lng : num [1:228496] -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num [1:228496] 41.9 41.9 41.8 42 42.1 ...
## $ end_lng : num [1:228496] -87.7 -87.7 -87.6 -87.6 -87.7 ...
## $ usertype : chr [1:228496] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q4_2020)
## spc_tbl_ [84,776 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:84776] "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
## $ bike_id : chr [1:84776] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ start_time : POSIXct[1:84776], format: "2020-04-26 17:45:14" "2020-04-17 17:08:54" ...
## $ end_time : POSIXct[1:84776], format: "2020-04-26 18:12:03" "2020-04-17 17:17:03" ...
## $ from_station_name: chr [1:84776] "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
## $ from_station_id : num [1:84776] 86 503 142 216 125 173 35 434 627 377 ...
## $ to_station_name : chr [1:84776] "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
## $ to_station_id : num [1:84776] 152 499 255 657 323 35 635 382 359 508 ...
## $ start_lat : num [1:84776] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:84776] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:84776] 41.9 41.9 41.9 41.9 42 ...
## $ end_lng : num [1:84776] -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ usertype : chr [1:84776] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q5_2020)
## spc_tbl_ [200,274 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:200274] "02668AD35674B983" "7A50CCAF1EDDB28F" "2FFCDFDB91FE9A52" "58991CF1DB75BA84" ...
## $ bike_id : chr [1:200274] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ start_time : POSIXct[1:200274], format: "2020-05-27 10:03:52" "2020-05-25 10:47:11" ...
## $ end_time : POSIXct[1:200274], format: "2020-05-27 10:16:49" "2020-05-25 11:05:40" ...
## $ from_station_name: chr [1:200274] "Franklin St & Jackson Blvd" "Clark St & Wrightwood Ave" "Kedzie Ave & Milwaukee Ave" "Clarendon Ave & Leland Ave" ...
## $ from_station_id : num [1:200274] 36 340 260 251 261 206 261 180 331 219 ...
## $ to_station_name : chr [1:200274] "Wabash Ave & Grand Ave" "Clark St & Leland Ave" "Kedzie Ave & Milwaukee Ave" "Lake Shore Dr & Wellington Ave" ...
## $ to_station_id : num [1:200274] 199 326 260 157 206 22 261 180 300 305 ...
## $ start_lat : num [1:200274] 41.9 41.9 41.9 42 41.9 ...
## $ start_lng : num [1:200274] -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:200274] 41.9 42 41.9 41.9 41.8 ...
## $ end_lng : num [1:200274] -87.6 -87.7 -87.7 -87.6 -87.6 ...
## $ usertype : chr [1:200274] "member" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q6_2020)
## spc_tbl_ [343,005 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:343005] "8CD5DE2C2B6C4CFC" "9A191EB2C751D85D" "F37D14B0B5659BCF" "C41237B506E85FA1" ...
## $ bike_id : chr [1:343005] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ start_time : POSIXct[1:343005], format: "2020-06-13 23:24:48" "2020-06-26 07:26:10" ...
## $ end_time : POSIXct[1:343005], format: "2020-06-13 23:36:55" "2020-06-26 07:31:58" ...
## $ from_station_name: chr [1:343005] "Wilton Ave & Belmont Ave" "Federal St & Polk St" "Daley Center Plaza" "Broadway & Cornelia Ave" ...
## $ from_station_id : num [1:343005] 117 41 81 303 327 327 41 115 338 84 ...
## $ to_station_name : chr [1:343005] "Damen Ave & Clybourn Ave" "Daley Center Plaza" "State St & Harrison St" "Broadway & Berwyn Ave" ...
## $ to_station_id : num [1:343005] 163 81 5 294 117 117 81 303 164 53 ...
## $ start_lat : num [1:343005] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:343005] -87.7 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:343005] 41.9 41.9 41.9 42 41.9 ...
## $ end_lng : num [1:343005] -87.7 -87.6 -87.6 -87.7 -87.7 ...
## $ usertype : chr [1:343005] "casual" "member" "member" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q7_2020)
## spc_tbl_ [551,480 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:551480] "762198876D69004D" "BEC9C9FBA0D4CF1B" "D2FD8EA432C77EC1" "54AE594E20B35881" ...
## $ bike_id : chr [1:551480] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ start_time : POSIXct[1:551480], format: "2020-07-09 15:22:02" "2020-07-24 23:56:30" ...
## $ end_time : POSIXct[1:551480], format: "2020-07-09 15:25:52" "2020-07-25 00:20:17" ...
## $ from_station_name: chr [1:551480] "Ritchie Ct & Banks St" "Halsted St & Roscoe St" "Lake Shore Dr & Diversey Pkwy" "LaSalle St & Illinois St" ...
## $ from_station_id : num [1:551480] 180 299 329 181 268 635 113 211 176 31 ...
## $ to_station_name : chr [1:551480] "Wells St & Evergreen Ave" "Broadway & Ridge Ave" "Clark St & Wellington Ave" "Clark St & Armitage Ave" ...
## $ to_station_id : num [1:551480] 291 461 156 94 301 289 140 31 191 142 ...
## $ start_lat : num [1:551480] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:551480] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:551480] 41.9 42 41.9 41.9 41.9 ...
## $ end_lng : num [1:551480] -87.6 -87.7 -87.6 -87.6 -87.6 ...
## $ usertype : chr [1:551480] "member" "member" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q8_2020)
## spc_tbl_ [622,361 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:622361] "322BD23D287743ED" "2A3AEF1AB9054D8B" "67DC1D133E8B5816" "C79FBBD412E578A7" ...
## $ bike_id : chr [1:622361] "docked_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:622361], format: "2020-08-20 18:08:14" "2020-08-27 18:46:04" ...
## $ end_time : POSIXct[1:622361], format: "2020-08-20 18:17:51" "2020-08-27 19:54:51" ...
## $ from_station_name: chr [1:622361] "Lake Shore Dr & Diversey Pkwy" "Michigan Ave & 14th St" "Columbus Dr & Randolph St" "Daley Center Plaza" ...
## $ from_station_id : num [1:622361] 329 168 195 81 658 658 196 67 153 177 ...
## $ to_station_name : chr [1:622361] "Clark St & Lincoln Ave" "Michigan Ave & 14th St" "State St & Randolph St" "State St & Kinzie St" ...
## $ to_station_id : num [1:622361] 141 168 44 47 658 658 49 229 225 305 ...
## $ start_lat : num [1:622361] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:622361] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:622361] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:622361] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ usertype : chr [1:622361] "member" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q9_2020)
## spc_tbl_ [532,958 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:532958] "2B22BD5F95FB2629" "A7FB70B4AFC6CAF2" "86057FA01BAC778E" "57F6DC9A153DB98C" ...
## $ bike_id : chr [1:532958] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:532958], format: "2020-09-17 14:27:11" "2020-09-17 15:07:31" ...
## $ end_time : POSIXct[1:532958], format: "2020-09-17 14:44:24" "2020-09-17 15:07:45" ...
## $ from_station_name: chr [1:532958] "Michigan Ave & Lake St" "W Oakdale Ave & N Broadway" "W Oakdale Ave & N Broadway" "Ashland Ave & Belle Plaine Ave" ...
## $ from_station_id : num [1:532958] 52 NA NA 246 24 94 291 NA NA NA ...
## $ to_station_name : chr [1:532958] "Green St & Randolph St" "W Oakdale Ave & N Broadway" "W Oakdale Ave & N Broadway" "Montrose Harbor" ...
## $ to_station_id : num [1:532958] 112 NA NA 249 24 NA 256 NA NA NA ...
## $ start_lat : num [1:532958] 41.9 41.9 41.9 42 41.9 ...
## $ start_lng : num [1:532958] -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:532958] 41.9 41.9 41.9 42 41.9 ...
## $ end_lng : num [1:532958] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ usertype : chr [1:532958] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q10_2020)
## spc_tbl_ [388,653 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:388653] "ACB6B40CF5B9044C" "DF450C72FD109C01" "B6396B54A15AC0DF" "44A4AEE261B9E854" ...
## $ bike_id : chr [1:388653] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:388653], format: "2020-10-31 19:39:43" "2020-10-31 23:50:08" ...
## $ end_time : POSIXct[1:388653], format: "2020-10-31 19:57:12" "2020-11-01 00:04:16" ...
## $ from_station_name: chr [1:388653] "Lakeview Ave & Fullerton Pkwy" "Southport Ave & Waveland Ave" "Stony Island Ave & 67th St" "Clark St & Grace St" ...
## $ from_station_id : num [1:388653] 313 227 102 165 190 359 313 125 NA 174 ...
## $ to_station_name : chr [1:388653] "Rush St & Hubbard St" "Kedzie Ave & Milwaukee Ave" "University Ave & 57th St" "Broadway & Sheridan Rd" ...
## $ to_station_id : num [1:388653] 125 260 423 256 185 53 125 313 199 635 ...
## $ start_lat : num [1:388653] 41.9 41.9 41.8 42 41.9 ...
## $ start_lng : num [1:388653] -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ end_lat : num [1:388653] 41.9 41.9 41.8 42 41.9 ...
## $ end_lng : num [1:388653] -87.6 -87.7 -87.6 -87.7 -87.7 ...
## $ usertype : chr [1:388653] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q11_2020)
## spc_tbl_ [259,716 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:259716] "BD0A6FF6FFF9B921" "96A7A7A4BDE4F82D" "C61526D06582BDC5" "E533E89C32080B9E" ...
## $ bike_id : chr [1:259716] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:259716], format: "2020-11-01 13:36:00" "2020-11-01 10:03:26" ...
## $ end_time : POSIXct[1:259716], format: "2020-11-01 13:45:40" "2020-11-01 10:14:45" ...
## $ from_station_name: chr [1:259716] "Dearborn St & Erie St" "Franklin St & Illinois St" "Lake Shore Dr & Monroe St" "Leavitt St & Chicago Ave" ...
## $ from_station_id : num [1:259716] 110 672 76 659 2 72 76 NA 58 394 ...
## $ to_station_name : chr [1:259716] "St. Clair St & Erie St" "Noble St & Milwaukee Ave" "Federal St & Polk St" "Stave St & Armitage Ave" ...
## $ to_station_id : num [1:259716] 211 29 41 185 2 76 72 NA 288 273 ...
## $ start_lat : num [1:259716] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:259716] -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:259716] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:259716] -87.6 -87.7 -87.6 -87.7 -87.6 ...
## $ usertype : chr [1:259716] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(q12_2020)
## spc_tbl_ [131,573 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : chr [1:131573] "70B6A9A437D4C30D" "158A465D4E74C54A" "5262016E0F1F2F9A" "BE119628E44F871E" ...
## $ bike_id : chr [1:131573] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:131573], format: "2020-12-27 12:44:29" "2020-12-18 17:37:15" ...
## $ end_time : POSIXct[1:131573], format: "2020-12-27 12:55:06" "2020-12-18 17:44:19" ...
## $ from_station_name: chr [1:131573] "Aberdeen St & Jackson Blvd" NA NA NA ...
## $ from_station_id : chr [1:131573] "13157" NA NA NA ...
## $ to_station_name : chr [1:131573] "Desplaines St & Kinzie St" NA NA NA ...
## $ to_station_id : chr [1:131573] "TA1306000003" NA NA NA ...
## $ start_lat : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
## $ start_lng : num [1:131573] -87.7 -87.7 -87.7 -87.7 -87.6 ...
## $ end_lat : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
## $ end_lng : num [1:131573] -87.6 -87.7 -87.7 -87.7 -87.6 ...
## $ usertype : chr [1:131573] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Convert trip_id, bike_id, from_station_id and to_station_id to character so that they can stack correctly.
q1_2021 <- mutate(q1_2021, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q2_2021 <- mutate(q2_2021, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q3_2021 <- mutate(q3_2021, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q4_2020 <- mutate(q4_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q5_2020 <- mutate(q5_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q6_2020 <- mutate(q6_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q7_2020 <- mutate(q7_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q8_2020 <- mutate(q8_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q9_2020 <- mutate(q9_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q10_2020 <- mutate(q10_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q11_2020 <- mutate(q11_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
q12_2020 <- mutate(q12_2020, trip_id = as.character(trip_id)
,bike_id = as.character(bike_id)
,from_station_id = as.character(from_station_id)
,to_station_id = as.character(to_station_id))
Stack individual quarter’s data frames into one big data frame.
library(dplyr)
all_trips <- bind_rows(q1_2021, q2_2021, q3_2021, q4_2020, q5_2020, q6_2020, q7_2020, q8_2020, q9_2020, q10_2020, q11_2020, q12_2020)
View(all_trips)
Remove start_lat, start_long, end_lat, and end_long fields as this data were unnecessary for the calculations and visualization.
all_trips <- all_trips %>%
select(-c(start_lat, start_lng, end_lat, end_lng))
This involves Cleaning Up and adding Data To Prepare for Analysis.
First, Inspect the new table that has been created
colnames(all_trips) #List of column names
## [1] "trip_id" "bike_id" "start_time"
## [4] "end_time" "from_station_name" "from_station_id"
## [7] "to_station_name" "to_station_id" "usertype"
nrow(all_trips) #How many rows are in data frame?
## [1] 3489748
dim(all_trips) #Dimensions of the data frame?
## [1] 3489748 9
head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips)
## # A tibble: 6 × 9
## trip_id bike_id start_time end_time from_…¹ from_…²
## <chr> <chr> <dttm> <dttm> <chr> <chr>
## 1 E19E6F1B8D4C4… electr… 2021-01-23 16:14:19 2021-01-23 16:24:44 Califo… 17660
## 2 DC88F20C2C55F… electr… 2021-01-27 18:43:08 2021-01-27 18:47:12 Califo… 17660
## 3 EC45C94683FE3… electr… 2021-01-21 22:35:54 2021-01-21 22:37:14 Califo… 17660
## 4 4FA453A75AE37… electr… 2021-01-07 13:31:13 2021-01-07 13:42:55 Califo… 17660
## 5 BE5E8EB4E7263… electr… 2021-01-23 02:24:02 2021-01-23 02:24:45 Califo… 17660
## 6 5D8969F88C773… electr… 2021-01-09 14:24:07 2021-01-09 15:17:54 Califo… 17660
## # … with 3 more variables: to_station_name <chr>, to_station_id <chr>,
## # usertype <chr>, and abbreviated variable names ¹from_station_name,
## # ²from_station_id
str(all_trips) #See list of columns and data types (numeric, character, etc)
## tibble [3,489,748 × 9] (S3: tbl_df/tbl/data.frame)
## $ trip_id : chr [1:3489748] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ bike_id : chr [1:3489748] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:3489748], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
## $ end_time : POSIXct[1:3489748], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
## $ from_station_name: chr [1:3489748] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ from_station_id : chr [1:3489748] "17660" "17660" "17660" "17660" ...
## $ to_station_name : chr [1:3489748] NA NA NA NA ...
## $ to_station_id : chr [1:3489748] NA NA NA NA ...
## $ usertype : chr [1:3489748] "member" "member" "member" "member" ...
summary(all_trips) #Statistical summary of data. Mainly for numerics
## trip_id bike_id start_time
## Length:3489748 Length:3489748 Min. :2020-04-01 00:00:30.00
## Class :character Class :character 1st Qu.:2020-07-14 19:38:28.00
## Mode :character Mode :character Median :2020-08-29 14:50:36.50
## Mean :2020-09-10 01:21:45.98
## 3rd Qu.:2020-10-20 18:14:13.00
## Max. :2021-03-31 23:59:08.00
## end_time from_station_name from_station_id
## Min. :2020-04-01 00:10:45.00 Length:3489748 Length:3489748
## 1st Qu.:2020-07-14 20:13:07.75 Class :character Class :character
## Median :2020-08-29 15:21:13.00 Mode :character Mode :character
## Mean :2020-09-10 01:46:31.98
## 3rd Qu.:2020-10-20 18:28:46.25
## Max. :2021-04-06 11:00:11.00
## to_station_name to_station_id usertype
## Length:3489748 Length:3489748 Length:3489748
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
table(all_trips$usertype)
##
## casual member
## 1430376 2059372
Reassign to the desired values for member & casual
all_trips <- all_trips %>%
mutate(usertype = recode(usertype
,"member" = "Subscriber"
,"casual" = "Customer"))
table(all_trips$usertype)
##
## Customer Subscriber
## 1430376 2059372
Add columns that list the date, month, day, and year of each ride.This will allow us to aggregate ride data for each month, day, or year.
Note: Before completing these operations we could only aggregate at the ride level: https://www.statmethods.net/input/dates.html more on date formats in R found at this link.
all_trips$date <- as.Date(all_trips$start_time) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
calculation to all_trips (in seconds) refer this link: https://stat.ethz.ch/R-manual/R-devel/library/base/html/difftime.html for more information.
all_trips$ride_length <- difftime(all_trips$end_time, all_trips$start_time)
str(all_trips)
## tibble [3,489,748 × 15] (S3: tbl_df/tbl/data.frame)
## $ trip_id : chr [1:3489748] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
## $ bike_id : chr [1:3489748] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ start_time : POSIXct[1:3489748], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
## $ end_time : POSIXct[1:3489748], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
## $ from_station_name: chr [1:3489748] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
## $ from_station_id : chr [1:3489748] "17660" "17660" "17660" "17660" ...
## $ to_station_name : chr [1:3489748] NA NA NA NA ...
## $ to_station_id : chr [1:3489748] NA NA NA NA ...
## $ usertype : chr [1:3489748] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ date : Date[1:3489748], format: "2021-01-23" "2021-01-27" ...
## $ month : chr [1:3489748] "01" "01" "01" "01" ...
## $ day : chr [1:3489748] "23" "27" "21" "07" ...
## $ year : chr [1:3489748] "2021" "2021" "2021" "2021" ...
## $ day_of_week : chr [1:3489748] "Saturday" "Wednesday" "Thursday" "Thursday" ...
## $ ride_length : 'difftime' num [1:3489748] 625 244 80 702 ...
## ..- attr(*, "units")= chr "secs"
is.factor(all_trips$ride_length)
## [1] FALSE
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE
The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative
Therefore, it is required to create a new version of the dataframe (v2) since data is being removed. https://www.datasciencemadesimple.com/delete-or-drop-rows-in-r-with-conditions-2/ for more information.
all_trips_v2 <- all_trips[!(all_trips$from_station_name == "HQ QR" | all_trips$ride_length<0),]
Descriptive analysis on ride_length (all figures in seconds).
mean(all_trips_v2$ride_length, na.rm = TRUE) #straight average (total ride length / rides)
## [1] 1704.123
median(all_trips_v2$ride_length, na.rm = TRUE) #midpoint number in the ascending array of ride lengths
## [1] 884
max(all_trips_v2$ride_length, na.rm = TRUE) #longest ride
## [1] 3523202
min(all_trips_v2$ride_length, na.rm = TRUE) #shortest ride
## [1] 0
summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0 483 884 1704 1616 3523202 122128
Remove rows with missing values in the ride_length column using the na.omit() function.
all_trips_v2_clean <- na.omit(all_trips_v2)
View(all_trips_v2_clean)
aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = mean)
## all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1 Customer 2727.148
## 2 Subscriber 955.989
aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = median)
## all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1 Customer 1298
## 2 Subscriber 696
aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = max)
## all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1 Customer 3341033
## 2 Subscriber 3523202
aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = min)
## all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1 Customer 0
## 2 Subscriber 0
See the average ride time by each day for Subscriber vs customer users.
aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype + all_trips_v2_clean$day_of_week, FUN = mean)
## all_trips_v2_clean$usertype all_trips_v2_clean$day_of_week
## 1 Customer Friday
## 2 Subscriber Friday
## 3 Customer Monday
## 4 Subscriber Monday
## 5 Customer Saturday
## 6 Subscriber Saturday
## 7 Customer Sunday
## 8 Subscriber Sunday
## 9 Customer Thursday
## 10 Subscriber Thursday
## 11 Customer Tuesday
## 12 Subscriber Tuesday
## 13 Customer Wednesday
## 14 Subscriber Wednesday
## all_trips_v2_clean$ride_length
## 1 2595.3146
## 2 934.0045
## 3 2729.2281
## 4 905.6358
## 5 2838.4543
## 6 1061.0147
## 7 3068.1051
## 8 1088.6371
## 9 2615.7031
## 10 901.8053
## 11 2455.6268
## 12 898.2676
## 13 2452.9892
## 14 902.8526
all_trips_v2_clean$day_of_week <- ordered(all_trips_v2_clean$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
all_trips_v2_clean$day_of_week <- ordered(all_trips_v2_clean$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype + all_trips_v2_clean$day_of_week, FUN = mean)
## all_trips_v2_clean$usertype all_trips_v2_clean$day_of_week
## 1 Customer Sunday
## 2 Subscriber Sunday
## 3 Customer Monday
## 4 Subscriber Monday
## 5 Customer Tuesday
## 6 Subscriber Tuesday
## 7 Customer Wednesday
## 8 Subscriber Wednesday
## 9 Customer Thursday
## 10 Subscriber Thursday
## 11 Customer Friday
## 12 Subscriber Friday
## 13 Customer Saturday
## 14 Subscriber Saturday
## all_trips_v2_clean$ride_length
## 1 3068.1051
## 2 1088.6371
## 3 2729.2281
## 4 905.6358
## 5 2455.6268
## 6 898.2676
## 7 2452.9892
## 8 902.8526
## 9 2615.7031
## 10 901.8053
## 11 2595.3146
## 12 934.0045
## 13 2838.4543
## 14 1061.0147
Analyze ridership data by type and weekday
all_trips_v2_clean %>%
mutate(weekday = wday(start_time, label = TRUE)) %>% #creates weekday field using wday()
group_by(usertype, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(usertype, weekday)
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups: usertype [2]
## usertype weekday number_of_rides average_duration
## <chr> <ord> <int> <dbl>
## 1 Customer Sun 249761 3068.
## 2 Customer Mon 142042 2729.
## 3 Customer Tue 136218 2456.
## 4 Customer Wed 148352 2453.
## 5 Customer Thu 156177 2616.
## 6 Customer Fri 196451 2595.
## 7 Customer Sat 319017 2838.
## 8 Subscriber Sun 250465 1089.
## 9 Subscriber Mon 251905 906.
## 10 Subscriber Tue 268223 898.
## 11 Subscriber Wed 288390 903.
## 12 Subscriber Thu 283731 902.
## 13 Subscriber Fri 288897 934.
## 14 Subscriber Sat 304608 1061.
all_trips_v2_clean %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(usertype, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(usertype, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = usertype)) +
geom_col(position = "dodge")
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
all_trips_v2_clean %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(usertype, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(usertype, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = usertype)) +
geom_col(position = "dodge")
## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.
Now, Create a csv file that we will visualize in Excel, Tableau, or my presentation software.
counts <- aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype + all_trips_v2_clean$day_of_week, FUN = mean)
write.csv(counts, file = "C:/Users/HP/Desktop/All Data/All Data/avg_ride_length2.csv")
write.csv(all_trips_v2_clean, file = "C:/Users/HP/Desktop/All Data/All Data/trips_clean_data2.csv")