Working on Cyclistic Bike Ride 12 Months Dataset

Divvy(cyclistic 12 Months Dataset Analysis.

This analysis is based on the Divvy case study Sophisticated, Clear, and Polished’: Divvy and Data Visualization” written by Kevin Hartman (found here: https://artscience.blog/home/divvy-dataviz-case-study).

The purpose of this script is to consolidate downloaded Divvy data into a single data frame and then conduct simple analysis to help answer the key question: “In what ways do members and casual riders use Divvy bikes differently?”

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(lubridate)
library(ggplot2)
getwd()

## [1] "C:/Users/HP/Desktop/CapStone Project/Visualized Data"

setwd("C:/Users/HP/Desktop/All Data/All Data")

STEP 1: COLLECT DATA

this were the steps taking to Upload Divvy datasets (csv files)

q1_2021 <- read_csv("C:/Users/HP/Desktop/All Data/202101-divvy-tripdata.csv")

## Rows: 96834 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q2_2021 <- read_csv("C:/Users/HP/Desktop/All Data/202102-divvy-tripdata.csv")

## Rows: 49622 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q3_2021 <- read_csv("C:/Users/HP/Desktop/All Data/202103-divvy-tripdata.csv")

## Rows: 228496 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q4_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202004-divvy-tripdata.csv")

## Rows: 84776 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q5_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202005-divvy-tripdata.csv")

## Rows: 200274 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q6_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202006-divvy-tripdata.csv")

## Rows: 343005 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q7_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202007-divvy-tripdata.csv")

## Rows: 551480 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q8_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202008-divvy-tripdata.csv")

## Rows: 622361 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q9_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202009-divvy-tripdata.csv")

## Rows: 532958 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q10_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202010-divvy-tripdata.csv")

## Rows: 388653 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q11_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202011-divvy-tripdata.csv")

## Rows: 259716 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl  (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q12_2020 <- read_csv("C:/Users/HP/Desktop/All Data/202012-divvy-tripdata.csv")

## Rows: 131573 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

STEP 2:

WRANGLE DATA AND COMBINE INTO A SINGLE FILE

Compare column names for each of the files. This is to ensure that they match perfectly before i can use a command to join them into a single file.

colnames(q1_2021)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q2_2021)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q3_2021)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q4_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q5_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q6_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q7_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q8_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q9_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q10_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q11_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(q12_2020)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

Renaming the Columns

Rename columns to make them consistent with q1_2021 (as this will be the supposed going-forward table design for Divvy).

(q1_2021 <- rename(q1_2021
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 96,834 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>   <chr>  
##  1 E19E6F1B8D4C… electr… 2021-01-23 16:14:19 2021-01-23 16:24:44 Califo… 17660  
##  2 DC88F20C2C55… electr… 2021-01-27 18:43:08 2021-01-27 18:47:12 Califo… 17660  
##  3 EC45C94683FE… electr… 2021-01-21 22:35:54 2021-01-21 22:37:14 Califo… 17660  
##  4 4FA453A75AE3… electr… 2021-01-07 13:31:13 2021-01-07 13:42:55 Califo… 17660  
##  5 BE5E8EB4E726… electr… 2021-01-23 02:24:02 2021-01-23 02:24:45 Califo… 17660  
##  6 5D8969F88C77… electr… 2021-01-09 14:24:07 2021-01-09 15:17:54 Califo… 17660  
##  7 09275CC10F85… electr… 2021-01-04 05:05:04 2021-01-04 05:10:39 Califo… 17660  
##  8 DF7A32A217AE… electr… 2021-01-14 15:07:00 2021-01-14 15:13:40 Califo… 17660  
##  9 C2EFC62379EB… electr… 2021-01-09 09:57:55 2021-01-09 10:00:26 Califo… 17660  
## 10 B9F73448DFBE… classi… 2021-01-24 19:15:38 2021-01-24 19:22:51 Califo… 17660  
## # … with 96,824 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q2_2021 <- rename(q2_2021
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 49,622 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>   <chr>  
##  1 89E7AA6C2922… classi… 2021-02-12 16:14:56 2021-02-12 16:21:43 Glenwo… 525    
##  2 0FEFDE260356… classi… 2021-02-14 17:52:38 2021-02-14 18:12:09 Glenwo… 525    
##  3 E6159D746B2D… electr… 2021-02-09 19:10:18 2021-02-09 19:19:10 Clark … KA1503…
##  4 B32D3199F1C2… classi… 2021-02-02 17:49:41 2021-02-02 17:54:06 Wood S… 637    
##  5 83E463F23575… electr… 2021-02-23 15:07:23 2021-02-23 15:22:37 State … 13216  
##  6 BDAA7E3494E8… electr… 2021-02-24 15:43:33 2021-02-24 15:49:05 Fairba… 18003  
##  7 A77274235117… classi… 2021-02-01 17:47:42 2021-02-01 17:48:33 LaSall… KP1705…
##  8 295476889D9B… classi… 2021-02-11 18:33:53 2021-02-11 18:35:09 Fairba… 18003  
##  9 362087194BA4… classi… 2021-02-27 15:13:39 2021-02-27 15:36:36 LaSall… KP1705…
## 10 21630F715038… classi… 2021-02-20 08:59:42 2021-02-20 09:17:04 LaSall… KP1705…
## # … with 49,612 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q3_2021 <- rename(q3_2021
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 228,496 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>   <chr>  
##  1 CFA86D4455AA… classi… 2021-03-16 08:32:30 2021-03-16 08:36:34 Humbol… 15651  
##  2 30D9DC61227D… classi… 2021-03-28 01:26:28 2021-03-28 01:36:55 Humbol… 15651  
##  3 846D87A15682… classi… 2021-03-11 21:17:29 2021-03-11 21:33:53 Shield… 15443  
##  4 994D05AA75A1… classi… 2021-03-11 13:26:42 2021-03-11 13:55:41 Winthr… TA1308…
##  5 DF7464FBE92D… classi… 2021-03-21 09:09:37 2021-03-21 09:27:33 Glenwo… 525    
##  6 CEBA8516FD17… classi… 2021-03-20 11:08:47 2021-03-20 11:29:39 Glenwo… 525    
##  7 297268586B79… classi… 2021-03-20 14:10:41 2021-03-20 14:22:13 State … 13050  
##  8 F39301858B60… electr… 2021-03-23 07:56:51 2021-03-23 08:05:50 Shore … TA1308…
##  9 D297F199D875… electr… 2021-03-31 15:31:19 2021-03-31 15:35:58 Clinto… 13021  
## 10 36B877141175… classi… 2021-03-11 17:37:37 2021-03-11 17:52:44 Michig… TA1305…
## # … with 228,486 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q4_2020 <- rename(q4_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 84,776 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 A847FADBBC63… docked… 2020-04-26 17:45:14 2020-04-26 18:12:03 Eckhar…      86
##  2 5405B80E996F… docked… 2020-04-17 17:08:54 2020-04-17 17:17:03 Drake …     503
##  3 5DD24A79A4E0… docked… 2020-04-01 17:54:13 2020-04-01 18:08:36 McClur…     142
##  4 2A59BBDF5CDB… docked… 2020-04-07 12:50:19 2020-04-07 13:02:31 Califo…     216
##  5 27AD306C119C… docked… 2020-04-18 10:22:59 2020-04-18 11:15:54 Rush S…     125
##  6 356216E87513… docked… 2020-04-30 17:55:47 2020-04-30 18:01:11 Mies v…     173
##  7 A2759CB06A81… docked… 2020-04-02 14:47:19 2020-04-02 14:52:32 Street…      35
##  8 FC8BC2E2D54F… docked… 2020-04-07 12:22:20 2020-04-07 13:38:09 Ogden …     434
##  9 9EC5648678DE… docked… 2020-04-15 10:30:11 2020-04-15 10:35:55 LaSall…     627
## 10 A8FFF89140C3… docked… 2020-04-04 15:02:28 2020-04-04 15:19:47 Kedzie…     377
## # … with 84,766 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q5_2020 <- rename(q5_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 200,274 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 02668AD35674… docked… 2020-05-27 10:03:52 2020-05-27 10:16:49 Frankl…      36
##  2 7A50CCAF1EDD… docked… 2020-05-25 10:47:11 2020-05-25 11:05:40 Clark …     340
##  3 2FFCDFDB91FE… docked… 2020-05-02 14:11:03 2020-05-02 15:48:21 Kedzie…     260
##  4 58991CF1DB75… docked… 2020-05-02 16:25:36 2020-05-02 16:39:28 Claren…     251
##  5 A79651EFECC2… docked… 2020-05-29 12:49:54 2020-05-29 13:27:11 Hermit…     261
##  6 1466C5B39F68… docked… 2020-05-29 13:27:24 2020-05-29 14:14:45 Halste…     206
##  7 2500D7957D4D… docked… 2020-05-20 12:51:41 2020-05-20 13:46:47 Hermit…     261
##  8 ED42D3E06AFB… docked… 2020-05-06 18:21:42 2020-05-06 19:07:07 Ritchi…     180
##  9 23AFBD962F9C… docked… 2020-05-30 17:00:58 2020-05-30 17:19:52 Halste…     331
## 10 52C0D13F6B81… docked… 2020-05-23 10:22:02 2020-05-23 10:52:02 Damen …     219
## # … with 200,264 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q6_2020 <- rename(q6_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 343,005 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 8CD5DE2C2B6C… docked… 2020-06-13 23:24:48 2020-06-13 23:36:55 Wilton…     117
##  2 9A191EB2C751… docked… 2020-06-26 07:26:10 2020-06-26 07:31:58 Federa…      41
##  3 F37D14B0B565… docked… 2020-06-23 17:12:41 2020-06-23 17:21:14 Daley …      81
##  4 C41237B506E8… docked… 2020-06-20 01:09:35 2020-06-20 01:28:24 Broadw…     303
##  5 4B51B3B0BDA7… docked… 2020-06-25 16:59:25 2020-06-25 17:08:48 Sheffi…     327
##  6 D50DF288196B… docked… 2020-06-17 18:07:18 2020-06-17 18:18:14 Sheffi…     327
##  7 165FA6D223E5… docked… 2020-06-25 07:24:33 2020-06-25 07:31:11 Federa…      41
##  8 D8236CFC050E… docked… 2020-06-19 00:00:56 2020-06-19 00:09:15 Sheffi…     115
##  9 9D82B9B53C37… docked… 2020-06-30 12:11:36 2020-06-30 12:32:43 Calume…     338
## 10 3DFF4AB10A68… docked… 2020-06-28 14:17:09 2020-06-28 14:27:51 Milwau…      84
## # … with 342,995 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q7_2020 <- rename(q7_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 551,480 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 762198876D69… docked… 2020-07-09 15:22:02 2020-07-09 15:25:52 Ritchi…     180
##  2 BEC9C9FBA0D4… docked… 2020-07-24 23:56:30 2020-07-25 00:20:17 Halste…     299
##  3 D2FD8EA432C7… docked… 2020-07-08 19:49:07 2020-07-08 19:56:22 Lake S…     329
##  4 54AE594E20B3… docked… 2020-07-17 19:06:42 2020-07-17 19:27:38 LaSall…     181
##  5 54025FDC7440… docked… 2020-07-04 10:39:57 2020-07-04 10:45:05 Lake S…     268
##  6 65636B619E24… docked… 2020-07-28 16:33:03 2020-07-28 16:49:10 Fairba…     635
##  7 22DB94283ECF… docked… 2020-07-30 11:58:12 2020-07-30 12:16:12 Bissel…     113
##  8 C9D789BEF899… docked… 2020-07-13 16:48:03 2020-07-13 16:57:00 St. Cl…     211
##  9 FBE24D943CDE… docked… 2020-07-30 11:00:12 2020-07-30 11:14:48 Clark …     176
## 10 8A2BBE457325… docked… 2020-07-06 18:05:29 2020-07-06 18:15:38 Frankl…      31
## # … with 551,470 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q8_2020 <- rename(q8_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 622,361 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 322BD23D2877… docked… 2020-08-20 18:08:14 2020-08-20 18:17:51 Lake S…     329
##  2 2A3AEF1AB905… electr… 2020-08-27 18:46:04 2020-08-27 19:54:51 Michig…     168
##  3 67DC1D133E8B… electr… 2020-08-26 19:44:14 2020-08-26 21:53:07 Columb…     195
##  4 C79FBBD412E5… electr… 2020-08-27 12:05:41 2020-08-27 12:53:45 Daley …      81
##  5 13814D3D661E… electr… 2020-08-27 16:49:02 2020-08-27 16:59:49 Leavit…     658
##  6 56349A5A42F0… electr… 2020-08-27 17:26:23 2020-08-27 18:07:50 Leavit…     658
##  7 EB6ABC5570C2… electr… 2020-08-26 20:14:02 2020-08-26 20:34:00 Cityfr…     196
##  8 B4ECE389A1DE… electr… 2020-08-26 21:59:50 2020-08-26 22:12:35 Sheffi…      67
##  9 0B355B0FE076… electr… 2020-08-26 19:17:42 2020-08-26 19:32:14 Southp…     153
## 10 1ECE04F779E9… electr… 2020-08-27 15:13:57 2020-08-27 15:41:59 Theate…     177
## # … with 622,351 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q9_2020 <- rename(q9_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 532,958 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 2B22BD5F95FB… electr… 2020-09-17 14:27:11 2020-09-17 14:44:24 Michig…      52
##  2 A7FB70B4AFC6… electr… 2020-09-17 15:07:31 2020-09-17 15:07:45 W Oakd…      NA
##  3 86057FA01BAC… electr… 2020-09-17 15:09:04 2020-09-17 15:09:35 W Oakd…      NA
##  4 57F6DC9A153D… electr… 2020-09-17 18:10:46 2020-09-17 18:35:49 Ashlan…     246
##  5 B9C4712F78C1… electr… 2020-09-17 15:16:13 2020-09-17 15:52:55 Fairba…      24
##  6 378BBCE1E444… electr… 2020-09-17 18:37:04 2020-09-17 19:23:28 Clark …      94
##  7 0CB5E2496B6F… electr… 2020-09-16 21:39:57 2020-09-16 21:53:44 Wells …     291
##  8 9673F5D39BDB… electr… 2020-09-17 12:18:06 2020-09-17 12:18:19 <NA>         NA
##  9 54B91F5C95B2… electr… 2020-09-17 17:09:17 2020-09-17 17:34:20 <NA>         NA
## 10 91CEBB66076D… electr… 2020-09-17 12:20:25 2020-09-17 12:29:47 <NA>         NA
## # … with 532,948 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q10_2020 <- rename(q10_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 388,653 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 ACB6B40CF5B9… electr… 2020-10-31 19:39:43 2020-10-31 19:57:12 Lakevi…     313
##  2 DF450C72FD10… electr… 2020-10-31 23:50:08 2020-11-01 00:04:16 Southp…     227
##  3 B6396B54A15A… electr… 2020-10-31 23:00:01 2020-10-31 23:08:22 Stony …     102
##  4 44A4AEE261B9… electr… 2020-10-31 22:16:43 2020-10-31 22:19:35 Clark …     165
##  5 10B7DD76A6A2… electr… 2020-10-31 19:38:19 2020-10-31 19:54:32 Southp…     190
##  6 DA6C37596601… electr… 2020-10-29 17:38:04 2020-10-29 17:45:43 Larrab…     359
##  7 C2F3808FD56B… electr… 2020-10-29 09:03:06 2020-10-29 09:17:56 Lakevi…     313
##  8 15B13B5A508B… electr… 2020-10-29 16:37:21 2020-10-29 16:52:40 Rush S…     125
##  9 285D224410C1… electr… 2020-10-28 23:12:03 2020-10-28 23:24:32 <NA>         NA
## 10 E1FB79FFE6DB… electr… 2020-10-29 16:38:44 2020-10-29 16:50:17 Canal …     174
## # … with 388,643 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q11_2020 <- rename(q11_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 259,716 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>     <dbl>
##  1 BD0A6FF6FFF9… electr… 2020-11-01 13:36:00 2020-11-01 13:45:40 Dearbo…     110
##  2 96A7A7A4BDE4… electr… 2020-11-01 10:03:26 2020-11-01 10:14:45 Frankl…     672
##  3 C61526D06582… electr… 2020-11-01 00:34:05 2020-11-01 01:03:06 Lake S…      76
##  4 E533E89C3208… electr… 2020-11-01 00:45:16 2020-11-01 00:54:31 Leavit…     659
##  5 1C9F4EF18C16… electr… 2020-11-01 15:43:25 2020-11-01 16:16:52 Buckin…       2
##  6 7259585D8276… electr… 2020-11-14 15:55:17 2020-11-14 16:44:38 Wabash…      72
##  7 91FE5C8F8A67… electr… 2020-11-14 16:47:29 2020-11-14 17:03:03 Lake S…      76
##  8 9E7A79ADA90C… electr… 2020-11-14 16:04:15 2020-11-14 16:19:33 <NA>         NA
##  9 A5B02C0D41DB… electr… 2020-11-14 16:24:09 2020-11-14 16:51:34 Marshf…      58
## 10 8234407C29FE… electr… 2020-11-14 01:24:22 2020-11-14 01:31:42 Clark …     394
## # … with 259,706 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <dbl>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

(q12_2020 <- rename(q12_2020
                   ,trip_id = ride_id
                   ,bike_id = rideable_type 
                   ,start_time = started_at  
                   ,end_time = ended_at  
                   ,from_station_name = start_station_name 
                   ,from_station_id = start_station_id 
                   ,to_station_name = end_station_name 
                   ,to_station_id = end_station_id 
                   ,usertype = member_casual))

## # A tibble: 131,573 × 13
##    trip_id       bike_id start_time          end_time            from_…¹ from_…²
##    <chr>         <chr>   <dttm>              <dttm>              <chr>   <chr>  
##  1 70B6A9A437D4… classi… 2020-12-27 12:44:29 2020-12-27 12:55:06 Aberde… 13157  
##  2 158A465D4E74… electr… 2020-12-18 17:37:15 2020-12-18 17:44:19 <NA>    <NA>   
##  3 5262016E0F1F… electr… 2020-12-15 15:04:33 2020-12-15 15:11:28 <NA>    <NA>   
##  4 BE119628E44F… electr… 2020-12-15 15:54:18 2020-12-15 16:00:11 <NA>    <NA>   
##  5 69AF78D57854… electr… 2020-12-22 12:08:17 2020-12-22 12:10:59 <NA>    <NA>   
##  6 C1DECC4AB488… electr… 2020-12-22 13:26:37 2020-12-22 13:34:50 <NA>    <NA>   
##  7 B014A60B856C… electr… 2020-12-03 16:23:48 2020-12-03 16:33:39 <NA>    <NA>   
##  8 1E127B1929C0… electr… 2020-12-03 15:03:38 2020-12-03 15:12:39 <NA>    <NA>   
##  9 05F41F5137B5… electr… 2020-12-12 09:26:17 2020-12-12 09:26:35 <NA>    <NA>   
## 10 BB807646588D… electr… 2020-12-18 12:52:06 2020-12-18 12:52:23 <NA>    <NA>   
## # … with 131,563 more rows, 7 more variables: to_station_name <chr>,
## #   to_station_id <chr>, start_lat <dbl>, start_lng <dbl>, end_lat <dbl>,
## #   end_lng <dbl>, usertype <chr>, and abbreviated variable names
## #   ¹from_station_name, ²from_station_id

Now Inspect the Dataframes

Inspect the dataframes and look for incongruencies

str(q1_2021)

## spc_tbl_ [96,834 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ bike_id          : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:96834], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
##  $ end_time         : POSIXct[1:96834], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
##  $ from_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ from_station_id  : chr [1:96834] "17660" "17660" "17660" "17660" ...
##  $ to_station_name  : chr [1:96834] NA NA NA NA ...
##  $ to_station_id    : chr [1:96834] NA NA NA NA ...
##  $ start_lat        : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng        : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat          : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng          : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ usertype         : chr [1:96834] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q2_2021)

## spc_tbl_ [49,622 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:49622] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
##  $ bike_id          : chr [1:49622] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ start_time       : POSIXct[1:49622], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" ...
##  $ end_time         : POSIXct[1:49622], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" ...
##  $ from_station_name: chr [1:49622] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
##  $ from_station_id  : chr [1:49622] "525" "525" "KA1503000012" "637" ...
##  $ to_station_name  : chr [1:49622] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
##  $ to_station_id    : chr [1:49622] "660" "16806" "TA1305000029" "TA1305000034" ...
##  $ start_lat        : num [1:49622] 42 42 41.9 41.9 41.8 ...
##  $ start_lng        : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat          : num [1:49622] 42 42 41.9 41.9 41.8 ...
##  $ end_lng          : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ usertype         : chr [1:49622] "member" "casual" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q3_2021)

## spc_tbl_ [228,496 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:228496] "CFA86D4455AA1030" "30D9DC61227D1AF3" "846D87A15682A284" "994D05AA75A168F2" ...
##  $ bike_id          : chr [1:228496] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
##  $ start_time       : POSIXct[1:228496], format: "2021-03-16 08:32:30" "2021-03-28 01:26:28" ...
##  $ end_time         : POSIXct[1:228496], format: "2021-03-16 08:36:34" "2021-03-28 01:36:55" ...
##  $ from_station_name: chr [1:228496] "Humboldt Blvd & Armitage Ave" "Humboldt Blvd & Armitage Ave" "Shields Ave & 28th Pl" "Winthrop Ave & Lawrence Ave" ...
##  $ from_station_id  : chr [1:228496] "15651" "15651" "15443" "TA1308000021" ...
##  $ to_station_name  : chr [1:228496] "Stave St & Armitage Ave" "Central Park Ave & Bloomingdale Ave" "Halsted St & 35th St" "Broadway & Sheridan Rd" ...
##  $ to_station_id    : chr [1:228496] "13266" "18017" "TA1308000043" "13323" ...
##  $ start_lat        : num [1:228496] 41.9 41.9 41.8 42 42 ...
##  $ start_lng        : num [1:228496] -87.7 -87.7 -87.6 -87.7 -87.7 ...
##  $ end_lat          : num [1:228496] 41.9 41.9 41.8 42 42.1 ...
##  $ end_lng          : num [1:228496] -87.7 -87.7 -87.6 -87.6 -87.7 ...
##  $ usertype         : chr [1:228496] "casual" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q4_2020)

## spc_tbl_ [84,776 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:84776] "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
##  $ bike_id          : chr [1:84776] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ start_time       : POSIXct[1:84776], format: "2020-04-26 17:45:14" "2020-04-17 17:08:54" ...
##  $ end_time         : POSIXct[1:84776], format: "2020-04-26 18:12:03" "2020-04-17 17:17:03" ...
##  $ from_station_name: chr [1:84776] "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
##  $ from_station_id  : num [1:84776] 86 503 142 216 125 173 35 434 627 377 ...
##  $ to_station_name  : chr [1:84776] "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
##  $ to_station_id    : num [1:84776] 152 499 255 657 323 35 635 382 359 508 ...
##  $ start_lat        : num [1:84776] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng        : num [1:84776] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat          : num [1:84776] 41.9 41.9 41.9 41.9 42 ...
##  $ end_lng          : num [1:84776] -87.7 -87.7 -87.6 -87.7 -87.7 ...
##  $ usertype         : chr [1:84776] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q5_2020)

## spc_tbl_ [200,274 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:200274] "02668AD35674B983" "7A50CCAF1EDDB28F" "2FFCDFDB91FE9A52" "58991CF1DB75BA84" ...
##  $ bike_id          : chr [1:200274] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ start_time       : POSIXct[1:200274], format: "2020-05-27 10:03:52" "2020-05-25 10:47:11" ...
##  $ end_time         : POSIXct[1:200274], format: "2020-05-27 10:16:49" "2020-05-25 11:05:40" ...
##  $ from_station_name: chr [1:200274] "Franklin St & Jackson Blvd" "Clark St & Wrightwood Ave" "Kedzie Ave & Milwaukee Ave" "Clarendon Ave & Leland Ave" ...
##  $ from_station_id  : num [1:200274] 36 340 260 251 261 206 261 180 331 219 ...
##  $ to_station_name  : chr [1:200274] "Wabash Ave & Grand Ave" "Clark St & Leland Ave" "Kedzie Ave & Milwaukee Ave" "Lake Shore Dr & Wellington Ave" ...
##  $ to_station_id    : num [1:200274] 199 326 260 157 206 22 261 180 300 305 ...
##  $ start_lat        : num [1:200274] 41.9 41.9 41.9 42 41.9 ...
##  $ start_lng        : num [1:200274] -87.6 -87.6 -87.7 -87.7 -87.7 ...
##  $ end_lat          : num [1:200274] 41.9 42 41.9 41.9 41.8 ...
##  $ end_lng          : num [1:200274] -87.6 -87.7 -87.7 -87.6 -87.6 ...
##  $ usertype         : chr [1:200274] "member" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q6_2020)

## spc_tbl_ [343,005 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:343005] "8CD5DE2C2B6C4CFC" "9A191EB2C751D85D" "F37D14B0B5659BCF" "C41237B506E85FA1" ...
##  $ bike_id          : chr [1:343005] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ start_time       : POSIXct[1:343005], format: "2020-06-13 23:24:48" "2020-06-26 07:26:10" ...
##  $ end_time         : POSIXct[1:343005], format: "2020-06-13 23:36:55" "2020-06-26 07:31:58" ...
##  $ from_station_name: chr [1:343005] "Wilton Ave & Belmont Ave" "Federal St & Polk St" "Daley Center Plaza" "Broadway & Cornelia Ave" ...
##  $ from_station_id  : num [1:343005] 117 41 81 303 327 327 41 115 338 84 ...
##  $ to_station_name  : chr [1:343005] "Damen Ave & Clybourn Ave" "Daley Center Plaza" "State St & Harrison St" "Broadway & Berwyn Ave" ...
##  $ to_station_id    : num [1:343005] 163 81 5 294 117 117 81 303 164 53 ...
##  $ start_lat        : num [1:343005] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng        : num [1:343005] -87.7 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat          : num [1:343005] 41.9 41.9 41.9 42 41.9 ...
##  $ end_lng          : num [1:343005] -87.7 -87.6 -87.6 -87.7 -87.7 ...
##  $ usertype         : chr [1:343005] "casual" "member" "member" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q7_2020)

## spc_tbl_ [551,480 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:551480] "762198876D69004D" "BEC9C9FBA0D4CF1B" "D2FD8EA432C77EC1" "54AE594E20B35881" ...
##  $ bike_id          : chr [1:551480] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
##  $ start_time       : POSIXct[1:551480], format: "2020-07-09 15:22:02" "2020-07-24 23:56:30" ...
##  $ end_time         : POSIXct[1:551480], format: "2020-07-09 15:25:52" "2020-07-25 00:20:17" ...
##  $ from_station_name: chr [1:551480] "Ritchie Ct & Banks St" "Halsted St & Roscoe St" "Lake Shore Dr & Diversey Pkwy" "LaSalle St & Illinois St" ...
##  $ from_station_id  : num [1:551480] 180 299 329 181 268 635 113 211 176 31 ...
##  $ to_station_name  : chr [1:551480] "Wells St & Evergreen Ave" "Broadway & Ridge Ave" "Clark St & Wellington Ave" "Clark St & Armitage Ave" ...
##  $ to_station_id    : num [1:551480] 291 461 156 94 301 289 140 31 191 142 ...
##  $ start_lat        : num [1:551480] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng        : num [1:551480] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat          : num [1:551480] 41.9 42 41.9 41.9 41.9 ...
##  $ end_lng          : num [1:551480] -87.6 -87.7 -87.6 -87.6 -87.6 ...
##  $ usertype         : chr [1:551480] "member" "member" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q8_2020)

## spc_tbl_ [622,361 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:622361] "322BD23D287743ED" "2A3AEF1AB9054D8B" "67DC1D133E8B5816" "C79FBBD412E578A7" ...
##  $ bike_id          : chr [1:622361] "docked_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:622361], format: "2020-08-20 18:08:14" "2020-08-27 18:46:04" ...
##  $ end_time         : POSIXct[1:622361], format: "2020-08-20 18:17:51" "2020-08-27 19:54:51" ...
##  $ from_station_name: chr [1:622361] "Lake Shore Dr & Diversey Pkwy" "Michigan Ave & 14th St" "Columbus Dr & Randolph St" "Daley Center Plaza" ...
##  $ from_station_id  : num [1:622361] 329 168 195 81 658 658 196 67 153 177 ...
##  $ to_station_name  : chr [1:622361] "Clark St & Lincoln Ave" "Michigan Ave & 14th St" "State St & Randolph St" "State St & Kinzie St" ...
##  $ to_station_id    : num [1:622361] 141 168 44 47 658 658 49 229 225 305 ...
##  $ start_lat        : num [1:622361] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng        : num [1:622361] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat          : num [1:622361] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng          : num [1:622361] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ usertype         : chr [1:622361] "member" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q9_2020)

## spc_tbl_ [532,958 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:532958] "2B22BD5F95FB2629" "A7FB70B4AFC6CAF2" "86057FA01BAC778E" "57F6DC9A153DB98C" ...
##  $ bike_id          : chr [1:532958] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:532958], format: "2020-09-17 14:27:11" "2020-09-17 15:07:31" ...
##  $ end_time         : POSIXct[1:532958], format: "2020-09-17 14:44:24" "2020-09-17 15:07:45" ...
##  $ from_station_name: chr [1:532958] "Michigan Ave & Lake St" "W Oakdale Ave & N Broadway" "W Oakdale Ave & N Broadway" "Ashland Ave & Belle Plaine Ave" ...
##  $ from_station_id  : num [1:532958] 52 NA NA 246 24 94 291 NA NA NA ...
##  $ to_station_name  : chr [1:532958] "Green St & Randolph St" "W Oakdale Ave & N Broadway" "W Oakdale Ave & N Broadway" "Montrose Harbor" ...
##  $ to_station_id    : num [1:532958] 112 NA NA 249 24 NA 256 NA NA NA ...
##  $ start_lat        : num [1:532958] 41.9 41.9 41.9 42 41.9 ...
##  $ start_lng        : num [1:532958] -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ end_lat          : num [1:532958] 41.9 41.9 41.9 42 41.9 ...
##  $ end_lng          : num [1:532958] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ usertype         : chr [1:532958] "casual" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q10_2020)

## spc_tbl_ [388,653 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:388653] "ACB6B40CF5B9044C" "DF450C72FD109C01" "B6396B54A15AC0DF" "44A4AEE261B9E854" ...
##  $ bike_id          : chr [1:388653] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:388653], format: "2020-10-31 19:39:43" "2020-10-31 23:50:08" ...
##  $ end_time         : POSIXct[1:388653], format: "2020-10-31 19:57:12" "2020-11-01 00:04:16" ...
##  $ from_station_name: chr [1:388653] "Lakeview Ave & Fullerton Pkwy" "Southport Ave & Waveland Ave" "Stony Island Ave & 67th St" "Clark St & Grace St" ...
##  $ from_station_id  : num [1:388653] 313 227 102 165 190 359 313 125 NA 174 ...
##  $ to_station_name  : chr [1:388653] "Rush St & Hubbard St" "Kedzie Ave & Milwaukee Ave" "University Ave & 57th St" "Broadway & Sheridan Rd" ...
##  $ to_station_id    : num [1:388653] 125 260 423 256 185 53 125 313 199 635 ...
##  $ start_lat        : num [1:388653] 41.9 41.9 41.8 42 41.9 ...
##  $ start_lng        : num [1:388653] -87.6 -87.7 -87.6 -87.7 -87.7 ...
##  $ end_lat          : num [1:388653] 41.9 41.9 41.8 42 41.9 ...
##  $ end_lng          : num [1:388653] -87.6 -87.7 -87.6 -87.7 -87.7 ...
##  $ usertype         : chr [1:388653] "casual" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q11_2020)

## spc_tbl_ [259,716 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:259716] "BD0A6FF6FFF9B921" "96A7A7A4BDE4F82D" "C61526D06582BDC5" "E533E89C32080B9E" ...
##  $ bike_id          : chr [1:259716] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:259716], format: "2020-11-01 13:36:00" "2020-11-01 10:03:26" ...
##  $ end_time         : POSIXct[1:259716], format: "2020-11-01 13:45:40" "2020-11-01 10:14:45" ...
##  $ from_station_name: chr [1:259716] "Dearborn St & Erie St" "Franklin St & Illinois St" "Lake Shore Dr & Monroe St" "Leavitt St & Chicago Ave" ...
##  $ from_station_id  : num [1:259716] 110 672 76 659 2 72 76 NA 58 394 ...
##  $ to_station_name  : chr [1:259716] "St. Clair St & Erie St" "Noble St & Milwaukee Ave" "Federal St & Polk St" "Stave St & Armitage Ave" ...
##  $ to_station_id    : num [1:259716] 211 29 41 185 2 76 72 NA 288 273 ...
##  $ start_lat        : num [1:259716] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng        : num [1:259716] -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ end_lat          : num [1:259716] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng          : num [1:259716] -87.6 -87.7 -87.6 -87.7 -87.6 ...
##  $ usertype         : chr [1:259716] "casual" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_double(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_double(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(q12_2020)

## spc_tbl_ [131,573 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:131573] "70B6A9A437D4C30D" "158A465D4E74C54A" "5262016E0F1F2F9A" "BE119628E44F871E" ...
##  $ bike_id          : chr [1:131573] "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:131573], format: "2020-12-27 12:44:29" "2020-12-18 17:37:15" ...
##  $ end_time         : POSIXct[1:131573], format: "2020-12-27 12:55:06" "2020-12-18 17:44:19" ...
##  $ from_station_name: chr [1:131573] "Aberdeen St & Jackson Blvd" NA NA NA ...
##  $ from_station_id  : chr [1:131573] "13157" NA NA NA ...
##  $ to_station_name  : chr [1:131573] "Desplaines St & Kinzie St" NA NA NA ...
##  $ to_station_id    : chr [1:131573] "TA1306000003" NA NA NA ...
##  $ start_lat        : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
##  $ start_lng        : num [1:131573] -87.7 -87.7 -87.7 -87.7 -87.6 ...
##  $ end_lat          : num [1:131573] 41.9 41.9 41.9 41.9 41.8 ...
##  $ end_lng          : num [1:131573] -87.6 -87.7 -87.7 -87.7 -87.6 ...
##  $ usertype         : chr [1:131573] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Converting Columns To Character

Convert trip_id, bike_id, from_station_id and to_station_id to character so that they can stack correctly.

q1_2021 <-  mutate(q1_2021, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id)) 
q2_2021 <-  mutate(q2_2021, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))  
q3_2021 <-  mutate(q3_2021, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q4_2020 <-  mutate(q4_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q5_2020 <-  mutate(q5_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q6_2020 <-  mutate(q6_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q7_2020 <-  mutate(q7_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q8_2020 <-  mutate(q8_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q9_2020 <-  mutate(q9_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q10_2020 <-  mutate(q10_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q11_2020 <-  mutate(q11_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))
q12_2020 <-  mutate(q12_2020, trip_id = as.character(trip_id)
                   ,bike_id = as.character(bike_id)
                   ,from_station_id = as.character(from_station_id)
                   ,to_station_id = as.character(to_station_id))

Combine the 12 Months Dataset into a Single Dataframe

Stack individual quarter’s data frames into one big data frame.

library(dplyr)
all_trips <- bind_rows(q1_2021, q2_2021, q3_2021, q4_2020, q5_2020, q6_2020, q7_2020, q8_2020, q9_2020, q10_2020, q11_2020, q12_2020)

View(all_trips)

Removing Unecessary Columns

Remove start_lat, start_long, end_lat, and end_long fields as this data were unnecessary for the calculations and visualization.

all_trips <- all_trips %>%  
  select(-c(start_lat, start_lng, end_lat, end_lng))

STEP 3:

DATA CLEANING AND MANIPULATIONS

This involves Cleaning Up and adding Data To Prepare for Analysis.

First, Inspect the new table that has been created

colnames(all_trips)  #List of column names

## [1] "trip_id"           "bike_id"           "start_time"       
## [4] "end_time"          "from_station_name" "from_station_id"  
## [7] "to_station_name"   "to_station_id"     "usertype"

nrow(all_trips)  #How many rows are in data frame?

## [1] 3489748

dim(all_trips)  #Dimensions of the data frame?

## [1] 3489748       9

head(all_trips)  #See the first 6 rows of data frame.  Also tail(all_trips)

## # A tibble: 6 × 9
##   trip_id        bike_id start_time          end_time            from_…¹ from_…²
##   <chr>          <chr>   <dttm>              <dttm>              <chr>   <chr>  
## 1 E19E6F1B8D4C4… electr… 2021-01-23 16:14:19 2021-01-23 16:24:44 Califo… 17660  
## 2 DC88F20C2C55F… electr… 2021-01-27 18:43:08 2021-01-27 18:47:12 Califo… 17660  
## 3 EC45C94683FE3… electr… 2021-01-21 22:35:54 2021-01-21 22:37:14 Califo… 17660  
## 4 4FA453A75AE37… electr… 2021-01-07 13:31:13 2021-01-07 13:42:55 Califo… 17660  
## 5 BE5E8EB4E7263… electr… 2021-01-23 02:24:02 2021-01-23 02:24:45 Califo… 17660  
## 6 5D8969F88C773… electr… 2021-01-09 14:24:07 2021-01-09 15:17:54 Califo… 17660  
## # … with 3 more variables: to_station_name <chr>, to_station_id <chr>,
## #   usertype <chr>, and abbreviated variable names ¹from_station_name,
## #   ²from_station_id

str(all_trips)  #See list of columns and data types (numeric, character, etc)

## tibble [3,489,748 × 9] (S3: tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:3489748] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ bike_id          : chr [1:3489748] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:3489748], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
##  $ end_time         : POSIXct[1:3489748], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
##  $ from_station_name: chr [1:3489748] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ from_station_id  : chr [1:3489748] "17660" "17660" "17660" "17660" ...
##  $ to_station_name  : chr [1:3489748] NA NA NA NA ...
##  $ to_station_id    : chr [1:3489748] NA NA NA NA ...
##  $ usertype         : chr [1:3489748] "member" "member" "member" "member" ...

summary(all_trips)  #Statistical summary of data. Mainly for numerics

##    trip_id            bike_id            start_time                    
##  Length:3489748     Length:3489748     Min.   :2020-04-01 00:00:30.00  
##  Class :character   Class :character   1st Qu.:2020-07-14 19:38:28.00  
##  Mode  :character   Mode  :character   Median :2020-08-29 14:50:36.50  
##                                        Mean   :2020-09-10 01:21:45.98  
##                                        3rd Qu.:2020-10-20 18:14:13.00  
##                                        Max.   :2021-03-31 23:59:08.00  
##     end_time                      from_station_name  from_station_id   
##  Min.   :2020-04-01 00:10:45.00   Length:3489748     Length:3489748    
##  1st Qu.:2020-07-14 20:13:07.75   Class :character   Class :character  
##  Median :2020-08-29 15:21:13.00   Mode  :character   Mode  :character  
##  Mean   :2020-09-10 01:46:31.98                                        
##  3rd Qu.:2020-10-20 18:28:46.25                                        
##  Max.   :2021-04-06 11:00:11.00                                        
##  to_station_name    to_station_id        usertype        
##  Length:3489748     Length:3489748     Length:3489748    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##

This displays the frequency counts of each unique value in the “usertype” column of the “all_trips” dataset.

table(all_trips$usertype)

## 
##  casual  member 
## 1430376 2059372

Reassign to the desired values.

Reassign to the desired values for member & casual

all_trips <-  all_trips %>% 
  mutate(usertype = recode(usertype
                                ,"member" = "Subscriber"
                                ,"casual" = "Customer"))

Check to make sure the proper number of observations were reassigned.

table(all_trips$usertype)

## 
##   Customer Subscriber 
##    1430376    2059372

Adding Columns

Add columns that list the date, month, day, and year of each ride.This will allow us to aggregate ride data for each month, day, or year.

Note: Before completing these operations we could only aggregate at the ride level: https://www.statmethods.net/input/dates.html more on date formats in R found at this link.

all_trips$date <- as.Date(all_trips$start_time) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

Add a “ride_length”

calculation to all_trips (in seconds) refer this link: https://stat.ethz.ch/R-manual/R-devel/library/base/html/difftime.html for more information.

all_trips$ride_length <- difftime(all_trips$end_time, all_trips$start_time)

Inspect the structure of the columns

str(all_trips)

## tibble [3,489,748 × 15] (S3: tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:3489748] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ bike_id          : chr [1:3489748] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ start_time       : POSIXct[1:3489748], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
##  $ end_time         : POSIXct[1:3489748], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
##  $ from_station_name: chr [1:3489748] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ from_station_id  : chr [1:3489748] "17660" "17660" "17660" "17660" ...
##  $ to_station_name  : chr [1:3489748] NA NA NA NA ...
##  $ to_station_id    : chr [1:3489748] NA NA NA NA ...
##  $ usertype         : chr [1:3489748] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ date             : Date[1:3489748], format: "2021-01-23" "2021-01-27" ...
##  $ month            : chr [1:3489748] "01" "01" "01" "01" ...
##  $ day              : chr [1:3489748] "23" "27" "21" "07" ...
##  $ year             : chr [1:3489748] "2021" "2021" "2021" "2021" ...
##  $ day_of_week      : chr [1:3489748] "Saturday" "Wednesday" "Thursday" "Thursday" ...
##  $ ride_length      : 'difftime' num [1:3489748] 625 244 80 702 ...
##   ..- attr(*, "units")= chr "secs"

Convert “ride_length” from Factor to numeric so we can run calculations on the data.

is.factor(all_trips$ride_length)

## [1] FALSE

all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)

## [1] TRUE

Removing “bad” data

The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative

Therefore, it is required to create a new version of the dataframe (v2) since data is being removed. https://www.datasciencemadesimple.com/delete-or-drop-rows-in-r-with-conditions-2/ for more information.

all_trips_v2 <- all_trips[!(all_trips$from_station_name == "HQ QR" | all_trips$ride_length<0),]

STEP 4: CONDUCT DESCRIPTIVE ANALYSIS

Descriptive analysis on ride_length (all figures in seconds).

mean(all_trips_v2$ride_length, na.rm = TRUE) #straight average (total ride length / rides)

## [1] 1704.123

median(all_trips_v2$ride_length, na.rm = TRUE) #midpoint number in the ascending array of ride lengths

## [1] 884

max(all_trips_v2$ride_length, na.rm = TRUE) #longest ride

## [1] 3523202

min(all_trips_v2$ride_length, na.rm = TRUE) #shortest ride

## [1] 0

You can condense the four lines above to one line using summary() on the specific attribute.

summary(all_trips_v2$ride_length)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       0     483     884    1704    1616 3523202  122128

REMOVE MISSING VALUES

Remove rows with missing values in the ride_length column using the na.omit() function.

all_trips_v2_clean <- na.omit(all_trips_v2)

View all_trips_v2_clean

View(all_trips_v2_clean)

Compare Subscriber and Customer users

aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = mean)

##   all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1                    Customer                       2727.148
## 2                  Subscriber                        955.989

aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = median)

##   all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1                    Customer                           1298
## 2                  Subscriber                            696

aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = max)

##   all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1                    Customer                        3341033
## 2                  Subscriber                        3523202

aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype, FUN = min)

##   all_trips_v2_clean$usertype all_trips_v2_clean$ride_length
## 1                    Customer                              0
## 2                  Subscriber                              0

Calculating the Average ride length

See the average ride time by each day for Subscriber vs customer users.

aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype + all_trips_v2_clean$day_of_week, FUN = mean)

##    all_trips_v2_clean$usertype all_trips_v2_clean$day_of_week
## 1                     Customer                         Friday
## 2                   Subscriber                         Friday
## 3                     Customer                         Monday
## 4                   Subscriber                         Monday
## 5                     Customer                       Saturday
## 6                   Subscriber                       Saturday
## 7                     Customer                         Sunday
## 8                   Subscriber                         Sunday
## 9                     Customer                       Thursday
## 10                  Subscriber                       Thursday
## 11                    Customer                        Tuesday
## 12                  Subscriber                        Tuesday
## 13                    Customer                      Wednesday
## 14                  Subscriber                      Wednesday
##    all_trips_v2_clean$ride_length
## 1                       2595.3146
## 2                        934.0045
## 3                       2729.2281
## 4                        905.6358
## 5                       2838.4543
## 6                       1061.0147
## 7                       3068.1051
## 8                       1088.6371
## 9                       2615.7031
## 10                       901.8053
## 11                      2455.6268
## 12                       898.2676
## 13                      2452.9892
## 14                       902.8526

Notice that the days of the week are out of order. Let’s fix that.

all_trips_v2_clean$day_of_week <- ordered(all_trips_v2_clean$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
all_trips_v2_clean$day_of_week <- ordered(all_trips_v2_clean$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

Now, let’s run the average ride time by each day for Subscriber vs customer users.

aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype + all_trips_v2_clean$day_of_week, FUN = mean)

##    all_trips_v2_clean$usertype all_trips_v2_clean$day_of_week
## 1                     Customer                         Sunday
## 2                   Subscriber                         Sunday
## 3                     Customer                         Monday
## 4                   Subscriber                         Monday
## 5                     Customer                        Tuesday
## 6                   Subscriber                        Tuesday
## 7                     Customer                      Wednesday
## 8                   Subscriber                      Wednesday
## 9                     Customer                       Thursday
## 10                  Subscriber                       Thursday
## 11                    Customer                         Friday
## 12                  Subscriber                         Friday
## 13                    Customer                       Saturday
## 14                  Subscriber                       Saturday
##    all_trips_v2_clean$ride_length
## 1                       3068.1051
## 2                       1088.6371
## 3                       2729.2281
## 4                        905.6358
## 5                       2455.6268
## 6                        898.2676
## 7                       2452.9892
## 8                        902.8526
## 9                       2615.7031
## 10                       901.8053
## 11                      2595.3146
## 12                       934.0045
## 13                      2838.4543
## 14                      1061.0147

ANALYSIS

Analyze ridership data by type and weekday

all_trips_v2_clean %>% 
  mutate(weekday = wday(start_time, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(usertype, weekday) %>%  #groups by usertype and weekday
  summarise(number_of_rides = n()                           #calculates the number of rides and average duration 
            ,average_duration = mean(ride_length)) %>%      # calculates the average duration
  arrange(usertype, weekday)

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

## # A tibble: 14 × 4
## # Groups:   usertype [2]
##    usertype   weekday number_of_rides average_duration
##    <chr>      <ord>             <int>            <dbl>
##  1 Customer   Sun              249761            3068.
##  2 Customer   Mon              142042            2729.
##  3 Customer   Tue              136218            2456.
##  4 Customer   Wed              148352            2453.
##  5 Customer   Thu              156177            2616.
##  6 Customer   Fri              196451            2595.
##  7 Customer   Sat              319017            2838.
##  8 Subscriber Sun              250465            1089.
##  9 Subscriber Mon              251905             906.
## 10 Subscriber Tue              268223             898.
## 11 Subscriber Wed              288390             903.
## 12 Subscriber Thu              283731             902.
## 13 Subscriber Fri              288897             934.
## 14 Subscriber Sat              304608            1061.

VISUALIZATION

Now Let’s visualize the number of rides by rider type

all_trips_v2_clean %>% 
  mutate(weekday = wday(start_time, label = TRUE)) %>% 
  group_by(usertype, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(usertype, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = usertype)) +
  geom_col(position = "dodge")

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

Let’s create a visualization for average duration

all_trips_v2_clean %>% 
  mutate(weekday = wday(start_time, label = TRUE)) %>% 
  group_by(usertype, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length)) %>% 
  arrange(usertype, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = usertype)) +
  geom_col(position = "dodge")

## `summarise()` has grouped output by 'usertype'. You can override using the
## `.groups` argument.

STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS

Now, Create a csv file that we will visualize in Excel, Tableau, or my presentation software.

counts <- aggregate(all_trips_v2_clean$ride_length ~ all_trips_v2_clean$usertype + all_trips_v2_clean$day_of_week, FUN = mean)

Export files to your directory

write.csv(counts, file = "C:/Users/HP/Desktop/All Data/All Data/avg_ride_length2.csv")

write.csv(all_trips_v2_clean, file = "C:/Users/HP/Desktop/All Data/All Data/trips_clean_data2.csv")