Introduction

This is an analysis of the Bike trips of Cyclistic users from January 2023 to December 2023

Loading packages

library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (lubridate)

Importing data for analysis

January <- read_csv ("January_2023.csv")
## Rows: 190301 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
February <- read_csv ("February_2023.csv")
## Rows: 190445 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
March <- read_csv ("March_2023.csv")
## Rows: 258678 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
April <- read_csv ("April_2023.csv")
## Rows: 426590 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
May <- read_csv ("May_2023.csv")
## Rows: 604827 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
June <- read_csv ("June_2023.csv")
## Rows: 719618 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
July <- read_csv ("July_2023.csv")
## Rows: 767650 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
August <- read_csv ("August_2023.csv")
## Rows: 771693 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
September <- read_csv ("September_2023.csv")
## Rows: 666371 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
October <- read_csv ("October_2023.csv")
## Rows: 537113 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
November <- read_csv ("November_2023.csv")
## Rows: 362518 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
December <- read_csv ("December_2023.csv")
## Rows: 224073 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Checking column names for inconsistencies

colnames (January)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (February)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (March)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (April)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (May)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (June)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (July)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (August)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (September)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (October)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (November)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
colnames (December)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

Checking structure

str (January)
## spc_tbl_ [190,301 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:190301] "F96D5A74A3E41399" "13CB7EB698CEDB88" "BD88A2E670661CE5" "C90792D034FED968" ...
##  $ rideable_type     : chr [1:190301] "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:190301], format: "2023-01-21 20:05:42" "2023-01-10 15:37:36" ...
##  $ ended_at          : POSIXct[1:190301], format: "2023-01-21 20:16:33" "2023-01-10 15:46:05" ...
##  $ start_station_name: chr [1:190301] "Lincoln Ave & Fullerton Ave" "Kimbark Ave & 53rd St" "Western Ave & Lunt Ave" "Kimbark Ave & 53rd St" ...
##  $ start_station_id  : chr [1:190301] "TA1309000058" "TA1309000037" "RP-005" "TA1309000037" ...
##  $ end_station_name  : chr [1:190301] "Hampden Ct & Diversey Ave" "Greenwood Ave & 47th St" "Valli Produce - Evanston Plaza" "Greenwood Ave & 47th St" ...
##  $ end_station_id    : chr [1:190301] "202480.0" "TA1308000002" "599" "TA1308000002" ...
##  $ start_lat         : num [1:190301] 41.9 41.8 42 41.8 41.8 ...
##  $ start_lng         : num [1:190301] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num [1:190301] 41.9 41.8 42 41.8 41.8 ...
##  $ end_lng           : num [1:190301] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:190301] "member" "member" "casual" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (February)
## spc_tbl_ [190,445 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:190445] "CBCD0D7777F0E45F" "F3EC5FCE5FF39DE9" "E54C1F27FA9354FF" "3D561E04F739CC45" ...
##  $ rideable_type     : chr [1:190445] "classic_bike" "electric_bike" "classic_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:190445], format: "2023-02-14 11:59:42" "2023-02-15 13:53:48" ...
##  $ ended_at          : POSIXct[1:190445], format: "2023-02-14 12:13:38" "2023-02-15 13:59:08" ...
##  $ start_station_name: chr [1:190445] "Southport Ave & Clybourn Ave" "Clarendon Ave & Gordon Ter" "Southport Ave & Clybourn Ave" "Southport Ave & Clybourn Ave" ...
##  $ start_station_id  : chr [1:190445] "TA1309000030" "13379" "TA1309000030" "TA1309000030" ...
##  $ end_station_name  : chr [1:190445] "Clark St & Schiller St" "Sheridan Rd & Lawrence Ave" "Aberdeen St & Monroe St" "Franklin St & Adams St (Temp)" ...
##  $ end_station_id    : chr [1:190445] "TA1309000024" "TA1309000041" "13156" "TA1309000008" ...
##  $ start_lat         : num [1:190445] 41.9 42 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:190445] -87.7 -87.6 -87.7 -87.7 -87.6 ...
##  $ end_lat           : num [1:190445] 41.9 42 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:190445] -87.6 -87.7 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:190445] "casual" "casual" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (March)
## spc_tbl_ [258,678 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:258678] "6842AA605EE9FBB3" "F984267A75B99A8C" "FF7CF57CFE026D02" "6B61B916032CB6D6" ...
##  $ rideable_type     : chr [1:258678] "electric_bike" "electric_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:258678], format: "2023-03-16 08:20:34" "2023-03-04 14:07:06" ...
##  $ ended_at          : POSIXct[1:258678], format: "2023-03-16 08:22:52" "2023-03-04 14:15:31" ...
##  $ start_station_name: chr [1:258678] "Clark St & Armitage Ave" "Public Rack - Kedzie Ave & Argyle St" "Orleans St & Chestnut St (NEXT Apts)" "Desplaines St & Kinzie St" ...
##  $ start_station_id  : chr [1:258678] "13146" "491" "620" "TA1306000003" ...
##  $ end_station_name  : chr [1:258678] "Larrabee St & Webster Ave" NA "Clark St & Randolph St" "Sheffield Ave & Kingsbury St" ...
##  $ end_station_id    : chr [1:258678] "13193" NA "TA1305000030" "13154" ...
##  $ start_lat         : num [1:258678] 41.9 42 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:258678] -87.6 -87.7 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:258678] 41.9 42 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:258678] -87.6 -87.7 -87.6 -87.7 -87.7 ...
##  $ member_casual     : chr [1:258678] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (April)
## spc_tbl_ [426,590 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:426590] "8FE8F7D9C10E88C7" "34E4ED3ADF1D821B" "5296BF07A2F77CB5" "40759916B76D5D52" ...
##  $ rideable_type     : chr [1:426590] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:426590], format: "2023-04-02 08:37:28" "2023-04-19 11:29:02" ...
##  $ ended_at          : POSIXct[1:426590], format: "2023-04-02 08:41:37" "2023-04-19 11:52:12" ...
##  $ start_station_name: chr [1:426590] NA NA NA NA ...
##  $ start_station_id  : chr [1:426590] NA NA NA NA ...
##  $ end_station_name  : chr [1:426590] NA NA NA NA ...
##  $ end_station_id    : chr [1:426590] NA NA NA NA ...
##  $ start_lat         : num [1:426590] 41.8 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:426590] -87.6 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:426590] 41.8 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:426590] -87.6 -87.7 -87.7 -87.7 -87.6 ...
##  $ member_casual     : chr [1:426590] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (May)
## spc_tbl_ [604,827 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:604827] "0D9FA920C3062031" "92485E5FB5888ACD" "FB144B3FC8300187" "DDEB93BC2CE9AA77" ...
##  $ rideable_type     : chr [1:604827] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:604827], format: "2023-05-07 19:53:48" "2023-05-06 18:54:08" ...
##  $ ended_at          : POSIXct[1:604827], format: "2023-05-07 19:58:32" "2023-05-06 19:03:35" ...
##  $ start_station_name: chr [1:604827] "Southport Ave & Belmont Ave" "Southport Ave & Belmont Ave" "Halsted St & 21st St" "Carpenter St & Huron St" ...
##  $ start_station_id  : chr [1:604827] "13229" "13229" "13162" "13196" ...
##  $ end_station_name  : chr [1:604827] NA NA NA "Damen Ave & Cortland St" ...
##  $ end_station_id    : chr [1:604827] NA NA NA "13133" ...
##  $ start_lat         : num [1:604827] 41.9 41.9 41.9 41.9 42 ...
##  $ start_lng         : num [1:604827] -87.7 -87.7 -87.6 -87.7 -87.7 ...
##  $ end_lat           : num [1:604827] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:604827] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:604827] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (June)
## spc_tbl_ [719,618 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:719618] "6F1682AC40EB6F71" "622A1686D64948EB" "3C88859D926253B4" "EAD8A5E0259DEC88" ...
##  $ rideable_type     : chr [1:719618] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:719618], format: "2023-06-05 13:34:12" "2023-06-05 01:30:22" ...
##  $ ended_at          : POSIXct[1:719618], format: "2023-06-05 14:31:56" "2023-06-05 01:33:06" ...
##  $ start_station_name: chr [1:719618] NA NA NA NA ...
##  $ start_station_id  : chr [1:719618] NA NA NA NA ...
##  $ end_station_name  : chr [1:719618] NA NA NA NA ...
##  $ end_station_id    : chr [1:719618] NA NA NA NA ...
##  $ start_lat         : num [1:719618] 41.9 41.9 42 42 42 ...
##  $ start_lng         : num [1:719618] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:719618] 41.9 41.9 41.9 42 42 ...
##  $ end_lng           : num [1:719618] -87.7 -87.7 -87.6 -87.7 -87.7 ...
##  $ member_casual     : chr [1:719618] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (July)
## spc_tbl_ [767,650 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:767650] "9340B064F0AEE130" "D1460EE3CE0D8AF8" "DF41BE31B895A25E" "9624A293749EF703" ...
##  $ rideable_type     : chr [1:767650] "electric_bike" "classic_bike" "classic_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:767650], format: "2023-07-23 20:06:14" "2023-07-23 17:05:07" ...
##  $ ended_at          : POSIXct[1:767650], format: "2023-07-23 20:22:44" "2023-07-23 17:18:37" ...
##  $ start_station_name: chr [1:767650] "Kedzie Ave & 110th St" "Western Ave & Walton St" "Western Ave & Walton St" "Racine Ave & Randolph St" ...
##  $ start_station_id  : chr [1:767650] "20204" "KA1504000103" "KA1504000103" "13155" ...
##  $ end_station_name  : chr [1:767650] "Public Rack - Racine Ave & 109th Pl" "Milwaukee Ave & Grand Ave" "Damen Ave & Pierce Ave" "Clinton St & Madison St" ...
##  $ end_station_id    : chr [1:767650] "877" "13033" "TA1305000041" "TA1305000032" ...
##  $ start_lat         : num [1:767650] 41.7 41.9 41.9 41.9 42 ...
##  $ start_lng         : num [1:767650] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:767650] 41.7 41.9 41.9 41.9 42 ...
##  $ end_lng           : num [1:767650] -87.7 -87.6 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:767650] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (August)
## spc_tbl_ [771,693 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:771693] "903C30C2D810A53B" "F2FB18A98E110A2B" "D0DEC7C94E4663DA" "E0DDDC5F84747ED9" ...
##  $ rideable_type     : chr [1:771693] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:771693], format: "2023-08-19 15:41:53" "2023-08-18 15:30:18" ...
##  $ ended_at          : POSIXct[1:771693], format: "2023-08-19 15:53:36" "2023-08-18 15:45:25" ...
##  $ start_station_name: chr [1:771693] "LaSalle St & Illinois St" "Clark St & Randolph St" "Clark St & Randolph St" "Wells St & Elm St" ...
##  $ start_station_id  : chr [1:771693] "13430" "TA1305000030" "TA1305000030" "KA1504000135" ...
##  $ end_station_name  : chr [1:771693] "Clark St & Elm St" NA NA NA ...
##  $ end_station_id    : chr [1:771693] "TA1307000039" NA NA NA ...
##  $ start_lat         : num [1:771693] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:771693] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:771693] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:771693] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ member_casual     : chr [1:771693] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (September)
## spc_tbl_ [666,371 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:666371] "011C1903BF4E2E28" "87DB80E048A1BF9F" "7C2EB7AF669066E3" "57D197B010269CE3" ...
##  $ rideable_type     : chr [1:666371] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:666371], format: "2023-09-23 00:27:50" "2023-09-02 09:26:43" ...
##  $ ended_at          : POSIXct[1:666371], format: "2023-09-23 00:33:27" "2023-09-02 09:38:19" ...
##  $ start_station_name: chr [1:666371] "Halsted St & Wrightwood Ave" "Clark St & Drummond Pl" "Financial Pl & Ida B Wells Dr" "Clark St & Drummond Pl" ...
##  $ start_station_id  : chr [1:666371] "TA1309000061" "TA1307000142" "SL-010" "TA1307000142" ...
##  $ end_station_name  : chr [1:666371] "Sheffield Ave & Wellington Ave" "Racine Ave & Fullerton Ave" "Racine Ave & 15th St" "Racine Ave & Belmont Ave" ...
##  $ end_station_id    : chr [1:666371] "TA1307000052" "TA1306000026" "13304" "TA1308000019" ...
##  $ start_lat         : num [1:666371] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:666371] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:666371] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:666371] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:666371] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (October)
## spc_tbl_ [537,113 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:537113] "4449097279F8BBE7" "9CF060543CA7B439" "667F21F4D6BDE69C" "F92714CC6B019B96" ...
##  $ rideable_type     : chr [1:537113] "classic_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:537113], format: "2023-10-08 10:36:26" "2023-10-11 17:23:59" ...
##  $ ended_at          : POSIXct[1:537113], format: "2023-10-08 10:49:19" "2023-10-11 17:36:08" ...
##  $ start_station_name: chr [1:537113] "Orleans St & Chestnut St (NEXT Apts)" "Desplaines St & Kinzie St" "Orleans St & Chestnut St (NEXT Apts)" "Desplaines St & Kinzie St" ...
##  $ start_station_id  : chr [1:537113] "620" "TA1306000003" "620" "TA1306000003" ...
##  $ end_station_name  : chr [1:537113] "Sheffield Ave & Webster Ave" "Sheffield Ave & Webster Ave" "Franklin St & Lake St" "Franklin St & Lake St" ...
##  $ end_station_id    : chr [1:537113] "TA1309000033" "TA1309000033" "TA1307000111" "TA1307000111" ...
##  $ start_lat         : num [1:537113] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:537113] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:537113] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:537113] -87.7 -87.7 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:537113] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (November)
## spc_tbl_ [362,518 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:362518] "4EAD8F1AD547356B" "6322270563BF5470" "B37BDE091ECA38E0" "CF0CA5DD26E4F90E" ...
##  $ rideable_type     : chr [1:362518] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:362518], format: "2023-11-30 21:50:05" "2023-11-03 09:44:02" ...
##  $ ended_at          : POSIXct[1:362518], format: "2023-11-30 22:13:27" "2023-11-03 10:17:15" ...
##  $ start_station_name: chr [1:362518] "Millennium Park" "Broadway & Sheridan Rd" "State St & Pearson St" "Theater on the Lake" ...
##  $ start_station_id  : chr [1:362518] "13008" "13323" "TA1307000061" "TA1308000001" ...
##  $ end_station_name  : chr [1:362518] "Pine Grove Ave & Waveland Ave" "Broadway & Sheridan Rd" "State St & Pearson St" "Theater on the Lake" ...
##  $ end_station_id    : chr [1:362518] "TA1307000150" "13323" "TA1307000061" "TA1308000001" ...
##  $ start_lat         : num [1:362518] 41.9 42 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:362518] -87.6 -87.7 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:362518] 41.9 42 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:362518] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:362518] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str (December)
## spc_tbl_ [224,073 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:224073] "C9BD54F578F57246" "CDBD92F067FA620E" "ABC0858E52CBFC84" "F44B6F0E8F76DC90" ...
##  $ rideable_type     : chr [1:224073] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:224073], format: "2023-12-02 18:44:01" "2023-12-02 18:48:19" ...
##  $ ended_at          : POSIXct[1:224073], format: "2023-12-02 18:47:51" "2023-12-02 18:54:48" ...
##  $ start_station_name: chr [1:224073] NA NA NA NA ...
##  $ start_station_id  : chr [1:224073] NA NA NA NA ...
##  $ end_station_name  : chr [1:224073] NA NA NA NA ...
##  $ end_station_id    : chr [1:224073] NA NA NA NA ...
##  $ start_lat         : num [1:224073] 41.9 41.9 41.9 42 41.9 ...
##  $ start_lng         : num [1:224073] -87.7 -87.7 -87.6 -87.7 -87.6 ...
##  $ end_lat           : num [1:224073] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:224073] -87.7 -87.6 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:224073] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Aggregating and inspecting aggregated data

Annual_trips <- bind_rows (January, February, March, April, May, June, July, August, September, October, November, December)
colnames (Annual_trips)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
nrow (Annual_trips)
## [1] 5719877
dim (Annual_trips)
## [1] 5719877      13
head (Annual_trips)
## # A tibble: 6 × 13
##   ride_id          rideable_type started_at          ended_at           
##   <chr>            <chr>         <dttm>              <dttm>             
## 1 F96D5A74A3E41399 electric_bike 2023-01-21 20:05:42 2023-01-21 20:16:33
## 2 13CB7EB698CEDB88 classic_bike  2023-01-10 15:37:36 2023-01-10 15:46:05
## 3 BD88A2E670661CE5 electric_bike 2023-01-02 07:51:57 2023-01-02 08:05:11
## 4 C90792D034FED968 classic_bike  2023-01-22 10:52:58 2023-01-22 11:01:44
## 5 3397017529188E8A classic_bike  2023-01-12 13:58:01 2023-01-12 14:13:20
## 6 58E68156DAE3E311 electric_bike 2023-01-31 07:18:03 2023-01-31 07:21:16
## # ℹ 9 more variables: start_station_name <chr>, start_station_id <chr>,
## #   end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## #   start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>
str (Annual_trips)
## spc_tbl_ [5,719,877 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5719877] "F96D5A74A3E41399" "13CB7EB698CEDB88" "BD88A2E670661CE5" "C90792D034FED968" ...
##  $ rideable_type     : chr [1:5719877] "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:5719877], format: "2023-01-21 20:05:42" "2023-01-10 15:37:36" ...
##  $ ended_at          : POSIXct[1:5719877], format: "2023-01-21 20:16:33" "2023-01-10 15:46:05" ...
##  $ start_station_name: chr [1:5719877] "Lincoln Ave & Fullerton Ave" "Kimbark Ave & 53rd St" "Western Ave & Lunt Ave" "Kimbark Ave & 53rd St" ...
##  $ start_station_id  : chr [1:5719877] "TA1309000058" "TA1309000037" "RP-005" "TA1309000037" ...
##  $ end_station_name  : chr [1:5719877] "Hampden Ct & Diversey Ave" "Greenwood Ave & 47th St" "Valli Produce - Evanston Plaza" "Greenwood Ave & 47th St" ...
##  $ end_station_id    : chr [1:5719877] "202480.0" "TA1308000002" "599" "TA1308000002" ...
##  $ start_lat         : num [1:5719877] 41.9 41.8 42 41.8 41.8 ...
##  $ start_lng         : num [1:5719877] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num [1:5719877] 41.9 41.8 42 41.8 41.8 ...
##  $ end_lng           : num [1:5719877] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:5719877] "member" "member" "casual" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary (Annual_trips)
##    ride_id          rideable_type        started_at                    
##  Length:5719877     Length:5719877     Min.   :2023-01-01 00:01:58.00  
##  Class :character   Class :character   1st Qu.:2023-05-21 12:50:44.00  
##  Mode  :character   Mode  :character   Median :2023-07-20 18:02:50.00  
##                                        Mean   :2023-07-16 10:27:50.01  
##                                        3rd Qu.:2023-09-16 20:08:49.00  
##                                        Max.   :2023-12-31 23:59:38.00  
##                                                                        
##     ended_at                      start_station_name start_station_id  
##  Min.   :2023-01-01 00:02:41.00   Length:5719877     Length:5719877    
##  1st Qu.:2023-05-21 13:14:09.00   Class :character   Class :character  
##  Median :2023-07-20 18:19:47.00   Mode  :character   Mode  :character  
##  Mean   :2023-07-16 10:46:00.18                                        
##  3rd Qu.:2023-09-16 20:28:10.00                                        
##  Max.   :2024-01-01 23:50:51.00                                        
##                                                                        
##  end_station_name   end_station_id       start_lat       start_lng     
##  Length:5719877     Length:5719877     Min.   :41.63   Min.   :-87.94  
##  Class :character   Class :character   1st Qu.:41.88   1st Qu.:-87.66  
##  Mode  :character   Mode  :character   Median :41.90   Median :-87.64  
##                                        Mean   :41.90   Mean   :-87.65  
##                                        3rd Qu.:41.93   3rd Qu.:-87.63  
##                                        Max.   :42.07   Max.   :-87.46  
##                                                                        
##     end_lat         end_lng       member_casual     
##  Min.   : 0.00   Min.   :-88.16   Length:5719877    
##  1st Qu.:41.88   1st Qu.:-87.66   Class :character  
##  Median :41.90   Median :-87.64   Mode  :character  
##  Mean   :41.90   Mean   :-87.65                     
##  3rd Qu.:41.93   3rd Qu.:-87.63                     
##  Max.   :42.18   Max.   :  0.00                     
##  NA's   :6990    NA's   :6990
n_distinct(Annual_trips$rideable_type)
## [1] 3
unique (Annual_trips$rideable_type)
## [1] "electric_bike" "classic_bike"  "docked_bike"
n_distinct(Annual_trips$member_casual)
## [1] 2
unique (Annual_trips$member_casual)
## [1] "member" "casual"

Adding more columns

Annual_trips$date <- as.Date(Annual_trips$started_at)#Adds date column
Annual_trips$month <- format(as.Date(Annual_trips$date),"%m") #Adds month column
Annual_trips$day <- format(as.Date(Annual_trips$date), "%d") #Adds day column
Annual_trips$year <- format(as.Date(Annual_trips$date), "%Y") #Adds year column
Annual_trips$weekday <- format(as.Date(Annual_trips$date), "%A") #Adds weekday column
Annual_trips$starting_hour <- format(as.POSIXct(Annual_trips$started_at), "%H") #Adds starting_hour column

Annual_trips <- Annual_trips %>% #Adds season column
  mutate(season=
           case_when(month %in% c('12', '01', '02')~'winter', month %in% c('05', '04', '03')~'spring', month %in% c('06','07','08')~'summer', month %in% c('09', '10', '11')~'fall'))

Annual_trips$ride_length <- difftime(Annual_trips$ended_at, Annual_trips$started_at, units="mins") #Adds ride_length column
Annual_trips$ride_length <- round(Annual_trips$ride_length, 2)

Annual_trips <- Annual_trips %>% #Adds route column
  mutate(route = paste(start_station_name, "to", sep=" "))
Annual_trips <- Annual_trips %>% 
  mutate(route = paste(route, end_station_name,sep=" "))

Inspecting columns

colnames (Annual_trips)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"      "date"               "month"             
## [16] "day"                "year"               "weekday"           
## [19] "starting_hour"      "season"             "ride_length"       
## [22] "route"
summary(Annual_trips)
##    ride_id          rideable_type        started_at                    
##  Length:5719877     Length:5719877     Min.   :2023-01-01 00:01:58.00  
##  Class :character   Class :character   1st Qu.:2023-05-21 12:50:44.00  
##  Mode  :character   Mode  :character   Median :2023-07-20 18:02:50.00  
##                                        Mean   :2023-07-16 10:27:50.01  
##                                        3rd Qu.:2023-09-16 20:08:49.00  
##                                        Max.   :2023-12-31 23:59:38.00  
##                                                                        
##     ended_at                      start_station_name start_station_id  
##  Min.   :2023-01-01 00:02:41.00   Length:5719877     Length:5719877    
##  1st Qu.:2023-05-21 13:14:09.00   Class :character   Class :character  
##  Median :2023-07-20 18:19:47.00   Mode  :character   Mode  :character  
##  Mean   :2023-07-16 10:46:00.18                                        
##  3rd Qu.:2023-09-16 20:28:10.00                                        
##  Max.   :2024-01-01 23:50:51.00                                        
##                                                                        
##  end_station_name   end_station_id       start_lat       start_lng     
##  Length:5719877     Length:5719877     Min.   :41.63   Min.   :-87.94  
##  Class :character   Class :character   1st Qu.:41.88   1st Qu.:-87.66  
##  Mode  :character   Mode  :character   Median :41.90   Median :-87.64  
##                                        Mean   :41.90   Mean   :-87.65  
##                                        3rd Qu.:41.93   3rd Qu.:-87.63  
##                                        Max.   :42.07   Max.   :-87.46  
##                                                                        
##     end_lat         end_lng       member_casual           date           
##  Min.   : 0.00   Min.   :-88.16   Length:5719877     Min.   :2023-01-01  
##  1st Qu.:41.88   1st Qu.:-87.66   Class :character   1st Qu.:2023-05-21  
##  Median :41.90   Median :-87.64   Mode  :character   Median :2023-07-20  
##  Mean   :41.90   Mean   :-87.65                      Mean   :2023-07-15  
##  3rd Qu.:41.93   3rd Qu.:-87.63                      3rd Qu.:2023-09-16  
##  Max.   :42.18   Max.   :  0.00                      Max.   :2023-12-31  
##  NA's   :6990    NA's   :6990                                            
##     month               day                year             weekday         
##  Length:5719877     Length:5719877     Length:5719877     Length:5719877    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  starting_hour         season          ride_length          route          
##  Length:5719877     Length:5719877     Length:5719877    Length:5719877    
##  Class :character   Class :character   Class :difftime   Class :character  
##  Mode  :character   Mode  :character   Mode  :numeric    Mode  :character  
##                                                                            
##                                                                            
##                                                                            
## 
str (Annual_trips)
## tibble [5,719,877 × 22] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5719877] "F96D5A74A3E41399" "13CB7EB698CEDB88" "BD88A2E670661CE5" "C90792D034FED968" ...
##  $ rideable_type     : chr [1:5719877] "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:5719877], format: "2023-01-21 20:05:42" "2023-01-10 15:37:36" ...
##  $ ended_at          : POSIXct[1:5719877], format: "2023-01-21 20:16:33" "2023-01-10 15:46:05" ...
##  $ start_station_name: chr [1:5719877] "Lincoln Ave & Fullerton Ave" "Kimbark Ave & 53rd St" "Western Ave & Lunt Ave" "Kimbark Ave & 53rd St" ...
##  $ start_station_id  : chr [1:5719877] "TA1309000058" "TA1309000037" "RP-005" "TA1309000037" ...
##  $ end_station_name  : chr [1:5719877] "Hampden Ct & Diversey Ave" "Greenwood Ave & 47th St" "Valli Produce - Evanston Plaza" "Greenwood Ave & 47th St" ...
##  $ end_station_id    : chr [1:5719877] "202480.0" "TA1308000002" "599" "TA1308000002" ...
##  $ start_lat         : num [1:5719877] 41.9 41.8 42 41.8 41.8 ...
##  $ start_lng         : num [1:5719877] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num [1:5719877] 41.9 41.8 42 41.8 41.8 ...
##  $ end_lng           : num [1:5719877] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:5719877] "member" "member" "casual" "member" ...
##  $ date              : Date[1:5719877], format: "2023-01-21" "2023-01-10" ...
##  $ month             : chr [1:5719877] "01" "01" "01" "01" ...
##  $ day               : chr [1:5719877] "21" "10" "02" "22" ...
##  $ year              : chr [1:5719877] "2023" "2023" "2023" "2023" ...
##  $ weekday           : chr [1:5719877] "Saturday" "Tuesday" "Monday" "Sunday" ...
##  $ starting_hour     : chr [1:5719877] "20" "15" "07" "10" ...
##  $ season            : chr [1:5719877] "winter" "winter" "winter" "winter" ...
##  $ ride_length       : 'difftime' num [1:5719877] 10.85 8.48 13.23 8.77 ...
##   ..- attr(*, "units")= chr "mins"
##  $ route             : chr [1:5719877] "Lincoln Ave & Fullerton Ave to Hampden Ct & Diversey Ave" "Kimbark Ave & 53rd St to Greenwood Ave & 47th St" "Western Ave & Lunt Ave to Valli Produce - Evanston Plaza" "Kimbark Ave & 53rd St to Greenwood Ave & 47th St" ...
is.numeric(Annual_trips$ride_length)
## [1] FALSE
Annual_trips$ride_length <- as.numeric(as.character(Annual_trips$ride_length))
is.numeric(Annual_trips$ride_length)
## [1] TRUE

Removing bad data

Annual_trips_v2 <- Annual_trips %>% 
  filter(ride_length > 0)
Annual_trips_v3 <- drop_na (Annual_trips_v2)
any(duplicated(Annual_trips_v3$ride_id))
## [1] FALSE

Summary of ride length

summary (Annual_trips_v3$ride_length)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.02     5.62     9.80    15.95    17.48 12136.30

Comparing ride lengths of members and casual users

aggregate(Annual_trips_v3$ride_length ~ Annual_trips_v3$member_casual, FUN=mean)
##   Annual_trips_v3$member_casual Annual_trips_v3$ride_length
## 1                        casual                     22.9402
## 2                        member                     12.1326
aggregate(Annual_trips_v3$ride_length ~ Annual_trips_v3$member_casual, FUN=median)
##   Annual_trips_v3$member_casual Annual_trips_v3$ride_length
## 1                        casual                       12.75
## 2                        member                        8.62
aggregate(Annual_trips_v3$ride_length ~ Annual_trips_v3$member_casual, FUN=max)
##   Annual_trips_v3$member_casual Annual_trips_v3$ride_length
## 1                        casual                    12136.30
## 2                        member                     1497.87
aggregate(Annual_trips_v3$ride_length ~ Annual_trips_v3$member_casual, FUN=min)
##   Annual_trips_v3$member_casual Annual_trips_v3$ride_length
## 1                        casual                        0.02
## 2                        member                        0.02

Analyzing total number of rides by members and casual users

Annual_trips_v3 %>%
  group_by(member_casual) %>%
  summarise(num_of_rides = n()) %>%
  arrange(member_casual)
## # A tibble: 2 × 2
##   member_casual num_of_rides
##   <chr>                <int>
## 1 casual             1531530
## 2 member             2799608

Analyzing the number of rides and average ride lengths of members and casual users by weekday

Annual_trips_v3 %>%
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n() 
       ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, weekday) 
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups:   member_casual [2]
##    member_casual weekday number_of_rides average_duration
##    <chr>         <ord>             <int>            <dbl>
##  1 casual        Sun              254711             26.6
##  2 casual        Mon              175382             22.5
##  3 casual        Tue              181510             20.5
##  4 casual        Wed              183065             19.6
##  5 casual        Thu              198905             20.0
##  6 casual        Fri              227828             22.3
##  7 casual        Sat              310129             25.9
##  8 member        Sun              307821             13.6
##  9 member        Mon              386651             11.6
## 10 member        Tue              448779             11.6
## 11 member        Wed              452679             11.6
## 12 member        Thu              452610             11.6
## 13 member        Fri              400471             12.0
## 14 member        Sat              350597             13.6

Analyzing the number of rides and average ride lengths of members and casual users by month

Annual_trips_v3 %>%
  mutate(month = month(started_at, label = TRUE)) %>% 
  group_by(member_casual, month) %>% 
  summarise(number_of_rides = n() 
       ,average_duration = mean(ride_length)) %>% 
  arrange(member_casual, month) 
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 24 × 4
## # Groups:   member_casual [2]
##    member_casual month number_of_rides average_duration
##    <chr>         <ord>           <int>            <dbl>
##  1 casual        Jan             29618             14.9
##  2 casual        Feb             32774             17.7
##  3 casual        Mar             46786             16.7
##  4 casual        Apr            110526             22.6
##  5 casual        May            177025             24.5
##  6 casual        Jun            219778             24.1
##  7 casual        Jul            245254             25.2
##  8 casual        Aug            233819             24.4
##  9 casual        Sep            196938             23.5
## 10 casual        Oct            130269             21.4
## # ℹ 14 more rows

Analyzing most used type of bike by members and casual users

Annual_trips_v3 %>%
  group_by(rideable_type) %>%
  summarise(num_of_rides = n()) %>%
  arrange(rideable_type)
## # A tibble: 3 × 2
##   rideable_type num_of_rides
##   <chr>                <int>
## 1 classic_bike       2690570
## 2 docked_bike          76124
## 3 electric_bike      1564444
Annual_trips_v3%>%
  group_by(rideable_type, member_casual) %>%
  summarise(num_of_rides = n()) %>%
  arrange(rideable_type, member_casual)
## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.
## # A tibble: 5 × 3
## # Groups:   rideable_type [3]
##   rideable_type member_casual num_of_rides
##   <chr>         <chr>                <int>
## 1 classic_bike  casual              872952
## 2 classic_bike  member             1817618
## 3 docked_bike   casual               76124
## 4 electric_bike casual              582454
## 5 electric_bike member              981990

Analyzing most used route by casual users

Top_routes <- Annual_trips_v3 %>%
  group_by(route,member_casual) %>% 
  summarise(number_of_rides = n()) %>% 
  filter(member_casual == "casual") %>% 
  arrange(route, member_casual)
## `summarise()` has grouped output by 'route'. You can override using the
## `.groups` argument.
head(arrange(Top_routes, desc(number_of_rides)), 10)
## # A tibble: 10 × 3
## # Groups:   route [10]
##    route                                           member_casual number_of_rides
##    <chr>                                           <chr>                   <int>
##  1 Streeter Dr & Grand Ave to Streeter Dr & Grand… casual                   8648
##  2 DuSable Lake Shore Dr & Monroe St to DuSable L… casual                   6730
##  3 DuSable Lake Shore Dr & Monroe St to Streeter … casual                   4626
##  4 Michigan Ave & Oak St to Michigan Ave & Oak St  casual                   4258
##  5 Millennium Park to Millennium Park              casual                   3419
##  6 Dusable Harbor to Dusable Harbor                casual                   2796
##  7 Montrose Harbor to Montrose Harbor              casual                   2528
##  8 Streeter Dr & Grand Ave to DuSable Lake Shore … casual                   2352
##  9 DuSable Lake Shore Dr & North Blvd to DuSable … casual                   2042
## 10 Ellis Ave & 60th St to Ellis Ave & 55th St      casual                   2037

Removing some columns and Exporting csv

Annual_trips_v3 <- Annual_trips_v3 %>%
  select(-start_station_id, -end_station_id, -started_at, -ended_at, -start_lat, -start_lng, -end_lat, -end_lng)  

Annual_Trips_2023 <- Annual_trips_v3
write.csv(Annual_Trips_2023, file = 'Annual_Trips_2023.csv')