Capstone Case Study

Cyclistic Tripdata

Processing the data: R studios was used for this phase of the analysis process to clean the data and confirm that it is complete, correct and relevant. First, the relevant packages have to be installed and loaded.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(skimr)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(readr)

Loading datasets into the environment with the correct naming conventions.

January_2024 <- read_csv("202401-divvy-tripdata.csv")

## Rows: 144873 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

February_2024 <- read_csv("202402-divvy-tripdata.csv")

## Rows: 223164 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

March_2024 <- read_csv("202403-divvy-tripdata.csv")

## Rows: 301687 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

April_2024 <- read_csv("202404-divvy-tripdata.csv")

## Rows: 415025 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

May_2024 <- read_csv("202405-divvy-tripdata.csv")

## Rows: 609493 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

June_2024 <- read_csv("202406-divvy-tripdata.csv")

## Rows: 710721 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

July_2024 <- read_csv("202407-divvy-tripdata.csv")

## Rows: 748962 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

August_2024 <- read_csv("202408-divvy-tripdata.csv")

## Rows: 755639 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

September_2024 <- read_csv("202409-divvy-tripdata.csv")

## Rows: 821276 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

October_2024 <- read_csv("202410-divvy-tripdata.csv")

## Rows: 616281 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

November_2024 <- read_csv("202411-divvy-tripdata.csv")

## Rows: 335075 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

December_2024 <- read_csv("202412-divvy-tripdata.csv")

## Rows: 178372 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

To facilitate processsing all the datasets would need to be combined into one single data frame. But first confirm that the data sets include the same column names:

colnames(January_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(February_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(March_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(April_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(May_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(June_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(July_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(August_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(September_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(October_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(November_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

colnames(December_2024)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

Checking the overall structure of each data set and if they have the same data types:

str(January_2024)

## spc_tbl_ [144,873 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:144873] "C1D650626C8C899A" "EECD38BDB25BFCB0" "F4A9CE78061F17F7" "0A0D9E15EE50B171" ...
##  $ rideable_type     : chr [1:144873] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:144873], format: "2024-01-12 15:30:27" "2024-01-08 15:45:46" ...
##  $ ended_at          : POSIXct[1:144873], format: "2024-01-12 15:37:59" "2024-01-08 15:52:59" ...
##  $ start_station_name: chr [1:144873] "Wells St & Elm St" "Wells St & Elm St" "Wells St & Elm St" "Wells St & Randolph St" ...
##  $ start_station_id  : chr [1:144873] "KA1504000135" "KA1504000135" "KA1504000135" "TA1305000030" ...
##  $ end_station_name  : chr [1:144873] "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Larrabee St & Webster Ave" ...
##  $ end_station_id    : chr [1:144873] "KA1503000043" "KA1503000043" "KA1503000043" "13193" ...
##  $ start_lat         : num [1:144873] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:144873] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:144873] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:144873] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:144873] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(February_2024)

## spc_tbl_ [223,164 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:223164] "FCB05EB1758F85E8" "7FB986AD5D3DE9D6" "40CA13E15B5B470D" "D47A1660919E8861" ...
##  $ rideable_type     : chr [1:223164] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:223164], format: "2024-02-03 14:14:18" "2024-02-05 21:10:06" ...
##  $ ended_at          : POSIXct[1:223164], format: "2024-02-03 14:21:00" "2024-02-05 21:15:44" ...
##  $ start_station_name: chr [1:223164] "Clark St & Newport St" "Michigan Ave & Washington St" "Leavitt St & Armitage Ave" "Southport Ave & Waveland Ave" ...
##  $ start_station_id  : chr [1:223164] "632" "13001" "TA1309000029" "13235" ...
##  $ end_station_name  : chr [1:223164] "Southport Ave & Waveland Ave" "Wabash Ave & Grand Ave" "Milwaukee Ave & Wabansia Ave" "Southport Ave & Belmont Ave" ...
##  $ end_station_id    : chr [1:223164] "13235" "TA1307000117" "13243" "13229" ...
##  $ start_lat         : num [1:223164] 41.9 41.9 41.9 41.9 41.8 ...
##  $ start_lng         : num [1:223164] -87.7 -87.6 -87.7 -87.7 -87.6 ...
##  $ end_lat           : num [1:223164] 41.9 41.9 41.9 41.9 41.8 ...
##  $ end_lng           : num [1:223164] -87.7 -87.6 -87.7 -87.7 -87.6 ...
##  $ member_casual     : chr [1:223164] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(March_2024)

## spc_tbl_ [301,687 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:301687] "64FBE3BAED5F29E6" "9991629435C5E20E" "E5C9FECD5B71BEBD" "4CEA3EC8906DAEA8" ...
##  $ rideable_type     : chr [1:301687] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:301687], format: "2024-03-05 18:33:11" "2024-03-06 17:15:14" ...
##  $ ended_at          : POSIXct[1:301687], format: "2024-03-05 18:51:48" "2024-03-06 17:16:04" ...
##  $ start_station_name: chr [1:301687] NA NA NA NA ...
##  $ start_station_id  : chr [1:301687] NA NA NA NA ...
##  $ end_station_name  : chr [1:301687] NA NA NA NA ...
##  $ end_station_id    : chr [1:301687] NA NA NA NA ...
##  $ start_lat         : num [1:301687] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:301687] -87.7 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:301687] 42 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:301687] -87.7 -87.6 -87.6 -87.6 -87.7 ...
##  $ member_casual     : chr [1:301687] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(April_2024)

## spc_tbl_ [415,025 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:415025] "743252713F32516B" "BE90D33D2240C614" "D47BBDDE7C40DD61" "6684E760BF9EA9B5" ...
##  $ rideable_type     : chr [1:415025] "classic_bike" "electric_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:415025], format: "2024-04-22 19:08:21" "2024-04-11 06:19:24" ...
##  $ ended_at          : POSIXct[1:415025], format: "2024-04-22 19:12:56" "2024-04-11 06:22:21" ...
##  $ start_station_name: chr [1:415025] "Aberdeen St & Jackson Blvd" "Aberdeen St & Jackson Blvd" "Sheridan Rd & Montrose Ave" "Aberdeen St & Jackson Blvd" ...
##  $ start_station_id  : chr [1:415025] "13157" "13157" "TA1307000107" "13157" ...
##  $ end_station_name  : chr [1:415025] "Desplaines St & Jackson Blvd" "Desplaines St & Jackson Blvd" "Ashland Ave & Belle Plaine Ave" "Desplaines St & Jackson Blvd" ...
##  $ end_station_id    : chr [1:415025] "15539" "15539" "13249" "15539" ...
##  $ start_lat         : num [1:415025] 41.9 41.9 42 41.9 42 ...
##  $ start_lng         : num [1:415025] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:415025] 41.9 41.9 42 41.9 41.9 ...
##  $ end_lng           : num [1:415025] -87.6 -87.6 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:415025] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(May_2024)

## spc_tbl_ [609,493 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:609493] "7D9F0CE9EC2A1297" "02EC47687411416F" "101370FB2D3402BE" "E97E396331ED6913" ...
##  $ rideable_type     : chr [1:609493] "classic_bike" "classic_bike" "classic_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:609493], format: "2024-05-25 15:52:42" "2024-05-14 15:11:51" ...
##  $ ended_at          : POSIXct[1:609493], format: "2024-05-25 16:11:50" "2024-05-14 15:22:00" ...
##  $ start_station_name: chr [1:609493] "Streeter Dr & Grand Ave" "Sheridan Rd & Greenleaf Ave" "Streeter Dr & Grand Ave" "Streeter Dr & Grand Ave" ...
##  $ start_station_id  : chr [1:609493] "13022" "KA1504000159" "13022" "13022" ...
##  $ end_station_name  : chr [1:609493] "Clark St & Elm St" "Sheridan Rd & Loyola Ave" "Wabash Ave & 9th St" "Sheffield Ave & Wellington Ave" ...
##  $ end_station_id    : chr [1:609493] "TA1307000039" "RP-009" "TA1309000010" "TA1307000052" ...
##  $ start_lat         : num [1:609493] 41.9 42 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:609493] -87.6 -87.7 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:609493] 41.9 42 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:609493] -87.6 -87.7 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:609493] "casual" "casual" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(June_2024)

## spc_tbl_ [710,721 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:710721] "CDE6023BE6B11D2F" "462B48CD292B6A18" "9CFB6A858D23ABF7" "6365EFEB64231153" ...
##  $ rideable_type     : chr [1:710721] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:710721], format: "2024-06-11 17:20:06" "2024-06-11 17:19:21" ...
##  $ ended_at          : POSIXct[1:710721], format: "2024-06-11 17:21:39" "2024-06-11 17:19:36" ...
##  $ start_station_name: chr [1:710721] NA NA NA NA ...
##  $ start_station_id  : chr [1:710721] NA NA NA NA ...
##  $ end_station_name  : chr [1:710721] NA NA NA NA ...
##  $ end_station_id    : chr [1:710721] NA NA NA NA ...
##  $ start_lat         : num [1:710721] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:710721] -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num [1:710721] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:710721] -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:710721] "casual" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(July_2024)

## spc_tbl_ [748,962 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:748962] "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
##  $ rideable_type     : chr [1:748962] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:748962], format: "2024-07-11 08:15:14" "2024-07-11 15:45:07" ...
##  $ ended_at          : POSIXct[1:748962], format: "2024-07-11 08:17:56" "2024-07-11 16:06:04" ...
##  $ start_station_name: chr [1:748962] NA NA NA NA ...
##  $ start_station_id  : chr [1:748962] NA NA NA NA ...
##  $ end_station_name  : chr [1:748962] NA NA NA NA ...
##  $ end_station_id    : chr [1:748962] NA NA NA NA ...
##  $ start_lat         : num [1:748962] 41.8 41.8 41.8 41.9 42 ...
##  $ start_lng         : num [1:748962] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ end_lat           : num [1:748962] 41.8 41.8 41.8 41.9 41.9 ...
##  $ end_lng           : num [1:748962] -87.6 -87.6 -87.6 -87.7 -87.6 ...
##  $ member_casual     : chr [1:748962] "casual" "casual" "casual" "casual" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(August_2024)

## spc_tbl_ [755,639 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:755639] "BAA154388A869E64" "8752245932EFF67A" "44DDF9F57A9A161F" "44AAAF069B0C78C3" ...
##  $ rideable_type     : chr [1:755639] "classic_bike" "electric_bike" "classic_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:755639], format: "2024-08-02 13:35:14" "2024-08-02 15:33:13" ...
##  $ ended_at          : POSIXct[1:755639], format: "2024-08-02 13:48:24" "2024-08-02 15:55:23" ...
##  $ start_station_name: chr [1:755639] "State St & Randolph St" "Franklin St & Monroe St" "Franklin St & Monroe St" "Clark St & Elm St" ...
##  $ start_station_id  : chr [1:755639] "TA1305000029" "TA1309000007" "TA1309000007" "TA1307000039" ...
##  $ end_station_name  : chr [1:755639] "Wabash Ave & 9th St" "Damen Ave & Cortland St" "Clark St & Elm St" "McClurg Ct & Ohio St" ...
##  $ end_station_id    : chr [1:755639] "TA1309000010" "13133" "TA1307000039" "TA1306000029" ...
##  $ start_lat         : num [1:755639] 41.9 41.9 41.9 41.9 42 ...
##  $ start_lng         : num [1:755639] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:755639] 41.9 41.9 41.9 41.9 42 ...
##  $ end_lng           : num [1:755639] -87.6 -87.7 -87.6 -87.6 -87.7 ...
##  $ member_casual     : chr [1:755639] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(September_2024)

## spc_tbl_ [821,276 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:821276] "31D38723D5A8665A" "67CB39987F4E895B" "DA61204FD26EC681" "06F160D46AF235DD" ...
##  $ rideable_type     : chr [1:821276] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:821276], format: "2024-09-26 15:30:58" "2024-09-26 15:31:32" ...
##  $ ended_at          : POSIXct[1:821276], format: "2024-09-26 15:30:59" "2024-09-26 15:53:13" ...
##  $ start_station_name: chr [1:821276] NA NA NA NA ...
##  $ start_station_id  : chr [1:821276] NA NA NA NA ...
##  $ end_station_name  : chr [1:821276] NA NA NA NA ...
##  $ end_station_id    : chr [1:821276] NA NA NA NA ...
##  $ start_lat         : num [1:821276] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:821276] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:821276] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:821276] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:821276] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(October_2024)

## spc_tbl_ [616,281 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:616281] "4422E707103AA4FF" "19DB722B44CBE82F" "20AE2509FD68C939" "D0F17580AB9515A9" ...
##  $ rideable_type     : chr [1:616281] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:616281], format: "2024-10-14 03:26:04" "2024-10-13 19:33:38" ...
##  $ ended_at          : POSIXct[1:616281], format: "2024-10-14 03:32:56" "2024-10-13 19:39:04" ...
##  $ start_station_name: chr [1:616281] NA NA NA NA ...
##  $ start_station_id  : chr [1:616281] NA NA NA NA ...
##  $ end_station_name  : chr [1:616281] NA NA NA NA ...
##  $ end_station_id    : chr [1:616281] NA NA NA NA ...
##  $ start_lat         : num [1:616281] 42 42 42 42 42 ...
##  $ start_lng         : num [1:616281] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:616281] 42 42 42 42 42 ...
##  $ end_lng           : num [1:616281] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:616281] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(November_2024)

## spc_tbl_ [335,075 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:335075] "578DDD7CE1771FFA" "78B141C50102ABA6" "1E794CF36394E2D7" "E5DD2CAB58D73F98" ...
##  $ rideable_type     : chr [1:335075] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:335075], format: "2024-11-07 19:21:58" "2024-11-22 14:49:00" ...
##  $ ended_at          : POSIXct[1:335075], format: "2024-11-07 19:28:57" "2024-11-22 14:56:15" ...
##  $ start_station_name: chr [1:335075] "Walsh Park" "Walsh Park" "Walsh Park" "Clark St & Elm St" ...
##  $ start_station_id  : chr [1:335075] "18067" "18067" "18067" "TA1307000039" ...
##  $ end_station_name  : chr [1:335075] "Leavitt St & North Ave" "Leavitt St & Armitage Ave" "Damen Ave & Cortland St" "Clark St & Drummond Pl" ...
##  $ end_station_id    : chr [1:335075] "TA1308000005" "TA1309000029" "13133" "TA1307000142" ...
##  $ start_lat         : num [1:335075] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:335075] -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ end_lat           : num [1:335075] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:335075] -87.7 -87.7 -87.7 -87.6 -87.6 ...
##  $ member_casual     : chr [1:335075] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

str(December_2024)

## spc_tbl_ [178,372 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:178372] "6C960DEB4F78854E" "C0913EEB2834E7A2" "848A37DD4723078A" "3FA09C762ECB48BD" ...
##  $ rideable_type     : chr [1:178372] "electric_bike" "classic_bike" "classic_bike" "electric_bike" ...
##  $ started_at        : POSIXct[1:178372], format: "2024-12-31 01:38:35" "2024-12-21 18:41:26" ...
##  $ ended_at          : POSIXct[1:178372], format: "2024-12-31 01:48:45" "2024-12-21 18:47:33" ...
##  $ start_station_name: chr [1:178372] "Halsted St & Roscoe St" "Clark St & Wellington Ave" "Sheridan Rd & Montrose Ave" "Aberdeen St & Jackson Blvd" ...
##  $ start_station_id  : chr [1:178372] "TA1309000025" "TA1307000136" "TA1307000107" "13157" ...
##  $ end_station_name  : chr [1:178372] "Clark St & Winnemac Ave" "Halsted St & Roscoe St" "Broadway & Barry Ave" "Green St & Randolph St*" ...
##  $ end_station_id    : chr [1:178372] "TA1309000035" "TA1309000025" "13137" "chargingstx3" ...
##  $ start_lat         : num [1:178372] 41.9 41.9 42 41.9 41.9 ...
##  $ start_lng         : num [1:178372] -87.6 -87.6 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:178372] 42 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:178372] -87.7 -87.6 -87.6 -87.6 -87.7 ...
##  $ member_casual     : chr [1:178372] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Confirming that the datatypes are consistent across all datasets.

compare_df_cols(January_2024,February_2024,March_2024,April_2024,May_2024,June_2024,July_2024,August_2024,September_2024,October_2024,November_2024,December_2024,return = "mismatch")

##  [1] column_name    January_2024   February_2024  March_2024     April_2024    
##  [6] May_2024       June_2024      July_2024      August_2024    September_2024
## [11] October_2024   November_2024  December_2024 
## <0 rows> (or 0-length row.names)

The function above confirms that there are no mismatched datatypes across all 12 of the data sets. Next, combining these datasets into a single data frame by using the bind_rows() function, stacking them ontop of one another:

combined_trips <- bind_rows(January_2024,February_2024,March_2024,April_2024,May_2024,June_2024,July_2024,August_2024,September_2024,October_2024,November_2024,December_2024)

Running some functions to obtain quick overview of the data frame:

str(combined_trips)

## spc_tbl_ [5,860,568 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5860568] "C1D650626C8C899A" "EECD38BDB25BFCB0" "F4A9CE78061F17F7" "0A0D9E15EE50B171" ...
##  $ rideable_type     : chr [1:5860568] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:5860568], format: "2024-01-12 15:30:27" "2024-01-08 15:45:46" ...
##  $ ended_at          : POSIXct[1:5860568], format: "2024-01-12 15:37:59" "2024-01-08 15:52:59" ...
##  $ start_station_name: chr [1:5860568] "Wells St & Elm St" "Wells St & Elm St" "Wells St & Elm St" "Wells St & Randolph St" ...
##  $ start_station_id  : chr [1:5860568] "KA1504000135" "KA1504000135" "KA1504000135" "TA1305000030" ...
##  $ end_station_name  : chr [1:5860568] "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Larrabee St & Webster Ave" ...
##  $ end_station_id    : chr [1:5860568] "KA1503000043" "KA1503000043" "KA1503000043" "13193" ...
##  $ start_lat         : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:5860568] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

head(combined_trips)

## # A tibble: 6 × 13
##   ride_id          rideable_type started_at          ended_at           
##   <chr>            <chr>         <dttm>              <dttm>             
## 1 C1D650626C8C899A electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59
## 2 EECD38BDB25BFCB0 electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59
## 3 F4A9CE78061F17F7 electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19
## 4 0A0D9E15EE50B171 classic_bike  2024-01-29 16:26:17 2024-01-29 16:56:06
## 5 33FFC9805E3EFF9A classic_bike  2024-01-31 05:43:23 2024-01-31 06:09:35
## 6 C96080812CD285C5 classic_bike  2024-01-07 11:21:24 2024-01-07 11:30:03
## # ℹ 9 more variables: start_station_name <chr>, start_station_id <chr>,
## #   end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## #   start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>

skim_without_charts(combined_trips)

Data summary
Name	combined_trips
Number of rows	5860568
Number of columns	13
_______________________
Column type frequency:
character	7
numeric	4
POSIXct	2
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ride_id	0	1.00	16	16	5860357
rideable_type	0	1.00	12	16	3
start_station_name	1073951	0.82	10	64	1808
start_station_id	1073951	0.82	3	35	1763
end_station_name	1104653	0.81	10	64	1815
end_station_id	1104653	0.81	3	35	1768
member_casual	0	1.00	6	6	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
start_lat	0	1	41.90	0.04	41.64	41.88	41.90	41.93	42.07
start_lng	0	1	-87.65	0.03	-87.91	-87.66	-87.64	-87.63	-87.52
end_lat	7232	1	41.90	0.06	16.06	41.88	41.90	41.93	87.96
end_lng	7232	1	-87.65	0.11	-144.05	-87.66	-87.64	-87.63	152.53

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2024-01-01 00:00:39	2024-12-31 23:56:49	2024-07-22 20:36:16	5649600
ended_at	0	1	2024-01-01 00:04:20	2024-12-31 23:59:55	2024-07-22 20:53:59	5652165

dim(combined_trips)

## [1] 5860568      13

Creating columns that extract the day, month, year and day of the week of each trip using its recorded “started_at” time:

combined_trips$date <- as.Date(combined_trips$started_at)
combined_trips$month <- format(as.Date(combined_trips$date), "%B")
combined_trips$day <- format(as.Date(combined_trips$date), "%d")
combined_trips$year <- format(as.Date(combined_trips$date), "%Y")
combined_trips$day_of_the_week <- weekdays(combined_trips$date)

head(combined_trips)

## # A tibble: 6 × 18
##   ride_id          rideable_type started_at          ended_at           
##   <chr>            <chr>         <dttm>              <dttm>             
## 1 C1D650626C8C899A electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59
## 2 EECD38BDB25BFCB0 electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59
## 3 F4A9CE78061F17F7 electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19
## 4 0A0D9E15EE50B171 classic_bike  2024-01-29 16:26:17 2024-01-29 16:56:06
## 5 33FFC9805E3EFF9A classic_bike  2024-01-31 05:43:23 2024-01-31 06:09:35
## 6 C96080812CD285C5 classic_bike  2024-01-07 11:21:24 2024-01-07 11:30:03
## # ℹ 14 more variables: start_station_name <chr>, start_station_id <chr>,
## #   end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## #   start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>,
## #   date <date>, month <chr>, day <chr>, year <chr>, day_of_the_week <chr>

Adding another variable: the ride length; that is, the difference between the “started_at” and “ended_at” time of each trip.

combined_trips$ride_length <- difftime(combined_trips$ended_at, combined_trips$started_at, units = "min")

Reducing the decimal points to two places and changing the datatype to numeric so that arithmetic analysis can be done on the ride_length column:

combined_trips$ride_length <- round(combined_trips$ride_length, 2)

combined_trips$ride_length <- as.numeric(as.character(combined_trips$ride_length))

Checking the structure of the data frame for datatype consistency:

str(combined_trips)

## spc_tbl_ [5,860,568 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5860568] "C1D650626C8C899A" "EECD38BDB25BFCB0" "F4A9CE78061F17F7" "0A0D9E15EE50B171" ...
##  $ rideable_type     : chr [1:5860568] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:5860568], format: "2024-01-12 15:30:27" "2024-01-08 15:45:46" ...
##  $ ended_at          : POSIXct[1:5860568], format: "2024-01-12 15:37:59" "2024-01-08 15:52:59" ...
##  $ start_station_name: chr [1:5860568] "Wells St & Elm St" "Wells St & Elm St" "Wells St & Elm St" "Wells St & Randolph St" ...
##  $ start_station_id  : chr [1:5860568] "KA1504000135" "KA1504000135" "KA1504000135" "TA1305000030" ...
##  $ end_station_name  : chr [1:5860568] "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Larrabee St & Webster Ave" ...
##  $ end_station_id    : chr [1:5860568] "KA1503000043" "KA1503000043" "KA1503000043" "13193" ...
##  $ start_lat         : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.7 ...
##  $ end_lat           : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ member_casual     : chr [1:5860568] "member" "member" "member" "member" ...
##  $ date              : Date[1:5860568], format: "2024-01-12" "2024-01-08" ...
##  $ month             : chr [1:5860568] "January" "January" "January" "January" ...
##  $ day               : chr [1:5860568] "12" "08" "27" "29" ...
##  $ year              : chr [1:5860568] "2024" "2024" "2024" "2024" ...
##  $ day_of_the_week   : chr [1:5860568] "Friday" "Monday" "Saturday" "Monday" ...
##  $ ride_length       : num [1:5860568] 7.53 7.22 8 29.82 26.2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_datetime(format = ""),
##   ..   ended_at = col_datetime(format = ""),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

##Cleaning

Normally, I would need to be advised on what specific data cleaning procedures I am permitted to carry out on the data. To retain the integrity of the original data frame, the original data set needs to be backed up and stored before performing all data cleaning procedures.

Deleting null values:

combined_trips_v2 <- drop_na(combined_trips)

Deleting the entries with negative ride_length since the end time should normally not be earlier than the start time:

combined_trips_v2 <- filter(combined_trips_v2, ride_length > 0)

Extracting the latitude and longitude data from the data set since it will not be needed in this phase of the analysis:

lat_lng1 <- select(combined_trips_v2,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual)

combined_trips_v2 <- combined_trips_v2 %>%
select (-c(start_lat, start_lng, end_lat, end_lng))

head (combined_trips_v2)

## # A tibble: 6 × 15
##   ride_id          rideable_type started_at          ended_at           
##   <chr>            <chr>         <dttm>              <dttm>             
## 1 C1D650626C8C899A electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59
## 2 EECD38BDB25BFCB0 electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59
## 3 F4A9CE78061F17F7 electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19
## 4 0A0D9E15EE50B171 classic_bike  2024-01-29 16:26:17 2024-01-29 16:56:06
## 5 33FFC9805E3EFF9A classic_bike  2024-01-31 05:43:23 2024-01-31 06:09:35
## 6 C96080812CD285C5 classic_bike  2024-01-07 11:21:24 2024-01-07 11:30:03
## # ℹ 11 more variables: start_station_name <chr>, start_station_id <chr>,
## #   end_station_name <chr>, end_station_id <chr>, member_casual <chr>,
## #   date <date>, month <chr>, day <chr>, year <chr>, day_of_the_week <chr>,
## #   ride_length <dbl>

After concluding that the data is in the correct format and has been properly cleaned, moving on to the next phase: Analysis.

##Analysis

In this phase, patterns, trends, and relationships are detected in order to seek a resolution to the business task. Starting with descriptive analysis:

combined_trips_v2 %>%
  group_by (member_casual) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))

## # A tibble: 2 × 3
##   member_casual number_of_rides average_ride_length
##   <chr>                   <int>               <dbl>
## 1 casual                1521531                24.1
## 2 member                2686557                12.5

combined_trips_v2 %>%
  count(rideable_type)

## # A tibble: 3 × 2
##   rideable_type          n
##   <chr>              <int>
## 1 classic_bike     2728069
## 2 electric_bike    1432192
## 3 electric_scooter   47827

combined_trips_v2 %>%
  group_by(member_casual)%>%
  count (rideable_type)

## # A tibble: 6 × 3
## # Groups:   member_casual [2]
##   member_casual rideable_type          n
##   <chr>         <chr>              <int>
## 1 casual        classic_bike      968853
## 2 casual        electric_bike     526938
## 3 casual        electric_scooter   25740
## 4 member        classic_bike     1759216
## 5 member        electric_bike     905254
## 6 member        electric_scooter   22087

summary (combined_trips_v2$ride_length) #This shows a summary of the ride length.

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.01    5.83   10.14   16.66   18.22 1509.37

combined_trips_v2%>%
  group_by(member_casual)%>%
  summarise(number_of_rides=n(), min_ride_length=min(ride_length),max_ride_length=max(ride_length),avg_ride_length=mean(ride_length),median_ride_length=median(ride_length))

## # A tibble: 2 × 6
##   member_casual number_of_rides min_ride_length max_ride_length avg_ride_length
##   <chr>                   <int>           <dbl>           <dbl>           <dbl>
## 1 casual                1521531            0.01           1509.            24.1
## 2 member                2686557            0.01           1498.            12.5
## # ℹ 1 more variable: median_ride_length <dbl>

Ordering the months from January 2024 to December 2024 and the days of the week from Sunday to Saturday:

combined_trips_v2$month <- ordered(combined_trips_v2$month, levels=c( "January", "February", "March", "April", "May", "June", "July","August", "September", "October", "November", "December"))

combined_trips_v2$day_of_the_week <- ordered(combined_trips_v2$day_of_the_week, levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))

combined_trips_v2 %>%
  group_by(member_casual, month) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  arrange (month)

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 24 × 4
## # Groups:   member_casual [2]
##    member_casual month    number_of_rides average_ride_length
##    <chr>         <ord>              <int>               <dbl>
##  1 casual        January            17712                15.5
##  2 member        January            96076                11.6
##  3 casual        February           38168                19.8
##  4 member        February          146553                11.7
##  5 casual        March              62801                22.0
##  6 member        March             167436                11.4
##  7 casual        April              93927                24.8
##  8 member        April             203814                12.3
##  9 casual        May               167524                26.9
## 10 member        May               274711                13.3
## # ℹ 14 more rows

combined_trips_v2 %>%
  group_by(member_casual, day_of_the_week) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  arrange (day_of_the_week)

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 14 × 4
## # Groups:   member_casual [2]
##    member_casual day_of_the_week number_of_rides average_ride_length
##    <chr>         <ord>                     <int>               <dbl>
##  1 casual        Sunday                   266167                27.6
##  2 member        Sunday                   295517                14.0
##  3 casual        Monday                   181652                23.2
##  4 member        Monday                   399237                11.9
##  5 casual        Tuesday                  162825                20.6
##  6 member        Tuesday                  421404                11.9
##  7 casual        Wednesday                187354                21.4
##  8 member        Wednesday                446875                12.2
##  9 casual        Thursday                 183071                20.9
## 10 member        Thursday                 414244                11.9
## 11 casual        Friday                   222632                23.2
## 12 member        Friday                   376721                12.1
## 13 casual        Saturday                 317830                27.3
## 14 member        Saturday                 332559                14.0

Most popular start stations for casual riders:

combined_trips_v2 %>%
  group_by(start_station_name, member_casual) %>%
  summarise(number_of_trips=n()) %>%
  arrange(desc (number_of_trips)) %>%
  filter(member_casual== "casual") %>%
  select(start_station_name, number_of_trips)

## `summarise()` has grouped output by 'start_station_name'. You can override
## using the `.groups` argument.

## # A tibble: 1,570 × 2
## # Groups:   start_station_name [1,570]
##    start_station_name                 number_of_trips
##    <chr>                                        <int>
##  1 Streeter Dr & Grand Ave                      48315
##  2 DuSable Lake Shore Dr & Monroe St            32194
##  3 Michigan Ave & Oak St                        23295
##  4 DuSable Lake Shore Dr & North Blvd           21359
##  5 Millennium Park                              20840
##  6 Shedd Aquarium                               20117
##  7 Dusable Harbor                               17263
##  8 Theater on the Lake                          15398
##  9 Michigan Ave & 8th St                        12556
## 10 Adler Planetarium                            12273
## # ℹ 1,560 more rows

Most popular end stations for casual riders:

combined_trips_v2 %>%
  group_by(end_station_name, member_casual) %>%
  summarise(number_of_trips=n()) %>%
  arrange(desc (number_of_trips)) %>%
  filter(member_casual== "casual") %>%
  select(end_station_name, number_of_trips)

## `summarise()` has grouped output by 'end_station_name'. You can override using
## the `.groups` argument.

## # A tibble: 1,613 × 2
## # Groups:   end_station_name [1,613]
##    end_station_name                   number_of_trips
##    <chr>                                        <int>
##  1 Streeter Dr & Grand Ave                      52435
##  2 DuSable Lake Shore Dr & Monroe St            30120
##  3 DuSable Lake Shore Dr & North Blvd           25129
##  4 Michigan Ave & Oak St                        24187
##  5 Millennium Park                              22912
##  6 Shedd Aquarium                               18307
##  7 Theater on the Lake                          16906
##  8 Dusable Harbor                               15674
##  9 Michigan Ave & 8th St                        11648
## 10 Indiana Ave & Roosevelt Rd                   10703
## # ℹ 1,603 more rows

Creating another version of the data set that combines the start_station_name and the end_station_name to serve as an aggregate representation of each ride route:

combined_trips_v3 <- (unite(combined_trips_v2, "ride_routes", start_station_name, end_station_name, sep= " to "))
head(combined_trips_v3)

## # A tibble: 6 × 14
##   ride_id      rideable_type started_at          ended_at            ride_routes
##   <chr>        <chr>         <dttm>              <dttm>              <chr>      
## 1 C1D650626C8… electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59 Wells St &…
## 2 EECD38BDB25… electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59 Wells St &…
## 3 F4A9CE78061… electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19 Wells St &…
## 4 0A0D9E15EE5… classic_bike  2024-01-29 16:26:17 2024-01-29 16:56:06 Wells St &…
## 5 33FFC9805E3… classic_bike  2024-01-31 05:43:23 2024-01-31 06:09:35 Lincoln Av…
## 6 C96080812CD… classic_bike  2024-01-07 11:21:24 2024-01-07 11:30:03 Wells St &…
## # ℹ 9 more variables: start_station_id <chr>, end_station_id <chr>,
## #   member_casual <chr>, date <date>, month <ord>, day <chr>, year <chr>,
## #   day_of_the_week <ord>, ride_length <dbl>

Extracting data about the most used ride routes and segregating them by rider type:

top_routes <- combined_trips_v3 %>%
  group_by(ride_routes) %>%
  summarise(number_of_rides=n()) %>%
  arrange (desc (number_of_rides))

head (top_routes,10)

## # A tibble: 10 × 2
##    ride_routes                                                   number_of_rides
##    <chr>                                                                   <int>
##  1 Streeter Dr & Grand Ave to Streeter Dr & Grand Ave                       9667
##  2 DuSable Lake Shore Dr & Monroe St to DuSable Lake Shore Dr &…            7917
##  3 Calumet Ave & 33rd St to State St & 33rd St                              5921
##  4 State St & 33rd St to Calumet Ave & 33rd St                              5913
##  5 DuSable Lake Shore Dr & Monroe St to Streeter Dr & Grand Ave             5692
##  6 Ellis Ave & 60th St to Ellis Ave & 55th St                               5437
##  7 Michigan Ave & Oak St to Michigan Ave & Oak St                           5261
##  8 Ellis Ave & 60th St to University Ave & 57th St                          5187
##  9 Ellis Ave & 55th St to Ellis Ave & 60th St                               5137
## 10 University Ave & 57th St to Ellis Ave & 60th St                          4827

top_routes_1 <- combined_trips_v3 %>%
  group_by(ride_routes, member_casual) %>%
  summarise(number_of_rides=n()) %>%
  arrange (desc(number_of_rides))

## `summarise()` has grouped output by 'ride_routes'. You can override using the
## `.groups` argument.

head (top_routes_1, 10)

## # A tibble: 10 × 3
## # Groups:   ride_routes [10]
##    ride_routes                                     member_casual number_of_rides
##    <chr>                                           <chr>                   <int>
##  1 Streeter Dr & Grand Ave to Streeter Dr & Grand… casual                   8871
##  2 DuSable Lake Shore Dr & Monroe St to DuSable L… casual                   7242
##  3 State St & 33rd St to Calumet Ave & 33rd St     member                   5587
##  4 Calumet Ave & 33rd St to State St & 33rd St     member                   5506
##  5 DuSable Lake Shore Dr & Monroe St to Streeter … casual                   5265
##  6 Michigan Ave & Oak St to Michigan Ave & Oak St  casual                   4619
##  7 Ellis Ave & 60th St to University Ave & 57th St member                   3988
##  8 Ellis Ave & 60th St to Ellis Ave & 55th St      member                   3976
##  9 University Ave & 57th St to Ellis Ave & 60th St member                   3942
## 10 Ellis Ave & 55th St to Ellis Ave & 60th St      member                   3837

##Visualizations

Facilitating the identification of patterns and trends in the data in order to gain useful insights:

Number of rides and average ride length by rider type:

combined_trips_v3 %>%
  group_by(member_casual) %>%
  summarise(Average_ride_length=mean(ride_length)) %>%
  ggplot(aes(x= member_casual, y=Average_ride_length, fill=member_casual)) + geom_col() + labs(title = "Average ride length by rider type", x="Rider type", y="Average ride length")

combined_trips_v3 %>%
  group_by(member_casual, month) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  ggplot (aes(x=month, y=number_of_rides, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Number of rides per month segmented by rider type", x= "Month", y="Number of rides")+ theme(axis.text.x = element_text(angle = 60, hjust=1))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

combined_trips_v3 %>%
  group_by(member_casual, month) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  ggplot (aes(x=month, y=average_ride_length, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Average ride length per month segmented by rider type", x= "Month", y="Average ride length")+ theme(axis.text.x = element_text(angle = 60, hjust=1))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

combined_trips_v3 %>%
  group_by(member_casual, day_of_the_week) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  ggplot (aes(x=day_of_the_week, y=number_of_rides, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Number of rides per day of the week", x= "Day of the week", y="Number of rides")

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

combined_trips_v3 %>%
  group_by(member_casual, day_of_the_week) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  ggplot (aes(x=day_of_the_week, y=average_ride_length, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Average ride length per week day segmented by rider type", x= "Day of the week", y="Average ride length")+ theme(axis.text.x = element_text(angle = 60, hjust=1))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Number of rides and Average ride length by rideable type:

combined_trips_v3 %>%
  group_by(rideable_type, member_casual) %>%
  summarise(number_of_rides=n()) %>%
  ggplot(aes(x=rideable_type, y=number_of_rides, fill=member_casual)) + geom_col(position = "dodge") + labs(title="Number of rides per rideable type" , x="Rideable Type", y="Number of rides")

## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.

combined_trips_v3 %>%
  group_by(rideable_type, member_casual) %>%
  summarise(Average_ride_length = mean(ride_length)) %>%
  ggplot(aes(x=rideable_type, y=Average_ride_length, fill=member_casual)) + geom_col(position = "dodge") + labs(title="Average ride length per rideable type" , x="Rideable Type", y="Average ride length")

## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.

combined_trips_v3 %>%
  group_by(member_casual, month, rideable_type) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  ggplot (aes(x=month, y=number_of_rides, fill=rideable_type)) + geom_col(position="dodge2") + facet_wrap(~member_casual) + labs(title = "Number of rides segmented by rideable types for each month", x="Month", y="Number of rides") + theme(axis.text.x = element_text(angle = 60, hjust=1))

## `summarise()` has grouped output by 'member_casual', 'month'. You can override
## using the `.groups` argument.

combined_trips_v3 %>%
  group_by(member_casual, day_of_the_week, rideable_type) %>%
  summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
  ggplot (aes(x=day_of_the_week, y=number_of_rides, fill=rideable_type)) + geom_col(position = "dodge") + facet_wrap(~member_casual) + labs(title = "Number of rides segmented by rideable types for each day of the week", x="Day of the week", y="Number of rides") + theme(axis.text.x = element_text(angle = 60, hjust=1))

## `summarise()` has grouped output by 'member_casual', 'day_of_the_week'. You can
## override using the `.groups` argument.

combined_trips_v2 %>%
  group_by(start_station_name, member_casual) %>%
  summarise(number_of_trips=n()) %>%
  arrange(desc (number_of_trips)) %>%
  filter(member_casual== "casual", number_of_trips >= 15460) %>%
  select(start_station_name, number_of_trips) %>%
  ggplot(aes(x=start_station_name, y=number_of_trips)) + geom_col(fill="blue") + coord_flip() + labs(title="Top 10 most popular start stations for casual riders", x="Start station name", y="Number of trips")

## `summarise()` has grouped output by 'start_station_name'. You can override
## using the `.groups` argument.

combined_trips_v2 %>%
  group_by(end_station_name, member_casual) %>%
  summarise(number_of_trips=n()) %>%
  arrange(desc (number_of_trips)) %>%
  filter(member_casual== "casual", number_of_trips >= 15596) %>%
  select(end_station_name, number_of_trips) %>%
  ggplot(aes(x=end_station_name, y=number_of_trips)) + geom_col(fill="red") + coord_flip() + labs(title="Top 10 most popular end stations for casual riders", x="End station name", y="Number of trips")

## `summarise()` has grouped output by 'end_station_name'. You can override using
## the `.groups` argument.

Saving these data frames as CSV files on my local desktop for further analysis and visualizations on other platforms.

write.csv(top_routes,"C:\\Users\\Gerald Ntune\\Desktop\\Cyclist project\\top_routes.csv", row.names=FALSE)

write.csv(combined_trips_v3,"C:\\Users\\Gerald Ntune\\Desktop\\Cyclist project\\combined_trips_v3.csv", row.names=FALSE)

write.csv(top_routes_1,"C:\\Users\\Temi\\Desktop\\Cyclist project\\top_routes_1.csv", row.names=FALSE)

write.csv(lat_lng1,"C:\\Users\\Temi\\Desktop\\Cyclist project\\lat_lng1.csv", row.names=FALSE)

Capstone Case Study

Gerald Ntune

2025-03-19

Cyclistic Tripdata