Processing the data: R studios was used for this phase of the analysis process to clean the data and confirm that it is complete, correct and relevant. First, the relevant packages have to be installed and loaded.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readr)
Loading datasets into the environment with the correct naming conventions.
January_2024 <- read_csv("202401-divvy-tripdata.csv")
## Rows: 144873 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
February_2024 <- read_csv("202402-divvy-tripdata.csv")
## Rows: 223164 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
March_2024 <- read_csv("202403-divvy-tripdata.csv")
## Rows: 301687 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
April_2024 <- read_csv("202404-divvy-tripdata.csv")
## Rows: 415025 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
May_2024 <- read_csv("202405-divvy-tripdata.csv")
## Rows: 609493 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
June_2024 <- read_csv("202406-divvy-tripdata.csv")
## Rows: 710721 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
July_2024 <- read_csv("202407-divvy-tripdata.csv")
## Rows: 748962 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
August_2024 <- read_csv("202408-divvy-tripdata.csv")
## Rows: 755639 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
September_2024 <- read_csv("202409-divvy-tripdata.csv")
## Rows: 821276 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
October_2024 <- read_csv("202410-divvy-tripdata.csv")
## Rows: 616281 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
November_2024 <- read_csv("202411-divvy-tripdata.csv")
## Rows: 335075 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
December_2024 <- read_csv("202412-divvy-tripdata.csv")
## Rows: 178372 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
To facilitate processsing all the datasets would need to be combined into one single data frame. But first confirm that the data sets include the same column names:
colnames(January_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(February_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(March_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(April_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(May_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(June_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(July_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(August_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(September_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(October_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(November_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(December_2024)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
Checking the overall structure of each data set and if they have the same data types:
str(January_2024)
## spc_tbl_ [144,873 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:144873] "C1D650626C8C899A" "EECD38BDB25BFCB0" "F4A9CE78061F17F7" "0A0D9E15EE50B171" ...
## $ rideable_type : chr [1:144873] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:144873], format: "2024-01-12 15:30:27" "2024-01-08 15:45:46" ...
## $ ended_at : POSIXct[1:144873], format: "2024-01-12 15:37:59" "2024-01-08 15:52:59" ...
## $ start_station_name: chr [1:144873] "Wells St & Elm St" "Wells St & Elm St" "Wells St & Elm St" "Wells St & Randolph St" ...
## $ start_station_id : chr [1:144873] "KA1504000135" "KA1504000135" "KA1504000135" "TA1305000030" ...
## $ end_station_name : chr [1:144873] "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Larrabee St & Webster Ave" ...
## $ end_station_id : chr [1:144873] "KA1503000043" "KA1503000043" "KA1503000043" "13193" ...
## $ start_lat : num [1:144873] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:144873] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:144873] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:144873] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr [1:144873] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(February_2024)
## spc_tbl_ [223,164 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:223164] "FCB05EB1758F85E8" "7FB986AD5D3DE9D6" "40CA13E15B5B470D" "D47A1660919E8861" ...
## $ rideable_type : chr [1:223164] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:223164], format: "2024-02-03 14:14:18" "2024-02-05 21:10:06" ...
## $ ended_at : POSIXct[1:223164], format: "2024-02-03 14:21:00" "2024-02-05 21:15:44" ...
## $ start_station_name: chr [1:223164] "Clark St & Newport St" "Michigan Ave & Washington St" "Leavitt St & Armitage Ave" "Southport Ave & Waveland Ave" ...
## $ start_station_id : chr [1:223164] "632" "13001" "TA1309000029" "13235" ...
## $ end_station_name : chr [1:223164] "Southport Ave & Waveland Ave" "Wabash Ave & Grand Ave" "Milwaukee Ave & Wabansia Ave" "Southport Ave & Belmont Ave" ...
## $ end_station_id : chr [1:223164] "13235" "TA1307000117" "13243" "13229" ...
## $ start_lat : num [1:223164] 41.9 41.9 41.9 41.9 41.8 ...
## $ start_lng : num [1:223164] -87.7 -87.6 -87.7 -87.7 -87.6 ...
## $ end_lat : num [1:223164] 41.9 41.9 41.9 41.9 41.8 ...
## $ end_lng : num [1:223164] -87.7 -87.6 -87.7 -87.7 -87.6 ...
## $ member_casual : chr [1:223164] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(March_2024)
## spc_tbl_ [301,687 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:301687] "64FBE3BAED5F29E6" "9991629435C5E20E" "E5C9FECD5B71BEBD" "4CEA3EC8906DAEA8" ...
## $ rideable_type : chr [1:301687] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:301687], format: "2024-03-05 18:33:11" "2024-03-06 17:15:14" ...
## $ ended_at : POSIXct[1:301687], format: "2024-03-05 18:51:48" "2024-03-06 17:16:04" ...
## $ start_station_name: chr [1:301687] NA NA NA NA ...
## $ start_station_id : chr [1:301687] NA NA NA NA ...
## $ end_station_name : chr [1:301687] NA NA NA NA ...
## $ end_station_id : chr [1:301687] NA NA NA NA ...
## $ start_lat : num [1:301687] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:301687] -87.7 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:301687] 42 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:301687] -87.7 -87.6 -87.6 -87.6 -87.7 ...
## $ member_casual : chr [1:301687] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(April_2024)
## spc_tbl_ [415,025 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:415025] "743252713F32516B" "BE90D33D2240C614" "D47BBDDE7C40DD61" "6684E760BF9EA9B5" ...
## $ rideable_type : chr [1:415025] "classic_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : POSIXct[1:415025], format: "2024-04-22 19:08:21" "2024-04-11 06:19:24" ...
## $ ended_at : POSIXct[1:415025], format: "2024-04-22 19:12:56" "2024-04-11 06:22:21" ...
## $ start_station_name: chr [1:415025] "Aberdeen St & Jackson Blvd" "Aberdeen St & Jackson Blvd" "Sheridan Rd & Montrose Ave" "Aberdeen St & Jackson Blvd" ...
## $ start_station_id : chr [1:415025] "13157" "13157" "TA1307000107" "13157" ...
## $ end_station_name : chr [1:415025] "Desplaines St & Jackson Blvd" "Desplaines St & Jackson Blvd" "Ashland Ave & Belle Plaine Ave" "Desplaines St & Jackson Blvd" ...
## $ end_station_id : chr [1:415025] "15539" "15539" "13249" "15539" ...
## $ start_lat : num [1:415025] 41.9 41.9 42 41.9 42 ...
## $ start_lng : num [1:415025] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:415025] 41.9 41.9 42 41.9 41.9 ...
## $ end_lng : num [1:415025] -87.6 -87.6 -87.7 -87.6 -87.6 ...
## $ member_casual : chr [1:415025] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(May_2024)
## spc_tbl_ [609,493 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:609493] "7D9F0CE9EC2A1297" "02EC47687411416F" "101370FB2D3402BE" "E97E396331ED6913" ...
## $ rideable_type : chr [1:609493] "classic_bike" "classic_bike" "classic_bike" "electric_bike" ...
## $ started_at : POSIXct[1:609493], format: "2024-05-25 15:52:42" "2024-05-14 15:11:51" ...
## $ ended_at : POSIXct[1:609493], format: "2024-05-25 16:11:50" "2024-05-14 15:22:00" ...
## $ start_station_name: chr [1:609493] "Streeter Dr & Grand Ave" "Sheridan Rd & Greenleaf Ave" "Streeter Dr & Grand Ave" "Streeter Dr & Grand Ave" ...
## $ start_station_id : chr [1:609493] "13022" "KA1504000159" "13022" "13022" ...
## $ end_station_name : chr [1:609493] "Clark St & Elm St" "Sheridan Rd & Loyola Ave" "Wabash Ave & 9th St" "Sheffield Ave & Wellington Ave" ...
## $ end_station_id : chr [1:609493] "TA1307000039" "RP-009" "TA1309000010" "TA1307000052" ...
## $ start_lat : num [1:609493] 41.9 42 41.9 41.9 41.9 ...
## $ start_lng : num [1:609493] -87.6 -87.7 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:609493] 41.9 42 41.9 41.9 41.9 ...
## $ end_lng : num [1:609493] -87.6 -87.7 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:609493] "casual" "casual" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(June_2024)
## spc_tbl_ [710,721 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:710721] "CDE6023BE6B11D2F" "462B48CD292B6A18" "9CFB6A858D23ABF7" "6365EFEB64231153" ...
## $ rideable_type : chr [1:710721] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:710721], format: "2024-06-11 17:20:06" "2024-06-11 17:19:21" ...
## $ ended_at : POSIXct[1:710721], format: "2024-06-11 17:21:39" "2024-06-11 17:19:36" ...
## $ start_station_name: chr [1:710721] NA NA NA NA ...
## $ start_station_id : chr [1:710721] NA NA NA NA ...
## $ end_station_name : chr [1:710721] NA NA NA NA ...
## $ end_station_id : chr [1:710721] NA NA NA NA ...
## $ start_lat : num [1:710721] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:710721] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ end_lat : num [1:710721] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:710721] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr [1:710721] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(July_2024)
## spc_tbl_ [748,962 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:748962] "2658E319B13141F9" "B2176315168A47CE" "C2A9D33DF7EBB422" "8BFEA406DF01D8AD" ...
## $ rideable_type : chr [1:748962] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:748962], format: "2024-07-11 08:15:14" "2024-07-11 15:45:07" ...
## $ ended_at : POSIXct[1:748962], format: "2024-07-11 08:17:56" "2024-07-11 16:06:04" ...
## $ start_station_name: chr [1:748962] NA NA NA NA ...
## $ start_station_id : chr [1:748962] NA NA NA NA ...
## $ end_station_name : chr [1:748962] NA NA NA NA ...
## $ end_station_id : chr [1:748962] NA NA NA NA ...
## $ start_lat : num [1:748962] 41.8 41.8 41.8 41.9 42 ...
## $ start_lng : num [1:748962] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ end_lat : num [1:748962] 41.8 41.8 41.8 41.9 41.9 ...
## $ end_lng : num [1:748962] -87.6 -87.6 -87.6 -87.7 -87.6 ...
## $ member_casual : chr [1:748962] "casual" "casual" "casual" "casual" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(August_2024)
## spc_tbl_ [755,639 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:755639] "BAA154388A869E64" "8752245932EFF67A" "44DDF9F57A9A161F" "44AAAF069B0C78C3" ...
## $ rideable_type : chr [1:755639] "classic_bike" "electric_bike" "classic_bike" "electric_bike" ...
## $ started_at : POSIXct[1:755639], format: "2024-08-02 13:35:14" "2024-08-02 15:33:13" ...
## $ ended_at : POSIXct[1:755639], format: "2024-08-02 13:48:24" "2024-08-02 15:55:23" ...
## $ start_station_name: chr [1:755639] "State St & Randolph St" "Franklin St & Monroe St" "Franklin St & Monroe St" "Clark St & Elm St" ...
## $ start_station_id : chr [1:755639] "TA1305000029" "TA1309000007" "TA1309000007" "TA1307000039" ...
## $ end_station_name : chr [1:755639] "Wabash Ave & 9th St" "Damen Ave & Cortland St" "Clark St & Elm St" "McClurg Ct & Ohio St" ...
## $ end_station_id : chr [1:755639] "TA1309000010" "13133" "TA1307000039" "TA1306000029" ...
## $ start_lat : num [1:755639] 41.9 41.9 41.9 41.9 42 ...
## $ start_lng : num [1:755639] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:755639] 41.9 41.9 41.9 41.9 42 ...
## $ end_lng : num [1:755639] -87.6 -87.7 -87.6 -87.6 -87.7 ...
## $ member_casual : chr [1:755639] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(September_2024)
## spc_tbl_ [821,276 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:821276] "31D38723D5A8665A" "67CB39987F4E895B" "DA61204FD26EC681" "06F160D46AF235DD" ...
## $ rideable_type : chr [1:821276] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:821276], format: "2024-09-26 15:30:58" "2024-09-26 15:31:32" ...
## $ ended_at : POSIXct[1:821276], format: "2024-09-26 15:30:59" "2024-09-26 15:53:13" ...
## $ start_station_name: chr [1:821276] NA NA NA NA ...
## $ start_station_id : chr [1:821276] NA NA NA NA ...
## $ end_station_name : chr [1:821276] NA NA NA NA ...
## $ end_station_id : chr [1:821276] NA NA NA NA ...
## $ start_lat : num [1:821276] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:821276] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:821276] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:821276] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr [1:821276] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(October_2024)
## spc_tbl_ [616,281 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:616281] "4422E707103AA4FF" "19DB722B44CBE82F" "20AE2509FD68C939" "D0F17580AB9515A9" ...
## $ rideable_type : chr [1:616281] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : POSIXct[1:616281], format: "2024-10-14 03:26:04" "2024-10-13 19:33:38" ...
## $ ended_at : POSIXct[1:616281], format: "2024-10-14 03:32:56" "2024-10-13 19:39:04" ...
## $ start_station_name: chr [1:616281] NA NA NA NA ...
## $ start_station_id : chr [1:616281] NA NA NA NA ...
## $ end_station_name : chr [1:616281] NA NA NA NA ...
## $ end_station_id : chr [1:616281] NA NA NA NA ...
## $ start_lat : num [1:616281] 42 42 42 42 42 ...
## $ start_lng : num [1:616281] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:616281] 42 42 42 42 42 ...
## $ end_lng : num [1:616281] -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ member_casual : chr [1:616281] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(November_2024)
## spc_tbl_ [335,075 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:335075] "578DDD7CE1771FFA" "78B141C50102ABA6" "1E794CF36394E2D7" "E5DD2CAB58D73F98" ...
## $ rideable_type : chr [1:335075] "classic_bike" "classic_bike" "classic_bike" "classic_bike" ...
## $ started_at : POSIXct[1:335075], format: "2024-11-07 19:21:58" "2024-11-22 14:49:00" ...
## $ ended_at : POSIXct[1:335075], format: "2024-11-07 19:28:57" "2024-11-22 14:56:15" ...
## $ start_station_name: chr [1:335075] "Walsh Park" "Walsh Park" "Walsh Park" "Clark St & Elm St" ...
## $ start_station_id : chr [1:335075] "18067" "18067" "18067" "TA1307000039" ...
## $ end_station_name : chr [1:335075] "Leavitt St & North Ave" "Leavitt St & Armitage Ave" "Damen Ave & Cortland St" "Clark St & Drummond Pl" ...
## $ end_station_id : chr [1:335075] "TA1308000005" "TA1309000029" "13133" "TA1307000142" ...
## $ start_lat : num [1:335075] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:335075] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ end_lat : num [1:335075] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:335075] -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr [1:335075] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(December_2024)
## spc_tbl_ [178,372 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:178372] "6C960DEB4F78854E" "C0913EEB2834E7A2" "848A37DD4723078A" "3FA09C762ECB48BD" ...
## $ rideable_type : chr [1:178372] "electric_bike" "classic_bike" "classic_bike" "electric_bike" ...
## $ started_at : POSIXct[1:178372], format: "2024-12-31 01:38:35" "2024-12-21 18:41:26" ...
## $ ended_at : POSIXct[1:178372], format: "2024-12-31 01:48:45" "2024-12-21 18:47:33" ...
## $ start_station_name: chr [1:178372] "Halsted St & Roscoe St" "Clark St & Wellington Ave" "Sheridan Rd & Montrose Ave" "Aberdeen St & Jackson Blvd" ...
## $ start_station_id : chr [1:178372] "TA1309000025" "TA1307000136" "TA1307000107" "13157" ...
## $ end_station_name : chr [1:178372] "Clark St & Winnemac Ave" "Halsted St & Roscoe St" "Broadway & Barry Ave" "Green St & Randolph St*" ...
## $ end_station_id : chr [1:178372] "TA1309000035" "TA1309000025" "13137" "chargingstx3" ...
## $ start_lat : num [1:178372] 41.9 41.9 42 41.9 41.9 ...
## $ start_lng : num [1:178372] -87.6 -87.6 -87.7 -87.7 -87.7 ...
## $ end_lat : num [1:178372] 42 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:178372] -87.7 -87.6 -87.6 -87.6 -87.7 ...
## $ member_casual : chr [1:178372] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Confirming that the datatypes are consistent across all datasets.
compare_df_cols(January_2024,February_2024,March_2024,April_2024,May_2024,June_2024,July_2024,August_2024,September_2024,October_2024,November_2024,December_2024,return = "mismatch")
## [1] column_name January_2024 February_2024 March_2024 April_2024
## [6] May_2024 June_2024 July_2024 August_2024 September_2024
## [11] October_2024 November_2024 December_2024
## <0 rows> (or 0-length row.names)
The function above confirms that there are no mismatched datatypes across all 12 of the data sets. Next, combining these datasets into a single data frame by using the bind_rows() function, stacking them ontop of one another:
combined_trips <- bind_rows(January_2024,February_2024,March_2024,April_2024,May_2024,June_2024,July_2024,August_2024,September_2024,October_2024,November_2024,December_2024)
Running some functions to obtain quick overview of the data frame:
str(combined_trips)
## spc_tbl_ [5,860,568 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5860568] "C1D650626C8C899A" "EECD38BDB25BFCB0" "F4A9CE78061F17F7" "0A0D9E15EE50B171" ...
## $ rideable_type : chr [1:5860568] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:5860568], format: "2024-01-12 15:30:27" "2024-01-08 15:45:46" ...
## $ ended_at : POSIXct[1:5860568], format: "2024-01-12 15:37:59" "2024-01-08 15:52:59" ...
## $ start_station_name: chr [1:5860568] "Wells St & Elm St" "Wells St & Elm St" "Wells St & Elm St" "Wells St & Randolph St" ...
## $ start_station_id : chr [1:5860568] "KA1504000135" "KA1504000135" "KA1504000135" "TA1305000030" ...
## $ end_station_name : chr [1:5860568] "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Larrabee St & Webster Ave" ...
## $ end_station_id : chr [1:5860568] "KA1503000043" "KA1503000043" "KA1503000043" "13193" ...
## $ start_lat : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr [1:5860568] "member" "member" "member" "member" ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
head(combined_trips)
## # A tibble: 6 × 13
## ride_id rideable_type started_at ended_at
## <chr> <chr> <dttm> <dttm>
## 1 C1D650626C8C899A electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59
## 2 EECD38BDB25BFCB0 electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59
## 3 F4A9CE78061F17F7 electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19
## 4 0A0D9E15EE50B171 classic_bike 2024-01-29 16:26:17 2024-01-29 16:56:06
## 5 33FFC9805E3EFF9A classic_bike 2024-01-31 05:43:23 2024-01-31 06:09:35
## 6 C96080812CD285C5 classic_bike 2024-01-07 11:21:24 2024-01-07 11:30:03
## # ℹ 9 more variables: start_station_name <chr>, start_station_id <chr>,
## # end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## # start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>
skim_without_charts(combined_trips)
Name | combined_trips |
Number of rows | 5860568 |
Number of columns | 13 |
_______________________ | |
Column type frequency: | |
character | 7 |
numeric | 4 |
POSIXct | 2 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
ride_id | 0 | 1.00 | 16 | 16 | 0 | 5860357 | 0 |
rideable_type | 0 | 1.00 | 12 | 16 | 0 | 3 | 0 |
start_station_name | 1073951 | 0.82 | 10 | 64 | 0 | 1808 | 0 |
start_station_id | 1073951 | 0.82 | 3 | 35 | 0 | 1763 | 0 |
end_station_name | 1104653 | 0.81 | 10 | 64 | 0 | 1815 | 0 |
end_station_id | 1104653 | 0.81 | 3 | 35 | 0 | 1768 | 0 |
member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
start_lat | 0 | 1 | 41.90 | 0.04 | 41.64 | 41.88 | 41.90 | 41.93 | 42.07 |
start_lng | 0 | 1 | -87.65 | 0.03 | -87.91 | -87.66 | -87.64 | -87.63 | -87.52 |
end_lat | 7232 | 1 | 41.90 | 0.06 | 16.06 | 41.88 | 41.90 | 41.93 | 87.96 |
end_lng | 7232 | 1 | -87.65 | 0.11 | -144.05 | -87.66 | -87.64 | -87.63 | 152.53 |
Variable type: POSIXct
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
started_at | 0 | 1 | 2024-01-01 00:00:39 | 2024-12-31 23:56:49 | 2024-07-22 20:36:16 | 5649600 |
ended_at | 0 | 1 | 2024-01-01 00:04:20 | 2024-12-31 23:59:55 | 2024-07-22 20:53:59 | 5652165 |
dim(combined_trips)
## [1] 5860568 13
Creating columns that extract the day, month, year and day of the week of each trip using its recorded “started_at” time:
combined_trips$date <- as.Date(combined_trips$started_at)
combined_trips$month <- format(as.Date(combined_trips$date), "%B")
combined_trips$day <- format(as.Date(combined_trips$date), "%d")
combined_trips$year <- format(as.Date(combined_trips$date), "%Y")
combined_trips$day_of_the_week <- weekdays(combined_trips$date)
head(combined_trips)
## # A tibble: 6 × 18
## ride_id rideable_type started_at ended_at
## <chr> <chr> <dttm> <dttm>
## 1 C1D650626C8C899A electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59
## 2 EECD38BDB25BFCB0 electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59
## 3 F4A9CE78061F17F7 electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19
## 4 0A0D9E15EE50B171 classic_bike 2024-01-29 16:26:17 2024-01-29 16:56:06
## 5 33FFC9805E3EFF9A classic_bike 2024-01-31 05:43:23 2024-01-31 06:09:35
## 6 C96080812CD285C5 classic_bike 2024-01-07 11:21:24 2024-01-07 11:30:03
## # ℹ 14 more variables: start_station_name <chr>, start_station_id <chr>,
## # end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## # start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>,
## # date <date>, month <chr>, day <chr>, year <chr>, day_of_the_week <chr>
Adding another variable: the ride length; that is, the difference between the “started_at” and “ended_at” time of each trip.
combined_trips$ride_length <- difftime(combined_trips$ended_at, combined_trips$started_at, units = "min")
Reducing the decimal points to two places and changing the datatype to numeric so that arithmetic analysis can be done on the ride_length column:
combined_trips$ride_length <- round(combined_trips$ride_length, 2)
combined_trips$ride_length <- as.numeric(as.character(combined_trips$ride_length))
Checking the structure of the data frame for datatype consistency:
str(combined_trips)
## spc_tbl_ [5,860,568 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5860568] "C1D650626C8C899A" "EECD38BDB25BFCB0" "F4A9CE78061F17F7" "0A0D9E15EE50B171" ...
## $ rideable_type : chr [1:5860568] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:5860568], format: "2024-01-12 15:30:27" "2024-01-08 15:45:46" ...
## $ ended_at : POSIXct[1:5860568], format: "2024-01-12 15:37:59" "2024-01-08 15:52:59" ...
## $ start_station_name: chr [1:5860568] "Wells St & Elm St" "Wells St & Elm St" "Wells St & Elm St" "Wells St & Randolph St" ...
## $ start_station_id : chr [1:5860568] "KA1504000135" "KA1504000135" "KA1504000135" "TA1305000030" ...
## $ end_station_name : chr [1:5860568] "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Kingsbury St & Kinzie St" "Larrabee St & Webster Ave" ...
## $ end_station_id : chr [1:5860568] "KA1503000043" "KA1503000043" "KA1503000043" "13193" ...
## $ start_lat : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.7 ...
## $ end_lat : num [1:5860568] 41.9 41.9 41.9 41.9 41.9 ...
## $ end_lng : num [1:5860568] -87.6 -87.6 -87.6 -87.6 -87.6 ...
## $ member_casual : chr [1:5860568] "member" "member" "member" "member" ...
## $ date : Date[1:5860568], format: "2024-01-12" "2024-01-08" ...
## $ month : chr [1:5860568] "January" "January" "January" "January" ...
## $ day : chr [1:5860568] "12" "08" "27" "29" ...
## $ year : chr [1:5860568] "2024" "2024" "2024" "2024" ...
## $ day_of_the_week : chr [1:5860568] "Friday" "Monday" "Saturday" "Monday" ...
## $ ride_length : num [1:5860568] 7.53 7.22 8 29.82 26.2 ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_datetime(format = ""),
## .. ended_at = col_datetime(format = ""),
## .. start_station_name = col_character(),
## .. start_station_id = col_character(),
## .. end_station_name = col_character(),
## .. end_station_id = col_character(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. member_casual = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
##Cleaning
Normally, I would need to be advised on what specific data cleaning procedures I am permitted to carry out on the data. To retain the integrity of the original data frame, the original data set needs to be backed up and stored before performing all data cleaning procedures.
Deleting null values:
combined_trips_v2 <- drop_na(combined_trips)
Deleting the entries with negative ride_length since the end time should normally not be earlier than the start time:
combined_trips_v2 <- filter(combined_trips_v2, ride_length > 0)
Extracting the latitude and longitude data from the data set since it will not be needed in this phase of the analysis:
lat_lng1 <- select(combined_trips_v2,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual)
combined_trips_v2 <- combined_trips_v2 %>%
select (-c(start_lat, start_lng, end_lat, end_lng))
head (combined_trips_v2)
## # A tibble: 6 × 15
## ride_id rideable_type started_at ended_at
## <chr> <chr> <dttm> <dttm>
## 1 C1D650626C8C899A electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59
## 2 EECD38BDB25BFCB0 electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59
## 3 F4A9CE78061F17F7 electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19
## 4 0A0D9E15EE50B171 classic_bike 2024-01-29 16:26:17 2024-01-29 16:56:06
## 5 33FFC9805E3EFF9A classic_bike 2024-01-31 05:43:23 2024-01-31 06:09:35
## 6 C96080812CD285C5 classic_bike 2024-01-07 11:21:24 2024-01-07 11:30:03
## # ℹ 11 more variables: start_station_name <chr>, start_station_id <chr>,
## # end_station_name <chr>, end_station_id <chr>, member_casual <chr>,
## # date <date>, month <chr>, day <chr>, year <chr>, day_of_the_week <chr>,
## # ride_length <dbl>
After concluding that the data is in the correct format and has been properly cleaned, moving on to the next phase: Analysis.
##Analysis
In this phase, patterns, trends, and relationships are detected in order to seek a resolution to the business task. Starting with descriptive analysis:
combined_trips_v2 %>%
group_by (member_casual) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))
## # A tibble: 2 × 3
## member_casual number_of_rides average_ride_length
## <chr> <int> <dbl>
## 1 casual 1521531 24.1
## 2 member 2686557 12.5
combined_trips_v2 %>%
count(rideable_type)
## # A tibble: 3 × 2
## rideable_type n
## <chr> <int>
## 1 classic_bike 2728069
## 2 electric_bike 1432192
## 3 electric_scooter 47827
combined_trips_v2 %>%
group_by(member_casual)%>%
count (rideable_type)
## # A tibble: 6 × 3
## # Groups: member_casual [2]
## member_casual rideable_type n
## <chr> <chr> <int>
## 1 casual classic_bike 968853
## 2 casual electric_bike 526938
## 3 casual electric_scooter 25740
## 4 member classic_bike 1759216
## 5 member electric_bike 905254
## 6 member electric_scooter 22087
summary (combined_trips_v2$ride_length) #This shows a summary of the ride length.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 5.83 10.14 16.66 18.22 1509.37
combined_trips_v2%>%
group_by(member_casual)%>%
summarise(number_of_rides=n(), min_ride_length=min(ride_length),max_ride_length=max(ride_length),avg_ride_length=mean(ride_length),median_ride_length=median(ride_length))
## # A tibble: 2 × 6
## member_casual number_of_rides min_ride_length max_ride_length avg_ride_length
## <chr> <int> <dbl> <dbl> <dbl>
## 1 casual 1521531 0.01 1509. 24.1
## 2 member 2686557 0.01 1498. 12.5
## # ℹ 1 more variable: median_ride_length <dbl>
Ordering the months from January 2024 to December 2024 and the days of the week from Sunday to Saturday:
combined_trips_v2$month <- ordered(combined_trips_v2$month, levels=c( "January", "February", "March", "April", "May", "June", "July","August", "September", "October", "November", "December"))
combined_trips_v2$day_of_the_week <- ordered(combined_trips_v2$day_of_the_week, levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))
combined_trips_v2 %>%
group_by(member_casual, month) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
arrange (month)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 24 × 4
## # Groups: member_casual [2]
## member_casual month number_of_rides average_ride_length
## <chr> <ord> <int> <dbl>
## 1 casual January 17712 15.5
## 2 member January 96076 11.6
## 3 casual February 38168 19.8
## 4 member February 146553 11.7
## 5 casual March 62801 22.0
## 6 member March 167436 11.4
## 7 casual April 93927 24.8
## 8 member April 203814 12.3
## 9 casual May 167524 26.9
## 10 member May 274711 13.3
## # ℹ 14 more rows
combined_trips_v2 %>%
group_by(member_casual, day_of_the_week) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
arrange (day_of_the_week)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups: member_casual [2]
## member_casual day_of_the_week number_of_rides average_ride_length
## <chr> <ord> <int> <dbl>
## 1 casual Sunday 266167 27.6
## 2 member Sunday 295517 14.0
## 3 casual Monday 181652 23.2
## 4 member Monday 399237 11.9
## 5 casual Tuesday 162825 20.6
## 6 member Tuesday 421404 11.9
## 7 casual Wednesday 187354 21.4
## 8 member Wednesday 446875 12.2
## 9 casual Thursday 183071 20.9
## 10 member Thursday 414244 11.9
## 11 casual Friday 222632 23.2
## 12 member Friday 376721 12.1
## 13 casual Saturday 317830 27.3
## 14 member Saturday 332559 14.0
Most popular start stations for casual riders:
combined_trips_v2 %>%
group_by(start_station_name, member_casual) %>%
summarise(number_of_trips=n()) %>%
arrange(desc (number_of_trips)) %>%
filter(member_casual== "casual") %>%
select(start_station_name, number_of_trips)
## `summarise()` has grouped output by 'start_station_name'. You can override
## using the `.groups` argument.
## # A tibble: 1,570 × 2
## # Groups: start_station_name [1,570]
## start_station_name number_of_trips
## <chr> <int>
## 1 Streeter Dr & Grand Ave 48315
## 2 DuSable Lake Shore Dr & Monroe St 32194
## 3 Michigan Ave & Oak St 23295
## 4 DuSable Lake Shore Dr & North Blvd 21359
## 5 Millennium Park 20840
## 6 Shedd Aquarium 20117
## 7 Dusable Harbor 17263
## 8 Theater on the Lake 15398
## 9 Michigan Ave & 8th St 12556
## 10 Adler Planetarium 12273
## # ℹ 1,560 more rows
Most popular end stations for casual riders:
combined_trips_v2 %>%
group_by(end_station_name, member_casual) %>%
summarise(number_of_trips=n()) %>%
arrange(desc (number_of_trips)) %>%
filter(member_casual== "casual") %>%
select(end_station_name, number_of_trips)
## `summarise()` has grouped output by 'end_station_name'. You can override using
## the `.groups` argument.
## # A tibble: 1,613 × 2
## # Groups: end_station_name [1,613]
## end_station_name number_of_trips
## <chr> <int>
## 1 Streeter Dr & Grand Ave 52435
## 2 DuSable Lake Shore Dr & Monroe St 30120
## 3 DuSable Lake Shore Dr & North Blvd 25129
## 4 Michigan Ave & Oak St 24187
## 5 Millennium Park 22912
## 6 Shedd Aquarium 18307
## 7 Theater on the Lake 16906
## 8 Dusable Harbor 15674
## 9 Michigan Ave & 8th St 11648
## 10 Indiana Ave & Roosevelt Rd 10703
## # ℹ 1,603 more rows
Creating another version of the data set that combines the start_station_name and the end_station_name to serve as an aggregate representation of each ride route:
combined_trips_v3 <- (unite(combined_trips_v2, "ride_routes", start_station_name, end_station_name, sep= " to "))
head(combined_trips_v3)
## # A tibble: 6 × 14
## ride_id rideable_type started_at ended_at ride_routes
## <chr> <chr> <dttm> <dttm> <chr>
## 1 C1D650626C8… electric_bike 2024-01-12 15:30:27 2024-01-12 15:37:59 Wells St &…
## 2 EECD38BDB25… electric_bike 2024-01-08 15:45:46 2024-01-08 15:52:59 Wells St &…
## 3 F4A9CE78061… electric_bike 2024-01-27 12:27:19 2024-01-27 12:35:19 Wells St &…
## 4 0A0D9E15EE5… classic_bike 2024-01-29 16:26:17 2024-01-29 16:56:06 Wells St &…
## 5 33FFC9805E3… classic_bike 2024-01-31 05:43:23 2024-01-31 06:09:35 Lincoln Av…
## 6 C96080812CD… classic_bike 2024-01-07 11:21:24 2024-01-07 11:30:03 Wells St &…
## # ℹ 9 more variables: start_station_id <chr>, end_station_id <chr>,
## # member_casual <chr>, date <date>, month <ord>, day <chr>, year <chr>,
## # day_of_the_week <ord>, ride_length <dbl>
Extracting data about the most used ride routes and segregating them by rider type:
top_routes <- combined_trips_v3 %>%
group_by(ride_routes) %>%
summarise(number_of_rides=n()) %>%
arrange (desc (number_of_rides))
head (top_routes,10)
## # A tibble: 10 × 2
## ride_routes number_of_rides
## <chr> <int>
## 1 Streeter Dr & Grand Ave to Streeter Dr & Grand Ave 9667
## 2 DuSable Lake Shore Dr & Monroe St to DuSable Lake Shore Dr &… 7917
## 3 Calumet Ave & 33rd St to State St & 33rd St 5921
## 4 State St & 33rd St to Calumet Ave & 33rd St 5913
## 5 DuSable Lake Shore Dr & Monroe St to Streeter Dr & Grand Ave 5692
## 6 Ellis Ave & 60th St to Ellis Ave & 55th St 5437
## 7 Michigan Ave & Oak St to Michigan Ave & Oak St 5261
## 8 Ellis Ave & 60th St to University Ave & 57th St 5187
## 9 Ellis Ave & 55th St to Ellis Ave & 60th St 5137
## 10 University Ave & 57th St to Ellis Ave & 60th St 4827
top_routes_1 <- combined_trips_v3 %>%
group_by(ride_routes, member_casual) %>%
summarise(number_of_rides=n()) %>%
arrange (desc(number_of_rides))
## `summarise()` has grouped output by 'ride_routes'. You can override using the
## `.groups` argument.
head (top_routes_1, 10)
## # A tibble: 10 × 3
## # Groups: ride_routes [10]
## ride_routes member_casual number_of_rides
## <chr> <chr> <int>
## 1 Streeter Dr & Grand Ave to Streeter Dr & Grand… casual 8871
## 2 DuSable Lake Shore Dr & Monroe St to DuSable L… casual 7242
## 3 State St & 33rd St to Calumet Ave & 33rd St member 5587
## 4 Calumet Ave & 33rd St to State St & 33rd St member 5506
## 5 DuSable Lake Shore Dr & Monroe St to Streeter … casual 5265
## 6 Michigan Ave & Oak St to Michigan Ave & Oak St casual 4619
## 7 Ellis Ave & 60th St to University Ave & 57th St member 3988
## 8 Ellis Ave & 60th St to Ellis Ave & 55th St member 3976
## 9 University Ave & 57th St to Ellis Ave & 60th St member 3942
## 10 Ellis Ave & 55th St to Ellis Ave & 60th St member 3837
##Visualizations
Facilitating the identification of patterns and trends in the data in order to gain useful insights:
Number of rides and average ride length by rider type:
combined_trips_v3 %>%
group_by(member_casual) %>%
summarise(Average_ride_length=mean(ride_length)) %>%
ggplot(aes(x= member_casual, y=Average_ride_length, fill=member_casual)) + geom_col() + labs(title = "Average ride length by rider type", x="Rider type", y="Average ride length")
combined_trips_v3 %>%
group_by(member_casual, month) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
ggplot (aes(x=month, y=number_of_rides, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Number of rides per month segmented by rider type", x= "Month", y="Number of rides")+ theme(axis.text.x = element_text(angle = 60, hjust=1))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
combined_trips_v3 %>%
group_by(member_casual, month) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
ggplot (aes(x=month, y=average_ride_length, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Average ride length per month segmented by rider type", x= "Month", y="Average ride length")+ theme(axis.text.x = element_text(angle = 60, hjust=1))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
combined_trips_v3 %>%
group_by(member_casual, day_of_the_week) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
ggplot (aes(x=day_of_the_week, y=number_of_rides, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Number of rides per day of the week", x= "Day of the week", y="Number of rides")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
combined_trips_v3 %>%
group_by(member_casual, day_of_the_week) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
ggplot (aes(x=day_of_the_week, y=average_ride_length, fill=member_casual)) + geom_col(position= "dodge2") + labs(title = "Average ride length per week day segmented by rider type", x= "Day of the week", y="Average ride length")+ theme(axis.text.x = element_text(angle = 60, hjust=1))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
Number of rides and Average ride length by rideable type:
combined_trips_v3 %>%
group_by(rideable_type, member_casual) %>%
summarise(number_of_rides=n()) %>%
ggplot(aes(x=rideable_type, y=number_of_rides, fill=member_casual)) + geom_col(position = "dodge") + labs(title="Number of rides per rideable type" , x="Rideable Type", y="Number of rides")
## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.
combined_trips_v3 %>%
group_by(rideable_type, member_casual) %>%
summarise(Average_ride_length = mean(ride_length)) %>%
ggplot(aes(x=rideable_type, y=Average_ride_length, fill=member_casual)) + geom_col(position = "dodge") + labs(title="Average ride length per rideable type" , x="Rideable Type", y="Average ride length")
## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.
combined_trips_v3 %>%
group_by(member_casual, month, rideable_type) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
ggplot (aes(x=month, y=number_of_rides, fill=rideable_type)) + geom_col(position="dodge2") + facet_wrap(~member_casual) + labs(title = "Number of rides segmented by rideable types for each month", x="Month", y="Number of rides") + theme(axis.text.x = element_text(angle = 60, hjust=1))
## `summarise()` has grouped output by 'member_casual', 'month'. You can override
## using the `.groups` argument.
combined_trips_v3 %>%
group_by(member_casual, day_of_the_week, rideable_type) %>%
summarise(number_of_rides=n(), average_ride_length=mean(ride_length))%>%
ggplot (aes(x=day_of_the_week, y=number_of_rides, fill=rideable_type)) + geom_col(position = "dodge") + facet_wrap(~member_casual) + labs(title = "Number of rides segmented by rideable types for each day of the week", x="Day of the week", y="Number of rides") + theme(axis.text.x = element_text(angle = 60, hjust=1))
## `summarise()` has grouped output by 'member_casual', 'day_of_the_week'. You can
## override using the `.groups` argument.
combined_trips_v2 %>%
group_by(start_station_name, member_casual) %>%
summarise(number_of_trips=n()) %>%
arrange(desc (number_of_trips)) %>%
filter(member_casual== "casual", number_of_trips >= 15460) %>%
select(start_station_name, number_of_trips) %>%
ggplot(aes(x=start_station_name, y=number_of_trips)) + geom_col(fill="blue") + coord_flip() + labs(title="Top 10 most popular start stations for casual riders", x="Start station name", y="Number of trips")
## `summarise()` has grouped output by 'start_station_name'. You can override
## using the `.groups` argument.
combined_trips_v2 %>%
group_by(end_station_name, member_casual) %>%
summarise(number_of_trips=n()) %>%
arrange(desc (number_of_trips)) %>%
filter(member_casual== "casual", number_of_trips >= 15596) %>%
select(end_station_name, number_of_trips) %>%
ggplot(aes(x=end_station_name, y=number_of_trips)) + geom_col(fill="red") + coord_flip() + labs(title="Top 10 most popular end stations for casual riders", x="End station name", y="Number of trips")
## `summarise()` has grouped output by 'end_station_name'. You can override using
## the `.groups` argument.
Saving these data frames as CSV files on my local desktop for further
analysis and visualizations on other platforms.
write.csv(top_routes,"C:\\Users\\Gerald Ntune\\Desktop\\Cyclist project\\top_routes.csv", row.names=FALSE)
write.csv(combined_trips_v3,"C:\\Users\\Gerald Ntune\\Desktop\\Cyclist project\\combined_trips_v3.csv", row.names=FALSE)
write.csv(top_routes_1,"C:\\Users\\Temi\\Desktop\\Cyclist project\\top_routes_1.csv", row.names=FALSE)
write.csv(lat_lng1,"C:\\Users\\Temi\\Desktop\\Cyclist project\\lat_lng1.csv", row.names=FALSE)