In my data analysis projects, I have worked extensively with date-time conversions to facilitate data cleaning, transformation, and visualization. One such project involved processing Divvy bike-sharing trip data for the year 2019. The dataset contained timestamps in different formats that required standardization for further analysis.
The project began with importing four quarterly datasets using
read_csv(). Since column names differed across datasets, I
standardized them using rename() from the
dplyr package.
library(here)
## here() starts at C:/Users/oscar
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(skimr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
q1_2019 <- read_csv("Divvy_Trips_2019_Q1.csv")
## Rows: 365069 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): from_station_name, to_station_name, usertype, gender
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## dttm (2): start_time, end_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q2_2019i <- read_csv("Divvy_Trips_2019_Q2.csv")
## Rows: 1108163 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): 03 - Rental Start Station Name, 02 - Rental End Station Name, User...
## dbl (5): 01 - Rental Details Rental ID, 01 - Rental Details Bike ID, 03 - R...
## num (1): 01 - Rental Details Duration In Seconds Uncapped
## dttm (2): 01 - Rental Details Local Start Time, 01 - Rental Details Local En...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q3_2019 <- read_csv("Divvy_Trips_2019_Q3.csv")
## Rows: 1640718 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): from_station_name, to_station_name, usertype, gender
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## dttm (2): start_time, end_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q4_2019 <- read_csv("Divvy_Trips_2019_Q4.csv")
## Rows: 704054 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): from_station_name, to_station_name, usertype, gender
## dbl (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## num (1): tripduration
## dttm (2): start_time, end_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(q1_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q2_2019i)
## [1] "01 - Rental Details Rental ID"
## [2] "01 - Rental Details Local Start Time"
## [3] "01 - Rental Details Local End Time"
## [4] "01 - Rental Details Bike ID"
## [5] "01 - Rental Details Duration In Seconds Uncapped"
## [6] "03 - Rental Start Station ID"
## [7] "03 - Rental Start Station Name"
## [8] "02 - Rental End Station ID"
## [9] "02 - Rental End Station Name"
## [10] "User Type"
## [11] "Member Gender"
## [12] "05 - Member Details Member Birthday Year"
colnames(q3_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q4_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
q2_2019 <- q2_2019i %>%
rename(trip_id = '01 - Rental Details Rental ID',
start_time = '01 - Rental Details Local Start Time',
end_time = '01 - Rental Details Local End Time',
bikeid = '01 - Rental Details Bike ID',
tripduration = '01 - Rental Details Duration In Seconds Uncapped',
from_station_id = '03 - Rental Start Station ID',
from_station_name = '03 - Rental Start Station Name',
to_station_id = '02 - Rental End Station ID',
to_station_name = '02 - Rental End Station Name',
usertype = 'User Type',
gender = 'Member Gender',
birthyear = '05 - Member Details Member Birthday Year')
str(q1_2019)
## spc_tbl_ [365,069 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : num [1:365069] 21742443 21742444 21742445 21742446 21742447 ...
## $ start_time : POSIXct[1:365069], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ end_time : POSIXct[1:365069], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ bikeid : num [1:365069] 2167 4386 1524 252 1170 ...
## $ tripduration : num [1:365069] 390 441 829 1783 364 ...
## $ from_station_id : num [1:365069] 199 44 15 123 173 98 98 211 150 268 ...
## $ from_station_name: chr [1:365069] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ to_station_id : num [1:365069] 84 624 644 176 35 49 49 142 148 141 ...
## $ to_station_name : chr [1:365069] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ usertype : chr [1:365069] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:365069] "Male" "Female" "Female" "Male" ...
## $ birthyear : num [1:365069] 1989 1990 1994 1993 1994 ...
## - attr(*, "spec")=
## .. cols(
## .. trip_id = col_double(),
## .. start_time = col_datetime(format = ""),
## .. end_time = col_datetime(format = ""),
## .. bikeid = col_double(),
## .. tripduration = col_number(),
## .. from_station_id = col_double(),
## .. from_station_name = col_character(),
## .. to_station_id = col_double(),
## .. to_station_name = col_character(),
## .. usertype = col_character(),
## .. gender = col_character(),
## .. birthyear = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(q2_2019)
## spc_tbl_ [1,108,163 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : num [1:1108163] 22178529 22178530 22178531 22178532 22178533 ...
## $ start_time : POSIXct[1:1108163], format: "2019-04-01 00:02:22" "2019-04-01 00:03:02" ...
## $ end_time : POSIXct[1:1108163], format: "2019-04-01 00:09:48" "2019-04-01 00:20:30" ...
## $ bikeid : num [1:1108163] 6251 6226 5649 4151 3270 ...
## $ tripduration : num [1:1108163] 446 1048 252 357 1007 ...
## $ from_station_id : num [1:1108163] 81 317 283 26 202 420 503 260 211 211 ...
## $ from_station_name: chr [1:1108163] "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ to_station_id : num [1:1108163] 56 59 174 133 129 426 500 499 211 211 ...
## $ to_station_name : chr [1:1108163] "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ usertype : chr [1:1108163] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:1108163] "Male" "Female" "Male" "Male" ...
## $ birthyear : num [1:1108163] 1975 1984 1990 1993 1992 ...
## - attr(*, "spec")=
## .. cols(
## .. `01 - Rental Details Rental ID` = col_double(),
## .. `01 - Rental Details Local Start Time` = col_datetime(format = ""),
## .. `01 - Rental Details Local End Time` = col_datetime(format = ""),
## .. `01 - Rental Details Bike ID` = col_double(),
## .. `01 - Rental Details Duration In Seconds Uncapped` = col_number(),
## .. `03 - Rental Start Station ID` = col_double(),
## .. `03 - Rental Start Station Name` = col_character(),
## .. `02 - Rental End Station ID` = col_double(),
## .. `02 - Rental End Station Name` = col_character(),
## .. `User Type` = col_character(),
## .. `Member Gender` = col_character(),
## .. `05 - Member Details Member Birthday Year` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(q3_2019)
## spc_tbl_ [1,640,718 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : num [1:1640718] 23479388 23479389 23479390 23479391 23479392 ...
## $ start_time : POSIXct[1:1640718], format: "2019-07-01 00:00:27" "2019-07-01 00:01:16" ...
## $ end_time : POSIXct[1:1640718], format: "2019-07-01 00:20:41" "2019-07-01 00:18:44" ...
## $ bikeid : num [1:1640718] 3591 5353 6180 5540 6014 ...
## $ tripduration : num [1:1640718] 1214 1048 1554 1503 1213 ...
## $ from_station_id : num [1:1640718] 117 381 313 313 168 300 168 313 43 43 ...
## $ from_station_name: chr [1:1640718] "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
## $ to_station_id : num [1:1640718] 497 203 144 144 62 232 62 144 195 195 ...
## $ to_station_name : chr [1:1640718] "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
## $ usertype : chr [1:1640718] "Subscriber" "Customer" "Customer" "Customer" ...
## $ gender : chr [1:1640718] "Male" NA NA NA ...
## $ birthyear : num [1:1640718] 1992 NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. trip_id = col_double(),
## .. start_time = col_datetime(format = ""),
## .. end_time = col_datetime(format = ""),
## .. bikeid = col_double(),
## .. tripduration = col_number(),
## .. from_station_id = col_double(),
## .. from_station_name = col_character(),
## .. to_station_id = col_double(),
## .. to_station_name = col_character(),
## .. usertype = col_character(),
## .. gender = col_character(),
## .. birthyear = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(q4_2019)
## spc_tbl_ [704,054 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ trip_id : num [1:704054] 25223640 25223641 25223642 25223643 25223644 ...
## $ start_time : POSIXct[1:704054], format: "2019-10-01 00:01:39" "2019-10-01 00:02:16" ...
## $ end_time : POSIXct[1:704054], format: "2019-10-01 00:17:20" "2019-10-01 00:06:34" ...
## $ bikeid : num [1:704054] 2215 6328 3003 3275 5294 ...
## $ tripduration : num [1:704054] 940 258 850 2350 1867 ...
## $ from_station_id : num [1:704054] 20 19 84 313 210 156 84 156 156 336 ...
## $ from_station_name: chr [1:704054] "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
## $ to_station_id : num [1:704054] 309 241 199 290 382 226 142 463 463 336 ...
## $ to_station_name : chr [1:704054] "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
## $ usertype : chr [1:704054] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr [1:704054] "Male" "Male" "Female" "Male" ...
## $ birthyear : num [1:704054] 1987 1998 1991 1990 1987 ...
## - attr(*, "spec")=
## .. cols(
## .. trip_id = col_double(),
## .. start_time = col_datetime(format = ""),
## .. end_time = col_datetime(format = ""),
## .. bikeid = col_double(),
## .. tripduration = col_number(),
## .. from_station_id = col_double(),
## .. from_station_name = col_character(),
## .. to_station_id = col_double(),
## .. to_station_name = col_character(),
## .. usertype = col_character(),
## .. gender = col_character(),
## .. birthyear = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
q1_2019 <- mutate(q1_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
q2_2019 <- mutate(q2_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
q3_2019 <- mutate(q3_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
q4_2019 <- mutate(q4_2019, trip_id = as.character(trip_id), bikeid = as.character(bikeid))
TPdt_19 <- bind_rows(q1_2019, q2_2019, q3_2019, q4_2019)
The dataset contained start_time and
end_time columns, which were initially in string format. To
facilitate time-based calculations, I converted them into Date
format:
TPdt_19$date <- as.Date(TPdt_19$start_time)
Additionally, I extracted specific time-related attributes to improve analysis:
TPdt_19$month <- format(as.Date(TPdt_19$date), '%m')
TPdt_19$day <- format(as.Date(TPdt_19$date), '%d')
TPdt_19$year <- format(as.Date(TPdt_19$date), '%Y')
TPdt_19$day_of_week <- format(as.Date(TPdt_19$date), '%A')
This transformation allowed for flexible grouping and trend analysis based on weekdays, months, and years.
To measure trip durations, I computed ride_length using
the difftime() function:
TPdt_19$ride_length <- difftime(TPdt_19$end_time, TPdt_19$start_time)
str(TPdt_19$ride_length)
## 'difftime' num [1:3818004] 6.5 7.35 13.8166666666667 29.7166666666667 ...
## - attr(*, "units")= chr "mins"
Since ride_length was initially stored as a factor, I
converted it into a numeric format for proper calculations:
TPdt_19d$ride_length <- as.numeric(as.character(TPdt_19d$ride_length))
To ensure data integrity, I filtered out erroneous records, such as rides with negative durations and specific invalid station names:
skim_without_charts(TPdt_19)
| Name | TPdt_19 |
| Number of rows | 3818004 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| Date | 1 |
| difftime | 1 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| trip_id | 0 | 1.00 | 7 | 8 | 0 | 3818004 | 0 |
| bikeid | 0 | 1.00 | 1 | 4 | 0 | 6017 | 0 |
| from_station_name | 0 | 1.00 | 10 | 43 | 0 | 640 | 0 |
| to_station_name | 0 | 1.00 | 10 | 43 | 0 | 641 | 0 |
| usertype | 0 | 1.00 | 8 | 10 | 0 | 2 | 0 |
| gender | 559206 | 0.85 | 4 | 6 | 0 | 2 | 0 |
| month | 0 | 1.00 | 2 | 2 | 0 | 12 | 0 |
| day | 0 | 1.00 | 2 | 2 | 0 | 31 | 0 |
| year | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| day_of_week | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2019-01-01 | 2019-12-31 | 2019-07-25 | 365 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | -56.37 mins | 177200.4 mins | 11.82 mins | 21577 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| tripduration | 0 | 1.00 | 1450.20 | 29854.14 | 61 | 411 | 709 | 1283 | 10628400 |
| from_station_id | 0 | 1.00 | 201.67 | 156.08 | 1 | 77 | 174 | 289 | 673 |
| to_station_id | 0 | 1.00 | 202.64 | 156.24 | 1 | 77 | 174 | 291 | 673 |
| birthyear | 538751 | 0.86 | 1984.07 | 10.87 | 1759 | 1979 | 1987 | 1992 | 2014 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| start_time | 0 | 1 | 2019-01-01 00:04:37 | 2019-12-31 23:57:17 | 2019-07-25 17:50:54 | 3306090 |
| end_time | 0 | 1 | 2019-01-01 00:11:07 | 2020-01-21 13:54:35 | 2019-07-25 18:12:23 | 3238342 |
TPdt_19 <- TPdt_19[!(TPdt_19$from_station_name == "HQ QR" | TPdt_19$ride_length < 0),]
skim_without_charts(TPdt_19)
| Name | TPdt_19 |
| Number of rows | 3817991 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| Date | 1 |
| difftime | 1 |
| numeric | 4 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| trip_id | 0 | 1.00 | 7 | 8 | 0 | 3817991 | 0 |
| bikeid | 0 | 1.00 | 1 | 4 | 0 | 6017 | 0 |
| from_station_name | 0 | 1.00 | 10 | 43 | 0 | 640 | 0 |
| to_station_name | 0 | 1.00 | 10 | 43 | 0 | 641 | 0 |
| usertype | 0 | 1.00 | 8 | 10 | 0 | 2 | 0 |
| gender | 559201 | 0.85 | 4 | 6 | 0 | 2 | 0 |
| month | 0 | 1.00 | 2 | 2 | 0 | 12 | 0 |
| day | 0 | 1.00 | 2 | 2 | 0 | 31 | 0 |
| year | 0 | 1.00 | 4 | 4 | 0 | 1 | 0 |
| day_of_week | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2019-01-01 | 2019-12-31 | 2019-07-25 | 365 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 1.02 mins | 177200.4 mins | 11.82 mins | 21564 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| tripduration | 0 | 1.00 | 1450.21 | 29854.19 | 61 | 411 | 709 | 1283 | 10628400 |
| from_station_id | 0 | 1.00 | 201.67 | 156.08 | 1 | 77 | 174 | 289 | 673 |
| to_station_id | 0 | 1.00 | 202.64 | 156.24 | 1 | 77 | 174 | 291 | 673 |
| birthyear | 538746 | 0.86 | 1984.07 | 10.87 | 1759 | 1979 | 1987 | 1992 | 2014 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| start_time | 0 | 1 | 2019-01-01 00:04:37 | 2019-12-31 23:57:17 | 2019-07-25 17:50:41 | 3306077 |
| end_time | 0 | 1 | 2019-01-01 00:11:07 | 2020-01-21 13:54:35 | 2019-07-25 18:12:17 | 3238330 |
summary(TPdt_19$ride_length)
## Length Class Mode
## 3817991 difftime numeric
Once the data was clean, I performed descriptive statistics to examine trends based on user types(members and casual) and weekdays:
aggregate(TPdt_19$ride_length ~ TPdt_19$usertype, FUN = mean)
## TPdt_19$usertype TPdt_19$ride_length
## 1 Customer 57.01802 mins
## 2 Subscriber 14.32780 mins
aggregate(TPdt_19$ride_length ~ TPdt_19$usertype, FUN = median)
## TPdt_19$usertype TPdt_19$ride_length
## 1 Customer 25.83333 mins
## 2 Subscriber 9.80000 mins
aggregate(TPdt_19$ride_length ~ TPdt_19$usertype, FUN = max)
## TPdt_19$usertype TPdt_19$ride_length
## 1 Customer 177200.4 mins
## 2 Subscriber 150943.9 mins
aggregate(TPdt_19$ride_length ~ TPdt_19$usertype, FUN = min)
## TPdt_19$usertype TPdt_19$ride_length
## 1 Customer 1.016667 mins
## 2 Subscriber 1.016667 mins
To ensure chronological order, I reordered the weekdays:
TPdt_19dd$day_of_week <- ordered(TPdt_19dd$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
To gain insights into ridership trends, I created visualizations
comparing ride counts and average durations across different days of the
week using ggplot2:
library(ggplot2)
TPdt_19 %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(usertype, weekday) %>%
summarise(number_of_rides = n(), average_duration = mean(ride_length), .groups = "drop") %>%
arrange(usertype, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = usertype)) +
geom_col(position = "dodge")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.
To gain insights into ridership trends, I created visualizations
comparing ride counts and average durations across different days of the
week using ggplot2:
TPdt_19d %>%
mutate(weekday = wday(start_time, label = TRUE)) %>%
group_by(usertype, weekday) %>%
summarise(number_of_rides = n(), average_duration = mean(ride_length)) %>%
arrange(usertype, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = usertype)) +
geom_col(position = "dodge")