The main objective of this cleaning process is to transform the raw, segmented temporal data of the NYC 2013 flights dataset into a continuous and usable timeline. By converting military-style integers into formal time objects and reconciling scheduled times with their respective delays, the script aims to create accurate, high-fidelity datetime columns. A critical component of this objective is the implementation of logical corrections for overnight flights, ensuring that arrivals occurring after midnight are correctly attributed to the following calendar day, thereby maintaining temporal integrity for subsequent analysis.
It is important to note that the cleaning process documented here is not intended to be exhaustive. The primary focus of this procedure was to handle and transform raw temporal data into meaningful datetime objects. Specifically, efforts were centered on rectifying ‘overnight flight’ inconsistencies — where arrival times numerically precede departure times — and consolidating separate year, month, and day columns into standardized POSIXct formats. While this provides a robust foundation for time-series analysis, other potential data quality issues, such as missing coordinate values or inconsistent carrier metadata, were not addressed in this specific workflow.
# data manipulation
library(tidyverse)
# working dataset
library(nycflights13)
# datetime management libraries
library(lubridate)
library(hms)
# plotting library
library(ggplot2)
flights |> sample_n(5)
## # A tibble: 5 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 8 3 932 945 -13 1050 1114
## 2 2013 6 14 1609 1615 -6 1835 1830
## 3 2013 2 18 1501 1505 -4 2002 2000
## 4 2013 1 17 712 705 7 1023 1021
## 5 2013 11 30 833 840 -7 1053 1115
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Columns dep_time, sched_dep_time,
arr_time, sched_arr_time represent a daytime,
using the military format. 520 means 05:20. I
wil convert these values into hms objects.
create_hm <- function(time) {
# %% 86400 converts 24:00:00 to 00:00:00
seconds <- ( (time %/% 100) * 3600 + (time %% 100) * 60) %% 86400
as_hms(seconds)
}
I will proceed with the conversion to hms objects. After that, I will carry out a relocation of some columns.
(flights <- flights |>
mutate(across(ends_with("_time") & !"air_time", create_hm)) |>
select(-time_hour) |>
select(flight, origin, dest,
year, month, day, hour, minute,
sched_dep_time, dep_delay, dep_time,
sched_arr_time, arr_delay, arr_time,
everything()
)) |>
select(flight, ends_with("time"), -air_time) |>
head(5)
## # A tibble: 5 × 5
## flight sched_dep_time dep_time sched_arr_time arr_time
## <int> <time> <time> <time> <time>
## 1 1545 05:15 05:17 08:19 08:30
## 2 1714 05:29 05:33 08:30 08:50
## 3 1141 05:40 05:42 08:50 09:23
## 4 725 05:45 05:44 10:22 10:04
## 5 461 06:00 05:54 08:37 08:12
I will create columns with both scheduled and actual datetime, for departure and arrival moments.
flights <- flights |>
mutate(
# scheduled departure
sched_dep_datetime = make_datetime(year, month, day,
hour(sched_dep_time),
minute(sched_dep_time)
),
# actual departure: scheduled departure + departure delay (duration)
actual_dep_datetime = sched_dep_datetime + dminutes(dep_delay),
# scheduled arrival
sched_arr_datetime = make_datetime(year, month, day,
hour(sched_arr_time),
minute(sched_arr_time)
),
# actual arrival: scheduled arrival + arrival delay (duration)
actual_arr_datetime = sched_arr_datetime + dminutes(arr_delay),
# place the new datetime columns at the beginning
.before=year) |>
# clean the time columns
select(-c(dep_time, sched_dep_time, arr_time, sched_arr_time)) |>
# relocate the delay columns
relocate(dep_delay, .after = sched_dep_datetime) |>
relocate(arr_delay, .after = sched_arr_datetime) |>
# sort the dataset by scheduled departure datetime
arrange(sched_dep_datetime)
flights |>
select(flight, ends_with("datetime"), ends_with("delay")) |>
sample_n(10)
## # A tibble: 10 × 7
## flight sched_dep_datetime actual_dep_datetime sched_arr_datetime
## <int> <dttm> <dttm> <dttm>
## 1 485 2013-02-27 06:41:00 2013-02-27 06:39:00 2013-02-27 07:55:00
## 2 43 2013-04-11 21:40:00 2013-04-12 00:29:00 2013-04-11 00:28:00
## 3 2163 2013-04-03 07:00:00 2013-04-03 06:55:00 2013-04-03 08:08:00
## 4 1733 2013-08-19 07:53:00 2013-08-19 07:52:00 2013-08-19 09:50:00
## 5 67 2013-09-14 17:40:00 2013-09-14 17:40:00 2013-09-14 20:35:00
## 6 4300 2013-05-27 16:50:00 2013-05-27 16:48:00 2013-05-27 18:17:00
## 7 2181 2013-10-15 15:05:00 2013-10-15 15:04:00 2013-10-15 18:02:00
## 8 1270 2013-11-13 14:56:00 2013-11-13 14:58:00 2013-11-13 17:49:00
## 9 279 2013-12-05 07:51:00 2013-12-05 07:43:00 2013-12-05 09:28:00
## 10 3439 2013-01-29 20:00:00 2013-01-29 19:50:00 2013-01-29 22:24:00
## # ℹ 3 more variables: actual_arr_datetime <dttm>, dep_delay <dbl>,
## # arr_delay <dbl>
If the arrival time is lower than the departure time, it means the flight landed the following day. In those cases, I need to update the arrival datetime, adding a duration of 1 day.
Prior to applying the modification, I will test it.
flights |>
mutate(
# logical vector to check if flight landed the following day
overnight_flight = sched_arr_datetime < sched_dep_datetime,
# add 1 day duration if is overnight flight
sched_arr_datetime = if_else(overnight_flight,
# add 1 day to current value
sched_arr_datetime + ddays(1),
# if not, leave it as it is
sched_arr_datetime),
# update the actual arrival datetime based on
# the updated scheduled arrival datetime
actual_arr_datetime = sched_arr_datetime + minutes(arr_delay)
) |>
# check the flights which should have been modified
filter(overnight_flight) |>
# get only the interesting columns
select(overnight_flight, ends_with("datetime")) |>
# get a sample
sample_n(10)
## # A tibble: 10 × 5
## overnight_flight sched_dep_datetime actual_dep_datetime sched_arr_datetime
## <lgl> <dttm> <dttm> <dttm>
## 1 TRUE 2013-04-10 23:59:00 NA 2013-04-11 03:43:00
## 2 TRUE 2013-06-10 23:59:00 2013-06-10 23:52:00 2013-06-11 03:50:00
## 3 TRUE 2013-12-22 20:23:00 2013-12-22 22:19:00 2013-12-23 01:22:00
## 4 TRUE 2013-05-03 21:00:00 2013-05-03 20:53:00 2013-05-04 00:30:00
## 5 TRUE 2013-01-10 21:30:00 2013-01-10 21:29:00 2013-01-11 00:25:00
## 6 TRUE 2013-04-14 21:50:00 2013-04-14 21:47:00 2013-04-15 00:43:00
## 7 TRUE 2013-08-17 23:59:00 2013-08-18 00:07:00 2013-08-18 03:44:00
## 8 TRUE 2013-08-24 21:30:00 2013-08-24 21:20:00 2013-08-25 00:14:00
## 9 TRUE 2013-12-28 21:45:00 2013-12-28 21:35:00 2013-12-29 00:19:00
## 10 TRUE 2013-12-16 21:30:00 2013-12-16 21:25:00 2013-12-17 00:33:00
## # ℹ 1 more variable: actual_arr_datetime <dttm>
Once I’ve verified that the modification works as expected, I will implement the changes.
flights <- flights |>
mutate(
# logical vector to check if flight landed the following day
overnight_flight = sched_arr_datetime < sched_dep_datetime,
# update the scheduled arrival datetime
sched_arr_datetime = if_else(overnight_flight,
sched_arr_datetime + ddays(1),
sched_arr_datetime),
# update the actual arrival datetime
actual_arr_datetime = sched_arr_datetime + minutes(arr_delay)
) |>
# remove year, month, day, hour and minute
select(-c(year, month, day, hour, minute))
After the cleaning process and all the modifications, let’s have a look at the processed dataset:
rows_overnight_flights <- flights |>
filter(overnight_flight & arr_delay > 0) |> sample_n(1)
rows_same_day_flights <- flights |>
filter(! overnight_flight) |> sample_n(1)
sample <- rbind(rows_same_day_flights, rows_overnight_flights) |> t()
colnames(sample) <- c("same day flight", "overnight flight")
sample
## same day flight overnight flight
## flight "1473" " 515"
## origin "LGA" "EWR"
## dest "MEM" "FLL"
## sched_dep_datetime "2013-06-28 16:51:00" "2013-03-30 21:55:00"
## dep_delay "55" "25"
## actual_dep_datetime "2013-06-28 17:46:00" "2013-03-30 22:20:00"
## sched_arr_datetime "2013-06-28 19:00:00" "2013-03-31 00:48:00"
## arr_delay "35" "12"
## actual_arr_datetime "2013-06-28 19:35:00" "2013-03-31 01:00:00"
## carrier "DL" "B6"
## tailnum "N340NB" "N283JB"
## air_time "143" "142"
## distance " 963" "1065"
## overnight_flight "FALSE" "TRUE"
Finally, a small plot showing the top 5 carriers with the largest departure delay.
flights |>
left_join(airlines, join_by(carrier)) |>
rename(carrier_code = carrier,
carrier = name) |>
group_by(carrier) |>
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) |>
arrange(-avg_dep_delay) |>
head(5) |>
ggplot(aes(x=reorder(carrier, - avg_dep_delay), y=avg_dep_delay)) +
geom_col(fill="lightblue") +
geom_text(aes(label = round(avg_dep_delay, 1)),
vjust = -0.5,
size = 3.5) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
scale_y_continuous(limits = c(0,25)) +
labs(x="", y="Average departure delay, minutes",
title="Top 5 carriers with highest average departure delay")