Objective of the Data Cleaning Process

The main objective of this cleaning process is to transform the raw, segmented temporal data of the NYC 2013 flights dataset into a continuous and usable timeline. By converting military-style integers into formal time objects and reconciling scheduled times with their respective delays, the script aims to create accurate, high-fidelity datetime columns. A critical component of this objective is the implementation of logical corrections for overnight flights, ensuring that arrivals occurring after midnight are correctly attributed to the following calendar day, thereby maintaining temporal integrity for subsequent analysis.

Data Cleaning Scope Disclaimer

It is important to note that the cleaning process documented here is not intended to be exhaustive. The primary focus of this procedure was to handle and transform raw temporal data into meaningful datetime objects. Specifically, efforts were centered on rectifying ‘overnight flight’ inconsistencies — where arrival times numerically precede departure times — and consolidating separate year, month, and day columns into standardized POSIXct formats. While this provides a robust foundation for time-series analysis, other potential data quality issues, such as missing coordinate values or inconsistent carrier metadata, were not addressed in this specific workflow.

Import of necessary libraries

# data manipulation
library(tidyverse)

# working dataset
library(nycflights13)

# datetime management libraries
library(lubridate)
library(hms)

# plotting library
library(ggplot2)

First glance at the NYC 2013 Flights dataset

flights |> sample_n(5)
## # A tibble: 5 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     8     3      932            945       -13     1050           1114
## 2  2013     6    14     1609           1615        -6     1835           1830
## 3  2013     2    18     1501           1505        -4     2002           2000
## 4  2013     1    17      712            705         7     1023           1021
## 5  2013    11    30      833            840        -7     1053           1115
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Format times

Columns dep_time, sched_dep_time, arr_time, sched_arr_time represent a daytime, using the military format. 520 means 05:20. I wil convert these values into hms objects.

create_hm <- function(time) {
  # %% 86400 converts 24:00:00 to 00:00:00
  seconds <- ( (time %/% 100) * 3600 + (time %% 100) * 60) %% 86400 
  as_hms(seconds)
}

I will proceed with the conversion to hms objects. After that, I will carry out a relocation of some columns.

(flights <- flights |> 
  mutate(across(ends_with("_time") & !"air_time", create_hm)) |>
  select(-time_hour) |>
  select(flight, origin, dest,
         year, month, day, hour, minute,
         sched_dep_time, dep_delay, dep_time,
         sched_arr_time, arr_delay, arr_time,
         everything()
  )) |>
  select(flight, ends_with("time"), -air_time) |>
  head(5)
## # A tibble: 5 × 5
##   flight sched_dep_time dep_time sched_arr_time arr_time
##    <int> <time>         <time>   <time>         <time>  
## 1   1545 05:15          05:17    08:19          08:30   
## 2   1714 05:29          05:33    08:30          08:50   
## 3   1141 05:40          05:42    08:50          09:23   
## 4    725 05:45          05:44    10:22          10:04   
## 5    461 06:00          05:54    08:37          08:12

Create datetime columns

I will create columns with both scheduled and actual datetime, for departure and arrival moments.

flights <- flights |> 
  mutate(
    
    # scheduled departure
    sched_dep_datetime = make_datetime(year, month, day,
                                   hour(sched_dep_time),
                                   minute(sched_dep_time)
                                   ),
    
    # actual departure: scheduled departure + departure delay (duration)
    actual_dep_datetime = sched_dep_datetime + dminutes(dep_delay),
    
    # scheduled arrival
    sched_arr_datetime = make_datetime(year, month, day,
                                   hour(sched_arr_time),
                                   minute(sched_arr_time)
                                   ),
    
    # actual arrival: scheduled arrival + arrival delay (duration)
    actual_arr_datetime = sched_arr_datetime + dminutes(arr_delay),
    
    # place the new datetime columns at the beginning
    .before=year) |>
  
  # clean the time columns
  select(-c(dep_time, sched_dep_time, arr_time, sched_arr_time)) |>
  
  # relocate the delay columns
  relocate(dep_delay, .after = sched_dep_datetime) |> 
  relocate(arr_delay, .after = sched_arr_datetime) |>
  
  # sort the dataset by scheduled departure datetime
  arrange(sched_dep_datetime)

Sample of dataset with datetime columns

flights |>
  select(flight, ends_with("datetime"), ends_with("delay")) |> 
  sample_n(10)
## # A tibble: 10 × 7
##    flight sched_dep_datetime  actual_dep_datetime sched_arr_datetime 
##     <int> <dttm>              <dttm>              <dttm>             
##  1    485 2013-02-27 06:41:00 2013-02-27 06:39:00 2013-02-27 07:55:00
##  2     43 2013-04-11 21:40:00 2013-04-12 00:29:00 2013-04-11 00:28:00
##  3   2163 2013-04-03 07:00:00 2013-04-03 06:55:00 2013-04-03 08:08:00
##  4   1733 2013-08-19 07:53:00 2013-08-19 07:52:00 2013-08-19 09:50:00
##  5     67 2013-09-14 17:40:00 2013-09-14 17:40:00 2013-09-14 20:35:00
##  6   4300 2013-05-27 16:50:00 2013-05-27 16:48:00 2013-05-27 18:17:00
##  7   2181 2013-10-15 15:05:00 2013-10-15 15:04:00 2013-10-15 18:02:00
##  8   1270 2013-11-13 14:56:00 2013-11-13 14:58:00 2013-11-13 17:49:00
##  9    279 2013-12-05 07:51:00 2013-12-05 07:43:00 2013-12-05 09:28:00
## 10   3439 2013-01-29 20:00:00 2013-01-29 19:50:00 2013-01-29 22:24:00
## # ℹ 3 more variables: actual_arr_datetime <dttm>, dep_delay <dbl>,
## #   arr_delay <dbl>

Fix arrival datetimes

If the arrival time is lower than the departure time, it means the flight landed the following day. In those cases, I need to update the arrival datetime, adding a duration of 1 day.

Test the solution

Prior to applying the modification, I will test it.

flights |> 
  mutate(
    # logical vector to check if flight landed the following day
    overnight_flight = sched_arr_datetime < sched_dep_datetime,
    
    # add 1 day duration if is overnight flight
    sched_arr_datetime = if_else(overnight_flight,
                                 # add 1 day to current value
                                 sched_arr_datetime + ddays(1),
                                 # if not, leave it as it is
                                 sched_arr_datetime),
    
    # update the actual arrival datetime based on
    # the updated scheduled arrival datetime
    actual_arr_datetime = sched_arr_datetime + minutes(arr_delay)
  ) |>
  
  # check the flights which should have been modified
  filter(overnight_flight) |> 
  
  # get only the interesting columns
  select(overnight_flight, ends_with("datetime")) |> 
  
  # get a sample
  sample_n(10)
## # A tibble: 10 × 5
##    overnight_flight sched_dep_datetime  actual_dep_datetime sched_arr_datetime 
##    <lgl>            <dttm>              <dttm>              <dttm>             
##  1 TRUE             2013-04-10 23:59:00 NA                  2013-04-11 03:43:00
##  2 TRUE             2013-06-10 23:59:00 2013-06-10 23:52:00 2013-06-11 03:50:00
##  3 TRUE             2013-12-22 20:23:00 2013-12-22 22:19:00 2013-12-23 01:22:00
##  4 TRUE             2013-05-03 21:00:00 2013-05-03 20:53:00 2013-05-04 00:30:00
##  5 TRUE             2013-01-10 21:30:00 2013-01-10 21:29:00 2013-01-11 00:25:00
##  6 TRUE             2013-04-14 21:50:00 2013-04-14 21:47:00 2013-04-15 00:43:00
##  7 TRUE             2013-08-17 23:59:00 2013-08-18 00:07:00 2013-08-18 03:44:00
##  8 TRUE             2013-08-24 21:30:00 2013-08-24 21:20:00 2013-08-25 00:14:00
##  9 TRUE             2013-12-28 21:45:00 2013-12-28 21:35:00 2013-12-29 00:19:00
## 10 TRUE             2013-12-16 21:30:00 2013-12-16 21:25:00 2013-12-17 00:33:00
## # ℹ 1 more variable: actual_arr_datetime <dttm>

Apply the fix

Once I’ve verified that the modification works as expected, I will implement the changes.

flights <- flights |> 
  mutate(
    # logical vector to check if flight landed the following day
    overnight_flight = sched_arr_datetime < sched_dep_datetime,
    
    # update the scheduled arrival datetime
    sched_arr_datetime = if_else(overnight_flight,
                                 sched_arr_datetime + ddays(1),
                                 sched_arr_datetime),
    
    # update the actual arrival datetime
    actual_arr_datetime = sched_arr_datetime + minutes(arr_delay)
  ) |> 
  # remove year, month, day, hour and minute
  select(-c(year, month, day, hour, minute))

Final overview of the clean dataset

After the cleaning process and all the modifications, let’s have a look at the processed dataset:

rows_overnight_flights <- flights |>
  filter(overnight_flight & arr_delay > 0) |> sample_n(1)

rows_same_day_flights <- flights |>
  filter(! overnight_flight) |> sample_n(1)

sample <- rbind(rows_same_day_flights, rows_overnight_flights) |> t()
colnames(sample) <- c("same day flight", "overnight flight")
sample
##                     same day flight       overnight flight     
## flight              "1473"                " 515"               
## origin              "LGA"                 "EWR"                
## dest                "MEM"                 "FLL"                
## sched_dep_datetime  "2013-06-28 16:51:00" "2013-03-30 21:55:00"
## dep_delay           "55"                  "25"                 
## actual_dep_datetime "2013-06-28 17:46:00" "2013-03-30 22:20:00"
## sched_arr_datetime  "2013-06-28 19:00:00" "2013-03-31 00:48:00"
## arr_delay           "35"                  "12"                 
## actual_arr_datetime "2013-06-28 19:35:00" "2013-03-31 01:00:00"
## carrier             "DL"                  "B6"                 
## tailnum             "N340NB"              "N283JB"             
## air_time            "143"                 "142"                
## distance            " 963"                "1065"               
## overnight_flight    "FALSE"               "TRUE"

Finally, a small plot showing the top 5 carriers with the largest departure delay.

flights |>
  left_join(airlines, join_by(carrier)) |>
  rename(carrier_code = carrier,
         carrier = name) |> 
  group_by(carrier) |> 
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) |>
  arrange(-avg_dep_delay) |> 
  head(5) |> 
  ggplot(aes(x=reorder(carrier, - avg_dep_delay), y=avg_dep_delay)) +
  geom_col(fill="lightblue") +
  geom_text(aes(label = round(avg_dep_delay, 1)), 
            vjust = -0.5,
            size = 3.5) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  scale_y_continuous(limits = c(0,25)) +
  labs(x="", y="Average departure delay, minutes",
       title="Top 5 carriers with highest average departure delay")