library(dplyr)
library(tidyr)
This data set was downloaded from the Bureau of Transportation Statistics website. A number of variables were filtered to reduce the size of the file. The file was then downloaded to GITHUB. Later the file was read and assigned to ‘airline_data’.
airline_data <- read.csv("https://raw.githubusercontent.com/RGreen-sps/scaling-umbrella/refs/heads/main/T_ONTIME_REPORTING.csv")
str(airline_data)
## 'data.frame': 54092 obs. of 37 variables:
## $ YEAR : int 2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
## $ MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY_OF_MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY_OF_WEEK : int 1 1 1 1 1 1 1 1 1 1 ...
## $ FL_DATE : chr "1/1/2024 12:00:00 AM" "1/1/2024 12:00:00 AM" "1/1/2024 12:00:00 AM" "1/1/2024 12:00:00 AM" ...
## $ OP_UNIQUE_CARRIER : chr "9E" "9E" "9E" "9E" ...
## $ OP_CARRIER_AIRLINE_ID: int 20363 20363 20363 20363 20363 20363 20363 20363 19805 19805 ...
## $ ORIGIN_AIRPORT_ID : int 10397 10397 10397 10397 10685 10685 13367 13367 10140 10397 ...
## $ ORIGIN_AIRPORT_SEQ_ID: int 1039707 1039707 1039707 1039707 1068503 1068503 1336707 1336707 1014005 1039707 ...
## $ ORIGIN_CITY_MARKET_ID: int 30397 30397 30397 30397 30685 30685 33367 33367 30140 30397 ...
## $ ORIGIN : chr "ATL" "ATL" "ATL" "ATL" ...
## $ ORIGIN_CITY_NAME : chr "Atlanta, GA" "Atlanta, GA" "Atlanta, GA" "Atlanta, GA" ...
## $ ORIGIN_STATE_ABR : chr "GA" "GA" "GA" "GA" ...
## $ ORIGIN_STATE_NM : chr "Georgia" "Georgia" "Georgia" "Georgia" ...
## $ DEST_AIRPORT_ID : int 10685 10685 13367 13367 10397 10397 10397 10397 13930 13930 ...
## $ DEST_AIRPORT_SEQ_ID : int 1068503 1068503 1336707 1336707 1039707 1039707 1039707 1039707 1393008 1393008 ...
## $ DEST_CITY_MARKET_ID : int 30685 30685 33367 33367 30397 30397 30397 30397 30977 30977 ...
## $ DEST : chr "BMI" "BMI" "MLI" "MLI" ...
## $ DEST_CITY_NAME : chr "Bloomington/Normal, IL" "Bloomington/Normal, IL" "Moline, IL" "Moline, IL" ...
## $ DEST_STATE_ABR : chr "IL" "IL" "IL" "IL" ...
## $ DEST_STATE_NM : chr "Illinois" "Illinois" "Illinois" "Illinois" ...
## $ DEP_TIME : int 1514 2138 1505 2108 550 1630 555 1651 1349 619 ...
## $ DEP_DELAY : num 15 -4 -5 -13 -10 -4 -2 -10 -9 -12 ...
## $ DEP_DELAY_NEW : num 15 0 0 0 0 0 0 0 0 0 ...
## $ DEP_DEL15 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ ARR_TIME : int 1548 2221 1610 2157 836 1901 915 1933 1736 714 ...
## $ ARR_DELAY : num -1 -14 -6 -28 -21 -23 10 -35 -9 -26 ...
## $ ARR_DELAY_NEW : num 0 0 0 0 0 0 10 0 0 0 ...
## $ ARR_DEL15 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ACTUAL_ELAPSED_TIME : num 94 103 125 109 106 91 140 102 167 115 ...
## $ AIR_TIME : num 79 85 95 86 77 72 87 82 147 93 ...
## $ DISTANCE : num 533 533 633 633 533 ...
## $ CARRIER_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
## $ WEATHER_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
## $ NAS_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
## $ SECURITY_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
## $ LATE_AIRCRAFT_DELAY : num NA NA NA NA NA NA NA NA NA NA ...
The data set was filter further for ‘UA’, United Air Lines Inc., and ‘WN’, Southwest Airlines Co. Additional filters includes the destination city names, and the origin city Chicago, IL. New columns were created, ‘on_time’, ‘delayed’, and the carrier column renamed to ‘airline.’
Pivot longer is used to combine the ‘on_time’ and ‘delayed’, into one column, the status column.
airline_add_status <- airline_col_filter |>
pivot_longer(
cols = on_time:delayed,
names_to = "status",
values_to = "NAME"
)
airline_add_status
## # A tibble: 20 × 4
## # Groups: airline [2]
## airline DEST_CITY_NAME status NAME
## <chr> <chr> <chr> <int>
## 1 UA Denver, CO on_time 136
## 2 UA Denver, CO delayed 83
## 3 WN Denver, CO on_time 220
## 4 WN Denver, CO delayed 124
## 5 UA New York, NY on_time 165
## 6 UA New York, NY delayed 106
## 7 WN New York, NY on_time 91
## 8 WN New York, NY delayed 97
## 9 UA Portland, OR on_time 40
## 10 UA Portland, OR delayed 22
## 11 WN Portland, OR on_time 14
## 12 WN Portland, OR delayed 15
## 13 UA Sacramento, CA on_time 41
## 14 UA Sacramento, CA delayed 22
## 15 WN Sacramento, CA on_time 13
## 16 WN Sacramento, CA delayed 14
## 17 UA St. Louis, MO on_time 31
## 18 UA St. Louis, MO delayed 13
## 19 WN St. Louis, MO on_time 76
## 20 WN St. Louis, MO delayed 55
pivot wider is used here to turn the values in the DEST_CITY_NAME column to now act as column names for the new format.
airline_compare <- airline_add_status |>
pivot_wider(
names_from = DEST_CITY_NAME,
values_from = NAME
)
airline_compare
## # A tibble: 4 × 7
## # Groups: airline [2]
## airline status `Denver, CO` `New York, NY` `Portland, OR` `Sacramento, CA`
## <chr> <chr> <int> <int> <int> <int>
## 1 UA on_time 136 165 40 41
## 2 UA delayed 83 106 22 22
## 3 WN on_time 220 91 14 13
## 4 WN delayed 124 97 15 14
## # ℹ 1 more variable: `St. Louis, MO` <int>
Finally the data frame is written to a CSV file.
write.csv(airline_compare, "airline_arr_delay_comp", row.names = FALSE)