flights <- read.csv("flights.csv")
flight_data <- flights %>%
select(CRS_DEP_TIME, ARR_DELAY, CANCELLED) %>%
mutate(hour_of_departure = as.numeric(substr(sprintf("%04.0f", CRS_DEP_TIME), 1, 2)))
# Aggregate data by hour of departure
flight_summary <- flight_data %>%
group_by(hour_of_departure) %>%
summarise(avg_arrival_delay = mean(ARR_DELAY, na.rm = TRUE),
delay_count = sum(ARR_DELAY > 0, na.rm = TRUE),
cancellation_count = sum(CANCELLED == 0))
# Average arrival delay by hour of departure
ggplot(data = flight_summary, aes(x = hour_of_departure, y = avg_arrival_delay)) +
geom_line() +
labs(title = "Average Arrival Delay Time by Hour of Departure",
x = "Hour of Departure",
y = "Average Arrival Delay (minutes)") +
theme_minimal()

# Frequency of delays
ggplot(data = flight_summary, aes(x = hour_of_departure, y = delay_count)) +
geom_line(color = "blue") +
labs(title = "Frequency of Delays by Hour From 2019-2023",
x = "Hour of Departure",
y = "Frequency of Delays") +
theme_minimal()

# Cancellations
ggplot(data = flight_summary, aes(x = hour_of_departure, y = cancellation_count)) +
geom_line(color = "red") +
labs(title = "Frequency of Cancellations by Hour From 2019-2023",
x = "Hour of Departure",
y = "Frequency of Cancellations") +
theme_minimal()

numerical_flights <- flights[, !(names(flights) %in% c("DOT_CODE", "FL_NUMBER")) & sapply(flights, is.numeric)]
summary(numerical_flights)
## CRS_DEP_TIME DEP_TIME DEP_DELAY TAXI_OUT
## Min. : 1 Min. : 1 Min. : -90.00 Min. : 1.00
## 1st Qu.: 915 1st Qu.: 916 1st Qu.: -6.00 1st Qu.: 11.00
## Median :1320 Median :1323 Median : -2.00 Median : 14.00
## Mean :1327 Mean :1330 Mean : 10.12 Mean : 16.64
## 3rd Qu.:1730 3rd Qu.:1739 3rd Qu.: 6.00 3rd Qu.: 19.00
## Max. :2359 Max. :2400 Max. :2966.00 Max. :184.00
## NA's :77615 NA's :77644 NA's :78806
## WHEELS_OFF WHEELS_ON TAXI_IN CRS_ARR_TIME
## Min. : 1 Min. : 1 Min. : 1.00 Min. : 1
## 1st Qu.: 931 1st Qu.:1049 1st Qu.: 4.00 1st Qu.:1107
## Median :1336 Median :1501 Median : 6.00 Median :1516
## Mean :1352 Mean :1462 Mean : 7.68 Mean :1491
## 3rd Qu.:1752 3rd Qu.:1908 3rd Qu.: 9.00 3rd Qu.:1919
## Max. :2400 Max. :2400 Max. :249.00 Max. :2400
## NA's :78806 NA's :79944 NA's :79944
## ARR_TIME ARR_DELAY CANCELLED DIVERTED
## Min. : 1 Min. : -96.00 Min. :0.00000 Min. :0.000000
## 1st Qu.:1053 1st Qu.: -16.00 1st Qu.:0.00000 1st Qu.:0.000000
## Median :1505 Median : -7.00 Median :0.00000 Median :0.000000
## Mean :1467 Mean : 4.26 Mean :0.02638 Mean :0.002352
## 3rd Qu.:1913 3rd Qu.: 7.00 3rd Qu.:0.00000 3rd Qu.:0.000000
## Max. :2400 Max. :2934.00 Max. :1.00000 Max. :1.000000
## NA's :79942 NA's :86198
## CRS_ELAPSED_TIME ELAPSED_TIME AIR_TIME DISTANCE
## Min. : 1.0 Min. : 15.0 Min. : 8.0 Min. : 29.0
## 1st Qu.: 90.0 1st Qu.: 84.0 1st Qu.: 61.0 1st Qu.: 377.0
## Median :125.0 Median :120.0 Median : 95.0 Median : 651.0
## Mean :142.3 Mean :136.6 Mean :112.3 Mean : 809.4
## 3rd Qu.:172.0 3rd Qu.:167.0 3rd Qu.:142.0 3rd Qu.:1046.0
## Max. :705.0 Max. :739.0 Max. :692.0 Max. :5812.0
## NA's :14 NA's :86198 NA's :86198
## DELAY_DUE_CARRIER DELAY_DUE_WEATHER DELAY_DUE_NAS DELAY_DUE_SECURITY
## Min. : 0.0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 4.0 Median : 0 Median : 0.0 Median : 0.0
## Mean : 24.8 Mean : 4 Mean : 13.2 Mean : 0.1
## 3rd Qu.: 23.0 3rd Qu.: 0 3rd Qu.: 17.0 3rd Qu.: 0.0
## Max. :2934.0 Max. :1653 Max. :1741.0 Max. :1185.0
## NA's :2466137 NA's :2466137 NA's :2466137 NA's :2466137
## DELAY_DUE_LATE_AIRCRAFT
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 0.0
## Mean : 25.5
## 3rd Qu.: 30.0
## Max. :2557.0
## NA's :2466137
# Airline Data
ggplot(data = flights, aes(x = AIRLINE)) +
geom_bar(fill = 'skyblue') +
labs(title = "Frequency of Airlines", x = "Airline", y = "Frequency") +
theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
theme(axis.text.y = element_text(size = 6)) +
scale_y_continuous(labels = scales::comma)

# Flight delays and cancellations across months
ggplot(data = flights, aes(x = factor(month(FL_DATE)), fill = factor(CANCELLED))) +
geom_bar() +
labs(title = "Flight Delays and Cancellations Across Months", x = "Month", y = "Frequency") +
scale_x_discrete(labels = month.abb) +
scale_fill_discrete(name = "Cancelled", labels = c("No", "Yes")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Reasons for flight delays
delay_data <- flights %>%
group_by(month = factor(month(FL_DATE))) %>%
summarize(
carrier_delay = sum(DELAY_DUE_CARRIER, na.rm = TRUE),
weather_delay = sum(DELAY_DUE_WEATHER, na.rm = TRUE),
nas_delay = sum(DELAY_DUE_NAS, na.rm = TRUE),
security_delay = sum(DELAY_DUE_SECURITY, na.rm = TRUE),
late_aircraft_delay = sum(DELAY_DUE_LATE_AIRCRAFT, na.rm = TRUE)
)
delay_data_long <- delay_data %>%
pivot_longer(cols = c(carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay),
names_to = "delay_type", values_to = "delay_minutes")
ggplot(data = delay_data_long, aes(x = month, y = delay_minutes, fill = delay_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Distribution of Flight Delays Across Months From 2019-2023",
x = "Month", y = "Delay in Minutes",
fill = "Delay Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_wrap(~delay_type, scales = "free_y", ncol = 2) +
scale_y_continuous(labels = scales::comma)

# Average departure delay time
flights <- flights %>%
mutate(HOUR_DEP = as.numeric(substr(sprintf("%04.0f", DEP_TIME), 1, 2)))
average_delay_by_hour <- flights %>%
group_by(HOUR_DEP) %>%
summarise(AVG_DELAY = mean(ARR_DELAY, na.rm = TRUE))
ggplot(average_delay_by_hour, aes(x = HOUR_DEP, y = AVG_DELAY)) +
geom_line(color = "purple") +
labs(x = "Hour of Departure", y = "Average Departure Delay (minutes)",
title = "Average Departure Delay Time by Hour of Departure") +
theme_minimal()
