flights <- read.csv("flights.csv")
flight_data <- flights %>%
  select(CRS_DEP_TIME, ARR_DELAY, CANCELLED) %>%
  mutate(hour_of_departure = as.numeric(substr(sprintf("%04.0f", CRS_DEP_TIME), 1, 2))) 
# Aggregate data by hour of departure
flight_summary <- flight_data %>%
  group_by(hour_of_departure) %>%
  summarise(avg_arrival_delay = mean(ARR_DELAY, na.rm = TRUE),
            delay_count = sum(ARR_DELAY > 0, na.rm = TRUE),
            cancellation_count = sum(CANCELLED == 0))

# Average arrival delay by hour of departure
ggplot(data = flight_summary, aes(x = hour_of_departure, y = avg_arrival_delay)) +
  geom_line() +
    labs(title = "Average Arrival Delay Time by Hour of Departure",
       x = "Hour of Departure",
       y = "Average Arrival Delay (minutes)") +
  theme_minimal()

# Frequency of delays
ggplot(data = flight_summary, aes(x = hour_of_departure, y = delay_count)) +
  geom_line(color = "blue") +
  labs(title = "Frequency of Delays by Hour From 2019-2023",
       x = "Hour of Departure",
       y = "Frequency of Delays") +
  theme_minimal()

# Cancellations
ggplot(data = flight_summary, aes(x = hour_of_departure, y = cancellation_count)) +
  geom_line(color = "red") +
  labs(title = "Frequency of Cancellations by Hour From 2019-2023",
       x = "Hour of Departure",
       y = "Frequency of Cancellations") +
  theme_minimal()

numerical_flights <- flights[, !(names(flights) %in% c("DOT_CODE", "FL_NUMBER")) & sapply(flights, is.numeric)]

summary(numerical_flights)
##   CRS_DEP_TIME     DEP_TIME       DEP_DELAY          TAXI_OUT     
##  Min.   :   1   Min.   :   1    Min.   : -90.00   Min.   :  1.00  
##  1st Qu.: 915   1st Qu.: 916    1st Qu.:  -6.00   1st Qu.: 11.00  
##  Median :1320   Median :1323    Median :  -2.00   Median : 14.00  
##  Mean   :1327   Mean   :1330    Mean   :  10.12   Mean   : 16.64  
##  3rd Qu.:1730   3rd Qu.:1739    3rd Qu.:   6.00   3rd Qu.: 19.00  
##  Max.   :2359   Max.   :2400    Max.   :2966.00   Max.   :184.00  
##                 NA's   :77615   NA's   :77644     NA's   :78806   
##    WHEELS_OFF      WHEELS_ON        TAXI_IN        CRS_ARR_TIME 
##  Min.   :   1    Min.   :   1    Min.   :  1.00   Min.   :   1  
##  1st Qu.: 931    1st Qu.:1049    1st Qu.:  4.00   1st Qu.:1107  
##  Median :1336    Median :1501    Median :  6.00   Median :1516  
##  Mean   :1352    Mean   :1462    Mean   :  7.68   Mean   :1491  
##  3rd Qu.:1752    3rd Qu.:1908    3rd Qu.:  9.00   3rd Qu.:1919  
##  Max.   :2400    Max.   :2400    Max.   :249.00   Max.   :2400  
##  NA's   :78806   NA's   :79944   NA's   :79944                  
##     ARR_TIME       ARR_DELAY         CANCELLED          DIVERTED       
##  Min.   :   1    Min.   : -96.00   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:1053    1st Qu.: -16.00   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :1505    Median :  -7.00   Median :0.00000   Median :0.000000  
##  Mean   :1467    Mean   :   4.26   Mean   :0.02638   Mean   :0.002352  
##  3rd Qu.:1913    3rd Qu.:   7.00   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :2400    Max.   :2934.00   Max.   :1.00000   Max.   :1.000000  
##  NA's   :79942   NA's   :86198                                         
##  CRS_ELAPSED_TIME  ELAPSED_TIME      AIR_TIME        DISTANCE     
##  Min.   :  1.0    Min.   : 15.0   Min.   :  8.0   Min.   :  29.0  
##  1st Qu.: 90.0    1st Qu.: 84.0   1st Qu.: 61.0   1st Qu.: 377.0  
##  Median :125.0    Median :120.0   Median : 95.0   Median : 651.0  
##  Mean   :142.3    Mean   :136.6   Mean   :112.3   Mean   : 809.4  
##  3rd Qu.:172.0    3rd Qu.:167.0   3rd Qu.:142.0   3rd Qu.:1046.0  
##  Max.   :705.0    Max.   :739.0   Max.   :692.0   Max.   :5812.0  
##  NA's   :14       NA's   :86198   NA's   :86198                   
##  DELAY_DUE_CARRIER DELAY_DUE_WEATHER DELAY_DUE_NAS     DELAY_DUE_SECURITY
##  Min.   :   0.0    Min.   :   0      Min.   :   0.0    Min.   :   0.0    
##  1st Qu.:   0.0    1st Qu.:   0      1st Qu.:   0.0    1st Qu.:   0.0    
##  Median :   4.0    Median :   0      Median :   0.0    Median :   0.0    
##  Mean   :  24.8    Mean   :   4      Mean   :  13.2    Mean   :   0.1    
##  3rd Qu.:  23.0    3rd Qu.:   0      3rd Qu.:  17.0    3rd Qu.:   0.0    
##  Max.   :2934.0    Max.   :1653      Max.   :1741.0    Max.   :1185.0    
##  NA's   :2466137   NA's   :2466137   NA's   :2466137   NA's   :2466137   
##  DELAY_DUE_LATE_AIRCRAFT
##  Min.   :   0.0         
##  1st Qu.:   0.0         
##  Median :   0.0         
##  Mean   :  25.5         
##  3rd Qu.:  30.0         
##  Max.   :2557.0         
##  NA's   :2466137
# Airline Data
ggplot(data = flights, aes(x = AIRLINE)) +
  geom_bar(fill = 'skyblue') +
  labs(title = "Frequency of Airlines", x = "Airline", y = "Frequency") +
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  theme(axis.text.y = element_text(size = 6)) +
  scale_y_continuous(labels = scales::comma)

# Flight delays and cancellations across months
ggplot(data = flights, aes(x = factor(month(FL_DATE)), fill = factor(CANCELLED))) +
  geom_bar() +
  labs(title = "Flight Delays and Cancellations Across Months", x = "Month", y = "Frequency") +
  scale_x_discrete(labels = month.abb) +
  scale_fill_discrete(name = "Cancelled", labels = c("No", "Yes")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Reasons for flight delays
delay_data <- flights %>%
  group_by(month = factor(month(FL_DATE))) %>%
  summarize(
    carrier_delay = sum(DELAY_DUE_CARRIER, na.rm = TRUE),
    weather_delay = sum(DELAY_DUE_WEATHER, na.rm = TRUE),
    nas_delay = sum(DELAY_DUE_NAS, na.rm = TRUE),
    security_delay = sum(DELAY_DUE_SECURITY, na.rm = TRUE),
    late_aircraft_delay = sum(DELAY_DUE_LATE_AIRCRAFT, na.rm = TRUE)
  )


delay_data_long <- delay_data %>%
  pivot_longer(cols = c(carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay),
               names_to = "delay_type", values_to = "delay_minutes")


ggplot(data = delay_data_long, aes(x = month, y = delay_minutes, fill = delay_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Distribution of Flight Delays Across Months From 2019-2023",
       x = "Month", y = "Delay in Minutes",
       fill = "Delay Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_wrap(~delay_type, scales = "free_y", ncol = 2) +
  scale_y_continuous(labels = scales::comma)  

# Average departure delay time
flights <- flights %>%
   mutate(HOUR_DEP = as.numeric(substr(sprintf("%04.0f", DEP_TIME), 1, 2))) 

average_delay_by_hour <- flights %>%
  group_by(HOUR_DEP) %>%
  summarise(AVG_DELAY = mean(ARR_DELAY, na.rm = TRUE))

ggplot(average_delay_by_hour, aes(x = HOUR_DEP, y = AVG_DELAY)) +
  geom_line(color = "purple") +
  labs(x = "Hour of Departure", y = "Average Departure Delay (minutes)",
       title = "Average Departure Delay Time by Hour of Departure") +
  theme_minimal()