Step 0: Load required packages

library(data.table)   # for fast CSV reading
library(tidyverse)    # for dplyr, ggplot2, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)       # for interactive plots
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(flexdashboard) # for dashboard layout

Step 1: Load CSV Data

# Load flight data
flights <- fread("activity01_airline/flights.csv")

# Inspect first few rows and column names
head(flights)
colnames(flights)
##  [1] "V1"                  "YEAR"                "MONTH"              
##  [4] "DAY"                 "DAY_OF_WEEK"         "AIRLINE"            
##  [7] "FLIGHT_NUMBER"       "TAIL_NUMBER"         "ORIGIN_AIRPORT"     
## [10] "DESTINATION_AIRPORT" "SCHEDULED_DEPARTURE" "DEPARTURE_TIME"     
## [13] "DEPARTURE_DELAY"     "TAXI_OUT"            "WHEELS_OFF"         
## [16] "SCHEDULED_TIME"      "ELAPSED_TIME"        "AIR_TIME"           
## [19] "DISTANCE"            "WHEELS_ON"           "TAXI_IN"            
## [22] "SCHEDULED_ARRIVAL"   "ARRIVAL_TIME"        "ARRIVAL_DELAY"      
## [25] "DIVERTED"            "CANCELLED"           "CANCELLATION_REASON"
## [28] "AIR_SYSTEM_DELAY"    "SECURITY_DELAY"      "AIRLINE_DELAY"      
## [31] "LATE_AIRCRAFT_DELAY" "WEATHER_DELAY"

Step 2: Clean the Data

# Create a new Status column based on cancellation and departure delay
flights[, Status := fifelse(CANCELLED == 1, "Canceled",
                            fifelse(DEPARTURE_DELAY > 0, "Delayed", "On-Time"))]

# Inspect the first few rows
head(flights)
# Check column names
colnames(flights)
##  [1] "V1"                  "YEAR"                "MONTH"              
##  [4] "DAY"                 "DAY_OF_WEEK"         "AIRLINE"            
##  [7] "FLIGHT_NUMBER"       "TAIL_NUMBER"         "ORIGIN_AIRPORT"     
## [10] "DESTINATION_AIRPORT" "SCHEDULED_DEPARTURE" "DEPARTURE_TIME"     
## [13] "DEPARTURE_DELAY"     "TAXI_OUT"            "WHEELS_OFF"         
## [16] "SCHEDULED_TIME"      "ELAPSED_TIME"        "AIR_TIME"           
## [19] "DISTANCE"            "WHEELS_ON"           "TAXI_IN"            
## [22] "SCHEDULED_ARRIVAL"   "ARRIVAL_TIME"        "ARRIVAL_DELAY"      
## [25] "DIVERTED"            "CANCELLED"           "CANCELLATION_REASON"
## [28] "AIR_SYSTEM_DELAY"    "SECURITY_DELAY"      "AIRLINE_DELAY"      
## [31] "LATE_AIRCRAFT_DELAY" "WEATHER_DELAY"       "Status"

Step 3: Summarize the Data

# Total flights by Status
status_summary <- flights[, .N, by = Status]
setnames(status_summary, "N", "Count")
status_summary
# Average departure delay by Airline
airline_delay <- flights[, .(AvgDepartureDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = AIRLINE]
airline_delay
# Optional: Flights by Origin Airport
origin_summary <- flights[, .N, by = ORIGIN_AIRPORT]
setnames(origin_summary, "N", "Count")
origin_summary

Step 4a: Flight Counts by Status

library(plotly)

status_plot <- plot_ly(
  data = status_summary,
  x = ~Status,
  y = ~Count,
  type = 'bar',
  marker = list(color = c('green', 'orange', 'red'))
) %>%
  layout(
    title = "Flight Status Counts",
    xaxis = list(title = "Status"),
    yaxis = list(title = "Number of Flights")
  )

status_plot

Step 4b: Average Departure Delay by Airline

delay_plot <- plot_ly(
  data = airline_delay,
  x = ~AIRLINE,
  y = ~AvgDepartureDelay,
  type = 'bar',
  marker = list(color = 'blue')
) %>%
  layout(
    title = "Average Departure Delay by Airline",
    xaxis = list(title = "Airline"),
    yaxis = list(title = "Average Departure Delay (minutes)")
  )

delay_plot

Step 5: Top 10 busiest origin airports

top_airports <- flights[, .N, by = ORIGIN_AIRPORT][order(-N)][1:10] %>%
  setnames("N", "Count")

airport_plot <- plot_ly(
  data = top_airports,
  x = ~ORIGIN_AIRPORT,
  y = ~Count,
  type = 'bar',
  marker = list(color = 'purple')
) %>%
  layout(
    title = "Top 10 Busiest Origin Airports",
    xaxis = list(title = "Airport"),
    yaxis = list(title = "Number of Flights")
)

airport_plot

Step 6a: Average departure delay by airline

airline_delay <- flights[, .(AvgDepartureDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = AIRLINE]

delay_plot <- plot_ly(
  data = airline_delay,
  x = ~AIRLINE,
  y = ~AvgDepartureDelay,
  type = 'bar',
  marker = list(color = 'blue')
) %>%
  layout(
    title = "Average Departure Delay by Airline",
    xaxis = list(title = "Airline"),
    yaxis = list(title = "Average Departure Delay (minutes)")
)

delay_plot
# Step 6b: Flight status summary
status_summary <- flights[, .N, by = Status] %>%
  setnames("N", "Count")

status_plot <- plot_ly(
  data = status_summary,
  x = ~Status,
  y = ~Count,
  type = 'bar',
  marker = list(color = c('green', 'orange', 'red'))
) %>%
  layout(
    title = "Flight Status Counts",
    xaxis = list(title = "Status"),
    yaxis = list(title = "Number of Flights")
)

status_plot
# Step 7: Top 10 routes by average departure delay
top_routes <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), 
                      by = .(ORIGIN_AIRPORT, DESTINATION_AIRPORT)][order(-AvgDelay)][1:10]

route_plot <- plot_ly(
  data = top_routes,
  x = ~paste(ORIGIN_AIRPORT, DESTINATION_AIRPORT, sep = " → "),
  y = ~AvgDelay,
  type = 'bar',
  marker = list(color = 'darkred')
) %>%
  layout(
    title = "Top 10 Routes by Average Departure Delay",
    xaxis = list(title = "Route"),
    yaxis = list(title = "Average Delay (minutes)")
)

route_plot

Step 7: Top 10 Busiest Origin Airports

# Summarize by origin airport
top_airports <- flights[, .N, by = ORIGIN_AIRPORT][order(-N)][1:10] 
setnames(top_airports, "N", "Count")

# Plot top 10 busiest origin airports
airport_plot <- plot_ly(
  data = top_airports,
  x = ~ORIGIN_AIRPORT,
  y = ~Count,
  type = 'bar',
  marker = list(color = 'purple')
) %>%
  layout(
    title = "Top 10 Busiest Origin Airports",
    xaxis = list(title = "Airport"),
    yaxis = list(title = "Number of Flights")
)

airport_plot

Step 8: Top 10 Routes by Average Departure Delay

# Summarize average delay by route
top_routes <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), 
                      by = .(ORIGIN_AIRPORT, DESTINATION_AIRPORT)][order(-AvgDelay)][1:10]

# Plot top 10 delayed routes
route_plot <- plot_ly(
  data = top_routes,
  x = ~paste(ORIGIN_AIRPORT, DESTINATION_AIRPORT, sep = " → "),
  y = ~AvgDelay,
  type = 'bar',
  marker = list(color = 'darkred')
) %>%
  layout(
    title = "Top 10 Routes by Average Departure Delay",
    xaxis = list(title = "Route"),
    yaxis = list(title = "Average Delay (minutes)")
)

route_plot

##Step 9: Interactive Delay Analysis by Month, Airline, and Day of Week

# Average departure delay by month
monthly_delay <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = MONTH]

monthly_plot <- plot_ly(
  data = monthly_delay,
  x = ~MONTH,
  y = ~AvgDelay,
  type = 'bar',
  marker = list(color = 'steelblue')
) %>%
  layout(
    title = "Average Departure Delay by Month",
    xaxis = list(title = "Month"),
    yaxis = list(title = "Average Delay (minutes)")
)

monthly_plot
# Average departure delay by day of week
day_delay <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = DAY_OF_WEEK]

day_plot <- plot_ly(
  data = day_delay,
  x = ~DAY_OF_WEEK,
  y = ~AvgDelay,
  type = 'bar',
  marker = list(color = 'orange')
) %>%
  layout(
    title = "Average Departure Delay by Day of Week",
    xaxis = list(title = "Day of Week (1=Mon, 7=Sun)"),
    yaxis = list(title = "Average Delay (minutes)")
)

day_plot
# Average departure delay by airline
airline_delay <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = AIRLINE]

airline_plot <- plot_ly(
  data = airline_delay,
  x = ~AIRLINE,
  y = ~AvgDelay,
  type = 'bar',
  marker = list(color = 'darkgreen')
) %>%
  layout(
    title = "Average Departure Delay by Airline",
    xaxis = list(title = "Airline"),
    yaxis = list(title = "Average Delay (minutes)")
)

airline_plot