library(data.table) # for fast CSV reading
library(tidyverse) # for dplyr, ggplot2, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly) # for interactive plots
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(flexdashboard) # for dashboard layout
# Load flight data
flights <- fread("activity01_airline/flights.csv")
# Inspect first few rows and column names
head(flights)
colnames(flights)
## [1] "V1" "YEAR" "MONTH"
## [4] "DAY" "DAY_OF_WEEK" "AIRLINE"
## [7] "FLIGHT_NUMBER" "TAIL_NUMBER" "ORIGIN_AIRPORT"
## [10] "DESTINATION_AIRPORT" "SCHEDULED_DEPARTURE" "DEPARTURE_TIME"
## [13] "DEPARTURE_DELAY" "TAXI_OUT" "WHEELS_OFF"
## [16] "SCHEDULED_TIME" "ELAPSED_TIME" "AIR_TIME"
## [19] "DISTANCE" "WHEELS_ON" "TAXI_IN"
## [22] "SCHEDULED_ARRIVAL" "ARRIVAL_TIME" "ARRIVAL_DELAY"
## [25] "DIVERTED" "CANCELLED" "CANCELLATION_REASON"
## [28] "AIR_SYSTEM_DELAY" "SECURITY_DELAY" "AIRLINE_DELAY"
## [31] "LATE_AIRCRAFT_DELAY" "WEATHER_DELAY"
# Create a new Status column based on cancellation and departure delay
flights[, Status := fifelse(CANCELLED == 1, "Canceled",
fifelse(DEPARTURE_DELAY > 0, "Delayed", "On-Time"))]
# Inspect the first few rows
head(flights)
# Check column names
colnames(flights)
## [1] "V1" "YEAR" "MONTH"
## [4] "DAY" "DAY_OF_WEEK" "AIRLINE"
## [7] "FLIGHT_NUMBER" "TAIL_NUMBER" "ORIGIN_AIRPORT"
## [10] "DESTINATION_AIRPORT" "SCHEDULED_DEPARTURE" "DEPARTURE_TIME"
## [13] "DEPARTURE_DELAY" "TAXI_OUT" "WHEELS_OFF"
## [16] "SCHEDULED_TIME" "ELAPSED_TIME" "AIR_TIME"
## [19] "DISTANCE" "WHEELS_ON" "TAXI_IN"
## [22] "SCHEDULED_ARRIVAL" "ARRIVAL_TIME" "ARRIVAL_DELAY"
## [25] "DIVERTED" "CANCELLED" "CANCELLATION_REASON"
## [28] "AIR_SYSTEM_DELAY" "SECURITY_DELAY" "AIRLINE_DELAY"
## [31] "LATE_AIRCRAFT_DELAY" "WEATHER_DELAY" "Status"
# Total flights by Status
status_summary <- flights[, .N, by = Status]
setnames(status_summary, "N", "Count")
status_summary
# Average departure delay by Airline
airline_delay <- flights[, .(AvgDepartureDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = AIRLINE]
airline_delay
# Optional: Flights by Origin Airport
origin_summary <- flights[, .N, by = ORIGIN_AIRPORT]
setnames(origin_summary, "N", "Count")
origin_summary
library(plotly)
status_plot <- plot_ly(
data = status_summary,
x = ~Status,
y = ~Count,
type = 'bar',
marker = list(color = c('green', 'orange', 'red'))
) %>%
layout(
title = "Flight Status Counts",
xaxis = list(title = "Status"),
yaxis = list(title = "Number of Flights")
)
status_plot
delay_plot <- plot_ly(
data = airline_delay,
x = ~AIRLINE,
y = ~AvgDepartureDelay,
type = 'bar',
marker = list(color = 'blue')
) %>%
layout(
title = "Average Departure Delay by Airline",
xaxis = list(title = "Airline"),
yaxis = list(title = "Average Departure Delay (minutes)")
)
delay_plot
top_airports <- flights[, .N, by = ORIGIN_AIRPORT][order(-N)][1:10] %>%
setnames("N", "Count")
airport_plot <- plot_ly(
data = top_airports,
x = ~ORIGIN_AIRPORT,
y = ~Count,
type = 'bar',
marker = list(color = 'purple')
) %>%
layout(
title = "Top 10 Busiest Origin Airports",
xaxis = list(title = "Airport"),
yaxis = list(title = "Number of Flights")
)
airport_plot
airline_delay <- flights[, .(AvgDepartureDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = AIRLINE]
delay_plot <- plot_ly(
data = airline_delay,
x = ~AIRLINE,
y = ~AvgDepartureDelay,
type = 'bar',
marker = list(color = 'blue')
) %>%
layout(
title = "Average Departure Delay by Airline",
xaxis = list(title = "Airline"),
yaxis = list(title = "Average Departure Delay (minutes)")
)
delay_plot
# Step 6b: Flight status summary
status_summary <- flights[, .N, by = Status] %>%
setnames("N", "Count")
status_plot <- plot_ly(
data = status_summary,
x = ~Status,
y = ~Count,
type = 'bar',
marker = list(color = c('green', 'orange', 'red'))
) %>%
layout(
title = "Flight Status Counts",
xaxis = list(title = "Status"),
yaxis = list(title = "Number of Flights")
)
status_plot
# Step 7: Top 10 routes by average departure delay
top_routes <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)),
by = .(ORIGIN_AIRPORT, DESTINATION_AIRPORT)][order(-AvgDelay)][1:10]
route_plot <- plot_ly(
data = top_routes,
x = ~paste(ORIGIN_AIRPORT, DESTINATION_AIRPORT, sep = " → "),
y = ~AvgDelay,
type = 'bar',
marker = list(color = 'darkred')
) %>%
layout(
title = "Top 10 Routes by Average Departure Delay",
xaxis = list(title = "Route"),
yaxis = list(title = "Average Delay (minutes)")
)
route_plot
# Summarize by origin airport
top_airports <- flights[, .N, by = ORIGIN_AIRPORT][order(-N)][1:10]
setnames(top_airports, "N", "Count")
# Plot top 10 busiest origin airports
airport_plot <- plot_ly(
data = top_airports,
x = ~ORIGIN_AIRPORT,
y = ~Count,
type = 'bar',
marker = list(color = 'purple')
) %>%
layout(
title = "Top 10 Busiest Origin Airports",
xaxis = list(title = "Airport"),
yaxis = list(title = "Number of Flights")
)
airport_plot
# Summarize average delay by route
top_routes <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)),
by = .(ORIGIN_AIRPORT, DESTINATION_AIRPORT)][order(-AvgDelay)][1:10]
# Plot top 10 delayed routes
route_plot <- plot_ly(
data = top_routes,
x = ~paste(ORIGIN_AIRPORT, DESTINATION_AIRPORT, sep = " → "),
y = ~AvgDelay,
type = 'bar',
marker = list(color = 'darkred')
) %>%
layout(
title = "Top 10 Routes by Average Departure Delay",
xaxis = list(title = "Route"),
yaxis = list(title = "Average Delay (minutes)")
)
route_plot
##Step 9: Interactive Delay Analysis by Month, Airline, and Day of Week
# Average departure delay by month
monthly_delay <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = MONTH]
monthly_plot <- plot_ly(
data = monthly_delay,
x = ~MONTH,
y = ~AvgDelay,
type = 'bar',
marker = list(color = 'steelblue')
) %>%
layout(
title = "Average Departure Delay by Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Average Delay (minutes)")
)
monthly_plot
# Average departure delay by day of week
day_delay <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = DAY_OF_WEEK]
day_plot <- plot_ly(
data = day_delay,
x = ~DAY_OF_WEEK,
y = ~AvgDelay,
type = 'bar',
marker = list(color = 'orange')
) %>%
layout(
title = "Average Departure Delay by Day of Week",
xaxis = list(title = "Day of Week (1=Mon, 7=Sun)"),
yaxis = list(title = "Average Delay (minutes)")
)
day_plot
# Average departure delay by airline
airline_delay <- flights[, .(AvgDelay = mean(DEPARTURE_DELAY, na.rm = TRUE)), by = AIRLINE]
airline_plot <- plot_ly(
data = airline_delay,
x = ~AIRLINE,
y = ~AvgDelay,
type = 'bar',
marker = list(color = 'darkgreen')
) %>%
layout(
title = "Average Departure Delay by Airline",
xaxis = list(title = "Airline"),
yaxis = list(title = "Average Delay (minutes)")
)
airline_plot