Data Preparation

library(readr)
#link to github data file 
flights_data_url <- 'https://raw.githubusercontent.com/mehreengillani/DATA606/refs/heads/main/flights_data.csv'


# read
flights <- read_csv(flights_data_url)
## Rows: 144875 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): UNIQUE_CARRIER, UNIQUE_CARRIER_NAME, ORIGIN, ORIGIN_CITY_NAME, ORIG...
## dbl (8): PASSENGERS, FREIGHT, MAIL, DISTANCE, CARRIER_GROUP, MONTH, YEAR, DI...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data cleaning and preparation for domestic flights only
flights_clean <- flights %>%
  # Convert all column names to lowercase first
  rename_with(tolower) %>%
  filter(!is.na(passengers), !is.na(distance))

# Create variables using only existing categories in the data
flights_clean <- flights_clean %>%
  mutate(
    # Use only distance groups that actually exist in the data
    distance_cat = case_when(
      distance_group == 1 ~ "less than 500 miles",
      distance_group == 2 ~ "500-999 miles", 
      distance_group == 3 ~ "1000-1499 miles",
      distance_group == 4 ~ "1500-1999 miles",
      distance_group == 5 ~ "2000-2499 miles",
      distance_group == 6 ~ "2500-2999 miles",
      distance_group == 7 ~ "3000-3499 miles",
      distance_group == 8 ~ "3500-3999 miles",
      distance_group == 9 ~ "4000-4499 miles",
      distance_group == 10 ~ "4500-4999 miles",
      distance_group == 11 ~ "5000-5499 miles",
      distance_group == 12 ~ "5500-5999 miles",
      distance_group == 13 ~ "6000-6499 miles", 
      distance_group == 15 ~ "7000-7499 miles",
      TRUE ~ NA_character_  # Fixed: TRUE in uppercase
    ),
    
    # Use only carrier groups that actually exist in domestic data
    carrier_type = case_when(
      carrier_group == 1 ~ "regional carriers",
      carrier_group == 2 ~ "national carriers", 
      carrier_group == 3 ~ "major carriers",
      TRUE ~ NA_character_  # Fixed: TRUE in uppercase
    ),
    
    # Create month names for better visualization
    month_name = factor(month, 
                       levels = 1:7,
                       labels = c("jan", "feb", "mar", "apr", "may", "jun", "jul")),
    
    # Create route identifier
    route = paste(origin, dest, sep = " - "),
    
    # Calculate simple load metrics
    load_factor = passengers / 180
  )

# Remove rows with NA in critical categorical variables
flights_clean <- flights_clean %>%
  filter(!is.na(distance_cat), !is.na(carrier_type))

# Factor the categorical variables with only existing levels
existing_distance_cats <- c("less than 500 miles", "500-999 miles", "1000-1499 miles",
                           "1500-1999 miles", "2000-2499 miles", "2500-2999 miles",
                           "3000-3499 miles", "3500-3999 miles", "4000-4499 miles",
                           "4500-4999 miles", "5000-5499 miles", "5500-5999 miles",
                           "6000-6499 miles", "7000-7499 miles")

existing_carrier_cats <- c("major carriers", "national carriers", "regional carriers")

flights_clean$distance_cat <- factor(flights_clean$distance_cat,
                                    levels = existing_distance_cats)

flights_clean$carrier_type <- factor(flights_clean$carrier_type,
                                    levels = existing_carrier_cats)

# For analysis, create simplified distance categories based on actual data distribution
flights_clean <- flights_clean %>%
  mutate(
    distance_simple = case_when(
      distance_group %in% 1:2 ~ "short (<1000 mi)",
      distance_group %in% 3:5 ~ "medium (1000-2499 mi)", 
      distance_group %in% 6:10 ~ "long (2500-4999 mi)",
      distance_group >= 11 ~ "very long (5000+ mi)"
    ),
    distance_simple = factor(distance_simple,
                            levels = c("short (<1000 mi)", "medium (1000-2499 mi)",
                                      "long (2500-4999 mi)", "very long (5000+ mi)"))
  )




kable(head(flights_clean)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "400px")
passengers freight mail distance unique_carrier unique_carrier_name carrier_group origin origin_city_name origin_state_abr dest dest_city_name dest_state_abr month year distance_group class distance_cat carrier_type month_name route load_factor distance_simple
0 NA 0 24 YV Mesa Airlines Inc.  2 HOU Houston, TX TX IAH Houston, TX TX 1 2025 1 F less than 500 miles national carriers jan HOU - IAH 0 short (<1000 mi)
0 NA 0 141 YV Mesa Airlines Inc.  2 DEN Denver, CO CO HDN Hayden, CO CO 1 2025 1 F less than 500 miles national carriers jan DEN - HDN 0 short (<1000 mi)
0 NA 0 152 YV Mesa Airlines Inc.  2 DEN Denver, CO CO GUC Gunnison, CO CO 1 2025 1 F less than 500 miles national carriers jan DEN - GUC 0 short (<1000 mi)
0 NA 0 156 YV Mesa Airlines Inc.  2 OKC Oklahoma City, OK OK ICT Wichita, KS KS 1 2025 1 F less than 500 miles national carriers jan OKC - ICT 0 short (<1000 mi)
0 NA 0 167 YV Mesa Airlines Inc.  2 CRP Corpus Christi, TX TX AUS Austin, TX TX 1 2025 1 F less than 500 miles national carriers jan CRP - AUS 0 short (<1000 mi)
0 0 0 0 1BQ Eastern Airlines f/k/a Dynamic Airways, LLC 1 GPT Gulfport/Biloxi, MS MS GPT Gulfport/Biloxi, MS MS 2 2025 1 L less than 500 miles regional carriers feb GPT - GPT 0 short (<1000 mi)

Research Questions

1. Seasonal Flight Operations: How do flight frequencies vary across months and carrier types? Hypothesis: Summer months (June-July) will have the highest flight frequencies, with major carriers showing the most significant seasonal increases. * Response: flight_count (Numerical) * Explanatory: carrier_type (Categorical), month (Categorical)

2. Carrier Operational Strategies: How do flight frequency patterns differ between carrier types? Hypothesis: Major carriers maintain more consistent flight frequencies year-round, while regional carriers show greater seasonal variation. * Response: flight_count (Numerical) * Explanatory: carrier_type (Categorical), month (Categorical)

3. Market Dominance: Which carriers dominate specific months in terms of flight operations? Hypothesis: A few major carriers dominate flight operations across all months, with seasonal carriers emerging in peak months. * Response: flight_count (Numerical), market_share (Numerical) * Explanatory: unique_carrier_name (Categorical), month (Categorical)

Cases

What are the cases, and how many are there?

Each case represents a domestic flight segment operated by a U.S. carrier between two airports in a specific time period. There are 144875 observations in the cleaned dataset.

Data Collection

Data is collected by the Bureau of Transportation Statistics (BTS) through mandatory reporting by all U.S. certified air carriers. Airlines submit Form 41 Traffic T-100 data, which includes detailed information about their scheduled domestic operations for 1-7 months in 2025, including passenger counts, freight, mail, and flight distances.

Type of Study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

data is downloaded from: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FIL&QO_fu146_anzr=Nv4+Pn44vr45 U.S. Department of Transportation, Bureau of Transportation Statistics, T-100 Domestic Segment Data.

Response

What is the response variable, and what type is it (numerical/categorical)?

The response variable is passengers (number of passengers on the flight segment) and is numerical.

Explanatory

What is the explanatory variable, and what type is it (numerical/categorical)?

The primary explanatory variable is distance (flight distance in miles) and is numerical. Additional exploratory variables include month (categorical) and carrier_type (categorical).

Relevant Summary Statistics

Provide summary statistics relevant to your research question.

# Summary statistics for main variable passengers
summary(flights_clean$passengers)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       7     729    3393    3864   87551
# Summary statistics for distance
summary(flights_clean$distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   295.0   641.0   797.5  1085.0  7360.0
# Correlation between distance and passengers
cor(flights_clean$distance, flights_clean$passengers, use = "complete.obs")
## [1] 0.1147395
# Summary by distance categories
table(flights_clean$distance_cat, useNA = 'ifany')
## 
## less than 500 miles       500-999 miles     1000-1499 miles     1500-1999 miles 
##               57712               44518               22533               10499 
##     2000-2499 miles     2500-2999 miles     3000-3499 miles     3500-3999 miles 
##                6366                2355                 394                 175 
##     4000-4499 miles     4500-4999 miles     5000-5499 miles     5500-5999 miles 
##                 176                 117                  23                   4 
##     6000-6499 miles     7000-7499 miles 
##                   2                   1
prop.table(table(flights_clean$distance_cat, useNA = 'ifany')) * 100
## 
## less than 500 miles       500-999 miles     1000-1499 miles     1500-1999 miles 
##        3.983572e+01        3.072856e+01        1.555341e+01        7.246937e+00 
##     2000-2499 miles     2500-2999 miles     3000-3499 miles     3500-3999 miles 
##        4.394133e+00        1.625539e+00        2.719586e-01        1.207938e-01 
##     4000-4499 miles     4500-4999 miles     5000-5499 miles     5500-5999 miles 
##        1.214840e-01        8.075928e-02        1.587575e-02        2.761001e-03 
##     6000-6499 miles     7000-7499 miles 
##        1.380500e-03        6.902502e-04
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
# Passenger statistics by distance category
describeBy(flights_clean$passengers, group = flights_clean$distance_cat, mat = TRUE)
##      item              group1 vars     n     mean       sd median   trimmed
## X11     1 less than 500 miles    1 57712 2336.390 5449.044     78 1052.9110
## X12     2       500-999 miles    1 44518 3961.071 7125.351   1444 2288.5592
## X13     3     1000-1499 miles    1 22533 4195.229 7216.493   1490 2523.9133
## X14     4     1500-1999 miles    1 10499 4157.890 7703.065    730 2336.9527
## X15     5     2000-2499 miles    1  6366 4257.254 7627.221    554 2484.2456
## X16     6     2500-2999 miles    1  2355 5459.390 7721.722   4358 3939.2382
## X17     7     3000-3499 miles    1   394 1934.584 4211.608      0  838.8038
## X18     8     3500-3999 miles    1   175 3645.040 4630.472     23 2967.8723
## X19     9     4000-4499 miles    1   176 1850.835 3341.212      0 1099.3662
## X110   10     4500-4999 miles    1   117 3018.453 3174.957   1005 2770.9263
## X111   11     5000-5499 miles    1    23 3983.783 1199.914   4052 4104.8947
## X112   12     5500-5999 miles    1     4    0.000    0.000      0    0.0000
## X113   13     6000-6499 miles    1     2    0.000    0.000      0    0.0000
## X114   14     7000-7499 miles    1     1    0.000       NA      0    0.0000
##            mad min   max range       skew  kurtosis        se
## X11   115.6428   0 87551 87551  4.9850211 37.291706  22.68231
## X12  2136.4266   0 74489 74489  3.6786401 17.681779  33.77057
## X13  2204.6262   0 74771 74771  3.4266294 15.863313  48.07471
## X14  1082.2980   0 68153 68153  3.4297137 15.125169  75.17780
## X15   821.3604   0 65811 65811  3.2952780 14.173821  95.59452
## X16  6401.8668   0 69764 69764  3.4279848 18.800397 159.11777
## X17     0.0000   0 21515 21515  2.5242622  6.361321 212.17772
## X18    34.0998   0 15999 15999  0.8162341 -0.659084 350.03077
## X19     0.0000   0 15381 15381  1.7738349  2.308881 251.85331
## X110 1490.0130   0  8701  8701  0.3949784 -1.498958 293.52487
## X111 1233.5232  47  5368  5321 -1.3237703  2.451216 250.19942
## X112    0.0000   0     0     0        NaN       NaN   0.00000
## X113    0.0000   0     0     0        NaN       NaN   0.00000
## X114    0.0000   0     0     0         NA        NA        NA

Visualizations

# Visualizations
ggplot(flights_clean, aes(x = passengers)) + 
  geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
  labs(title = "Distribution of passengers per Flight Segment",
       x = "Number of passengers", y = "Frequency") +
  theme_minimal()

ggplot(flights_clean, aes(x = distance)) + 
  geom_histogram(bins = 30, fill = "lightgreen", alpha = 0.7) +
  labs(title = "Distribution of Flight distances",
       x = "distance (miles)", y = "Frequency") +
  theme_minimal()

Flight distance vs No. of passengers

# Scatter plot with transparency to handle overlapping points
p_main <- ggplot(flights_clean, aes(x = distance, y = passengers)) +
  geom_point(alpha = 0.1, size = 0.5, color = "darkblue") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Flight distance vs. Number \n of passengers",
       subtitle = paste("Correlation:", round(cor(flights_clean$distance, flights_clean$passengers, 
                                                 use = "complete.obs"), 3)),
       x = "distance (miles)",
       y = "Number of passengers") +
  theme_minimal()

# Hexbin plot for better visualization of dense data
p_hex <- ggplot(flights_clean, aes(x = distance, y = passengers)) +
  geom_hex(bins = 50) +
  scale_fill_viridis_c(option = "plasma") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  labs(title = "distance vs passengers \n (Hexbin Density)",
       x = "distance (miles)",
       y = "Number of passengers",
       fill = "Count") +
  theme_minimal()

# Display both versions
p_main | p_hex
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Total Flights by month

# Calculate total flights by month
monthly_flights <- flights_clean %>%
  group_by(month) %>%
  summarise(
    total_flights = n(),
    .groups = 'drop'
  ) %>%
  arrange(desc(total_flights))

# Create month labels
month_labels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul")

# Create proper month order while showing highest to lowest
monthly_flights_ordered <- monthly_flights %>%
  mutate(
    month_name = factor(month, 
                       levels = 1:7,
                       labels = month_labels[1:7]),
    rank = rank(-total_flights)
  ) %>%
  arrange(desc(total_flights))

# Create the plot
ggplot(monthly_flights_ordered, aes(x = reorder(month_name, -total_flights), 
                                   y = total_flights,
                                   fill = total_flights)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = paste0(format(total_flights, big.mark = ","), 
                              "\n(", rank, ")")), 
            vjust = -0.08, size = 1.9, fontface = "bold") +
  scale_fill_gradient(low = "lightblue", high = "darkblue", 
                     name = "Total Flights") +
  labs(title = "monthly Flight Volume (January - July)",
       subtitle = "Ranked from highest to lowest, with rank numbers in parentheses",
       x = "month",
       y = "Total Number of Flights") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        plot.subtitle = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 0, hjust = 0.5))

# Correlation between distance and month
cor(monthly_flights$month, monthly_flights$total_flights, use = "complete.obs")
## [1] 0.7411118

There is a strong correlation between month and total flights

Passenger by carrier

# Top 10 carriers by average passengers
top_carriers <- flights_clean %>%
  group_by(unique_carrier_name) %>%
  summarise(avg_passengers = mean(passengers, na.rm = TRUE),
            n_flights = n()) %>%
  filter(n_flights > 100) %>%  # Only carriers with sufficient data
  arrange(desc(avg_passengers)) %>%
  head(10)

ggplot(top_carriers, aes(x = reorder(unique_carrier_name, avg_passengers), 
                        y = avg_passengers, 
                        fill = avg_passengers)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 10 Carriers by Average passengers per Flight",
       x = "Carrier",
       y = "Average Number of passengers") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal()