library(readr)
#link to github data file
flights_data_url <- 'https://raw.githubusercontent.com/mehreengillani/DATA606/refs/heads/main/flights_data.csv'
# read
flights <- read_csv(flights_data_url)
## Rows: 144875 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): UNIQUE_CARRIER, UNIQUE_CARRIER_NAME, ORIGIN, ORIGIN_CITY_NAME, ORIG...
## dbl (8): PASSENGERS, FREIGHT, MAIL, DISTANCE, CARRIER_GROUP, MONTH, YEAR, DI...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data cleaning and preparation for domestic flights only
flights_clean <- flights %>%
# Convert all column names to lowercase first
rename_with(tolower) %>%
filter(!is.na(passengers), !is.na(distance))
# Create variables using only existing categories in the data
flights_clean <- flights_clean %>%
mutate(
# Use only distance groups that actually exist in the data
distance_cat = case_when(
distance_group == 1 ~ "less than 500 miles",
distance_group == 2 ~ "500-999 miles",
distance_group == 3 ~ "1000-1499 miles",
distance_group == 4 ~ "1500-1999 miles",
distance_group == 5 ~ "2000-2499 miles",
distance_group == 6 ~ "2500-2999 miles",
distance_group == 7 ~ "3000-3499 miles",
distance_group == 8 ~ "3500-3999 miles",
distance_group == 9 ~ "4000-4499 miles",
distance_group == 10 ~ "4500-4999 miles",
distance_group == 11 ~ "5000-5499 miles",
distance_group == 12 ~ "5500-5999 miles",
distance_group == 13 ~ "6000-6499 miles",
distance_group == 15 ~ "7000-7499 miles",
TRUE ~ NA_character_ # Fixed: TRUE in uppercase
),
# Use only carrier groups that actually exist in domestic data
carrier_type = case_when(
carrier_group == 1 ~ "regional carriers",
carrier_group == 2 ~ "national carriers",
carrier_group == 3 ~ "major carriers",
TRUE ~ NA_character_ # Fixed: TRUE in uppercase
),
# Create month names for better visualization
month_name = factor(month,
levels = 1:7,
labels = c("jan", "feb", "mar", "apr", "may", "jun", "jul")),
# Create route identifier
route = paste(origin, dest, sep = " - "),
# Calculate simple load metrics
load_factor = passengers / 180
)
# Remove rows with NA in critical categorical variables
flights_clean <- flights_clean %>%
filter(!is.na(distance_cat), !is.na(carrier_type))
# Factor the categorical variables with only existing levels
existing_distance_cats <- c("less than 500 miles", "500-999 miles", "1000-1499 miles",
"1500-1999 miles", "2000-2499 miles", "2500-2999 miles",
"3000-3499 miles", "3500-3999 miles", "4000-4499 miles",
"4500-4999 miles", "5000-5499 miles", "5500-5999 miles",
"6000-6499 miles", "7000-7499 miles")
existing_carrier_cats <- c("major carriers", "national carriers", "regional carriers")
flights_clean$distance_cat <- factor(flights_clean$distance_cat,
levels = existing_distance_cats)
flights_clean$carrier_type <- factor(flights_clean$carrier_type,
levels = existing_carrier_cats)
# For analysis, create simplified distance categories based on actual data distribution
flights_clean <- flights_clean %>%
mutate(
distance_simple = case_when(
distance_group %in% 1:2 ~ "short (<1000 mi)",
distance_group %in% 3:5 ~ "medium (1000-2499 mi)",
distance_group %in% 6:10 ~ "long (2500-4999 mi)",
distance_group >= 11 ~ "very long (5000+ mi)"
),
distance_simple = factor(distance_simple,
levels = c("short (<1000 mi)", "medium (1000-2499 mi)",
"long (2500-4999 mi)", "very long (5000+ mi)"))
)
kable(head(flights_clean)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "400px")
| passengers | freight | distance | unique_carrier | unique_carrier_name | carrier_group | origin | origin_city_name | origin_state_abr | dest | dest_city_name | dest_state_abr | month | year | distance_group | class | distance_cat | carrier_type | month_name | route | load_factor | distance_simple | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NA | 0 | 24 | YV | Mesa Airlines Inc. | 2 | HOU | Houston, TX | TX | IAH | Houston, TX | TX | 1 | 2025 | 1 | F | less than 500 miles | national carriers | jan | HOU - IAH | 0 | short (<1000 mi) |
| 0 | NA | 0 | 141 | YV | Mesa Airlines Inc. | 2 | DEN | Denver, CO | CO | HDN | Hayden, CO | CO | 1 | 2025 | 1 | F | less than 500 miles | national carriers | jan | DEN - HDN | 0 | short (<1000 mi) |
| 0 | NA | 0 | 152 | YV | Mesa Airlines Inc. | 2 | DEN | Denver, CO | CO | GUC | Gunnison, CO | CO | 1 | 2025 | 1 | F | less than 500 miles | national carriers | jan | DEN - GUC | 0 | short (<1000 mi) |
| 0 | NA | 0 | 156 | YV | Mesa Airlines Inc. | 2 | OKC | Oklahoma City, OK | OK | ICT | Wichita, KS | KS | 1 | 2025 | 1 | F | less than 500 miles | national carriers | jan | OKC - ICT | 0 | short (<1000 mi) |
| 0 | NA | 0 | 167 | YV | Mesa Airlines Inc. | 2 | CRP | Corpus Christi, TX | TX | AUS | Austin, TX | TX | 1 | 2025 | 1 | F | less than 500 miles | national carriers | jan | CRP - AUS | 0 | short (<1000 mi) |
| 0 | 0 | 0 | 0 | 1BQ | Eastern Airlines f/k/a Dynamic Airways, LLC | 1 | GPT | Gulfport/Biloxi, MS | MS | GPT | Gulfport/Biloxi, MS | MS | 2 | 2025 | 1 | L | less than 500 miles | regional carriers | feb | GPT - GPT | 0 | short (<1000 mi) |
1. Seasonal Flight Operations: How do flight
frequencies vary across months and carrier types? Hypothesis: Summer
months (June-July) will have the highest flight frequencies, with major
carriers showing the most significant seasonal increases. * Response:
flight_count (Numerical) * Explanatory: carrier_type (Categorical),
month (Categorical)
2. Carrier Operational Strategies: How do flight
frequency patterns differ between carrier types? Hypothesis: Major
carriers maintain more consistent flight frequencies year-round, while
regional carriers show greater seasonal variation. * Response:
flight_count (Numerical) * Explanatory: carrier_type (Categorical),
month (Categorical)
3. Market Dominance: Which carriers dominate specific months in terms of flight operations? Hypothesis: A few major carriers dominate flight operations across all months, with seasonal carriers emerging in peak months. * Response: flight_count (Numerical), market_share (Numerical) * Explanatory: unique_carrier_name (Categorical), month (Categorical)
What are the cases, and how many are there?
Each case represents a domestic flight segment operated by a U.S. carrier between two airports in a specific time period. There are 144875 observations in the cleaned dataset.
Data is collected by the Bureau of Transportation Statistics (BTS) through mandatory reporting by all U.S. certified air carriers. Airlines submit Form 41 Traffic T-100 data, which includes detailed information about their scheduled domestic operations for 1-7 months in 2025, including passenger counts, freight, mail, and flight distances.
What type of study is this (observational/experiment)?
This is an observational study.
data is downloaded from: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FIL&QO_fu146_anzr=Nv4+Pn44vr45 U.S. Department of Transportation, Bureau of Transportation Statistics, T-100 Domestic Segment Data.
Response
What is the response variable, and what type is it (numerical/categorical)?
The response variable is passengers (number of passengers on the flight segment) and is numerical.
Explanatory
What is the explanatory variable, and what type is it (numerical/categorical)?
The primary explanatory variable is distance (flight distance in miles) and is numerical. Additional exploratory variables include month (categorical) and carrier_type (categorical).
Provide summary statistics relevant to your research question.
# Summary statistics for main variable passengers
summary(flights_clean$passengers)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 7 729 3393 3864 87551
# Summary statistics for distance
summary(flights_clean$distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 295.0 641.0 797.5 1085.0 7360.0
# Correlation between distance and passengers
cor(flights_clean$distance, flights_clean$passengers, use = "complete.obs")
## [1] 0.1147395
# Summary by distance categories
table(flights_clean$distance_cat, useNA = 'ifany')
##
## less than 500 miles 500-999 miles 1000-1499 miles 1500-1999 miles
## 57712 44518 22533 10499
## 2000-2499 miles 2500-2999 miles 3000-3499 miles 3500-3999 miles
## 6366 2355 394 175
## 4000-4499 miles 4500-4999 miles 5000-5499 miles 5500-5999 miles
## 176 117 23 4
## 6000-6499 miles 7000-7499 miles
## 2 1
prop.table(table(flights_clean$distance_cat, useNA = 'ifany')) * 100
##
## less than 500 miles 500-999 miles 1000-1499 miles 1500-1999 miles
## 3.983572e+01 3.072856e+01 1.555341e+01 7.246937e+00
## 2000-2499 miles 2500-2999 miles 3000-3499 miles 3500-3999 miles
## 4.394133e+00 1.625539e+00 2.719586e-01 1.207938e-01
## 4000-4499 miles 4500-4999 miles 5000-5499 miles 5500-5999 miles
## 1.214840e-01 8.075928e-02 1.587575e-02 2.761001e-03
## 6000-6499 miles 7000-7499 miles
## 1.380500e-03 6.902502e-04
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# Passenger statistics by distance category
describeBy(flights_clean$passengers, group = flights_clean$distance_cat, mat = TRUE)
## item group1 vars n mean sd median trimmed
## X11 1 less than 500 miles 1 57712 2336.390 5449.044 78 1052.9110
## X12 2 500-999 miles 1 44518 3961.071 7125.351 1444 2288.5592
## X13 3 1000-1499 miles 1 22533 4195.229 7216.493 1490 2523.9133
## X14 4 1500-1999 miles 1 10499 4157.890 7703.065 730 2336.9527
## X15 5 2000-2499 miles 1 6366 4257.254 7627.221 554 2484.2456
## X16 6 2500-2999 miles 1 2355 5459.390 7721.722 4358 3939.2382
## X17 7 3000-3499 miles 1 394 1934.584 4211.608 0 838.8038
## X18 8 3500-3999 miles 1 175 3645.040 4630.472 23 2967.8723
## X19 9 4000-4499 miles 1 176 1850.835 3341.212 0 1099.3662
## X110 10 4500-4999 miles 1 117 3018.453 3174.957 1005 2770.9263
## X111 11 5000-5499 miles 1 23 3983.783 1199.914 4052 4104.8947
## X112 12 5500-5999 miles 1 4 0.000 0.000 0 0.0000
## X113 13 6000-6499 miles 1 2 0.000 0.000 0 0.0000
## X114 14 7000-7499 miles 1 1 0.000 NA 0 0.0000
## mad min max range skew kurtosis se
## X11 115.6428 0 87551 87551 4.9850211 37.291706 22.68231
## X12 2136.4266 0 74489 74489 3.6786401 17.681779 33.77057
## X13 2204.6262 0 74771 74771 3.4266294 15.863313 48.07471
## X14 1082.2980 0 68153 68153 3.4297137 15.125169 75.17780
## X15 821.3604 0 65811 65811 3.2952780 14.173821 95.59452
## X16 6401.8668 0 69764 69764 3.4279848 18.800397 159.11777
## X17 0.0000 0 21515 21515 2.5242622 6.361321 212.17772
## X18 34.0998 0 15999 15999 0.8162341 -0.659084 350.03077
## X19 0.0000 0 15381 15381 1.7738349 2.308881 251.85331
## X110 1490.0130 0 8701 8701 0.3949784 -1.498958 293.52487
## X111 1233.5232 47 5368 5321 -1.3237703 2.451216 250.19942
## X112 0.0000 0 0 0 NaN NaN 0.00000
## X113 0.0000 0 0 0 NaN NaN 0.00000
## X114 0.0000 0 0 0 NA NA NA
# Visualizations
ggplot(flights_clean, aes(x = passengers)) +
geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
labs(title = "Distribution of passengers per Flight Segment",
x = "Number of passengers", y = "Frequency") +
theme_minimal()
ggplot(flights_clean, aes(x = distance)) +
geom_histogram(bins = 30, fill = "lightgreen", alpha = 0.7) +
labs(title = "Distribution of Flight distances",
x = "distance (miles)", y = "Frequency") +
theme_minimal()
# Scatter plot with transparency to handle overlapping points
p_main <- ggplot(flights_clean, aes(x = distance, y = passengers)) +
geom_point(alpha = 0.1, size = 0.5, color = "darkblue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(title = "Flight distance vs. Number \n of passengers",
subtitle = paste("Correlation:", round(cor(flights_clean$distance, flights_clean$passengers,
use = "complete.obs"), 3)),
x = "distance (miles)",
y = "Number of passengers") +
theme_minimal()
# Hexbin plot for better visualization of dense data
p_hex <- ggplot(flights_clean, aes(x = distance, y = passengers)) +
geom_hex(bins = 50) +
scale_fill_viridis_c(option = "plasma") +
geom_smooth(method = "lm", color = "red", se = FALSE) +
labs(title = "distance vs passengers \n (Hexbin Density)",
x = "distance (miles)",
y = "Number of passengers",
fill = "Count") +
theme_minimal()
# Display both versions
p_main | p_hex
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Calculate total flights by month
monthly_flights <- flights_clean %>%
group_by(month) %>%
summarise(
total_flights = n(),
.groups = 'drop'
) %>%
arrange(desc(total_flights))
# Create month labels
month_labels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul")
# Create proper month order while showing highest to lowest
monthly_flights_ordered <- monthly_flights %>%
mutate(
month_name = factor(month,
levels = 1:7,
labels = month_labels[1:7]),
rank = rank(-total_flights)
) %>%
arrange(desc(total_flights))
# Create the plot
ggplot(monthly_flights_ordered, aes(x = reorder(month_name, -total_flights),
y = total_flights,
fill = total_flights)) +
geom_col(alpha = 0.8) +
geom_text(aes(label = paste0(format(total_flights, big.mark = ","),
"\n(", rank, ")")),
vjust = -0.08, size = 1.9, fontface = "bold") +
scale_fill_gradient(low = "lightblue", high = "darkblue",
name = "Total Flights") +
labs(title = "monthly Flight Volume (January - July)",
subtitle = "Ranked from highest to lowest, with rank numbers in parentheses",
x = "month",
y = "Total Number of Flights") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 10),
plot.subtitle = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 0, hjust = 0.5))
# Correlation between distance and month
cor(monthly_flights$month, monthly_flights$total_flights, use = "complete.obs")
## [1] 0.7411118
There is a strong correlation between month and total flights
# Top 10 carriers by average passengers
top_carriers <- flights_clean %>%
group_by(unique_carrier_name) %>%
summarise(avg_passengers = mean(passengers, na.rm = TRUE),
n_flights = n()) %>%
filter(n_flights > 100) %>% # Only carriers with sufficient data
arrange(desc(avg_passengers)) %>%
head(10)
ggplot(top_carriers, aes(x = reorder(unique_carrier_name, avg_passengers),
y = avg_passengers,
fill = avg_passengers)) +
geom_col() +
coord_flip() +
labs(title = "Top 10 Carriers by Average passengers per Flight",
x = "Carrier",
y = "Average Number of passengers") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal()