setwd("/Users/seanhiggins/R studio directory")
db1b_sample1m <- read.csv("db1b_sample1m.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
This chunk loads the dataset into R for analysis.
summary(db1b_sample1m$ItinFare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 249.0 389.0 455.7 567.0 19080.0
summary(db1b_sample1m$Distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 62 1256 1970 2296 2918 17263
summary(db1b_sample1m$FarePerMile)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0006 0.1142 0.1902 0.2539 0.3214 9.3767
These summaries give an overview of fare, distance, and fare-per-mile distributions.
ggplot(db1b_sample1m, aes(x = ItinFare)) +
geom_histogram(binwidth = 50, fill = "steelblue", color = "black") +
labs(title = "Histogram of ItinFare",
x = "Itinerary Fare",
y = "Count") +
theme_minimal()
This histogram shows how itinerary fares are distributed across all
routes.
ggplot(db1b_sample1m, aes(x = Distance)) +
geom_histogram(binwidth = 300, fill = "orange", color = "black") +
labs(title = "Histogram of Distance",
x = "Distance (miles)",
y = "Count") +
theme_minimal()
This plot displays the distribution of route distances in the dataset.
summary(db1b_sample1m$FarePerMile)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0006 0.1142 0.1902 0.2539 0.3214 9.3767
This summary shows central tendencies and spread for fare-per-mile values.
Q1 <- quantile(db1b_sample1m$FarePerMile, 0.25, na.rm = TRUE)
Q3 <- quantile(db1b_sample1m$FarePerMile, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- db1b_sample1m %>%
filter(FarePerMile < lower_bound | FarePerMile > upper_bound)
nrow(outliers)
## [1] 60799
head(outliers)
## ItinID Coupons Year Quarter Origin OriginAirportID OriginAirportSeqID
## 1 202523326483 2 2025 2 HNL 12173 1217305
## 2 202523326484 2 2025 2 HNL 12173 1217305
## 3 202523325268 2 2025 2 HNL 12173 1217305
## 4 202523353945 2 2025 2 OGG 13830 1383002
## 5 202523347412 2 2025 2 LIH 12982 1298202
## 6 202524180976 2 2025 2 PDX 14057 1405702
## OriginCityMarketID OriginCountry OriginStateFips OriginState OriginStateName
## 1 32134 US 15 HI Hawaii
## 2 32134 US 15 HI Hawaii
## 3 32134 US 15 HI Hawaii
## 4 33830 US 15 HI Hawaii
## 5 32982 US 15 HI Hawaii
## 6 34057 US 41 OR Oregon
## OriginWac RoundTrip OnLine DollarCred FarePerMile RPCarrier Passengers
## 1 2 1 1 1 0.8800 HA 1
## 2 2 1 1 1 0.8800 HA 1
## 3 2 1 1 1 0.8627 HA 2
## 4 2 1 1 1 0.8800 HA 5
## 5 2 1 1 1 0.8627 HA 2
## 6 92 1 1 1 0.6822 OO 1
## ItinFare BulkFare Distance DistanceGroup MilesFlown ItinGeoType X
## 1 176 0 200 1 200 1 NA
## 2 176 0 200 1 200 1 NA
## 3 176 0 204 1 204 1 NA
## 4 176 0 200 1 200 1 NA
## 5 176 0 204 1 204 1 NA
## 6 176 0 258 1 258 2 NA
This code identifies observations with unusually high or low fare-per-mile values.
ggplot(db1b_sample1m, aes(y = FarePerMile)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Boxplot of FarePerMile",
y = "Fare per Mile") +
theme_minimal()
This boxplot visualizes overall fare-per-mile and highlights
outliers.
correlation <- cor(db1b_sample1m$ItinFare, db1b_sample1m$Distance, use = "complete.obs")
correlation
## [1] 0.3764296
This correlation measures how fare and distance move together.
ggplot(db1b_sample1m, aes(x = Distance, y = ItinFare)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Relationship Between ItinFare and Distance",
x = "Distance (miles)",
y = "Itinerary Fare ($)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
This scatterplot shows the relationship between route distance and
fare.
market_passengers <- db1b_sample1m %>%
group_by(OriginCityMarketID) %>%
summarise(total_passengers = sum(Passengers, na.rm = TRUE)) %>%
arrange(desc(total_passengers))
head(market_passengers)
## # A tibble: 6 × 2
## OriginCityMarketID total_passengers
## <int> <int>
## 1 31703 92544
## 2 32575 81720
## 3 30852 59832
## 4 30977 58063
## 5 32457 57847
## 6 30194 52313
This code calculates total passengers originating from each market.
top10_markets <- market_passengers %>%
slice_head(n = 10)
top10_markets
## # A tibble: 10 × 2
## OriginCityMarketID total_passengers
## <int> <int>
## 1 31703 92544
## 2 32575 81720
## 3 30852 59832
## 4 30977 58063
## 5 32457 57847
## 6 30194 52313
## 7 30721 46160
## 8 30397 43939
## 9 30325 40903
## 10 31453 40184
This extracts the top 10 origin markets by total passengers.
ggplot(top10_markets, aes(x = reorder(OriginCityMarketID, total_passengers),
y = total_passengers)) +
geom_col() +
coord_flip() +
labs(title = "Top 10 Origin Markets by Passenger Volume",
x = "OriginCityMarketID",
y = "Total Passengers") +
theme_minimal()
This bar chart visualizes the highest-volume markets.
marketA <- 35096
marketB <- 32211
market_compare <- db1b_sample1m %>%
filter(OriginCityMarketID %in% c(marketA, marketB)) %>%
mutate(FarePerMile = ItinFare / Distance) %>%
group_by(OriginCityMarketID) %>%
summarise(
avg_fare = mean(ItinFare, na.rm = TRUE),
avg_distance = mean(Distance, na.rm = TRUE),
avg_fare_per_mile = mean(FarePerMile, na.rm = TRUE),
roundtrip_share = mean(RoundTrip == 1, na.rm = TRUE)
)
market_compare
## # A tibble: 2 × 5
## OriginCityMarketID avg_fare avg_distance avg_fare_per_mile roundtrip_share
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 32211 357. 2451. 0.185 0.549
## 2 35096 434. 2154. 0.263 0.642
This chunk computes key metrics for each of the two selected markets, 35096 (NY) and 32211 (NV)
db1b_sample1m %>%
filter(OriginCityMarketID %in% c(35096, 32211)) %>%
mutate(FarePerMile = ItinFare / Distance) %>%
ggplot(aes(x = factor(OriginCityMarketID), y = FarePerMile)) +
geom_boxplot() +
labs(title = "Fare Per Mile Comparison: 35096 vs 32211",
x = "OriginCityMarketID",
y = "Fare Per Mile") +
theme_minimal()
This boxplot compares fare-per-mile distributions between the two
markets.
The comparison between OriginCityMarketID 32211 and 35096 shows noticeable differences in both pricing and travel patterns. Market 32211 has a lower average fare ($357.47) and a much lower fare per mile (0.185), which aligns with its longer average distance of about 2,451 miles. In contrast, market 35096 has a higher average fare ($434.00) and a significantly higher fare per mile (0.263), even though its routes are shorter on average (about 2,154 miles). This suggests that flights out of 35096 are priced more aggressively on a per-mile basis, possibly due to lower competition or higher demand. Additionally, 35096 has a higher round-trip share (0.642 vs. 0.549), indicating more round-trip leisure or planned travel compared to 32211. Overall, the two markets show clear differences in distance, fare structure, and traveler behavior.
df_two_markets <- db1b_sample1m %>%
filter(OriginCityMarketID %in% c(35096, 32211)) %>%
mutate(FarePerMile = ItinFare / Distance)
t_test_result <- t.test(FarePerMile ~ OriginCityMarketID, data = df_two_markets)
t_test_result
##
## Welch Two Sample t-test
##
## data: FarePerMile by OriginCityMarketID
## t = -17.749, df = 4939, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 32211 and group 35096 is not equal to 0
## 95 percent confidence interval:
## -0.08691126 -0.06962160
## sample estimates:
## mean in group 32211 mean in group 35096
## 0.1848608 0.2631272
This t-test evaluates whether mean fare-per-mile differs significantly between markets.
t_test_result$p.value
## [1] 2.242424e-68
This line extracts only the p-value for reporting.
Markets 35096 and 32211 likely differ due to a combination of competition, route distance, and passenger demand. Market 35096 shows a higher fare per mile even though its routes are shorter, which often signals limited airline competition or stronger demand from travelers who are less price-sensitive, such as business travelers. Its higher round-trip share also suggests more planned, recurring travel, which can allow airlines to charge higher fares. In contrast, market 32211 has longer average distances and a much lower fare per mile. This is typical on long-haul routes because fares do not rise proportionally with distance—airlines benefit from economies of scale, fuller planes, and more pricing pressure from competing carriers. Additionally, the slightly lower round-trip share could indicate a different mix of travelers, such as more one-way or flexible-schedule passengers. Overall, the differences between these two markets reflect underlying variations in route length, competitive environment, and traveler demand—factors that naturally lead to different fare structures and pricing behaviors.
Given the two markets I’ve analyzed, I’d probably start with deploying at least one of the Boeing 737s to 35096 (NY), and use 32211 (NV) more selectively or make it more of a secondary focus. From a data perspective, I’d say the NY market has a higher average fare (~$434) and a higher fare per mile (~0.26) than 32211 (about $357 and 0.18 per mile). It also has a higher round-trip share (~0.64 vs ~0.55), which suggests more stable, planned, possibly repeat travel—great for building reliable revenue and schedule consistency. Even though 35096’s routes are slightly shorter on average, that’s actually an operational advantage: shorter stage lengths let you schedule more daily rotations per aircraft, using each 737 more intensively while earning higher revenue per mile. Market 32211, with longer distances and lower fare per mile, looks more like a volume/long-haul market where margins might be thinner on a per-mile basis. To make a truly confident fleet deployment decision, I’d want more data on both demand and costs. On the demand side, I’d want: historical load factors by route and season, time-of-day demand patterns, forecast growth, and how sensitive each market is to fare changes. On the cost/operations side, I’d need: airport fees, turnaround times, fuel burn by stage length, slot or congestion constraints, and reliability/on-time performance at each airport. I’d also want competitive data (number of carriers, existing frequencies, low-cost carrier presence) and network value (how well each market feeds our other routes via connections). Combining that with our initial fare-per-mile and round-trip analysis would let us deploy the two 737s where they generate the best mix of high yield, strong demand, and operational efficiency.