flights_data “C:/Users/barry/OneDrive/Project2 Flight Data.xlsx”
These data were pulled on December 1st, 2024. The websites used were: delta.com, southwest.com, united.com, and jetblue.com. I looked up the price of a one-way flight from Detroit, MI to Boston, MA for the dates of Jan 1, 2025 through May 31, 2025. All data were pulled from the websites and entered into an excel file which was read into RStudio. The goal of the following analysis was to help consumers and companies understand the cost in dollars as well as points/miles (i.e. frequent flyer program ecosystem). There were two airlines that had nonstop flights and two airlines that had one layover. Categories like price, points/miles, nonstop, & day were used to help build predictive models to estimate potential cost for each carrier.
library(openxlsx)
sheet1 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 1)
sheet1$Carrier <- "Delta"
sheet2 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 2)
sheet2$Carrier <- "Southwest"
sheet3 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 3)
sheet3$Carrier <- "United"
sheet4 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 4)
sheet4$Carrier <- "JetBlue"
flights_data <- rbind(sheet1, sheet2, sheet3, sheet4)
str(flights_data)
## 'data.frame': 599 obs. of 7 variables:
## $ Date : num 45658 45659 45660 45661 45662 ...
## $ Price : num 169 244 314 139 139 244 244 244 244 244 ...
## $ Points/Miles: num 14500 21500 21500 11500 14500 21500 21500 21500 21500 21500 ...
## $ Cents/Point : num 1.17 1.13 1.46 1.21 0.91 1.13 1.13 1.13 1.13 1.13 ...
## $ Non-Stop : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Day : chr "Wednesday" "Thursday" "Friday" "Saturday" ...
## $ Carrier : chr "Delta" "Delta" "Delta" "Delta" ...
summary(flights_data)
## Date Price Points/Miles Cents/Point
## Min. :45658 Min. : 69.0 Min. : 5900 Min. :0.850
## 1st Qu.:45696 1st Qu.:139.0 1st Qu.:10500 1st Qu.:1.230
## Median :45734 Median :163.0 Median :11500 Median :1.300
## Mean :45733 Mean :189.5 Mean :14527 Mean :1.323
## 3rd Qu.:45771 3rd Qu.:244.0 3rd Qu.:19101 3rd Qu.:1.465
## Max. :45808 Max. :626.0 Max. :45969 Max. :1.650
## Non-Stop Day Carrier
## Min. :0.0000 Length:599 Length:599
## 1st Qu.:0.0000 Class :character Class :character
## Median :0.0000 Mode :character Mode :character
## Mean :0.4958
## 3rd Qu.:1.0000
## Max. :1.0000
head(flights_data)
## Date Price Points/Miles Cents/Point Non-Stop Day Carrier
## 1 45658 169 14500 1.17 1 Wednesday Delta
## 2 45659 244 21500 1.13 1 Thursday Delta
## 3 45660 314 21500 1.46 1 Friday Delta
## 4 45661 139 11500 1.21 1 Saturday Delta
## 5 45662 139 14500 0.91 1 Sunday Delta
## 6 45663 244 21500 1.13 1 Monday Delta
The following graph displays the average price, in dollars, for the referenced one-way flight for each airline.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
avg_cost <- flights_data %>%
group_by(Carrier) %>%
summarize(Average_Cost = mean(`Price`))
print(avg_cost)
## # A tibble: 4 × 2
## Carrier Average_Cost
## <chr> <dbl>
## 1 Delta 227.
## 2 JetBlue 136.
## 3 Southwest 241.
## 4 United 152.
# Bar graph for average cost per carrier
ggplot(avg_cost, aes(x = Carrier, y = Average_Cost, fill = Carrier)) +
geom_bar(stat = "identity", color = "black") + # `stat = "identity"` uses the values directly
theme_minimal() +
labs(
title = "Average Cost per Carrier",
x = "Carrier",
y = "Average Cost ($)"
) +
theme(legend.position = "none")
This next graph shows the average cost of flights in airline-specific points/miles.
avg_points_miles <- flights_data %>%
group_by(Carrier) %>%
summarize(Average_points_miles = mean(`Points/Miles`))
print(avg_points_miles)
## # A tibble: 4 × 2
## Carrier Average_points_miles
## <chr> <dbl>
## 1 Delta 19073.
## 2 JetBlue 10012.
## 3 Southwest 16681.
## 4 United 12195.
# Bar graph for average cost per carrier
ggplot(avg_points_miles, aes(x = Carrier, y = Average_points_miles, fill = Carrier)) +
geom_bar(stat = "identity", color = "black") +
theme_minimal() +
labs(
title = "Average Points/Miles per Carrier",
x = "Carrier",
y = "Average Points/Miles"
) +
theme(legend.position = "none")
This line graph displays which airline has the most valuable airline currency in terms of cents-per-point.
avg_cents_point <- flights_data %>%
group_by(Carrier) %>%
summarize(Average_Cents_Per_Point = mean(`Cents/Point`))
print(avg_cents_point)
## # A tibble: 4 × 2
## Carrier Average_Cents_Per_Point
## <chr> <dbl>
## 1 Delta 1.19
## 2 JetBlue 1.36
## 3 Southwest 1.47
## 4 United 1.26
ggplot(avg_cents_point, aes(x = Carrier, y = Average_Cents_Per_Point, group = 1)) +
geom_line(color = "blue", linewidth = 1.2) +
geom_point(size = 3, color = "red") +
theme_minimal() +
labs(
title = "Average Cents/Point per Carrier",
x = "Carrier",
y = "Average Cents/Point"
)
This is probably my favorite graph from the statistical analysis portion of this project. Here we can see how day of the week impacts average flight cost for each carrier.
flights_data$Day <- factor(
flights_data$Day,
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
)
avg_cost_by_day <- flights_data %>%
group_by(Carrier, Day) %>%
summarize(Average_Cost = mean(`Price`), .groups = "drop")
print(avg_cost_by_day)
## # A tibble: 28 × 3
## Carrier Day Average_Cost
## <chr> <fct> <dbl>
## 1 Delta Monday 258.
## 2 Delta Tuesday 244
## 3 Delta Wednesday 242.
## 4 Delta Thursday 259.
## 5 Delta Friday 263.
## 6 Delta Saturday 160.
## 7 Delta Sunday 162.
## 8 JetBlue Monday 141.
## 9 JetBlue Tuesday 93.5
## 10 JetBlue Wednesday 112.
## # ℹ 18 more rows
ggplot(avg_cost_by_day, aes(x = Day, y = Average_Cost, color = Carrier, group = Carrier)) +
geom_line(size = 1.2) +
geom_point(size = 3) +
theme_minimal() +
labs(
title = "Average Cost for Each Day of the Week by Carrier",
x = "Day of the Week",
y = "Average Cost ($)"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The following graph is a cluster analysis where I attempt to look for a pattern in the relationship between cost in dollars and cents/point. Carriers were converted to numeric data points, with Delta = 1, Southwest = 2, United = 3, & JetBlue = 4. After transformation, the data were clustered together using the “cbind” function, and then standardized to ensure a mean of zero and standard deviation of one was used for consistency for all airlines. “kmeans” was used to ensure there were four separate clusters generated. In summary, this was used to see if cheaper or more expensive flights were better for consumers or companies redeeming points to pay for flights.
library(caret)
## Loading required package: lattice
set.seed(123)
flights_data <- data.frame(
Carrier = rep(c("Delta", "Southwest", "United", "JetBlue"), each = 14),
Day = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), times = 8),
Price = sample(100:500, 56, replace = TRUE),
Cents_Per_Point = runif(56, 1, 1.6),
Non_Stop = sample(c(0, 1), 56, replace = TRUE)
)
dummy_vars <- dummyVars("~ Carrier + Day", data = flights_data)
dummy_data <- predict(dummy_vars, newdata = flights_data)
clustering_data <- cbind(flights_data %>% select(Price, Cents_Per_Point, Non_Stop), dummy_data)
head(clustering_data)
## Price Cents_Per_Point Non_Stop CarrierDelta CarrierJetBlue CarrierSouthwest
## 1 278 1.426109 0 1 0 0
## 2 113 1.000375 0 1 0 0
## 3 294 1.285190 1 1 0 0
## 4 405 1.132071 0 1 0 0
## 5 217 1.227890 1 1 0 0
## 6 398 1.367663 1 1 0 0
## CarrierUnited DayFriday DayMonday DaySaturday DaySunday DayThursday
## 1 0 0 1 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 1
## 5 0 1 0 0 0 0
## 6 0 0 0 1 0 0
## DayTuesday DayWednesday
## 1 0 0
## 2 1 0
## 3 0 1
## 4 0 0
## 5 0 0
## 6 0 0
clustering_data_scaled <- scale(clustering_data)
head(clustering_data_scaled)
## Price Cents_Per_Point Non_Stop CarrierDelta CarrierJetBlue
## 1 -0.1376655189 0.78228837 -1.1035656 1.716516 -0.5721721
## 2 -1.5621105296 -1.67656426 -1.1035656 1.716516 -0.5721721
## 3 0.0004624821 -0.03159966 0.8899723 1.716516 -0.5721721
## 4 0.9587254894 -0.91594433 -1.1035656 1.716516 -0.5721721
## 5 -0.6642785229 -0.36253897 0.8899723 1.716516 -0.5721721
## 6 0.8982944889 0.44472555 0.8899723 1.716516 -0.5721721
## CarrierSouthwest CarrierUnited DayFriday DayMonday DaySaturday DaySunday
## 1 -0.5721721 -0.5721721 -0.4045868 2.4275208 -0.4045868 -0.4045868
## 2 -0.5721721 -0.5721721 -0.4045868 -0.4045868 -0.4045868 -0.4045868
## 3 -0.5721721 -0.5721721 -0.4045868 -0.4045868 -0.4045868 -0.4045868
## 4 -0.5721721 -0.5721721 -0.4045868 -0.4045868 -0.4045868 -0.4045868
## 5 -0.5721721 -0.5721721 2.4275208 -0.4045868 -0.4045868 -0.4045868
## 6 -0.5721721 -0.5721721 -0.4045868 -0.4045868 2.4275208 -0.4045868
## DayThursday DayTuesday DayWednesday
## 1 -0.4045868 -0.4045868 -0.4045868
## 2 -0.4045868 2.4275208 -0.4045868
## 3 -0.4045868 -0.4045868 2.4275208
## 4 2.4275208 -0.4045868 -0.4045868
## 5 -0.4045868 -0.4045868 -0.4045868
## 6 -0.4045868 -0.4045868 -0.4045868
set.seed(123)
kmeans_result <- kmeans(clustering_data_scaled, centers = 4)
flights_data$Cluster <- kmeans_result$cluster
table(flights_data$Cluster)
##
## 1 2 3 4
## 8 24 16 8
# Scatter plot of Price vs. Cents_Per_Point, colored by cluster
ggplot(flights_data, aes(x = Price, y = Cents_Per_Point, color = as.factor(Cluster))) +
geom_point(size = 3) +
labs(
title = "Cluster Analysis of Flight Data",
x = "Price ($)",
y = "Cents per Point",
color = "Cluster"
) +
theme_minimal()
Here, the max and min for Price and Cents/Point were used to make a prediction model. The data were split to 80% training and 20% testing. “library(randomforest)” was used to predict price to the other variables, and to see how well each variable predicted price. MSE was calculated to measure the average squared difference between predicted and actual prices. The logistics regression model was created to determine if the other variables could predict whether or not a flight was nonstop. We can see in the graph that these variables put together do not particularly draw an accurate assessment of predicting prices.
set.seed(123)
flights_data <- data.frame(
Carrier = rep(c("Delta", "Southwest", "United", "JetBlue"), each = 14),
Day = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), times = 8),
Price = sample(69:626, 56, replace = TRUE),
Cents_Per_Point = runif(56, 0.85, 1.65),
Non_Stop = sample(c(0, 1), 56, replace = TRUE)
)
dummy_vars <- dummyVars("~ Carrier + Day", data = flights_data)
dummy_data <- predict(dummy_vars, newdata = flights_data)
ml_data <- cbind(flights_data %>% select(Price, Cents_Per_Point, Non_Stop), dummy_data)
set.seed(123)
train_index <- createDataPartition(ml_data$Price, p = 0.8, list = FALSE)
train_data <- ml_data[train_index, ]
test_data <- ml_data[-train_index, ]
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(123)
rf_model <- randomForest(Price ~ ., data = train_data, ntree = 500, importance = TRUE)
importance(rf_model)
## %IncMSE IncNodePurity
## Cents_Per_Point -4.3462997 320568.55
## Non_Stop -4.1776868 48327.74
## CarrierDelta 1.2609724 46798.12
## CarrierJetBlue -3.4855684 56057.60
## CarrierSouthwest -5.1276605 52167.15
## CarrierUnited -2.2424099 41368.46
## DayFriday -4.3415137 29874.96
## DayMonday -1.7411197 73895.56
## DaySaturday -0.1941850 23044.91
## DaySunday -2.2873820 37690.85
## DayThursday -3.6114006 30035.79
## DayTuesday -7.1655005 46817.83
## DayWednesday 0.4379939 92161.84
rf_predictions <- predict(rf_model, newdata = test_data)
actual <- test_data$Price
mean_squared_error <- mean((rf_predictions - actual)^2)
cat("Mean Squared Error:", mean_squared_error, "\n")
## Mean Squared Error: 22523.9
logistic_model <- glm(Non_Stop ~ ., data = train_data, family = "binomial")
logistic_probs <- predict(logistic_model, newdata = test_data, type = "response")
logistic_predictions <- ifelse(logistic_probs > 0.5, 1, 0)
confusionMatrix(as.factor(logistic_predictions), as.factor(test_data$Non_Stop))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1 4
## 1 2 1
##
## Accuracy : 0.25
## 95% CI : (0.0319, 0.6509)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 0.9944
##
## Kappa : -0.4118
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.3333
## Specificity : 0.2000
## Pos Pred Value : 0.2000
## Neg Pred Value : 0.3333
## Prevalence : 0.3750
## Detection Rate : 0.1250
## Detection Prevalence : 0.6250
## Balanced Accuracy : 0.2667
##
## 'Positive' Class : 0
##
varImpPlot(rf_model)
library(ggplot2)
# Scatter plot for predictions vs actual
ggplot(data.frame(Predicted = rf_predictions, Actual = actual), aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(
title = "Predicted vs Actual Prices",
x = "Actual Price",
y = "Predicted Price"
) +
theme_minimal()
A similar model was used to predict cost in terms of points or miles.
set.seed(123)
flights_data <- data.frame(
Carrier = rep(c("Delta", "Southwest", "United", "JetBlue"), each = 14),
Day = rep(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), times = 8),
Points_Miles = sample(5900:45969, 56, replace = TRUE),
Cents_Per_Point = runif(56, 0.85, 1.65),
Non_Stop = sample(c(0, 1), 56, replace = TRUE)
)
dummy_vars <- dummyVars("~ Carrier + Day", data = flights_data)
dummy_data <- predict(dummy_vars, newdata = flights_data)
ml_data <- cbind(flights_data %>% select(Points_Miles, Cents_Per_Point, Non_Stop), dummy_data)
set.seed(123)
train_index <- createDataPartition(ml_data$Points_Miles, p = 0.8, list = FALSE)
train_data <- ml_data[train_index, ]
test_data <- ml_data[-train_index, ]
library(randomForest)
set.seed(123)
rf_model <- randomForest(Points_Miles ~ ., data = train_data, ntree = 500, importance = TRUE)
importance(rf_model)
## %IncMSE IncNodePurity
## Cents_Per_Point -5.83486551 1319066787
## Non_Stop -2.85293881 223786902
## CarrierDelta -3.28316293 160357601
## CarrierJetBlue -0.59319835 140244495
## CarrierSouthwest 0.01698004 176662860
## CarrierUnited -3.75446974 167668620
## DayFriday -3.99750741 133176525
## DayMonday -5.95003189 161718630
## DaySaturday -1.41512086 97228182
## DaySunday -4.71267065 120041688
## DayThursday -4.95171248 235623959
## DayTuesday -2.50780391 212690645
## DayWednesday 1.74018684 181900191
rf_predictions <- predict(rf_model, newdata = test_data)
actual <- test_data$Points_Miles
mean_squared_error <- mean((rf_predictions - actual)^2)
cat("Mean Squared Error:", mean_squared_error, "\n")
## Mean Squared Error: 129053908
logistic_model <- glm(Non_Stop ~ ., data = train_data, family = "binomial")
logistic_probs <- predict(logistic_model, newdata = test_data, type = "response")
logistic_predictions <- ifelse(logistic_probs > 0.5, 1, 0)
confusionMatrix(as.factor(logistic_predictions), as.factor(test_data$Non_Stop))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2 5
## 1 1 0
##
## Accuracy : 0.25
## 95% CI : (0.0319, 0.6509)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 0.9944
##
## Kappa : -0.2632
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.6667
## Specificity : 0.0000
## Pos Pred Value : 0.2857
## Neg Pred Value : 0.0000
## Prevalence : 0.3750
## Detection Rate : 0.2500
## Detection Prevalence : 0.8750
## Balanced Accuracy : 0.3333
##
## 'Positive' Class : 0
##
varImpPlot(rf_model)
library(ggplot2)
# Scatter plot for predictions vs actual
ggplot(data.frame(Predicted = rf_predictions, Actual = actual), aes(x = Actual, y = Predicted)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "green") +
labs(
title = "Predicted vs Actual Points/Miles",
x = "Actual Points/Miles",
y = "Predicted Points/Miles"
) +
theme_minimal()
Here, “nnet” was used to build a neural network a normalized “Price” variable with the other variables in the “train_data”. MSE and RMSE were calculated to gauge how far the model was from predicting prices. Based on the outcome of the code, we can say that the model’s predictions, on average, deviate by roughly 12% of the normalized range.
sheet1 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 1)
sheet1$Carrier <- "Delta"
sheet2 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 2)
sheet2$Carrier <- "Southwest"
sheet3 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 3)
sheet3$Carrier <- "United"
sheet4 <- read.xlsx("C:/Users/barry/OneDrive/Project2 Flight Data.xlsx", sheet = 4)
sheet4$Carrier <- "JetBlue"
flights_data <- rbind(sheet1, sheet2, sheet3, sheet4)
library(nnet)
day_encoded <- model.matrix(~ Day - 1, data = flights_data)
carrier_encoded <- model.matrix(~ Carrier - 1, data = flights_data)
data_encoded <- cbind(flights_data[, "Price" , drop = FALSE], day_encoded, carrier_encoded)
head(data_encoded)
## Price DayFriday DayMonday DaySaturday DaySunday DayThursday DayTuesday
## 1 169 0 0 0 0 0 0
## 2 244 0 0 0 0 1 0
## 3 314 1 0 0 0 0 0
## 4 139 0 0 1 0 0 0
## 5 139 0 0 0 1 0 0
## 6 244 0 1 0 0 0 0
## DayWednesday CarrierDelta CarrierJetBlue CarrierSouthwest CarrierUnited
## 1 1 1 0 0 0
## 2 0 1 0 0 0
## 3 0 1 0 0 0
## 4 0 1 0 0 0
## 5 0 1 0 0 0
## 6 0 1 0 0 0
normalize <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
data_encoded$`Price` <- normalize(data_encoded$`Price`)
set.seed(123)
trainIndex <- createDataPartition(data_encoded$`Price`, p = 0.8, list = FALSE)
train_data <- data_encoded[trainIndex, ]
test_data <- data_encoded[-trainIndex, ]
predictors <- setdiff(names(train_data), "Price")
price_nn <- nnet(
`Price` ~ .,
data = train_data,
size = 10,
linout = TRUE,
maxit = 500
)
## # weights: 131
## initial value 1116.947771
## iter 10 value 8.186071
## iter 20 value 7.558340
## iter 30 value 7.368296
## iter 40 value 7.352639
## iter 50 value 7.349053
## iter 60 value 7.348997
## final value 7.348995
## converged
price_pred <- predict(price_nn, newdata = test_data)
mse_price <- mean((test_data$`Price` - price_pred)^2)
rmse_price <- sqrt(mse_price)
cat("RMSE for Price Prediction:", rmse_price, "\n")
## RMSE for Price Prediction: 0.1209665
This nnet model was built to account only for nonstop flights. For example, a company does not want to risk their employees having delayed flights from origin to destination, so the company only books flight with airlines that have nonstop routes; or a family traveling with young children may not want to de-board and re-board multiple planes during a trip for convenience. With an RMSE of 56.8, we can say that the model is able to predict nonstop flights with a high variance in Price.
non_stop_data <- flights_data[flights_data$`Non-Stop` == 1, ]
non_stop_data$Carrier <- as.factor(non_stop_data$Carrier)
non_stop_data$Day <- as.factor(non_stop_data$Day)
non_stop_data$Price <- scale(non_stop_data$Price)
set.seed(123)
sample_size <- floor(0.7 * nrow(non_stop_data))
train_indices <- sample(seq_len(nrow(non_stop_data)), size = sample_size)
train_data <- non_stop_data[train_indices, ]
test_data <- non_stop_data[-train_indices, ]
nnet_model <- nnet(
Price ~ Carrier + Day,
data = train_data,
size = 5,
linout = TRUE,
decay = 0.01,
maxit = 500
)
## # weights: 46
## initial value 361.653345
## iter 10 value 77.277366
## iter 20 value 66.862924
## iter 30 value 66.318476
## iter 40 value 66.214250
## iter 50 value 66.172174
## iter 60 value 66.145594
## iter 70 value 66.117871
## iter 80 value 66.093385
## iter 90 value 66.078242
## iter 100 value 66.068642
## iter 110 value 66.042292
## iter 120 value 66.019631
## iter 130 value 66.002277
## iter 140 value 65.985837
## iter 150 value 65.974179
## iter 160 value 65.962457
## iter 170 value 65.956805
## iter 180 value 65.954779
## iter 190 value 65.952846
## iter 200 value 65.950592
## iter 210 value 65.947383
## iter 220 value 65.945269
## iter 230 value 65.944463
## iter 240 value 65.943884
## iter 250 value 65.943630
## iter 260 value 65.943554
## iter 270 value 65.943497
## iter 280 value 65.943451
## final value 65.943447
## converged
predictions <- predict(nnet_model, newdata = test_data)
rmse <- sqrt(mean((test_data$Price - predictions)^2))
print(paste("RMSE for Non-Stop Flights:", rmse))
## [1] "RMSE for Non-Stop Flights: 0.567915376164214"
Here, nonstop data were kept, but another layer was added to assist the model with its predictions. Since the earlier graphs showed a drastic range in pricing depending on which day of the week the flight was, I accounted for weekday flights (Monday-Thursday) and weekend flights (Friday-Sunday). Since workdays are typically Monday-Thursday, the “mon_thu_data” variable will assist companies with predicting costs when booking employees flights. Weekends were accounted for with the “fri_sun_data” variable to help leisure travelers estimate their out-of-pocket costs for weekend getaways.
non_stop_data <- flights_data[flights_data$`Non-Stop` == 1, ]
mon_thu_data <- non_stop_data %>%
filter(Day %in% c("Monday", "Tuesday", "Wednesday", "Thursday"))
fri_sun_data <- non_stop_data %>%
filter(Day %in% c("Friday", "Saturday", "Sunday"))
normalize <- function(x) { (x - min(x)) / (max(x) - min(x)) }
mon_thu_data <- mon_thu_data %>%
mutate(Price = normalize(Price), `Cents/Point` = normalize(`Cents/Point`))
fri_sun_data <- fri_sun_data %>%
mutate(Price = normalize(Price), `Cents/Point` = normalize(`Cents/Point`))
mon_thu_data$Carrier <- as.factor(mon_thu_data$Carrier)
fri_sun_data$Carrier <- as.factor(fri_sun_data$Carrier)
mon_thu_ann <- nnet(Price ~ Carrier + `Cents/Point`, data = mon_thu_data, size = 5, linout = TRUE)
## # weights: 21
## initial value 23.180512
## iter 10 value 2.123733
## iter 20 value 2.032262
## iter 30 value 1.862442
## iter 40 value 1.820176
## iter 50 value 1.815708
## iter 60 value 1.803206
## iter 70 value 1.786357
## iter 80 value 1.780970
## iter 90 value 1.777819
## iter 100 value 1.772666
## final value 1.772666
## stopped after 100 iterations
fri_sun_ann <- nnet(Price ~ Carrier + `Cents/Point`, data = fri_sun_data, size = 5, linout = TRUE)
## # weights: 21
## initial value 165.662679
## iter 10 value 5.217479
## iter 20 value 4.923784
## iter 30 value 4.685620
## iter 40 value 4.423452
## iter 50 value 4.400913
## iter 60 value 4.372437
## iter 70 value 4.333972
## iter 80 value 4.230130
## iter 90 value 4.176527
## iter 100 value 4.150904
## final value 4.150904
## stopped after 100 iterations
mon_thu_preds <- predict(mon_thu_ann, mon_thu_data)
mon_thu_rmse <- sqrt(mean((mon_thu_preds - mon_thu_data$Price)^2))
fri_sun_preds <- predict(fri_sun_ann, fri_sun_data)
fri_sun_rmse <- sqrt(mean((fri_sun_preds - fri_sun_data$Price)^2))
print(paste("RMSE for Monday-Thursday flights:", mon_thu_rmse))
## [1] "RMSE for Monday-Thursday flights: 0.101519453019988"
print(paste("RMSE for Friday-Sunday flights:", fri_sun_rmse))
## [1] "RMSE for Friday-Sunday flights: 0.182228504924789"
With an RMSE of .102 for weekdays and .182 for weekends, we can see that the model performs much better than when “Day” is not accounted for. Overall, this project should assist those looking to estimate flight costs. Companies may want to consider JetBlue or United for weekday flights, whereas weekend flyers may choose Delta because prices drop. Southwest may have the highest average pricing, but they also have the most valuable frequent flyer points, and the price to redeem a flight with Southwest with points is actually cheaper than booking Delta with points. The predictive models were not overly accurate, but we did see vast improvement when accounting for weekdays versus weekends. Perhaps adding more variables to the equation can help with the model’s predictions, such as checked-bag fees, the number of flights per day for each airline (including morning, afternoon, and red eye flights), or seat-location (economy, main, business, first, etc.).