This phase delivers predictive modeling using LASSO, GAM, and Random Forests, with cross‑validation and feature evaluation.
Reading the data, loading required libraries, and setting the seed.
library(readxl)
data<-read_xlsx("all_data.xlsx")
library(dplyr)
library(lubridate)
library(fastDummies)
library(glmnet)
library(mgcv)
library(randomForest)
library(Metrics)
set.seed(1542)
The variables id,flight_code,destination were eliminated since they actually behave as identifiers. The raw original timestamp variable was converted to useful hour and weekday. Categorical columns that will be used were transformed to factors for safety.
data<-data %>%
mutate(timestamp=ymd_hms(timestamp),
hour=hour(timestamp),
weekday=wday(timestamp,label = TRUE),
weekday=factor(weekday,ordered = FALSE),
carrier=factor(carrier),
wind=factor(wind)) %>%
select(-c(id,timestamp,flight_code,
destination,taxi_10tile))
For efficiency and reproducibility two functions were created:
evaluation which given the model and the test_data it returns basic metrics
lasso which given x and y returns LASSO coefficient table and basic metrics.
evaluation<-function(model,test_data){
f_pred<-predict(model,newdata = test_data)
f_mae<-mae(test_data$taxi_out,f_pred)
f_rmse<-rmse(test_data$taxi_out,f_pred)
f_ss_res<-sum((test_data$taxi_out-f_pred)^2)
f_ss_tot<-sum((test_data$taxi_out-mean(test_data$taxi_out))^2)
f_r2<-1-f_ss_res/f_ss_tot
return(list(mae=f_mae,rmse=f_rmse,r2=f_r2))
}
lasso<-function(x,y){
n<-nrow(x)
set.seed(1542)
train_idx<-sample(seq_len(n),size = 0.8*n)
x_train<-x[train_idx,]
y_train<-y[train_idx]
x_test<-x[-train_idx,]
y_test<-y[-train_idx]
cv_model<-cv.glmnet(x_train,y_train)
lambda<-cv_model$lambda.min
model<-glmnet(x_train,y_train,lambda=lambda)
f_pred<-predict(model,newx = x_test,s=lambda)
f_mae<-mae(y_test,f_pred)
f_rmse<-rmse(y_test,f_pred)
f_ss_res<-sum((y_test-f_pred)^2)
f_ss_tot<-sum((y_test-mean(y_test))^2)
f_r2<-1-f_ss_res/f_ss_tot
coeftable<-data.frame(variabes=row.names(coef(model)),
values=as.numeric(round(coef(model),4)))
coeftable<-coeftable[order(abs(coeftable$values),
decreasing = TRUE),]
return(list(mae=f_mae,rmse=f_rmse,r2=f_r2,coef=head(coeftable,6)))
}
The key focus is the LASSO coefficient table, with each run limited to one or two categorical factors to prevent overloading and reveal where meaningful variable weight appears.
y1<-data$taxi_out
lasso_data1<- data %>%
dummy_cols(select_columns = "weekday") %>%
select(-c(carrier,wind,taxi_out))
x1<-model.matrix(~.,data = lasso_data1)
lasso(x1,y1)
## $mae
## [1] 5.285366
##
## $rmse
## [1] 6.640551
##
## $r2
## [1] 0.07195242
##
## $coef
## variabes values
## 1 (Intercept) 35.8855
## 15 weekdayTue -0.6391
## 12 pressure -0.5702
## 18 weekdayFri -0.5457
## 16 weekdayWed -0.5274
## 17 weekdayThu 0.4670
y2<-data$taxi_out
lasso_data2<-data %>%
dummy_cols(select_columns = "carrier") %>%
select(-c(wind,weekday,taxi_out))
x2<-model.matrix(~.,data = lasso_data2)
lasso(x2,y2)
## $mae
## [1] 5.205609
##
## $rmse
## [1] 6.576505
##
## $r2
## [1] 0.08976735
##
## $coef
## variabes values
## 1 (Intercept) 36.0369
## 4 carrierAS 3.9640
## 5 carrierB6 -1.3453
## 7 carrierHA -1.2101
## 20 pressure -0.5542
## 27 carrier_HA -0.5308
y3<-data$taxi_out
lasso_data3<-data%>%
dummy_cols(select_columns = "wind") %>%
select(-c(carrier,weekday,taxi_out))
x3<-model.matrix(~.,data=lasso_data3)
lasso(x3,y3)
## $mae
## [1] 5.2526
##
## $rmse
## [1] 6.619856
##
## $r2
## [1] 0.07772764
##
## $coef
## variabes values
## 1 (Intercept) 47.9262
## 11 windENE 1.5738
## 29 pressure -0.9722
## 16 windNNW -0.9020
## 12 windESE 0.8842
## 26 windWSW -0.7812
y4<-data$taxi_out
lasso_data4<-data%>%
dummy_cols(select_columns = c("wind","carrier")) %>%
select(-c(weekday,taxi_out))
x4<-model.matrix(~.,data=lasso_data4)
lasso(x4,y4)
## $mae
## [1] 5.161294
##
## $rmse
## [1] 6.542623
##
## $r2
## [1] 0.09912215
##
## $coef
## variabes values
## 1 (Intercept) 48.6340
## 4 carrierAS 3.9576
## 19 windENE 1.4590
## 5 carrierB6 -1.3364
## 7 carrierHA -1.2343
## 37 pressure -0.9690
y5<-data$taxi_out
lasso_data5<-data %>%
select(departures,wind)
x5<-model.matrix(~.,data = lasso_data5)
lasso(x5,y5)
## $mae
## [1] 5.337542
##
## $rmse
## [1] 6.739177
##
## $r2
## [1] 0.04418083
##
## $coef
## variabes values
## 1 (Intercept) 16.6175
## 5 windENE 2.1540
## 17 windVAR -1.1943
## 20 windWSW -1.1349
## 8 windNE 0.8932
## 13 windSE -0.8437
To capture potential non‑linear relationships, several GAM combinations were tested.
n<-nrow(data)
train_idx<-sample(seq_len(n),size = 0.8*n)
gam_data1<- data %>%
select(dep_delay,departures,arrivals,temperature,wind,taxi_out)
train_data1<-gam_data1[train_idx,]
test_data1<-gam_data1[-train_idx,]
gam_model1<-gam(taxi_out~s(departures) +wind,data = train_data1,
method = "REML")
evaluation(gam_model1,test_data1)
## $mae
## [1] 5.294927
##
## $rmse
## [1] 6.647421
##
## $r2
## [1] 0.05170853
gam_model2<-gam(taxi_out~s(dep_delay)+s(departures)+s(arrivals)
+s(temperature)+wind, data = train_data1,
method = "REML")
evaluation(gam_model2,test_data1)
## $mae
## [1] 5.25571
##
## $rmse
## [1] 6.596801
##
## $r2
## [1] 0.06609611
gam_data2<- data %>%
select(dep_delay,departures,arrivals,temperature,carrier,taxi_out)
train_data2<-gam_data2[train_idx,]
test_data2<-gam_data2[-train_idx,]
gam_model3<-gam(taxi_out~s(departures) +carrier+s(dep_delay)
+s(arrivals)+s(temperature),data = train_data2,
method = "REML")
evaluation(gam_model3,test_data2)
## $mae
## [1] 5.21823
##
## $rmse
## [1] 6.566307
##
## $r2
## [1] 0.07471013
In order to detect more complex interactions, Random Forests were used.
rf1_train<-data[train_idx,] %>%
select(c(departures,wind,taxi_out))
rf1_test<-data[-train_idx,] %>%
select(c(departures,wind,taxi_out))
rf1<-randomForest(taxi_out~.,
data=rf1_train,
mtry=2,
maxnodes=30)
evaluation(rf1,rf1_test)
## $mae
## [1] 5.287931
##
## $rmse
## [1] 6.638225
##
## $r2
## [1] 0.05433067
rf2_train<-data[train_idx,]
rf2_test<-data[-train_idx,]
rf2<-randomForest(taxi_out~.,
data=rf2_train,
mtry=6,
importance=TRUE,
maxnodes=500,
nodesize=150)
evaluation(rf2,rf2_test)
## $mae
## [1] 4.97365
##
## $rmse
## [1] 6.245785
##
## $r2
## [1] 0.1628381
The Random Forest model including all usable variables delivers the best results:
evaluation(rf2,rf2_test)
## $mae
## [1] 4.97365
##
## $rmse
## [1] 6.245785
##
## $r2
## [1] 0.1628381
imp<- importance(rf2)
imp_table<-data.frame(variable=rownames(imp),importance=imp[,1],
row.names = NULL)
imp_table <- imp_table[order(imp_table$importance, decreasing = TRUE), ]
imp_table
## variable importance
## 1 carrier 80.07580
## 4 departures 74.57532
## 9 wind 60.86407
## 2 dep_delay 42.39487
## 12 pressure 40.71500
## 6 temperature 40.09590
## 14 weekday 39.51985
## 11 wind_gust 33.06490
## 8 humidity 32.98834
## 5 arrivals 32.11200
## 13 hour 30.35654
## 7 dew_point 29.48188
## 10 wind_speed 28.25649
## 3 distance 21.59177
With these foundations in place, stakeholders now have a clearer view of the main drivers of taxi_out time and a starting point for further development of operational improvements at JFK. From an analytical perspective, departures appear to play a meaningful role in taxi_out performance and represent a sensible direction for further investigation.