Predicting Delayed Flights.
library(caret)
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
set.seed(120)
# load data
data("FlightDelays", package = "mlba")
# skim the data
skimr::skim(FlightDelays)
| Name | FlightDelays |
| Number of rows | 2201 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| CARRIER | 0 | 1 | 2 | 2 | 0 | 8 | 0 |
| DEST | 0 | 1 | 3 | 3 | 0 | 3 | 0 |
| FL_DATE | 0 | 1 | 9 | 10 | 0 | 31 | 0 |
| ORIGIN | 0 | 1 | 3 | 3 | 0 | 3 | 0 |
| TAIL_NUM | 0 | 1 | 6 | 6 | 0 | 549 | 0 |
| Flight.Status | 0 | 1 | 6 | 7 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CRS_DEP_TIME | 0 | 1 | 1371.94 | 432.70 | 600 | 1000 | 1455 | 1710 | 2130 | ▇▃▇▇▅ |
| DEP_TIME | 0 | 1 | 1369.30 | 442.46 | 10 | 1004 | 1450 | 1709 | 2330 | ▁▅▅▇▂ |
| DISTANCE | 0 | 1 | 211.87 | 13.32 | 169 | 213 | 214 | 214 | 229 | ▁▁▂▇▂ |
| FL_NUM | 0 | 1 | 3815.09 | 2409.75 | 746 | 2156 | 2385 | 6155 | 7924 | ▇▅▃▁▅ |
| Weather | 0 | 1 | 0.01 | 0.12 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| DAY_WEEK | 0 | 1 | 3.91 | 1.90 | 1 | 2 | 4 | 5 | 7 | ▇▅▅▅▆ |
| DAY_OF_MONTH | 0 | 1 | 16.02 | 8.68 | 1 | 8 | 16 | 23 | 31 | ▇▇▇▇▇ |
# show head of data
head(FlightDelays)
## CRS_DEP_TIME CARRIER DEP_TIME DEST DISTANCE FL_DATE FL_NUM ORIGIN Weather
## 1 1455 OH 1455 JFK 184 01/01/2004 5935 BWI 0
## 2 1640 DH 1640 JFK 213 01/01/2004 6155 DCA 0
## 3 1245 DH 1245 LGA 229 01/01/2004 7208 IAD 0
## 4 1715 DH 1709 LGA 229 01/01/2004 7215 IAD 0
## 5 1039 DH 1035 LGA 229 01/01/2004 7792 IAD 0
## 6 840 DH 839 JFK 228 01/01/2004 7800 IAD 0
## DAY_WEEK DAY_OF_MONTH TAIL_NUM Flight.Status
## 1 4 1 N940CA ontime
## 2 4 1 N405FJ ontime
## 3 4 1 N695BR ontime
## 4 4 1 N662BR ontime
## 5 4 1 N698BR ontime
## 6 4 1 N687BR ontime
Data Pre-processing.
# Remove unnecessary columns (DEP_TIME, FL_DATE, FL_NUM, DAY_OF_MONTH, TAIL_NUM)
colnames(FlightDelays)
## [1] "CRS_DEP_TIME" "CARRIER" "DEP_TIME" "DEST"
## [5] "DISTANCE" "FL_DATE" "FL_NUM" "ORIGIN"
## [9] "Weather" "DAY_WEEK" "DAY_OF_MONTH" "TAIL_NUM"
## [13] "Flight.Status"
FlightDelays.preprocessed <- FlightDelays |> select(-c(DEP_TIME, FL_DATE, FL_NUM, DAY_OF_MONTH, TAIL_NUM)) # |>, %>% are pipe operators, used to chain functions together
# transform DAY_WEEK into categorical variable
FlightDelays.preprocessed$DAY_WEEK <- as.factor(FlightDelays.preprocessed$DAY_WEEK)
FlightDelays.preprocessed$Weather <- as.factor(FlightDelays.preprocessed$Weather)
# bin the scheduled departure time into eight bins
FlightDelays.preprocessed$CRS_DEP_TIME <- cut(FlightDelays.preprocessed$CRS_DEP_TIME/100, breaks = 8) # cut() function is used to bin continuous variables into discrete intervals, breaks = 8 means 8 bins with the same length, divide by 100 to get rid of the last two digits
unique(FlightDelays.preprocessed$CRS_DEP_TIME) # unique() function is used to get unique values of a variable
## [1] (13.7,15.6] (15.6,17.5] (11.7,13.7] (9.82,11.7] (7.91,9.82] (19.4,21.3]
## [7] (5.98,7.91] (17.5,19.4]
## 8 Levels: (5.98,7.91] (7.91,9.82] (9.82,11.7] (11.7,13.7] ... (19.4,21.3]
# partition the data into training and validation sets
trainIndex <- sample(1:nrow(FlightDelays.preprocessed), 0.6*nrow(FlightDelays.preprocessed))
train.df <- FlightDelays.preprocessed[trainIndex,]
test.df <- FlightDelays.preprocessed[-trainIndex,]
Fit a classification tree to the flight delay variable using all the relevant predictors.
# train model
# model <- train(Flight.Status ~ ., data = train.df, method = "rpart")
delays.dt <- rpart(Flight.Status ~ ., data = train.df, method = "class", cp = 0.001, maxdepth = 8, model = TRUE) # model = TRUE to save the model, method = "class" for classification and for regression use "anova"
# plot tree with colored nodes red for delayed and green for ontime
prp(delays.dt, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, cex = 0.4)
# prune tree
prune.delays.dt <- prune(delays.dt, cp = delays.dt$cptable[which.min(delays.dt$cptable[,"xerror"]),"CP"])
# explain the code above: cptable is a table of complexity parameters (cp), xerror is the cross-validated error, and CP is the complexity parameter with the minimum cross-validated error
prp(prune.delays.dt, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, varlen = -10)
If you needed to fly between DCA and EWR on a Monday at 7:00 AM, would you be able to use this tree? What other information would you need? Is it available in practice? What information is redundant?
# train model
delays.dt2 <- rpart(Flight.Status ~ . - Weather, data = train.df, method = "class", cp = 0.001, maxdepth = 8, model = TRUE)
# plot tree with colored nodes red for delayed and green for ontime, with numbers of observations in each node
prp(delays.dt2, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, cex = 0.45)
# prune tree
prune.delays.dt2 <- prune(delays.dt2, cp = delays.dt2$cptable[which.min(delays.dt2$cptable[,"xerror"]),"CP"])
prp(prune.delays.dt2, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, faclen = 0, cex = 0.8)
…
# load libraries
library(knitr)
library(kableExtra)
# train model
delays.glm <- train(Flight.Status ~ . , data = train.df, method = "glm", family = "binomial")
# predict 2 models, glm and decision tree
delays.glm.pred.prob <- predict(delays.glm, test.df, type = "prob")
delays.glm.pred <- ifelse(delays.glm.pred.prob[,2] > 0.65, "ontime", "delayed") # threshold at 0.65 to increase sensitivity
delays.dt.pred <- predict(prune.delays.dt, test.df, type = "class")
# compare accuracy, sensitivity, specificity, threshold at optimal accuracy
accuracy.glm <- confusionMatrix(as.factor(delays.glm.pred), as.factor(test.df$Flight.Status), positive = "delayed")
accuracy.dt <- confusionMatrix(delays.dt.pred, as.factor(test.df$Flight.Status))
data.frame(
model = c("Logistic Regression", "Decision Tree"),
accuracy = c(accuracy.glm$overall["Accuracy"], accuracy.dt$overall["Accuracy"]),
sensitivity = c(accuracy.glm$byClass["Sensitivity"], accuracy.dt$byClass["Sensitivity"]),
specificity = c(accuracy.glm$byClass["Specificity"], accuracy.dt$byClass["Specificity"])
) |>
kable(caption = "Accuracy of Logistic Regression and Decision Tree Models") |>
kable_styling()
| model | accuracy | sensitivity | specificity |
|---|---|---|---|
| Logistic Regression | 0.7900114 | 0.2275449 | 0.9215686 |
| Decision Tree | 0.8217934 | 0.0598802 | 1.0000000 |
keywords: overfitting, threshold
Car Sales. Consider the data on used cars (ToyotaCorolla.csv) with 1436 records and details on 38 attributes, including Price, Age, KM, HP, and other specifications. The goal is to predict the price of a used Toyota Corolla based on its specifications.
# load data
data("ToyotaCorolla", package = "mlba")
Fit a neural network model to the data. Use a single hidden layer with 2 nodes.
# load libraries
library(caret)
library(nnet)
library(neuralnet)
library(forecast)
# Prepare data
Cars.df <- ToyotaCorolla |>
select(Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, Tow_Bar, Price)
# Filter only character columns
character_columns <- Cars.df |> select(where(is.character))
# Create the dummy data frame
dummy_data <- dummyVars(" ~ .", data = character_columns) |>
predict(character_columns)
# Remove the original character variables from Cars.df
Cars.df <- Cars.df |> select(-where(is.character))
# Bind the data frames by columns
Cars.df <- cbind(Cars.df, dummy_data)
# Scale except Price to 0-1 scale
Cars.df <- Cars.df |>
select_if(is.numeric) |> # select numeric columns
# select(-Price) |> # remove Price column
preProcess(method = "range") |> # need to load library caret
predict(Cars.df)
# Split data into training and test sets
train.index <- createDataPartition(Cars.df$Price, p = 0.6, list = FALSE)
train.df <- Cars.df[train.index, ]
test.df <- Cars.df[-train.index, ]
# Create nn1a model with 1 hidden layer and 2 nodes
nn1a <- neuralnet(Price ~ ., data = train.df, hidden = 2, linear.output = TRUE)
# Plot the neural network
plot(nn1a, rep = "best")
# Predict the training and test data of the neuralnet model
training.prediction <- compute(nn1a, train.df)
test.prediction <- compute(nn1a, test.df)
# Calculate RMSE step by step
RMSE(training.prediction$net.result, train.df$Price)
## [1] 0.03635764
RMSE(test.prediction$net.result, test.df$Price)
## [1] 0.0406216
Optional
# Create function to process neural network as previous
process.nn <- function(train.df, test.df, hidden = 2, linear.output = TRUE) {
nn <- neuralnet(Price ~ ., data = train.df, hidden = hidden, linear.output = linear.output)
plot(nn, rep = "best")
training.prediction <- compute(nn, train.df)
test.prediction <- compute(nn, test.df)
data.frame(
data = c("Training", "Test"),
RMSE = c(RMSE(training.prediction$net.result, train.df$Price), RMSE(test.prediction$net.result, test.df$Price))
) |>
kable(caption = "RMSE table", digits = 4) |>
kable_styling(full_width = FALSE, position = "left")
}
# apply the function with 1 hidden layers: 5 nodes
process.nn(train.df, test.df, hidden = 5)
| data | RMSE |
|---|---|
| Training | 0.0335 |
| Test | 0.0405 |
# apply the function with 2 hidden layers: 5 nodes and 5 nodes
process.nn(train.df, test.df, hidden = c(5,5))
| data | RMSE |
|---|---|
| Training | 0.0343 |
| Test | 0.0406 |
# apply the function with 1 hidden layers: 1 nodes
process.nn(train.df, test.df, hidden = 1)
| data | RMSE |
|---|---|
| Training | 0.0380 |
| Test | 0.0395 |
Compare the RMS error for the training data as the number of layers and nodes increases:
compare the RMS error for the validation data as the number of layers and nodes increases:
The optimal number of layers and nodes is …