9.2

Predicting Delayed Flights.

library(caret)
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)

set.seed(120)

# load data
data("FlightDelays", package = "mlba")

# skim the data
skimr::skim(FlightDelays)
Data summary
Name FlightDelays
Number of rows 2201
Number of columns 13
_______________________
Column type frequency:
character 6
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
CARRIER 0 1 2 2 0 8 0
DEST 0 1 3 3 0 3 0
FL_DATE 0 1 9 10 0 31 0
ORIGIN 0 1 3 3 0 3 0
TAIL_NUM 0 1 6 6 0 549 0
Flight.Status 0 1 6 7 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CRS_DEP_TIME 0 1 1371.94 432.70 600 1000 1455 1710 2130 ▇▃▇▇▅
DEP_TIME 0 1 1369.30 442.46 10 1004 1450 1709 2330 ▁▅▅▇▂
DISTANCE 0 1 211.87 13.32 169 213 214 214 229 ▁▁▂▇▂
FL_NUM 0 1 3815.09 2409.75 746 2156 2385 6155 7924 ▇▅▃▁▅
Weather 0 1 0.01 0.12 0 0 0 0 1 ▇▁▁▁▁
DAY_WEEK 0 1 3.91 1.90 1 2 4 5 7 ▇▅▅▅▆
DAY_OF_MONTH 0 1 16.02 8.68 1 8 16 23 31 ▇▇▇▇▇
# show head of data
head(FlightDelays)
##   CRS_DEP_TIME CARRIER DEP_TIME DEST DISTANCE    FL_DATE FL_NUM ORIGIN Weather
## 1         1455      OH     1455  JFK      184 01/01/2004   5935    BWI       0
## 2         1640      DH     1640  JFK      213 01/01/2004   6155    DCA       0
## 3         1245      DH     1245  LGA      229 01/01/2004   7208    IAD       0
## 4         1715      DH     1709  LGA      229 01/01/2004   7215    IAD       0
## 5         1039      DH     1035  LGA      229 01/01/2004   7792    IAD       0
## 6          840      DH      839  JFK      228 01/01/2004   7800    IAD       0
##   DAY_WEEK DAY_OF_MONTH TAIL_NUM Flight.Status
## 1        4            1   N940CA        ontime
## 2        4            1   N405FJ        ontime
## 3        4            1   N695BR        ontime
## 4        4            1   N662BR        ontime
## 5        4            1   N698BR        ontime
## 6        4            1   N687BR        ontime

Data Pre-processing.

# Remove unnecessary columns (DEP_TIME, FL_DATE, FL_NUM, DAY_OF_MONTH, TAIL_NUM)
colnames(FlightDelays)
##  [1] "CRS_DEP_TIME"  "CARRIER"       "DEP_TIME"      "DEST"         
##  [5] "DISTANCE"      "FL_DATE"       "FL_NUM"        "ORIGIN"       
##  [9] "Weather"       "DAY_WEEK"      "DAY_OF_MONTH"  "TAIL_NUM"     
## [13] "Flight.Status"
FlightDelays.preprocessed <- FlightDelays |> select(-c(DEP_TIME, FL_DATE, FL_NUM, DAY_OF_MONTH, TAIL_NUM)) # |>, %>% are pipe operators, used to chain functions together

# transform DAY_WEEK into categorical variable
FlightDelays.preprocessed$DAY_WEEK <- as.factor(FlightDelays.preprocessed$DAY_WEEK)
FlightDelays.preprocessed$Weather <- as.factor(FlightDelays.preprocessed$Weather)

# bin the scheduled departure time into eight bins
FlightDelays.preprocessed$CRS_DEP_TIME <- cut(FlightDelays.preprocessed$CRS_DEP_TIME/100, breaks = 8) # cut() function is used to bin continuous variables into discrete intervals, breaks = 8 means 8 bins with the same length, divide by 100 to get rid of the last two digits
unique(FlightDelays.preprocessed$CRS_DEP_TIME) # unique() function is used to get unique values of a variable
## [1] (13.7,15.6] (15.6,17.5] (11.7,13.7] (9.82,11.7] (7.91,9.82] (19.4,21.3]
## [7] (5.98,7.91] (17.5,19.4]
## 8 Levels: (5.98,7.91] (7.91,9.82] (9.82,11.7] (11.7,13.7] ... (19.4,21.3]
# partition the data into training and validation sets
trainIndex <- sample(1:nrow(FlightDelays.preprocessed), 0.6*nrow(FlightDelays.preprocessed))
train.df <- FlightDelays.preprocessed[trainIndex,]
test.df <- FlightDelays.preprocessed[-trainIndex,]

a.

Fit a classification tree to the flight delay variable using all the relevant predictors.

  • Do not include DEP_TIME (actual departure time) in the model because it is unknown at the time of prediction (unless we are generating our predictions of delays after the plane takes off, which is unlikely).
  • Use a pruned tree with maximum of 8 levels, setting cp = 0.001.
  • Express the resulting tree as a set of rules.
# train model
# model <- train(Flight.Status ~ ., data = train.df, method = "rpart")
delays.dt <- rpart(Flight.Status ~ ., data = train.df, method = "class", cp = 0.001, maxdepth = 8, model = TRUE) # model = TRUE to save the model, method = "class" for classification and for regression use "anova"

# plot tree with colored nodes red for delayed and green for ontime
prp(delays.dt, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, cex = 0.4)

# prune tree
prune.delays.dt <- prune(delays.dt, cp = delays.dt$cptable[which.min(delays.dt$cptable[,"xerror"]),"CP"])
# explain the code above: cptable is a table of complexity parameters (cp), xerror is the cross-validated error, and CP is the complexity parameter with the minimum cross-validated error

prp(prune.delays.dt, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, varlen = -10)

Answer

b.

If you needed to fly between DCA and EWR on a Monday at 7:00 AM, would you be able to use this tree? What other information would you need? Is it available in practice? What information is redundant?

Answer

c.

  • Fit the same tree as in (a), this time excluding the Weather predictor.
  • Display both the pruned and unpruned tree. You will find that the pruned tree contains a single terminal node.
# train model
delays.dt2 <- rpart(Flight.Status ~ . - Weather, data = train.df, method = "class", cp = 0.001, maxdepth = 8, model = TRUE)

# plot tree with colored nodes red for delayed and green for ontime, with numbers of observations in each node
prp(delays.dt2, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, cex = 0.45)

# prune tree
prune.delays.dt2 <- prune(delays.dt2, cp = delays.dt2$cptable[which.min(delays.dt2$cptable[,"xerror"]),"CP"])
prp(prune.delays.dt2, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, faclen = 0, cex = 0.8)

Question

  1. How is the pruned tree used for classification? (What is the rule for classifying?)
  2. To what is this rule equivalent?
  3. Examine the unpruned tree. What are the top three predictors according to this tree?
  4. Why, technically, does the pruned tree result in a single node?
  5. What is the disadvantage of using the top levels of the unpruned tree as opposed to the pruned tree?
  6. Compare this general result to that from logistic regression in the example in Chapter 10. What are possible reasons for the classification tree’s failure to find a good predictive model?

Answer

  1. The logistic regression model example in Chapter 10:
# load libraries
library(knitr)
library(kableExtra)

# train model
delays.glm <- train(Flight.Status ~ . , data = train.df, method = "glm", family = "binomial")

# predict 2 models, glm and decision tree
delays.glm.pred.prob <- predict(delays.glm, test.df, type = "prob")
delays.glm.pred <- ifelse(delays.glm.pred.prob[,2] > 0.65, "ontime", "delayed") # threshold at 0.65 to increase sensitivity
delays.dt.pred <- predict(prune.delays.dt, test.df, type = "class")

# compare accuracy, sensitivity, specificity, threshold at optimal accuracy
accuracy.glm <- confusionMatrix(as.factor(delays.glm.pred), as.factor(test.df$Flight.Status), positive = "delayed")
accuracy.dt <- confusionMatrix(delays.dt.pred, as.factor(test.df$Flight.Status))

data.frame(
  model = c("Logistic Regression", "Decision Tree"),
  accuracy = c(accuracy.glm$overall["Accuracy"], accuracy.dt$overall["Accuracy"]),
  sensitivity = c(accuracy.glm$byClass["Sensitivity"], accuracy.dt$byClass["Sensitivity"]),
  specificity = c(accuracy.glm$byClass["Specificity"], accuracy.dt$byClass["Specificity"])
) |>
  kable(caption = "Accuracy of Logistic Regression and Decision Tree Models") |> 
  kable_styling()
Accuracy of Logistic Regression and Decision Tree Models
model accuracy sensitivity specificity
Logistic Regression 0.7900114 0.2275449 0.9215686
Decision Tree 0.8217934 0.0598802 1.0000000

Answer

keywords: overfitting, threshold


11.3

Car Sales. Consider the data on used cars (ToyotaCorolla.csv) with 1436 records and details on 38 attributes, including Price, Age, KM, HP, and other specifications. The goal is to predict the price of a used Toyota Corolla based on its specifications.

# load data
data("ToyotaCorolla", package = "mlba")

a.

Fit a neural network model to the data. Use a single hidden layer with 2 nodes.

  • Use predictors Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.
  • Remember to first scale the numerical predictor and outcome variables to a 0–1 scale (use function preprocess() with method = “range”—see Chapter 7) and convert categorical predictors to dummies.
  • Record the RMS error for the training data and the validation data. Repeat the process, changing the number of hidden layers and nodes to single layer with 5 nodes; two layers, 5 nodes in each layer.

Answer

# load libraries
library(caret)
library(nnet)
library(neuralnet)
library(forecast)

# Prepare data
Cars.df <- ToyotaCorolla |> 
  select(Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, Tow_Bar, Price)

# Filter only character columns
character_columns <- Cars.df |> select(where(is.character))

# Create the dummy data frame
dummy_data <- dummyVars(" ~ .", data = character_columns) |>
  predict(character_columns)

# Remove the original character variables from Cars.df
Cars.df <- Cars.df |> select(-where(is.character))

# Bind the data frames by columns
Cars.df <- cbind(Cars.df, dummy_data)

# Scale except Price to 0-1 scale 
Cars.df <- Cars.df |> 
  select_if(is.numeric) |> # select numeric columns
  # select(-Price) |> # remove Price column
  preProcess(method = "range") |> # need to load library caret
  predict(Cars.df)

# Split data into training and test sets
train.index <- createDataPartition(Cars.df$Price, p = 0.6, list = FALSE)
train.df <- Cars.df[train.index, ]
test.df <- Cars.df[-train.index, ]
# Create nn1a model with 1 hidden layer and 2 nodes
nn1a <- neuralnet(Price ~ ., data = train.df, hidden = 2, linear.output = TRUE)

# Plot the neural network
plot(nn1a, rep = "best")

# Predict the training and test data of the neuralnet model
training.prediction <- compute(nn1a, train.df)
test.prediction <- compute(nn1a, test.df)


# Calculate RMSE step by step
RMSE(training.prediction$net.result, train.df$Price)
## [1] 0.03635764
RMSE(test.prediction$net.result, test.df$Price)
## [1] 0.0406216

Optional

# Create function to process neural network as previous
process.nn <- function(train.df, test.df, hidden = 2, linear.output = TRUE) {
  nn <- neuralnet(Price ~ ., data = train.df, hidden = hidden, linear.output = linear.output)
  plot(nn, rep = "best")
  training.prediction <- compute(nn, train.df)
  test.prediction <- compute(nn, test.df)
  data.frame(
    data = c("Training", "Test"),
    RMSE = c(RMSE(training.prediction$net.result, train.df$Price), RMSE(test.prediction$net.result, test.df$Price))
  ) |>
    kable(caption = "RMSE table", digits = 4) |>
    kable_styling(full_width = FALSE, position = "left")
}

# apply the function with 1 hidden layers: 5 nodes
process.nn(train.df, test.df, hidden = 5)
RMSE table
data RMSE
Training 0.0335
Test 0.0405
# apply the function with 2 hidden layers: 5 nodes and 5 nodes
process.nn(train.df, test.df, hidden = c(5,5))
RMSE table
data RMSE
Training 0.0343
Test 0.0406
# apply the function with 1 hidden layers: 1 nodes
process.nn(train.df, test.df, hidden = 1)
RMSE table
data RMSE
Training 0.0380
Test 0.0395
  1. What happens to the RMS error for the training data as the number of layers and nodes increases?
  2. What happens to the RMS error for the validation data?
  3. Comment on the appropriate number of layers and nodes for this application.

Answer

  1. Compare the RMS error for the training data as the number of layers and nodes increases:

  2. compare the RMS error for the validation data as the number of layers and nodes increases:

  3. The optimal number of layers and nodes is …