Data Mining Homework 4

9.2

Predicting Delayed Flights.

library(caret)
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)

set.seed(120)

# load data
data("FlightDelays", package = "mlba")

# skim the data
skimr::skim(FlightDelays)

Data summary
Name	FlightDelays
Number of rows	2201
Number of columns	13
_______________________
Column type frequency:
character	6
numeric	7
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
CARRIER	1	2	2	8
DEST	1	3	3	3
FL_DATE	1	9	10	31
ORIGIN	1	3	3	3
TAIL_NUM	1	6	6	549
Flight.Status	1	6	7	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
CRS_DEP_TIME	1	1371.94	432.70	600	1000	1455	1710	2130	▇▃▇▇▅
DEP_TIME	1	1369.30	442.46	10	1004	1450	1709	2330	▁▅▅▇▂
DISTANCE	1	211.87	13.32	169	213	214	214	229	▁▁▂▇▂
FL_NUM	1	3815.09	2409.75	746	2156	2385	6155	7924	▇▅▃▁▅
Weather	1	0.01	0.12	0	0	0	0	1	▇▁▁▁▁
DAY_WEEK	1	3.91	1.90	1	2	4	5	7	▇▅▅▅▆
DAY_OF_MONTH	1	16.02	8.68	1	8	16	23	31	▇▇▇▇▇

# show head of data
head(FlightDelays)

##   CRS_DEP_TIME CARRIER DEP_TIME DEST DISTANCE    FL_DATE FL_NUM ORIGIN Weather
## 1         1455      OH     1455  JFK      184 01/01/2004   5935    BWI       0
## 2         1640      DH     1640  JFK      213 01/01/2004   6155    DCA       0
## 3         1245      DH     1245  LGA      229 01/01/2004   7208    IAD       0
## 4         1715      DH     1709  LGA      229 01/01/2004   7215    IAD       0
## 5         1039      DH     1035  LGA      229 01/01/2004   7792    IAD       0
## 6          840      DH      839  JFK      228 01/01/2004   7800    IAD       0
##   DAY_WEEK DAY_OF_MONTH TAIL_NUM Flight.Status
## 1        4            1   N940CA        ontime
## 2        4            1   N405FJ        ontime
## 3        4            1   N695BR        ontime
## 4        4            1   N662BR        ontime
## 5        4            1   N698BR        ontime
## 6        4            1   N687BR        ontime

Data Pre-processing.

Transform variable day of week (DAY_WEEK) info a categorical variable.
Bin the scheduled departure time into eight bins (in R use function cut()).
Use these and all other columns as predictors (excluding DAY_OF_MONTH).
Partition the data into training and validation sets.

# Remove unnecessary columns (DEP_TIME, FL_DATE, FL_NUM, DAY_OF_MONTH, TAIL_NUM)
colnames(FlightDelays)

##  [1] "CRS_DEP_TIME"  "CARRIER"       "DEP_TIME"      "DEST"         
##  [5] "DISTANCE"      "FL_DATE"       "FL_NUM"        "ORIGIN"       
##  [9] "Weather"       "DAY_WEEK"      "DAY_OF_MONTH"  "TAIL_NUM"     
## [13] "Flight.Status"

FlightDelays.preprocessed <- FlightDelays |> select(-c(DEP_TIME, FL_DATE, FL_NUM, DAY_OF_MONTH, TAIL_NUM)) # |>, %>% are pipe operators, used to chain functions together

# transform DAY_WEEK into categorical variable
FlightDelays.preprocessed$DAY_WEEK <- as.factor(FlightDelays.preprocessed$DAY_WEEK)
FlightDelays.preprocessed$Weather <- as.factor(FlightDelays.preprocessed$Weather)

# bin the scheduled departure time into eight bins
FlightDelays.preprocessed$CRS_DEP_TIME <- cut(FlightDelays.preprocessed$CRS_DEP_TIME/100, breaks = 8) # cut() function is used to bin continuous variables into discrete intervals, breaks = 8 means 8 bins with the same length, divide by 100 to get rid of the last two digits
unique(FlightDelays.preprocessed$CRS_DEP_TIME) # unique() function is used to get unique values of a variable

## [1] (13.7,15.6] (15.6,17.5] (11.7,13.7] (9.82,11.7] (7.91,9.82] (19.4,21.3]
## [7] (5.98,7.91] (17.5,19.4]
## 8 Levels: (5.98,7.91] (7.91,9.82] (9.82,11.7] (11.7,13.7] ... (19.4,21.3]

# partition the data into training and validation sets
trainIndex <- sample(1:nrow(FlightDelays.preprocessed), 0.6*nrow(FlightDelays.preprocessed))
train.df <- FlightDelays.preprocessed[trainIndex,]
test.df <- FlightDelays.preprocessed[-trainIndex,]

a.

Fit a classification tree to the flight delay variable using all the relevant predictors.

Do not include DEP_TIME (actual departure time) in the model because it is unknown at the time of prediction (unless we are generating our predictions of delays after the plane takes off, which is unlikely).
Use a pruned tree with maximum of 8 levels, setting cp = 0.001.
Express the resulting tree as a set of rules.

# train model
# model <- train(Flight.Status ~ ., data = train.df, method = "rpart")
delays.dt <- rpart(Flight.Status ~ ., data = train.df, method = "class", cp = 0.001, maxdepth = 8, model = TRUE) # model = TRUE to save the model, method = "class" for classification and for regression use "anova"

# plot tree with colored nodes red for delayed and green for ontime
prp(delays.dt, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, cex = 0.4)

# prune tree
prune.delays.dt <- prune(delays.dt, cp = delays.dt$cptable[which.min(delays.dt$cptable[,"xerror"]),"CP"])
# explain the code above: cptable is a table of complexity parameters (cp), xerror is the cross-validated error, and CP is the complexity parameter with the minimum cross-validated error

prp(prune.delays.dt, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, varlen = -10)

Answer

b.

If you needed to fly between DCA and EWR on a Monday at 7:00 AM, would you be able to use this tree? What other information would you need? Is it available in practice? What information is redundant?

Answer

c.

Fit the same tree as in (a), this time excluding the Weather predictor.
Display both the pruned and unpruned tree. You will find that the pruned tree contains a single terminal node.

# train model
delays.dt2 <- rpart(Flight.Status ~ . - Weather, data = train.df, method = "class", cp = 0.001, maxdepth = 8, model = TRUE)

# plot tree with colored nodes red for delayed and green for ontime, with numbers of observations in each node
prp(delays.dt2, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, cex = 0.45)

# prune tree
prune.delays.dt2 <- prune(delays.dt2, cp = delays.dt2$cptable[which.min(delays.dt2$cptable[,"xerror"]),"CP"])
prp(prune.delays.dt2, box.palette = c("lightpink", "lightgreen"), type = 1, extra = 1, faclen = 0, cex = 0.8)

Question

How is the pruned tree used for classification? (What is the rule for classifying?)
To what is this rule equivalent?
Examine the unpruned tree. What are the top three predictors according to this tree?
Why, technically, does the pruned tree result in a single node?
What is the disadvantage of using the top levels of the unpruned tree as opposed to the pruned tree?
Compare this general result to that from logistic regression in the example in Chapter 10. What are possible reasons for the classification tree’s failure to find a good predictive model?

Answer

…

The logistic regression model example in Chapter 10:

# load libraries
library(knitr)
library(kableExtra)

# train model
delays.glm <- train(Flight.Status ~ . , data = train.df, method = "glm", family = "binomial")

# predict 2 models, glm and decision tree
delays.glm.pred.prob <- predict(delays.glm, test.df, type = "prob")
delays.glm.pred <- ifelse(delays.glm.pred.prob[,2] > 0.65, "ontime", "delayed") # threshold at 0.65 to increase sensitivity
delays.dt.pred <- predict(prune.delays.dt, test.df, type = "class")

# compare accuracy, sensitivity, specificity, threshold at optimal accuracy
accuracy.glm <- confusionMatrix(as.factor(delays.glm.pred), as.factor(test.df$Flight.Status), positive = "delayed")
accuracy.dt <- confusionMatrix(delays.dt.pred, as.factor(test.df$Flight.Status))

data.frame(
  model = c("Logistic Regression", "Decision Tree"),
  accuracy = c(accuracy.glm$overall["Accuracy"], accuracy.dt$overall["Accuracy"]),
  sensitivity = c(accuracy.glm$byClass["Sensitivity"], accuracy.dt$byClass["Sensitivity"]),
  specificity = c(accuracy.glm$byClass["Specificity"], accuracy.dt$byClass["Specificity"])
) |>
  kable(caption = "Accuracy of Logistic Regression and Decision Tree Models") |> 
  kable_styling()

Accuracy of Logistic Regression and Decision Tree Models
model	accuracy	sensitivity	specificity
Logistic Regression	0.7900114	0.2275449	0.9215686
Decision Tree	0.8217934	0.0598802	1.0000000

Answer

keywords: overfitting, threshold

11.3

Car Sales. Consider the data on used cars (ToyotaCorolla.csv) with 1436 records and details on 38 attributes, including Price, Age, KM, HP, and other specifications. The goal is to predict the price of a used Toyota Corolla based on its specifications.

# load data
data("ToyotaCorolla", package = "mlba")

a.

Fit a neural network model to the data. Use a single hidden layer with 2 nodes.

Use predictors Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.
Remember to first scale the numerical predictor and outcome variables to a 0–1 scale (use function preprocess() with method = “range”—see Chapter 7) and convert categorical predictors to dummies.
Record the RMS error for the training data and the validation data. Repeat the process, changing the number of hidden layers and nodes to single layer with 5 nodes; two layers, 5 nodes in each layer.

Answer

# load libraries
library(caret)
library(nnet)
library(neuralnet)
library(forecast)

# Prepare data
Cars.df <- ToyotaCorolla |> 
  select(Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, Tow_Bar, Price)

# Filter only character columns
character_columns <- Cars.df |> select(where(is.character))

# Create the dummy data frame
dummy_data <- dummyVars(" ~ .", data = character_columns) |>
  predict(character_columns)

# Remove the original character variables from Cars.df
Cars.df <- Cars.df |> select(-where(is.character))

# Bind the data frames by columns
Cars.df <- cbind(Cars.df, dummy_data)

# Scale except Price to 0-1 scale 
Cars.df <- Cars.df |> 
  select_if(is.numeric) |> # select numeric columns
  # select(-Price) |> # remove Price column
  preProcess(method = "range") |> # need to load library caret
  predict(Cars.df)

# Split data into training and test sets
train.index <- createDataPartition(Cars.df$Price, p = 0.6, list = FALSE)
train.df <- Cars.df[train.index, ]
test.df <- Cars.df[-train.index, ]

# Create nn1a model with 1 hidden layer and 2 nodes
nn1a <- neuralnet(Price ~ ., data = train.df, hidden = 2, linear.output = TRUE)

# Plot the neural network
plot(nn1a, rep = "best")

# Predict the training and test data of the neuralnet model
training.prediction <- compute(nn1a, train.df)
test.prediction <- compute(nn1a, test.df)


# Calculate RMSE step by step
RMSE(training.prediction$net.result, train.df$Price)

## [1] 0.03635764

RMSE(test.prediction$net.result, test.df$Price)

## [1] 0.0406216

Optional

# Create function to process neural network as previous
process.nn <- function(train.df, test.df, hidden = 2, linear.output = TRUE) {
  nn <- neuralnet(Price ~ ., data = train.df, hidden = hidden, linear.output = linear.output)
  plot(nn, rep = "best")
  training.prediction <- compute(nn, train.df)
  test.prediction <- compute(nn, test.df)
  data.frame(
    data = c("Training", "Test"),
    RMSE = c(RMSE(training.prediction$net.result, train.df$Price), RMSE(test.prediction$net.result, test.df$Price))
  ) |>
    kable(caption = "RMSE table", digits = 4) |>
    kable_styling(full_width = FALSE, position = "left")
}

# apply the function with 1 hidden layers: 5 nodes
process.nn(train.df, test.df, hidden = 5)

RMSE table
data	RMSE
Training	0.0335
Test	0.0405

# apply the function with 2 hidden layers: 5 nodes and 5 nodes
process.nn(train.df, test.df, hidden = c(5,5))

RMSE table
data	RMSE
Training	0.0343
Test	0.0406

# apply the function with 1 hidden layers: 1 nodes
process.nn(train.df, test.df, hidden = 1)

RMSE table
data	RMSE
Training	0.0380
Test	0.0395

What happens to the RMS error for the training data as the number of layers and nodes increases?
What happens to the RMS error for the validation data?
Comment on the appropriate number of layers and nodes for this application.

Answer

Compare the RMS error for the training data as the number of layers and nodes increases:
compare the RMS error for the validation data as the number of layers and nodes increases:
The optimal number of layers and nodes is …

Data Mining Homework 4

Van Tien Le

2023-11-22

9.2

a.

Answer

b.

Answer

c.

Question

Answer

Answer

11.3

a.

Answer

Answer