Decision Tree Example using the Boston Housing Dataset

This Decision Tree model is based on the Boston Housing Dataset obtained from Kaggle (https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset/input?select=housing.csv)

install.packages("MASS")
## The following package(s) will be installed:
## - MASS [7.3-60]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
## 
## # Installing packages --------------------------------------------------------
## - Installing MASS ...                           OK [linked from cache]
## Successfully installed 1 package in 9.3 milliseconds.
library(MASS)
attach(Boston)


#Install Packages
install.packages("tidymodels")
## The following package(s) will be installed:
## - tidymodels [1.1.1]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
## 
## # Installing packages --------------------------------------------------------
## - Installing tidymodels ...                     OK [linked from cache]
## Successfully installed 1 package in 2.4 milliseconds.
install.packages("tidyr")
## The following package(s) will be installed:
## - tidyr [1.3.0]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
## 
## # Installing packages --------------------------------------------------------
## - Installing tidyr ...                          OK [linked from cache]
## Successfully installed 1 package in 2.4 milliseconds.
# Load the library
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ recipes      1.0.8
## ✔ dials        1.2.0     ✔ rsample      1.2.0
## ✔ dplyr        1.1.2     ✔ tibble       3.2.1
## ✔ ggplot2      3.4.3     ✔ tidyr        1.3.0
## ✔ infer        1.0.4     ✔ tune         1.1.2
## ✔ modeldata    1.2.0     ✔ workflows    1.1.3
## ✔ parsnip      1.1.1     ✔ workflowsets 1.0.1
## ✔ purrr        1.0.2     ✔ yardstick    1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::select()  masks MASS::select()
## ✖ recipes::step()  masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyr)

# Prepare the dataset for ggplot2
boston_data_long <- Boston %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "value")

# Create a histogram for all numeric variables in one plot
boston_histograms <- ggplot(boston_data_long, aes(x = value)) +
  geom_histogram(bins = 30, color = "black", fill = "lightblue") +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  labs(title = "Histograms of Numeric Variables in the Boston Housing Dataset",
       x = "Value",
       y = "Frequency") +
  theme_minimal()

# Plot the histograms
print(boston_histograms)

# Split the data into training and testing sets

set.seed(123)
data_split <- initial_split(Boston, prop = 0.75)
train_data <- training(data_split)
test_data <- testing(data_split)

# Create a decision tree model specification
tree_spec <- decision_tree() %>%
  set_engine("rpart") %>%
  set_mode("regression")

# Fit the model to the training data
tree_fit <- tree_spec %>%
  fit(medv ~ ., data = train_data)

# Make predictions on the testing data
predictions <- tree_fit %>%
  predict(test_data) %>%
  pull(.pred)

# Calculate RMSE and R-squared
metrics <- metric_set(rmse, rsq)
model_performance <- test_data %>%
  mutate(predictions = predictions) %>%
  metrics(truth = medv, estimate = predictions)

print(model_performance)
## # A tibble: 2 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       5.22 
## 2 rsq     standard       0.689
# Make predictions on new data
new_data <- tribble(
  ~crim, ~zn, ~indus, ~chas, ~nox, ~rm, ~age, ~dis, ~rad, ~tax, ~ptratio, ~black, ~lstat,
  0.03237, 0, 2.18, 0, 0.458, 6.998, 45.8, 6.0622, 3, 222, 18.7, 394.63, 2.94
)

predictions <- predict(tree_fit, new_data)
print(predictions)
## # A tibble: 1 × 1
##   .pred
##   <dbl>
## 1  34.3
# Install and Load the library
install.packages("rpart.plot")
## The following package(s) will be installed:
## - rpart.plot [3.1.1]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
## 
## # Installing packages --------------------------------------------------------
## - Installing rpart.plot ...                     OK [linked from cache]
## Successfully installed 1 package in 2.3 milliseconds.
library(rpart.plot)
## Loading required package: rpart
## 
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
## 
##     prune
# Plot the decision tree
rpart.plot(tree_fit$fit, type = 4, extra = 101, under = TRUE, cex = 0.8, box.palette = "auto")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

rules <- rpart.rules(tree_fit$fit)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.rules with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
print(rules)
##  medv                                                               
##    11 when rm <  6.9        & lstat >= 15 & crim >= 7.5             
##    17 when rm <  6.9        & lstat >= 15 & crim <  7.5             
##    22 when rm <  6.5        & lstat <  15               & dis >= 1.6
##    26 when rm is 6.9 to 7.4 & lstat >=  7                           
##    27 when rm is 6.5 to 6.9 & lstat <  15                           
##    34 when rm <  6.5        & lstat <  15               & dis <  1.6
##    34 when rm is 6.9 to 7.4 & lstat <   7                           
##    47 when rm >=        7.4

Plot the most important variables

## The following package(s) will be installed:
## - vip [0.4.1]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
## 
## # Installing packages --------------------------------------------------------
## - Installing vip ...                            OK [linked from cache]
## Successfully installed 1 package in 2.6 milliseconds.
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.