This Decision Tree model is based on the Boston Housing Dataset obtained from Kaggle (https://www.kaggle.com/code/prasadperera/the-boston-housing-dataset/input?select=housing.csv)
install.packages("MASS")
## The following package(s) will be installed:
## - MASS [7.3-60]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
##
## # Installing packages --------------------------------------------------------
## - Installing MASS ... OK [linked from cache]
## Successfully installed 1 package in 9.3 milliseconds.
library(MASS)
attach(Boston)
#Install Packages
install.packages("tidymodels")
## The following package(s) will be installed:
## - tidymodels [1.1.1]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
##
## # Installing packages --------------------------------------------------------
## - Installing tidymodels ... OK [linked from cache]
## Successfully installed 1 package in 2.4 milliseconds.
install.packages("tidyr")
## The following package(s) will be installed:
## - tidyr [1.3.0]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
##
## # Installing packages --------------------------------------------------------
## - Installing tidyr ... OK [linked from cache]
## Successfully installed 1 package in 2.4 milliseconds.
# Load the library
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ recipes 1.0.8
## ✔ dials 1.2.0 ✔ rsample 1.2.0
## ✔ dplyr 1.1.2 ✔ tibble 3.2.1
## ✔ ggplot2 3.4.3 ✔ tidyr 1.3.0
## ✔ infer 1.0.4 ✔ tune 1.1.2
## ✔ modeldata 1.2.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.1 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.2 ✔ yardstick 1.2.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(tidyr)
# Prepare the dataset for ggplot2
boston_data_long <- Boston %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "value")
# Create a histogram for all numeric variables in one plot
boston_histograms <- ggplot(boston_data_long, aes(x = value)) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Histograms of Numeric Variables in the Boston Housing Dataset",
x = "Value",
y = "Frequency") +
theme_minimal()
# Plot the histograms
print(boston_histograms)
# Split the data into training and testing sets
set.seed(123)
data_split <- initial_split(Boston, prop = 0.75)
train_data <- training(data_split)
test_data <- testing(data_split)
# Create a decision tree model specification
tree_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("regression")
# Fit the model to the training data
tree_fit <- tree_spec %>%
fit(medv ~ ., data = train_data)
# Make predictions on the testing data
predictions <- tree_fit %>%
predict(test_data) %>%
pull(.pred)
# Calculate RMSE and R-squared
metrics <- metric_set(rmse, rsq)
model_performance <- test_data %>%
mutate(predictions = predictions) %>%
metrics(truth = medv, estimate = predictions)
print(model_performance)
## # A tibble: 2 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 5.22
## 2 rsq standard 0.689
# Make predictions on new data
new_data <- tribble(
~crim, ~zn, ~indus, ~chas, ~nox, ~rm, ~age, ~dis, ~rad, ~tax, ~ptratio, ~black, ~lstat,
0.03237, 0, 2.18, 0, 0.458, 6.998, 45.8, 6.0622, 3, 222, 18.7, 394.63, 2.94
)
predictions <- predict(tree_fit, new_data)
print(predictions)
## # A tibble: 1 × 1
## .pred
## <dbl>
## 1 34.3
# Install and Load the library
install.packages("rpart.plot")
## The following package(s) will be installed:
## - rpart.plot [3.1.1]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
##
## # Installing packages --------------------------------------------------------
## - Installing rpart.plot ... OK [linked from cache]
## Successfully installed 1 package in 2.3 milliseconds.
library(rpart.plot)
## Loading required package: rpart
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
# Plot the decision tree
rpart.plot(tree_fit$fit, type = 4, extra = 101, under = TRUE, cex = 0.8, box.palette = "auto")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
rules <- rpart.rules(tree_fit$fit)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.rules with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
print(rules)
## medv
## 11 when rm < 6.9 & lstat >= 15 & crim >= 7.5
## 17 when rm < 6.9 & lstat >= 15 & crim < 7.5
## 22 when rm < 6.5 & lstat < 15 & dis >= 1.6
## 26 when rm is 6.9 to 7.4 & lstat >= 7
## 27 when rm is 6.5 to 6.9 & lstat < 15
## 34 when rm < 6.5 & lstat < 15 & dis < 1.6
## 34 when rm is 6.9 to 7.4 & lstat < 7
## 47 when rm >= 7.4
## The following package(s) will be installed:
## - vip [0.4.1]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
##
## # Installing packages --------------------------------------------------------
## - Installing vip ... OK [linked from cache]
## Successfully installed 1 package in 2.6 milliseconds.
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.