# Create a sequence of probability values from 0 to 1
p_m1 <- seq(0, 1, by = 0.01)
# Calculate metrics for each probability value
# Gini index: 2*p*(1-p)
gini <- 2 * p_m1 * (1 - p_m1)
# Classification error: min(p, 1-p)
class_error <- pmin(p_m1, 1 - p_m1)
# Entropy: -p*log2(p) - (1-p)*log2(1-p)
# Handle edge cases where p=0 or p=1 (log(0) is undefined)
entropy <- numeric(length(p_m1))
for (i in 1:length(p_m1)) {
p <- p_m1[i]
if (p == 0 || p == 1) {
entropy[i] <- 0 # Entropy is 0 when p=0 or p=1
} else {
entropy[i] <- -p * log2(p) - (1-p) * log2(1-p)
}
}
# Create the plot
plot(p_m1, gini, type = "l", col = "blue", lwd = 2,
xlab = "Probability of Class 1 (p_m1)",
ylab = "Value",
main = "Comparison of Impurity Measures",
ylim = c(0, 1))
# Add lines for classification error and entropy
lines(p_m1, class_error, col = "red", lwd = 2)
lines(p_m1, entropy, col = "green", lwd = 2)
# Add a legend
legend("topright",
legend = c("Gini Index", "Classification Error", "Entropy"),
col = c("blue", "red", "green"),
lwd = 2)
Interpretation:
All three measures reach zero at p_m1 = 0 and p_m1 = 1, indicating perfect purity when all samples belong to a single class. They all peak when classes are most mixed (around p_m1 = 0.5), but with different maximum values and curve shapes:
Entropy (green) reaches the highest maximum value (1.0) and has the steepest curve, making it most sensitive to changes in class distribution
Gini Index (blue) peaks at 0.5 and has a more moderate parabolic shape
Classification Error (red) has a linear increase until 0.5, then decreases linearly
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
# Create a synthetic dataset that will result in the desired tree structure
# Based on the partition in the left panel of Figure 8.14
set.seed(123)
n <- 1000
x1 <- runif(n, -0.5, 1.5)
x2 <- runif(n, -0.5, 1.5)
# Create response variable based on the regions in the figure
y <- numeric(n)
for (i in 1:n) {
if (x1[i] < 0) {
y[i] <- 3 # Bottom left region
} else if (x1[i] < 1 && x2[i] < 0) {
y[i] <- 10 # Bottom middle region
} else if (x1[i] < 1 && x2[i] >= 0) {
y[i] <- 15 # Top left region
} else {
y[i] <- 5 # Right region
}
}
# Add some noise to make it more realistic
y <- y + rnorm(n, 0, 0.1)
# Create data frame
data <- data.frame(x1 = x1, x2 = x2, y = y)
# Fit the tree with the exact structure we want
tree <- rpart(y ~ x1 + x2, data = data,
control = rpart.control(minsplit = 5, cp = 0.001))
# Prune the tree to get the desired structure
pruned_tree <- prune(tree, cp = 0.01)
# Plot the tree
rpart.plot(pruned_tree, type = 4, extra = 101,
main = "Tree Corresponding to Left Panel Partition",
box.palette = "auto")
library(ggplot2)
# Create a data frame for the regions
regions <- data.frame(
x1_min = c(0, 1, 0, 1),
x1_max = c(1, 2, 1, 2),
x2_min = c(0, 0, 1, 1),
x2_max = c(1, 1, 2, 2),
mean_value = c(-1.80, 0.63, -1.06, 2.49)
)
# Create the plot using base R
plot_partition <- function() {
# Set up the plot
plot(c(-0.5, 2.5), c(-0.5, 2.5), type = "n", xlab = "X1", ylab = "X2",
main = "Partition Based on Right Panel Tree")
# Draw the partition lines
abline(h = 1, lty = 2) # x2 = 1 split
abline(v = 1, lty = 2) # x1 = 1 split
abline(v = 0, lty = 2) # x1 = 0 split
# Add rectangles for each region
rect(0, 0, 1, 1, col = rgb(0.9, 0.9, 0.9, 0.5), border = NA)
rect(1, 0, 2, 1, col = rgb(0.8, 0.8, 0.9, 0.5), border = NA)
rect(0, 1, 1, 2, col = rgb(0.9, 0.8, 0.8, 0.5), border = NA)
rect(1, 1, 2, 2, col = rgb(0.8, 0.9, 0.8, 0.5), border = NA)
# Add mean values to each region
text(0.5, 0.5, "-1.80", cex = 1.2) # Region where x2 < 1 and x1 < 1
text(1.5, 0.5, "0.63", cex = 1.2) # Region where x2 < 1 and x1 >= 1
text(0.5, 1.5, "-1.06", cex = 1.2) # Region where x2 >= 1 and x1 < 1
text(1.5, 1.5, "2.49", cex = 1.2) # Region where x2 >= 1 and x1 >= 1
}
# Create the partition plot
plot_partition()
# Given probabilities from 10 bootstrapped samples
probabilities <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75)
# Convert probabilities to class predictions (TRUE if Red, FALSE if Green)
class_predictions <- probabilities > 0.5
# Count votes for Red class
votes_red <- sum(class_predictions)
# Count votes for Green class
votes_green <- length(probabilities) - votes_red
# Determine final classification based on majority vote
final_class_majority_vote <- ifelse(votes_red > votes_green, "Red", "Green")
final_class_majority_vote
## [1] "Red"
# Approach 2: Average Probability
# Calculate the average probability
average_probability <- mean(probabilities)
# Determine final classification based on average probability
final_class_avg_prob <- ifelse(average_probability > 0.5, "Red", "Green")
# Print results
cat("Approach 1 (Majority Vote):\n")
## Approach 1 (Majority Vote):
cat("Votes for Red:", votes_red, "\n")
## Votes for Red: 6
cat("Votes for Green:", votes_green, "\n")
## Votes for Green: 4
cat("Final Classification:", final_class_majority_vote, "\n\n")
## Final Classification: Red
cat("Approach 2 (Average Probability):\n")
## Approach 2 (Average Probability):
cat("Average Probability:", average_probability, "\n")
## Average Probability: 0.45
cat("Final Classification:", final_class_avg_prob, "\n")
## Final Classification: Green
The algorithm used to fit a regression tree is called recursive binary splitting. It begins by considering all possible splits across all features to find the one that divides the data into two regions such that the sum of squared residuals (SSR) within each region is minimized.
Once the best split is made, the process is repeated recursively on each resulting region. This continues until a stopping criterion is met, such as a minimum number of observations in a node or a maximum tree depth.
The final prediction for each terminal node (leaf) is the average of the response values in that node. This greedy, top-down approach builds the tree by choosing the locally best split at each step.