Exercise6

3.

# Create a sequence of probability values from 0 to 1
p_m1 <- seq(0, 1, by = 0.01)

# Calculate metrics for each probability value
# Gini index: 2*p*(1-p)
gini <- 2 * p_m1 * (1 - p_m1)

# Classification error: min(p, 1-p)
class_error <- pmin(p_m1, 1 - p_m1)
# Entropy: -p*log2(p) - (1-p)*log2(1-p)
# Handle edge cases where p=0 or p=1 (log(0) is undefined)
entropy <- numeric(length(p_m1))
for (i in 1:length(p_m1)) {
  p <- p_m1[i]
  if (p == 0 || p == 1) {
    entropy[i] <- 0  # Entropy is 0 when p=0 or p=1
  } else {
    entropy[i] <- -p * log2(p) - (1-p) * log2(1-p)
  }
}
# Create the plot
plot(p_m1, gini, type = "l", col = "blue", lwd = 2,
     xlab = "Probability of Class 1 (p_m1)",
     ylab = "Value",
     main = "Comparison of Impurity Measures",
     ylim = c(0, 1))

# Add lines for classification error and entropy
lines(p_m1, class_error, col = "red", lwd = 2)
lines(p_m1, entropy, col = "green", lwd = 2)

# Add a legend
legend("topright", 
       legend = c("Gini Index", "Classification Error", "Entropy"),
       col = c("blue", "red", "green"),
       lwd = 2)

Interpretation:

All three measures reach zero at p_m1 = 0 and p_m1 = 1, indicating perfect purity when all samples belong to a single class. They all peak when classes are most mixed (around p_m1 = 0.5), but with different maximum values and curve shapes:

  • Entropy (green) reaches the highest maximum value (1.0) and has the steepest curve, making it most sensitive to changes in class distribution

  • Gini Index (blue) peaks at 0.5 and has a more moderate parabolic shape

  • Classification Error (red) has a linear increase until 0.5, then decreases linearly


4.

a. Creating the Tree from the Left Panel Partition

library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
# Create a synthetic dataset that will result in the desired tree structure
# Based on the partition in the left panel of Figure 8.14
set.seed(123)
n <- 1000
x1 <- runif(n, -0.5, 1.5)
x2 <- runif(n, -0.5, 1.5)

# Create response variable based on the regions in the figure
y <- numeric(n)
for (i in 1:n) {
  if (x1[i] < 0) {
    y[i] <- 3  # Bottom left region
  } else if (x1[i] < 1 && x2[i] < 0) {
    y[i] <- 10  # Bottom middle region
  } else if (x1[i] < 1 && x2[i] >= 0) {
    y[i] <- 15  # Top left region
  } else {
    y[i] <- 5   # Right region
  }
}

# Add some noise to make it more realistic
y <- y + rnorm(n, 0, 0.1)

# Create data frame
data <- data.frame(x1 = x1, x2 = x2, y = y)
# Fit the tree with the exact structure we want
tree <- rpart(y ~ x1 + x2, data = data, 
              control = rpart.control(minsplit = 5, cp = 0.001))

# Prune the tree to get the desired structure
pruned_tree <- prune(tree, cp = 0.01)

# Plot the tree
rpart.plot(pruned_tree, type = 4, extra = 101, 
           main = "Tree Corresponding to Left Panel Partition",
           box.palette = "auto")

b. Creating the Partition Diagram from Right Panel Tree

library(ggplot2)
# Create a data frame for the regions
regions <- data.frame(
  x1_min = c(0, 1, 0, 1),
  x1_max = c(1, 2, 1, 2),
  x2_min = c(0, 0, 1, 1),
  x2_max = c(1, 1, 2, 2),
  mean_value = c(-1.80, 0.63, -1.06, 2.49)
)
# Create the plot using base R
plot_partition <- function() {
  # Set up the plot
  plot(c(-0.5, 2.5), c(-0.5, 2.5), type = "n", xlab = "X1", ylab = "X2",
       main = "Partition Based on Right Panel Tree")
  
  # Draw the partition lines
  abline(h = 1, lty = 2)  # x2 = 1 split
  abline(v = 1, lty = 2)  # x1 = 1 split
  abline(v = 0, lty = 2)  # x1 = 0 split
  
  # Add rectangles for each region
  rect(0, 0, 1, 1, col = rgb(0.9, 0.9, 0.9, 0.5), border = NA)
  rect(1, 0, 2, 1, col = rgb(0.8, 0.8, 0.9, 0.5), border = NA)
  rect(0, 1, 1, 2, col = rgb(0.9, 0.8, 0.8, 0.5), border = NA)
  rect(1, 1, 2, 2, col = rgb(0.8, 0.9, 0.8, 0.5), border = NA)
  
  # Add mean values to each region
  text(0.5, 0.5, "-1.80", cex = 1.2)  # Region where x2 < 1 and x1 < 1
  text(1.5, 0.5, "0.63", cex = 1.2)   # Region where x2 < 1 and x1 >= 1
  text(0.5, 1.5, "-1.06", cex = 1.2)  # Region where x2 >= 1 and x1 < 1
  text(1.5, 1.5, "2.49", cex = 1.2)   # Region where x2 >= 1 and x1 >= 1
}
# Create the partition plot
plot_partition()


5.

# Given probabilities from 10 bootstrapped samples
probabilities <- c(0.1, 0.15, 0.2, 0.2, 0.55, 0.6, 0.6, 0.65, 0.7, 0.75)
# Convert probabilities to class predictions (TRUE if Red, FALSE if Green)
class_predictions <- probabilities > 0.5
# Count votes for Red class
votes_red <- sum(class_predictions)
# Count votes for Green class
votes_green <- length(probabilities) - votes_red
# Determine final classification based on majority vote
final_class_majority_vote <- ifelse(votes_red > votes_green, "Red", "Green")
final_class_majority_vote
## [1] "Red"
# Approach 2: Average Probability
# Calculate the average probability
average_probability <- mean(probabilities)
# Determine final classification based on average probability
final_class_avg_prob <- ifelse(average_probability > 0.5, "Red", "Green")

# Print results
cat("Approach 1 (Majority Vote):\n")
## Approach 1 (Majority Vote):
cat("Votes for Red:", votes_red, "\n")
## Votes for Red: 6
cat("Votes for Green:", votes_green, "\n")
## Votes for Green: 4
cat("Final Classification:", final_class_majority_vote, "\n\n")
## Final Classification: Red
cat("Approach 2 (Average Probability):\n")
## Approach 2 (Average Probability):
cat("Average Probability:", average_probability, "\n")
## Average Probability: 0.45
cat("Final Classification:", final_class_avg_prob, "\n")
## Final Classification: Green

6.