library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(tidyverse)
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(dplyr)
mushrooms<- read.csv("C:/Users/MSKR/Downloads/mushrooms.csv", header = TRUE)
str(mushrooms)
## 'data.frame': 8124 obs. of 23 variables:
## $ class : chr "p" "e" "e" "p" ...
## $ cap.shape : chr "x" "x" "b" "x" ...
## $ cap.surface : chr "s" "s" "s" "y" ...
## $ cap.color : chr "n" "y" "w" "w" ...
## $ bruises : chr "t" "t" "t" "t" ...
## $ odor : chr "p" "a" "l" "p" ...
## $ gill.attachment : chr "f" "f" "f" "f" ...
## $ gill.spacing : chr "c" "c" "c" "c" ...
## $ gill.size : chr "n" "b" "b" "n" ...
## $ gill.color : chr "k" "k" "n" "n" ...
## $ stalk.shape : chr "e" "e" "e" "e" ...
## $ stalk.root : chr "e" "c" "c" "e" ...
## $ stalk.surface.above.ring: chr "s" "s" "s" "s" ...
## $ stalk.surface.below.ring: chr "s" "s" "s" "s" ...
## $ stalk.color.above.ring : chr "w" "w" "w" "w" ...
## $ stalk.color.below.ring : chr "w" "w" "w" "w" ...
## $ veil.type : chr "p" "p" "p" "p" ...
## $ veil.color : chr "w" "w" "w" "w" ...
## $ ring.number : chr "o" "o" "o" "o" ...
## $ ring.type : chr "p" "p" "p" "p" ...
## $ spore.print.color : chr "k" "n" "n" "k" ...
## $ population : chr "s" "n" "n" "s" ...
## $ habitat : chr "u" "g" "m" "u" ...
# Convert characters to factors if needed
mushrooms <- mushrooms %>%
mutate_if(is.character, as.factor)
# Split data randomly into training and test sets (50% each)
set.seed(1234)
train_index <- sample(1:nrow(mushrooms), size = 0.5 * nrow(mushrooms))
train_data <- mushrooms[train_index, ]
test_data <- mushrooms[-train_index, ]
# Verify splits
cat("Training set dimensions:", dim(train_data), "\n")
## Training set dimensions: 4062 23
cat("Test set dimensions:", dim(test_data), "\n")
## Test set dimensions: 4062 23
# Build full classification tree
tree_model <- rpart(class ~ ., data = train_data, method = "class")
# Examine complexity parameter table
printcp(tree_model)
##
## Classification tree:
## rpart(formula = class ~ ., data = train_data, method = "class")
##
## Variables actually used in tree construction:
## [1] odor spore.print.color
##
## Root node error: 1953/4062 = 0.4808
##
## n= 4062
##
## CP nsplit rel error xerror xstd
## 1 0.970302 0 1.000000 1.000000 0.0163049
## 2 0.016385 1 0.029698 0.029698 0.0038716
## 3 0.010000 2 0.013313 0.013313 0.0026025
plotcp(tree_model)
# Find optimal complexity parameter
opt_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]
cat("Optimal complexity parameter:", opt_cp, "\n")
## Optimal complexity parameter: 0.01
# Prune the tree
pruned_tree <- prune(tree_model, cp = opt_cp)
# Visualize the pruned tree
rpart.plot(pruned_tree, extra = 104, box.palette = "RdBu",
main = "Classification Tree for Mushroom Edibility")
# Test model performance
tree_pred <- predict(pruned_tree, newdata = test_data, type = "class")
tree_accuracy <- mean(tree_pred == test_data$class)
cat("CART accuracy on test set:", tree_accuracy, "\n")
## CART accuracy on test set: 0.9945839
Insight:
The classification tree shows that odor is the most
important feature in determining whether a mushroom is edible
(e
) or poisonous (p
).
If a mushroom’s odor is almond, anise, or none (a, l, n), it is almost always edible (left side of the tree).
Among those, spore print color (specifically
being b, h, k, n, o, u, w, or y
) provides further
refinement, but most still end up edible.
If the odor is anything else, the mushroom is classified as poisonous (right side of the tree).
Overall, odor alone separates most edible mushrooms from
poisonous ones with very high accuracy.
This suggests that odor is a dominant predictor for
mushroom edibility in this dataset.
# Try different values for B (number of trees)
b_values <- c(50, 100, 200, 500)
bagging_results <- data.frame(B = integer(), Accuracy = numeric())
for (b in b_values) {
# Build bagging model (mtry = p for bagging)
bag_model <- randomForest(class ~ ., data = train_data,
mtry = ncol(train_data) - 1, # All predictors
ntree = b,
importance = TRUE)
# Test performance
bag_pred <- predict(bag_model, newdata = test_data)
bag_accuracy <- mean(bag_pred == test_data$class)
bagging_results <- rbind(bagging_results,
data.frame(B = b, Accuracy = bag_accuracy))
cat("Bagging with B =", b, "- Accuracy:", bag_accuracy, "\n")
}
## Bagging with B = 50 - Accuracy: 0.9995076
## Bagging with B = 100 - Accuracy: 0.9995076
## Bagging with B = 200 - Accuracy: 0.9995076
## Bagging with B = 500 - Accuracy: 0.9995076
# Plot accuracy vs number of trees
plot(bagging_results$B, bagging_results$Accuracy,
type = "b", xlab = "Number of Trees (B)", ylab = "Accuracy",
main = "Bagging: Accuracy vs Number of Trees")
# Find optimal B
best_b <- bagging_results$B[which.max(bagging_results$Accuracy)]
cat("Best number of trees for bagging:", best_b, "\n")
## Best number of trees for bagging: 50
# Build final bagging model with optimal B
final_bag <- randomForest(class ~ ., data = train_data,
mtry = ncol(train_data) - 1,
ntree = best_b,
importance = TRUE)
# Variable importance plot
varImpPlot(final_bag, main = "Variable Importance - Bagging",
n.var = 10)
Insight:
Using Bagging (Bootstrap Aggregating) with different numbers of trees (B = 50, 100, 200, 500) produced a very high and consistent accuracy of ~99.95%.
The odor feature is by far the most important predictor, followed by spore print color and stalk color below the ring.
This confirms that a few strong features drive almost perfect classification performance in this dataset.
# Try different combinations of mtry and B
mtry_values <- c(2, 4, floor(sqrt(ncol(train_data) - 1)))
b_values <- c(100, 300, 500)
rf_results <- data.frame(mtry = integer(), B = integer(), Accuracy = numeric())
for (mtry in mtry_values) {
for (b in b_values) {
# Build random forest model
rf_model <- randomForest(class ~ ., data = train_data,
mtry = mtry,
ntree = b,
importance = TRUE)
# Test performance
rf_pred <- predict(rf_model, newdata = test_data)
rf_accuracy <- mean(rf_pred == test_data$class)
rf_results <- rbind(rf_results,
data.frame(mtry = mtry, B = b, Accuracy = rf_accuracy))
cat("Random Forest with mtry =", mtry, "and B =", b, "- Accuracy:", rf_accuracy, "\n")
}
}
## Random Forest with mtry = 2 and B = 100 - Accuracy: 1
## Random Forest with mtry = 2 and B = 300 - Accuracy: 1
## Random Forest with mtry = 2 and B = 500 - Accuracy: 1
## Random Forest with mtry = 4 and B = 100 - Accuracy: 1
## Random Forest with mtry = 4 and B = 300 - Accuracy: 1
## Random Forest with mtry = 4 and B = 500 - Accuracy: 1
## Random Forest with mtry = 4 and B = 100 - Accuracy: 1
## Random Forest with mtry = 4 and B = 300 - Accuracy: 1
## Random Forest with mtry = 4 and B = 500 - Accuracy: 1
# Find optimal combination
best_rf <- rf_results[which.max(rf_results$Accuracy), ]
cat("Best Random Forest parameters: mtry =", best_rf$mtry, "and B =", best_rf$B, "\n")
## Best Random Forest parameters: mtry = 2 and B = 100
# Build final RF model with optimal parameters
final_rf <- randomForest(class ~ ., data = train_data,
mtry = best_rf$mtry,
ntree = best_rf$B,
importance = TRUE)
# Variable importance plot
varImpPlot(final_rf, main = "Variable Importance - Random Forest",
n.var = 10)
Insight
The model perfectly classifies mushrooms as edible or poisonous regardless of the number of trees (B) or features considered at each split (mtry).
Parameter tuning has minimal impact since all configurations achieve perfect results, suggesting you could use fewer trees (B=100) and fewer variables per split (mtry=2) for computational efficiency without sacrificing performance.
The variable importance plot confirms “odor” is by far the most significant predictor, followed by spore print color and gill characteristics.
# Try different combinations of lambda (learning rate) and B
lambda_values <- c(0.001, 0.01, 0.1)
b_values <- c(100, 500, 1000)
boost_results <- data.frame(lambda = numeric(), B = integer(), Accuracy = numeric())
for (lambda in lambda_values) {
for (b in b_values) {
# Prepare data for gbm (need numeric response)
train_boost <- train_data
train_boost$class_num <- ifelse(train_data$class == "p", 1, 0)
# Build boosting model
boost_model <- gbm(class_num ~ . - class, data = train_boost,
distribution = "bernoulli",
n.trees = b,
interaction.depth = 1, # Using stumps as requested
shrinkage = lambda,
verbose = FALSE)
# Test performance
test_boost <- test_data
boost_pred_prob <- predict(boost_model, newdata = test_boost,
n.trees = b, type = "response")
boost_pred_class <- ifelse(boost_pred_prob > 0.5, "p", "e")
boost_accuracy <- mean(boost_pred_class == test_data$class)
boost_results <- rbind(boost_results,
data.frame(lambda = lambda, B = b, Accuracy = boost_accuracy))
cat("Boosting with lambda =", lambda, "and B =", b, "- Accuracy:", boost_accuracy, "\n")
}
}
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 and B = 100 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 and B = 500 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.001 and B = 1000 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 and B = 100 - Accuracy: 0.9847366
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 and B = 500 - Accuracy: 0.9945839
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.01 and B = 1000 - Accuracy: 0.9975382
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 and B = 100 - Accuracy: 0.9975382
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 and B = 500 - Accuracy: 1
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
## Boosting with lambda = 0.1 and B = 1000 - Accuracy: 1
# Find optimal combination
best_boost <- boost_results[which.max(boost_results$Accuracy), ]
cat("Best Boosting parameters: lambda =", best_boost$lambda, "and B =", best_boost$B, "\n")
## Best Boosting parameters: lambda = 0.1 and B = 500
# Visualize relationship with heatmap
library(ggplot2)
ggplot(boost_results, aes(x = factor(B), y = factor(lambda), fill = Accuracy)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(x = "Number of Trees (B)", y = "Learning Rate (lambda)",
title = "Boosting: Accuracy by lambda and B")
# Build final boosting model with optimal parameters
train_boost <- train_data
train_boost$class_num <- ifelse(train_data$class == "p", 1, 0)
final_boost <- gbm(class_num ~ . - class, data = train_boost,
distribution = "bernoulli",
n.trees = best_boost$B,
interaction.depth = 1,
shrinkage = best_boost$lambda,
verbose = FALSE)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution,
## : variable 16: veil.type has no variation.
# Variable importance
summary(final_boost, plotit = TRUE, n.trees = best_boost$B, cBars = 10)
## var rel.inf
## odor odor 9.492480e+01
## spore.print.color spore.print.color 4.263208e+00
## stalk.surface.below.ring stalk.surface.below.ring 2.411991e-01
## gill.size gill.size 2.389130e-01
## stalk.color.below.ring stalk.color.below.ring 1.942312e-01
## population population 4.641570e-02
## cap.color cap.color 3.233962e-02
## stalk.surface.above.ring stalk.surface.above.ring 2.365999e-02
## ring.number ring.number 1.405065e-02
## cap.shape cap.shape 6.622187e-03
## habitat habitat 6.403945e-03
## ring.type ring.type 5.463953e-03
## gill.color gill.color 1.727668e-03
## stalk.root stalk.root 8.790685e-04
## stalk.color.above.ring stalk.color.above.ring 8.690701e-05
## cap.surface cap.surface 0.000000e+00
## bruises bruises 0.000000e+00
## gill.attachment gill.attachment 0.000000e+00
## gill.spacing gill.spacing 0.000000e+00
## stalk.shape stalk.shape 0.000000e+00
## veil.type veil.type 0.000000e+00
## veil.color veil.color 0.000000e+00
Insights
The boosting results show that high accuracy (up to 100%) is achieved with larger learning rates (λ = 0.1) and a sufficient number of trees (B ≥ 500), while lower learning rates require more trees to reach similar performance.
The variable importance plot highlights “odor” as overwhelmingly the most influential feature in predicting mushroom edibility, with other variables contributing minimally
# Compare all models
all_results <- data.frame(
Model = c("CART", "Bagging", "Random Forest", "Boosting"),
Accuracy = c(tree_accuracy,
max(bagging_results$Accuracy),
max(rf_results$Accuracy),
max(boost_results$Accuracy))
)
# Sort by accuracy
all_results <- all_results[order(-all_results$Accuracy), ]
print(all_results)
## Model Accuracy
## 3 Random Forest 1.0000000
## 4 Boosting 1.0000000
## 2 Bagging 0.9995076
## 1 CART 0.9945839
# Identify best model
best_model <- all_results$Model[1]
best_accuracy <- all_results$Accuracy[1]
cat("The best model is", best_model, "with accuracy", round(best_accuracy, 4), "\n")
## The best model is Random Forest with accuracy 1
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
data(Auto)
# Define function to calculate difference in medians
diff_median_mpg <- function(data, indices) {
sample_data <- data[indices, ]
median_4cyl <- median(sample_data$mpg[sample_data$cylinders == 4])
median_8cyl <- median(sample_data$mpg[sample_data$cylinders == 8])
return(median_4cyl - median_8cyl)
}
# Set seed for reproducibility
set.seed(1234) # for reproducibility
boot_result <- boot(data = Auto, statistic = diff_median_mpg, R = 1000)
print(boot_result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = diff_median_mpg, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 14.4 -0.30745 0.797002
# Extract standard error estimate
boot_result$t0 # Original estimate
## [1] 14.4
sd(boot_result$t) # Bootstrap estimated standard error
## [1] 0.797002
Interpretation:
Estimated difference in median mpg between 4-cylinder and 8-cylinder cars = 14.4 mpg
Bootstrap-estimated standard error = 0.797 mpg
*4-cylinder cars have a median mpg about 14.4 units higher than 8-cylinder cars*,\
with a **standard error** of about **0.797**.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
data(cpus)
# Create folds using createFolds() from caret package
set.seed(1234) # For reproducibility
folds <- createFolds(cpus$perf, k = 10, list = TRUE, returnTrain = FALSE)
# Initialize vector to store MSE values
mse_values <- numeric(10)
# Perform 10-fold cross validation
for(i in 1:10) {
# Split data into training and test sets
test_indices <- folds[[i]]
train_data <- cpus[-test_indices, ]
test_data <- cpus[test_indices, ]
# Fit linear regression model
model <- lm(perf ~ mmin + mmax, data = train_data)
# Make predictions on test set
predictions <- predict(model, newdata = test_data)
# Calculate MSE for this fold
mse_values[i] <- mean((test_data$perf - predictions)^2)
}
# Calculate average test MSE and standard error
avg_mse <- mean(mse_values)
se_mse <- sd(mse_values)/sqrt(10)
cat("10-Fold Cross-Validation Results:\n")
## 10-Fold Cross-Validation Results:
cat("Average Test MSE:", avg_mse, "\n")
## Average Test MSE: 6315.865
cat("Standard Error:", se_mse)
## Standard Error: 1815.464