Assignment5

Pre_loading

rm(list = ls())
set.seed(1)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(leaps)
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>% 
  select(-total_UPDRS) %>%
  drop_na()
names(UPDRS_df) <- gsub("[^[:alnum:]]", "_", names(UPDRS_df))
names(UPDRS_df) <- make.unique(names(UPDRS_df), sep="_")

1

This assignment will cover tree-based and ensemble methods. Divide the datasets into 80% for training and 20% for testing.

Regression problem: build regression models to predict motor_UPDRS (reminder: do not include total_UPDRS as input to the model) and measure their performance utilizing the test set.

Build a regression model using bagging (m=p).
How does the model perform?
Build a regression model using random forests (m=!p).
How does the model perform?
Build a regression model using boosting.
How does the model perform?
Compare and comment on the error obtained with each approach. Which model seems to perform the best?

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(gbm)

## Loaded gbm 2.1.9

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

# Split the data into training and testing sets
num_samples <- nrow(UPDRS_df)
set.seed(123)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)
train <- UPDRS_df[train_indices, ]
test <- UPDRS_df[-train_indices, ]

predictors <- setdiff(names(UPDRS_df), c("motor_UPDRS"))

# Bagging Model
cat("Building Bagging Model...\n")

## Building Bagging Model...

bag_model <- randomForest(motor_UPDRS ~ ., data=train, mtry=length(predictors), importance=TRUE)
predictions_bag <- predict(bag_model, newdata=test)
mse_bag <- mean((predictions_bag - test$motor_UPDRS)^2)
cat("Bagging Model MSE:", mse_bag, "\n")

## Bagging Model MSE: 0.1179848

# Random Forest Model
mtry_rf <- max(floor(sqrt(length(predictors))), 1) # Ensure at least 1
cat("Building Random Forest Model with mtry =", mtry_rf, "...\n")

## Building Random Forest Model with mtry = 4 ...

rf_model <- randomForest(motor_UPDRS ~ ., data=train, mtry=mtry_rf, importance=TRUE)
predictions_rf <- predict(rf_model, newdata=test)
mse_rf <- mean((predictions_rf - test$motor_UPDRS)^2)
cat("Random Forest Model MSE:", mse_rf, "\n")

## Random Forest Model MSE: 5.607907

# Boosting Model
cat("Building Boosting Model...\n")

## Building Boosting Model...

boost_model <- gbm(motor_UPDRS ~ ., data=train, distribution="gaussian",
                   n.trees=5000, interaction.depth=4, shrinkage=0.01, verbose=FALSE)
predictions_boost <- predict(boost_model, newdata=test, n.trees=5000)
mse_boost <- mean((predictions_boost - test$motor_UPDRS)^2)
cat("Boosting Model MSE:", mse_boost, "\n")

## Boosting Model MSE: 1.58763

# Compare and Determine the Best Model
mse_values <- c(bagging=mse_bag, random_forest=mse_rf, boosting=mse_boost)
best_model <- which.min(mse_values)
cat("Model Comparison Results:\n")

## Model Comparison Results:

print(mse_values)

##       bagging random_forest      boosting 
##     0.1179848     5.6079066     1.5876300

cat("Best performing model:", names(best_model), "with MSE:", mse_values[best_model], "\n")

## Best performing model: bagging with MSE: 0.1179848

2

Classification problem: build classification models to predict the class variable and measure their performance utilizing the test set.

Build a classification model using decision trees.
How does the model perform?
Prune the tree obtained in “a”.
Use cross-validation to determine the optimal level of tree complexity.

How does the model perform?

column_names <- c("SampleCodeNumber", "ClumpThickness", "UniformityOfCellSize", "UniformityOfCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "Class")

cancer_df <- read_csv('./Datasets/Classification/breast-cancer-wisconsin.data',
                      col_names = column_names, show_col_types = FALSE) %>%
  mutate(across(everything(), as.numeric)) %>%
  drop_na() %>%
  mutate(Class = if_else(Class == 2, 0, 1),
         Class = as.factor(Class))

# Splitting the dataset into training and testing sets
set.seed(123)
num_samples <- nrow(cancer_df)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)

train <- cancer_df[train_indices, ]
test <- cancer_df[-train_indices, ]


library(tree)
cat("Building Decision Tree Model...\n")

## Building Decision Tree Model...

tree_cancer <- tree(Class ~ ., data=train)

cat("Model Summary:\n")

## Model Summary:

summary(tree_cancer)

## 
## Classification tree:
## tree(formula = Class ~ ., data = train)
## Variables actually used in tree construction:
## [1] "UniformityOfCellSize" "BareNuclei"           "ClumpThickness"      
## [4] "MarginalAdhesion"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.1257 = 67.52 / 537 
## Misclassification error rate: 0.02747 = 15 / 546

plot(tree_cancer)
text(tree_cancer, pretty=0)

tree_pred <- predict(tree_cancer, newdata=test, type="class")

# a confusion matrix and calculating accuracy
conf_matrix <- table(tree_pred, test$Class)
cat("Confusion Matrix:\n")

## Confusion Matrix:

print(conf_matrix)

##          
## tree_pred  0  1
##         0 87  2
##         1  3 45

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy on Test Set:", accuracy, "\n")

## Accuracy on Test Set: 0.9635036

## Prune the Tree
set.seed(123)
cv_tree_cancer <- cv.tree(tree_cancer, FUN=prune.misclass)
cat("Optimal Number of Terminal Nodes:\n")

## Optimal Number of Terminal Nodes:

plot(cv_tree_cancer$size, cv_tree_cancer$dev, type='b')

optimal_size <- which.min(cv_tree_cancer$dev)
cat("Optimal size based on CV:", optimal_size, "\n")

## Optimal size based on CV: 3

pruned_tree_cancer <- prune.misclass(tree_cancer, best=optimal_size)
plot(pruned_tree_cancer)
text(pruned_tree_cancer, pretty=0)

tree_pred_pruned <- predict(pruned_tree_cancer, newdata=test, type="class")
conf_matrix_pruned <- table(tree_pred_pruned, test$Class)
cat("Confusion Matrix for Pruned Tree:\n")

## Confusion Matrix for Pruned Tree:

print(conf_matrix_pruned)

##                 
## tree_pred_pruned  0  1
##                0 88  6
##                1  2 41

accuracy_pruned <- sum(diag(conf_matrix_pruned)) / sum(conf_matrix_pruned)
cat("Accuracy on Test Set for Pruned Tree:", accuracy_pruned, "\n")

## Accuracy on Test Set for Pruned Tree: 0.9416058

Assignment5

Ao (Alan) Huang

2024-03-12

Pre_loading

1

2

The codes are also publicly available at https://rpubs.com/AlanHuang/CSC642-R_Assignment5