rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaps)
UPDRS_df <- read_csv('./Datasets/Regression/parkinsons_updrs.data', show_col_types = FALSE) %>%
select(-total_UPDRS) %>%
drop_na()
names(UPDRS_df) <- gsub("[^[:alnum:]]", "_", names(UPDRS_df))
names(UPDRS_df) <- make.unique(names(UPDRS_df), sep="_")
This assignment will cover tree-based and ensemble methods. Divide the datasets into 80% for training and 20% for testing.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
# Split the data into training and testing sets
num_samples <- nrow(UPDRS_df)
set.seed(123)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)
train <- UPDRS_df[train_indices, ]
test <- UPDRS_df[-train_indices, ]
predictors <- setdiff(names(UPDRS_df), c("motor_UPDRS"))
# Bagging Model
cat("Building Bagging Model...\n")
## Building Bagging Model...
bag_model <- randomForest(motor_UPDRS ~ ., data=train, mtry=length(predictors), importance=TRUE)
predictions_bag <- predict(bag_model, newdata=test)
mse_bag <- mean((predictions_bag - test$motor_UPDRS)^2)
cat("Bagging Model MSE:", mse_bag, "\n")
## Bagging Model MSE: 0.1179848
# Random Forest Model
mtry_rf <- max(floor(sqrt(length(predictors))), 1) # Ensure at least 1
cat("Building Random Forest Model with mtry =", mtry_rf, "...\n")
## Building Random Forest Model with mtry = 4 ...
rf_model <- randomForest(motor_UPDRS ~ ., data=train, mtry=mtry_rf, importance=TRUE)
predictions_rf <- predict(rf_model, newdata=test)
mse_rf <- mean((predictions_rf - test$motor_UPDRS)^2)
cat("Random Forest Model MSE:", mse_rf, "\n")
## Random Forest Model MSE: 5.607907
# Boosting Model
cat("Building Boosting Model...\n")
## Building Boosting Model...
boost_model <- gbm(motor_UPDRS ~ ., data=train, distribution="gaussian",
n.trees=5000, interaction.depth=4, shrinkage=0.01, verbose=FALSE)
predictions_boost <- predict(boost_model, newdata=test, n.trees=5000)
mse_boost <- mean((predictions_boost - test$motor_UPDRS)^2)
cat("Boosting Model MSE:", mse_boost, "\n")
## Boosting Model MSE: 1.58763
# Compare and Determine the Best Model
mse_values <- c(bagging=mse_bag, random_forest=mse_rf, boosting=mse_boost)
best_model <- which.min(mse_values)
cat("Model Comparison Results:\n")
## Model Comparison Results:
print(mse_values)
## bagging random_forest boosting
## 0.1179848 5.6079066 1.5876300
cat("Best performing model:", names(best_model), "with MSE:", mse_values[best_model], "\n")
## Best performing model: bagging with MSE: 0.1179848
column_names <- c("SampleCodeNumber", "ClumpThickness", "UniformityOfCellSize", "UniformityOfCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "Class")
cancer_df <- read_csv('./Datasets/Classification/breast-cancer-wisconsin.data',
col_names = column_names, show_col_types = FALSE) %>%
mutate(across(everything(), as.numeric)) %>%
drop_na() %>%
mutate(Class = if_else(Class == 2, 0, 1),
Class = as.factor(Class))
# Splitting the dataset into training and testing sets
set.seed(123)
num_samples <- nrow(cancer_df)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)
train <- cancer_df[train_indices, ]
test <- cancer_df[-train_indices, ]
library(tree)
cat("Building Decision Tree Model...\n")
## Building Decision Tree Model...
tree_cancer <- tree(Class ~ ., data=train)
cat("Model Summary:\n")
## Model Summary:
summary(tree_cancer)
##
## Classification tree:
## tree(formula = Class ~ ., data = train)
## Variables actually used in tree construction:
## [1] "UniformityOfCellSize" "BareNuclei" "ClumpThickness"
## [4] "MarginalAdhesion"
## Number of terminal nodes: 9
## Residual mean deviance: 0.1257 = 67.52 / 537
## Misclassification error rate: 0.02747 = 15 / 546
plot(tree_cancer)
text(tree_cancer, pretty=0)
tree_pred <- predict(tree_cancer, newdata=test, type="class")
# a confusion matrix and calculating accuracy
conf_matrix <- table(tree_pred, test$Class)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(conf_matrix)
##
## tree_pred 0 1
## 0 87 2
## 1 3 45
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy on Test Set:", accuracy, "\n")
## Accuracy on Test Set: 0.9635036
## Prune the Tree
set.seed(123)
cv_tree_cancer <- cv.tree(tree_cancer, FUN=prune.misclass)
cat("Optimal Number of Terminal Nodes:\n")
## Optimal Number of Terminal Nodes:
plot(cv_tree_cancer$size, cv_tree_cancer$dev, type='b')
optimal_size <- which.min(cv_tree_cancer$dev)
cat("Optimal size based on CV:", optimal_size, "\n")
## Optimal size based on CV: 3
pruned_tree_cancer <- prune.misclass(tree_cancer, best=optimal_size)
plot(pruned_tree_cancer)
text(pruned_tree_cancer, pretty=0)
tree_pred_pruned <- predict(pruned_tree_cancer, newdata=test, type="class")
conf_matrix_pruned <- table(tree_pred_pruned, test$Class)
cat("Confusion Matrix for Pruned Tree:\n")
## Confusion Matrix for Pruned Tree:
print(conf_matrix_pruned)
##
## tree_pred_pruned 0 1
## 0 88 6
## 1 2 41
accuracy_pruned <- sum(diag(conf_matrix_pruned)) / sum(conf_matrix_pruned)
cat("Accuracy on Test Set for Pruned Tree:", accuracy_pruned, "\n")
## Accuracy on Test Set for Pruned Tree: 0.9416058