abalone <- read.csv("C:/Users/Kayla/Downloads/abalone/abalone.data", header = FALSE)
abalone <- read.csv("abalone.data", header = FALSE)
#️ Add column names to make the data easier to read
colnames(abalone) <- c("Sex", "Length", "Diameter", "Height",
"WholeWeight", "ShuckedWeight",
"VisceraWeight", "ShellWeight", "Rings")
#Convert 'Sex' column to a category (not a #)
abalone$Sex <- as.factor(abalone$Sex)
#👀 Show the first few rows of the dataset
head(abalone)
## Sex Length Diameter Height WholeWeight ShuckedWeight VisceraWeight
## 1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010
## 2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485
## 3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415
## 4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140
## 5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395
## 6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775
## ShellWeight Rings
## 1 0.150 15
## 2 0.070 7
## 3 0.210 9
## 4 0.155 10
## 5 0.055 7
## 6 0.120 8
#install.packages("rpart") -> for building decision trees
#install.packages("rpart.plot") ->for plotting trees visually
#install.packages("caret")-> for data splitting into training and testing
#Load the libraries (do this every time you run the code)
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
set.seed(123) # so results are the same every time
# Creates the index for 80% of data
train_index <- createDataPartition(abalone$Rings, p = 0.8, list = FALSE)
# Split the data
abalone_train <- abalone[train_index, ]
abalone_test <- abalone[-train_index, ]
# method = "anova" since we're going to be predicting a number,
# Builds the regression tree to predict 'Rings'(age)
tree_model <- rpart(Rings ~ ., data = abalone_train, method = "anova")
# This is to plot the tree
rpart.plot(tree_model)
# Find the best (cp) value from the complexity parameter table
printcp(tree_model) # optional, this is just to see the table
##
## Regression tree:
## rpart(formula = Rings ~ ., data = abalone_train, method = "anova")
##
## Variables actually used in tree construction:
## [1] Sex ShellWeight ShuckedWeight
##
## Root node error: 35049/3343 = 10.484
##
## n= 3343
##
## CP nsplit rel error xerror xstd
## 1 0.276782 0 1.00000 1.00060 0.036269
## 2 0.055647 1 0.72322 0.73384 0.028965
## 3 0.038804 2 0.66757 0.68490 0.027130
## 4 0.018691 3 0.62877 0.64991 0.026348
## 5 0.018241 5 0.59139 0.63462 0.026041
## 6 0.014425 7 0.55490 0.60863 0.025280
## 7 0.013035 8 0.54048 0.58276 0.024224
## 8 0.011184 9 0.52744 0.57371 0.024221
## 9 0.010000 10 0.51626 0.56039 0.023626
best_cp <- tree_model$cptable[which.min(tree_model$cptable[,"xerror"]), "CP"]
# Prune the tree to simplify it and avoid overfitting
pruned_tree <- prune(tree_model, cp = best_cp)
# Plot the pruned tree
rpart.plot(pruned_tree)
# Predict on the test set
#predictions <- predict(tree_model, abalone_test) if we were using the regular tree model but to switch we put pruned tree.
predictions <- predict(pruned_tree, abalone_test)
# Check performance using RMSE meaning ->(Root Mean Squared Error)
rmse <- sqrt(mean((predictions - abalone_test$Rings)^2))
print(paste("RMSE:", round(rmse, 2)))
## [1] "RMSE: 2.32"
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.