dataset <- read_csv("../../../datasets/results-1511/dataset.csv")
Rows: 532 Columns: 648── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (647): psd_1, psd_2, psd_3, psd_4, psd_5, psd_6, psd_7, psd_8, psd_9, psd_10, psd_11, psd_12, psd_13, psd_14, psd_15, psd_16, psd_17, psd_18, psd_19, psd_20, psd_21, psd_22, psd_23, psd...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
We will split the dataset into a training set (70%) and a testing set (30%).
set.seed(123) # for reproducibility
sample_index <- sample(1:nrow(dataset), 0.7*nrow(dataset))
train_data <- dataset[sample_index, ]
test_data <- dataset[-sample_index, ]
Using the ranger package, we’ll predict the tmg
feature
and using permutation
library(ranger)
model_per <- ranger(tmg ~ ., data = train_data, importance = 'permutation')
model_per
Ranger result
Call:
ranger(tmg ~ ., data = train_data, importance = "permutation")
Type: Regression
Number of trees: 500
Sample size: 372
Number of independent variables: 647
Mtry: 25
Target node size: 5
Variable importance mode: permutation
Splitrule: variance
OOB prediction error (MSE): 0.001062913
R squared (OOB): 0.8279352
library(dplyr)
library(ggplot2)
plot_perm <-importance(model_per) |> as.data.frame() %>% add_rownames("predictor") %>% mutate(importance=`importance(model_per)`) %>% select(predictor,importance)|> arrange(desc(importance)) %>% head(20) %>% mutate(predictor = factor(predictor, levels = rev(unique(predictor)))) %>%
ggplot()+
geom_col(aes(y=predictor,x=importance),fill='darkblue', color='gray')+
ggtitle("Top 20 predictor importance using permutation")+
theme_minimal()
library(ranger)
model_imp <- ranger(tmg ~ ., data = train_data, importance = 'impurity')
model_imp
Ranger result
Call:
ranger(tmg ~ ., data = train_data, importance = "impurity")
Type: Regression
Number of trees: 500
Sample size: 372
Number of independent variables: 647
Mtry: 25
Target node size: 5
Variable importance mode: impurity
Splitrule: variance
OOB prediction error (MSE): 0.001023603
R squared (OOB): 0.8342988
plot_imp<-importance(model_imp) |> as.data.frame() %>% add_rownames("predictor") %>% mutate(importance=`importance(model_imp)`) %>% select(predictor,importance)|> arrange(desc(importance)) %>% head(20) %>% mutate(predictor = factor(predictor, levels = rev(unique(predictor)))) %>%
ggplot()+
geom_col(aes(y=predictor,x=importance),fill='darkblue', color='gray')+
ggtitle("Top 20 predictor importance using impurity")+
theme_minimal()
predictions <- predict(model_per, data = test_data)$predictions
# Compute the RMSE (Root Mean Square Error)
RMSE <- sqrt(mean((predictions - test_data$tmg)^2))
RMSE
[1] 0.03521241
library(ggplot2)
results <- data.frame(Reference = test_data$tmg, Predicted = predictions)
ggplot(results, aes(x = Reference, y = Predicted)) +
geom_point(color='blue') +
#geom_smooth(method = 'lm', color = 'red') +
geom_abline(intercept = 0,slope =1,color='red')+
ggtitle("Predicted vs Reference values") +
#ylim(0,1)+
xlab("Reference Values") +
ylab("Predicted Values")+
theme_bw()