1. Reading the dataset

dataset <- read_csv("../../../datasets/results-1511/dataset.csv")
Rows: 532 Columns: 648── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): name
dbl (647): psd_1, psd_2, psd_3, psd_4, psd_5, psd_6, psd_7, psd_8, psd_9, psd_10, psd_11, psd_12, psd_13, psd_14, psd_15, psd_16, psd_17, psd_18, psd_19, psd_20, psd_21, psd_22, psd_23, psd...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2. Splitting the dataset

We will split the dataset into a training set (70%) and a testing set (30%).

set.seed(123) # for reproducibility
sample_index <- sample(1:nrow(dataset), 0.7*nrow(dataset))
train_data <- dataset[sample_index, ]
test_data <- dataset[-sample_index, ]

3. Creating a regression model

Using the ranger package, we’ll predict the tmg feature and using permutation

library(ranger)
model_per <- ranger(tmg ~ ., data = train_data, importance = 'permutation')
model_per
Ranger result

Call:
 ranger(tmg ~ ., data = train_data, importance = "permutation") 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      372 
Number of independent variables:  647 
Mtry:                             25 
Target node size:                 5 
Variable importance mode:         permutation 
Splitrule:                        variance 
OOB prediction error (MSE):       0.001062913 
R squared (OOB):                  0.8279352 

4. Calculate importance with permutation

library(dplyr)
library(ggplot2)
plot_perm <-importance(model_per) |> as.data.frame() %>% add_rownames("predictor") %>% mutate(importance=`importance(model_per)`) %>% select(predictor,importance)|> arrange(desc(importance)) %>% head(20) %>% mutate(predictor = factor(predictor, levels = rev(unique(predictor))))  %>%
  ggplot()+
  geom_col(aes(y=predictor,x=importance),fill='darkblue', color='gray')+
  ggtitle("Top 20 predictor importance using permutation")+
  theme_minimal()

5. calculate importante with impurity

library(ranger)
model_imp <- ranger(tmg ~ ., data = train_data, importance = 'impurity')
model_imp
Ranger result

Call:
 ranger(tmg ~ ., data = train_data, importance = "impurity") 

Type:                             Regression 
Number of trees:                  500 
Sample size:                      372 
Number of independent variables:  647 
Mtry:                             25 
Target node size:                 5 
Variable importance mode:         impurity 
Splitrule:                        variance 
OOB prediction error (MSE):       0.001023603 
R squared (OOB):                  0.8342988 
plot_imp<-importance(model_imp) |> as.data.frame() %>% add_rownames("predictor") %>% mutate(importance=`importance(model_imp)`) %>% select(predictor,importance)|> arrange(desc(importance)) %>% head(20) %>% mutate(predictor = factor(predictor, levels = rev(unique(predictor))))  %>%
  ggplot()+
  geom_col(aes(y=predictor,x=importance),fill='darkblue', color='gray')+
  ggtitle("Top 20 predictor importance using impurity")+
  theme_minimal()

Permutation vs. Impurity

5. Evaluate results on test dataset

predictions <- predict(model_per, data = test_data)$predictions
# Compute the RMSE (Root Mean Square Error)
RMSE <- sqrt(mean((predictions - test_data$tmg)^2))
RMSE
[1] 0.03521241

6. Plot: Predicted vs Reference values

library(ggplot2)

results <- data.frame(Reference = test_data$tmg, Predicted = predictions)
ggplot(results, aes(x = Reference, y = Predicted)) +
  geom_point(color='blue') +
  #geom_smooth(method = 'lm', color = 'red') +
  geom_abline(intercept = 0,slope =1,color='red')+
  ggtitle("Predicted vs Reference values") +
  #ylim(0,1)+
  xlab("Reference Values") +
  ylab("Predicted Values")+
  theme_bw()

LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBNb2RlbCB1c2luZyBSYW5nZXIgZm9yIEZlTmkiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQprbml0cjo6b3B0c19jaHVuayRzZXQoZWNobyA9IFRSVUUpCmBgYAoKIyMgMS4gUmVhZGluZyB0aGUgZGF0YXNldApgYGB7cn0KbGlicmFyeShyZWFkcikKZGF0YXNldCA8LSByZWFkX2NzdigiLi4vLi4vLi4vZGF0YXNldHMvcmVzdWx0cy0xNTExL2RhdGFzZXQuY3N2IikKaGVhZChkYXRhc2V0KQoKYGBgCiMjIDIuIFNwbGl0dGluZyB0aGUgZGF0YXNldApXZSB3aWxsIHNwbGl0IHRoZSBkYXRhc2V0IGludG8gYSB0cmFpbmluZyBzZXQgKDcwJSkgYW5kIGEgdGVzdGluZyBzZXQgKDMwJSkuCmBgYHtyfQpzZXQuc2VlZCgxMjMpICMgZm9yIHJlcHJvZHVjaWJpbGl0eQpzYW1wbGVfaW5kZXggPC0gc2FtcGxlKDE6bnJvdyhkYXRhc2V0KSwgMC43Km5yb3coZGF0YXNldCkpCnRyYWluX2RhdGEgPC0gZGF0YXNldFtzYW1wbGVfaW5kZXgsIF0KdGVzdF9kYXRhIDwtIGRhdGFzZXRbLXNhbXBsZV9pbmRleCwgXQoKYGBgCgojIyAzLiBDcmVhdGluZyBhIHJlZ3Jlc3Npb24gbW9kZWwKVXNpbmcgdGhlIHJhbmdlciBwYWNrYWdlLCB3ZSdsbCBwcmVkaWN0IHRoZSBgdG1nYCBmZWF0dXJlIGFuZCB1c2luZyBwZXJtdXRhdGlvbgoKYGBge3J9CmxpYnJhcnkocmFuZ2VyKQptb2RlbF9wZXIgPC0gcmFuZ2VyKHRtZyB+IC4sIGRhdGEgPSB0cmFpbl9kYXRhLCBpbXBvcnRhbmNlID0gJ3Blcm11dGF0aW9uJykKbW9kZWxfcGVyCmBgYAoKCiMjIDQuIENhbGN1bGF0ZSBpbXBvcnRhbmNlIHdpdGggcGVybXV0YXRpb24KCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCnBsb3RfcGVybSA8LWltcG9ydGFuY2UobW9kZWxfcGVyKSB8PiBhcy5kYXRhLmZyYW1lKCkgJT4lIGFkZF9yb3duYW1lcygicHJlZGljdG9yIikgJT4lIG11dGF0ZShpbXBvcnRhbmNlPWBpbXBvcnRhbmNlKG1vZGVsX3BlcilgKSAlPiUgc2VsZWN0KHByZWRpY3RvcixpbXBvcnRhbmNlKXw+IGFycmFuZ2UoZGVzYyhpbXBvcnRhbmNlKSkgJT4lIGhlYWQoMjApICU+JSBtdXRhdGUocHJlZGljdG9yID0gZmFjdG9yKHByZWRpY3RvciwgbGV2ZWxzID0gcmV2KHVuaXF1ZShwcmVkaWN0b3IpKSkpICAlPiUKICBnZ3Bsb3QoKSsKICBnZW9tX2NvbChhZXMoeT1wcmVkaWN0b3IseD1pbXBvcnRhbmNlKSxmaWxsPSdkYXJrYmx1ZScsIGNvbG9yPSdncmF5JykrCiAgZ2d0aXRsZSgiVG9wIDIwIHByZWRpY3RvciBpbXBvcnRhbmNlIHVzaW5nIHBlcm11dGF0aW9uIikrCiAgdGhlbWVfbWluaW1hbCgpCgpgYGAKCgojIyA1LiBjYWxjdWxhdGUgaW1wb3J0YW50ZSB3aXRoIGltcHVyaXR5CmBgYHtyfQpsaWJyYXJ5KHJhbmdlcikKbW9kZWxfaW1wIDwtIHJhbmdlcih0bWcgfiAuLCBkYXRhID0gdHJhaW5fZGF0YSwgaW1wb3J0YW5jZSA9ICdpbXB1cml0eScpCm1vZGVsX2ltcApgYGAKCmBgYHtyfQpwbG90X2ltcDwtaW1wb3J0YW5jZShtb2RlbF9pbXApIHw+IGFzLmRhdGEuZnJhbWUoKSAlPiUgYWRkX3Jvd25hbWVzKCJwcmVkaWN0b3IiKSAlPiUgbXV0YXRlKGltcG9ydGFuY2U9YGltcG9ydGFuY2UobW9kZWxfaW1wKWApICU+JSBzZWxlY3QocHJlZGljdG9yLGltcG9ydGFuY2UpfD4gYXJyYW5nZShkZXNjKGltcG9ydGFuY2UpKSAlPiUgaGVhZCgyMCkgJT4lIG11dGF0ZShwcmVkaWN0b3IgPSBmYWN0b3IocHJlZGljdG9yLCBsZXZlbHMgPSByZXYodW5pcXVlKHByZWRpY3RvcikpKSkgICU+JQogIGdncGxvdCgpKwogIGdlb21fY29sKGFlcyh5PXByZWRpY3Rvcix4PWltcG9ydGFuY2UpLGZpbGw9J2RhcmtibHVlJywgY29sb3I9J2dyYXknKSsKICBnZ3RpdGxlKCJUb3AgMjAgcHJlZGljdG9yIGltcG9ydGFuY2UgdXNpbmcgaW1wdXJpdHkiKSsKICB0aGVtZV9taW5pbWFsKCkKYGBgCgojIyBQZXJtdXRhdGlvbiB2cy4gSW1wdXJpdHkKYGBge3J9CmxpYnJhcnkoZ3JpZEV4dHJhKQpncmlkRXh0cmE6OmdyaWQuYXJyYW5nZShwbG90X2ltcCxwbG90X3Blcm0sIG5jb2w9MikKCmBgYAojIyA1LiBFdmFsdWF0ZSByZXN1bHRzIG9uIHRlc3QgZGF0YXNldApgYGB7cn0KcHJlZGljdGlvbnMgPC0gcHJlZGljdChtb2RlbF9wZXIsIGRhdGEgPSB0ZXN0X2RhdGEpJHByZWRpY3Rpb25zCiMgQ29tcHV0ZSB0aGUgUk1TRSAoUm9vdCBNZWFuIFNxdWFyZSBFcnJvcikKUk1TRSA8LSBzcXJ0KG1lYW4oKHByZWRpY3Rpb25zIC0gdGVzdF9kYXRhJHRtZyleMikpClJNU0UKCmBgYAojIyA2LiBQbG90OiBQcmVkaWN0ZWQgdnMgUmVmZXJlbmNlIHZhbHVlcwpgYGB7cn0KbGlicmFyeShnZ3Bsb3QyKQoKcmVzdWx0cyA8LSBkYXRhLmZyYW1lKFJlZmVyZW5jZSA9IHRlc3RfZGF0YSR0bWcsIFByZWRpY3RlZCA9IHByZWRpY3Rpb25zKQpnZ3Bsb3QocmVzdWx0cywgYWVzKHggPSBSZWZlcmVuY2UsIHkgPSBQcmVkaWN0ZWQpKSArCiAgZ2VvbV9wb2ludChjb2xvcj0nYmx1ZScpICsKICAjZ2VvbV9zbW9vdGgobWV0aG9kID0gJ2xtJywgY29sb3IgPSAncmVkJykgKwogIGdlb21fYWJsaW5lKGludGVyY2VwdCA9IDAsc2xvcGUgPTEsY29sb3I9J3JlZCcpKwogIGdndGl0bGUoIlByZWRpY3RlZCB2cyBSZWZlcmVuY2UgdmFsdWVzIikgKwogICN5bGltKDAsMSkrCiAgeGxhYigiUmVmZXJlbmNlIFZhbHVlcyIpICsKICB5bGFiKCJQcmVkaWN0ZWQgVmFsdWVzIikrCiAgdGhlbWVfYncoKQoKYGBgCgo=