We will use use a supervised machine learning linear regression program to predict metabolite concentration.

Load Libraries

library(caret)
library(ggplot2)
library(car)
library(lmtest)
library(corrplot)
library(GGally)
library(reshape2)
library(glmnet)
library(DALEX)

Step 1: Load the data

met_data <- read.csv("metabolite_data.csv")
head(met_data)
##       mass     logP num_rings num_H_bonds enzyme_activity pathway_count
## 1 165.9881 4.259048         2           4        23.98300             1
## 2 174.2456 3.549930         0           1        45.46802             5
## 3 218.9677 2.287884         3           2        43.24518             4
## 4 181.7627 2.934555         3           5        37.77074             3
## 5 183.2322 2.168528         2           6        65.46609             4
## 6 222.8766 2.119002         0           0        35.84718             4
##   metabolite_concentration
## 1                 122.6491
## 2                 128.3079
## 3                 155.9756
## 4                 145.3023
## 5                 161.8741
## 6                 149.0430
dim(met_data)
## [1] 200   7
str(met_data)
## 'data.frame':    200 obs. of  7 variables:
##  $ mass                    : num  166 174 219 182 183 ...
##  $ logP                    : num  4.26 3.55 2.29 2.93 2.17 ...
##  $ num_rings               : int  2 0 3 3 2 0 2 2 0 2 ...
##  $ num_H_bonds             : int  4 1 2 5 6 0 5 6 4 5 ...
##  $ enzyme_activity         : num  24 45.5 43.2 37.8 65.5 ...
##  $ pathway_count           : int  1 5 4 3 4 4 4 3 3 3 ...
##  $ metabolite_concentration: num  123 128 156 145 162 ...

Comment here: Met data loaded with 200 observations of 7 variables

Step 2: Basic EDA: Exploratory Data Analysis

#Step 2a: check the metrics (mean, media etc) for each variable

summary(met_data)
##       mass            logP          num_rings    num_H_bonds   enzyme_activity
##  Min.   :122.3   Min.   :0.5273   Min.   :0.0   Min.   :0.00   Min.   :23.98  
##  1st Qu.:164.4   1st Qu.:2.0274   1st Qu.:1.0   1st Qu.:1.00   1st Qu.:43.24  
##  Median :178.5   Median :2.5183   Median :2.0   Median :3.00   Median :51.20  
##  Mean   :179.8   Mean   :2.5337   Mean   :1.6   Mean   :3.08   Mean   :49.91  
##  3rd Qu.:194.2   3rd Qu.:3.0719   3rd Qu.:3.0   3rd Qu.:5.00   3rd Qu.:56.38  
##  Max.   :261.0   Max.   :4.5572   Max.   :3.0   Max.   :6.00   Max.   :76.92  
##  pathway_count   metabolite_concentration
##  Min.   :1.000   Min.   :102.8           
##  1st Qu.:2.000   1st Qu.:134.4           
##  Median :3.000   Median :144.0           
##  Mean   :2.955   Mean   :145.3           
##  3rd Qu.:4.000   3rd Qu.:155.8           
##  Max.   :5.000   Max.   :191.0

Comment here: metric data loaded for mass, logP, num_rings, num_H_bonds, enzyme_activity, pathway_count, and metabolite_concentration

# Step 2b: plots for each variable: x-axis = order in the data set, y-axis: measurement
par(mfrow=c(3,3))
plot(met_data$mass)
plot(met_data$logP)
plot(met_data$num_rings)
plot(met_data$num_H_bonds)
plot(met_data$enzyme_activity)
plot(met_data$pathway_count)
plot(met_data$metabolite_concentration)
par(mfrow = c(1, 1))

Comment here:

# Step 2c: Boxplots with names and 45° labels

boxplot(
  met_data$mass,
  met_data$enzyme_activity,
  met_data$metabolite_concentration,
  names = FALSE,
  main = "Boxplots of Key Metabolite Variables",
  col = c("skyblue", "lightgreen", "salmon"),
  xaxt = "n"
)
labels1 <- c("mass", "enzyme_activity", "metabolite_concentration")
text(x = 1:length(labels1), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels1, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

boxplot(
  met_data$logP,
  met_data$num_rings,
  met_data$num_H_bonds,
  met_data$pathway_count,
  names = FALSE,
  main = "Boxplots of Other Predictors",
  col = c("orange", "purple", "lightblue", "lightpink"),
  xaxt = "n"
)
labels2 <- c("logP", "num_rings", "num_H_bonds", "pathway_count")
text(x = 1:length(labels2), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels2, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

Comment here:

# Step 2d: Correlation Heatmap ----
corrplot(cor(met_data), method = "color", type = "upper", tl.col = "black")

Comment here:

# Step 2e: Pairwise relationships
ggpairs(met_data)

Comment here:

# Step 2f: Distribution Check ----
met_data_long <- melt(met_data)
ggplot(met_data_long, aes(x = value, fill = variable)) +
  geom_histogram(bins = 20) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distributions of Metabolite Features")

Comment here:

Step 3: Train and Split

set.seed(42)
train_index <- createDataPartition(met_data$metabolite_concentration, p = 0.7, list = FALSE)
train_data <- met_data[train_index, ]
test_data  <- met_data[-train_index, ]

Comment here: Training Data and test data

Step 4: Linear regression Model

lm_model <- lm(metabolite_concentration ~ ., data = train_data)
summary(lm_model)
## 
## Call:
## lm(formula = metabolite_concentration ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1504  -3.7074  -0.1107   2.7126  15.4738 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4.24431    4.66993  -0.909    0.365    
## mass             0.50097    0.02003  25.015  < 2e-16 ***
## logP             2.66187    0.56185   4.738 5.47e-06 ***
## num_rings        3.17428    0.39278   8.082 3.44e-13 ***
## num_H_bonds      1.57205    0.23086   6.809 3.06e-10 ***
## enzyme_activity  0.74271    0.04244  17.499  < 2e-16 ***
## pathway_count    1.82975    0.31593   5.792 4.78e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.205 on 133 degrees of freedom
## Multiple R-squared:  0.8904, Adjusted R-squared:  0.8855 
## F-statistic: 180.1 on 6 and 133 DF,  p-value: < 2.2e-16

Comment here: Adjusted R-squared indicates about 88.55% of the variability in the target variable is explained by the predictors and indicates a good fit.

Step 5: Diagnostics

# Step 5a Check residual normality
shapiro.test(lm_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model$residuals
## W = 0.98715, p-value = 0.2191
par(mfrow = c(2, 3))
hist(lm_model$residuals, main = "Histogram of Residuals")
plot(lm_model)
par(mfrow = c(1, 1))

Comment here: The histogram of residuals shows that they have a normal distribution centered around 0

# Step 5b Statistical significance: Multicollinearity

vif(lm_model)
##            mass            logP       num_rings     num_H_bonds enzyme_activity 
##        1.080017        1.061047        1.031570        1.095657        1.031244 
##   pathway_count 
##        1.084573

Comment here: The VIF values for all predictors are close to 1, indicating no significant multicollinearity

# Step 5c Statistical significance: Homoscadasticity
bptest(lm_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model
## BP = 14.878, df = 6, p-value = 0.02122

Comment here: Since the p-value is less than 0.05, the null hypothesis is rejected. This indicates evidence of heteroscedasticity. The Breusch-Pagan test suggests that the residuals do not have constant variance.

Step 6: Model performance

lm_pred <- predict(lm_model, newdata = test_data)

# Step 6a: Calculate performance metrics and print
rmse <- sqrt(mean((test_data$metabolite_concentration - lm_pred)^2))
mae  <- mean(abs(test_data$metabolite_concentration - lm_pred))
r2   <- cor(test_data$metabolite_concentration, lm_pred)^2

# Print results
cat("RMSE:", rmse, "\n")
## RMSE: 4.970806
cat("MAE:", mae, "\n")
## MAE: 3.885294
cat("R²:", r2, "\n")
## R²: 0.9003138

Comment here: R2 value of 0.9 indicates that 90% of the variability in the response variable is explained by the model and indicates an excellent fit

# Step 6b: plot predicted vs observed

ggplot(data.frame(Observed = test_data$metabolite_concentration, Predicted = lm_pred),
       aes(x = Observed, y = Predicted)) +
  geom_point(color = "blue") +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Predicted vs Observed Metabolite Concentration")

Comment here: In the plot, the points are tightly clustered along the red dashed line, indicating that the model performs well with minimal prediction error

Step 7

#Which Variable/Predictor is important in the model
explainer <- explain(lm_model, data = train_data[, -7], 
                     y = train_data$metabolite_concentration)
## Preparation of a new explainer is initiated
##   -> model label       :  lm  (  default  )
##   -> data              :  140  rows  6  cols 
##   -> target variable   :  140  values 
##   -> predict function  :  yhat.lm  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package stats , ver. 4.5.1 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  107.7701 , mean =  145.5337 , max =  189.6574  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -12.1504 , mean =  1.031303e-13 , max =  15.47382  
##   A new explainer has been created!
vip <- model_parts(explainer)
plot(vip) + ggtitle("Variable Importance (DALEX)")

Comment here: The graph shows that mass and enzyme activity are the main predictors and logP is the least

Step 8

#Testing the model on an observation

new_met <- data.frame(
  mass=200,
  logP=3,
  num_rings=2,
  num_H_bonds=3,
  enzyme_activity=60,
  pathway_count=2
)

predict(lm_model, newdata=new_met)
##        1 
## 163.2212

Comment here: The function indicates that the observation will correspond to a concentration of 163.2212

Step 9: Cross-Validation

#Step 9a: Cross-Validation: another way to train and predict ----

set.seed(42)

# Define 10-fold CV
train_control <- trainControl(method = "cv", number = 10)

# Train linear regression model with CV
cv_model <- train(
  metabolite_concentration ~ .,
  data = met_data,
  method = "lm",
  trControl = train_control,
  metric = "RMSE"
)

# Show CV results
print(cv_model)
## Linear Regression 
## 
## 200 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   5.12678  0.8853752  4.151905
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Extract performance metrics
cv_results <- cv_model$resample
summary(cv_results)
##       RMSE          Rsquared           MAE          Resample        
##  Min.   :3.108   Min.   :0.8112   Min.   :2.494   Length:10         
##  1st Qu.:4.262   1st Qu.:0.8359   1st Qu.:3.543   Class :character  
##  Median :5.319   Median :0.8906   Median :4.329   Mode  :character  
##  Mean   :5.127   Mean   :0.8854   Mean   :4.152                     
##  3rd Qu.:5.940   3rd Qu.:0.9355   3rd Qu.:4.704                     
##  Max.   :6.677   Max.   :0.9599   Max.   :5.506
# Mean CV RMSE, MAE and R²
cat("Mean CV RMSE:", mean(cv_results$RMSE), "\n")
## Mean CV RMSE: 5.12678
cat("Mean CV MAE:", mean(cv_results$MAE), "\n")
## Mean CV MAE: 4.151905
cat("Mean CV R²:", mean(cv_results$Rsquared), "\n")
## Mean CV R²: 0.8853752

Comment here:

# Step 9b Visualize cross-validation performance: RMSE, MAE, Rsquared
par(mfrow = c(2, 2))

# RMSE
ggplot(cv_results, aes(x = RMSE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation RMSE Distribution (10-Fold)",
       x = "RMSE", y = "Count")

# MAE
ggplot(cv_results, aes(x = MAE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation MAE Distribution (10-Fold)",
       x = "MAE", y = "Count")

# RSquared
ggplot(cv_results, aes(x = Rsquared)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation Rsquared Distribution (10-Fold)",
       x = "Rsquared", y = "Count")

par(mfrow = c(1, 1))

Comment here:

# Step 9c: Predict New Metabolite with Cross Validation ----
new_met <- data.frame(
  mass = 200,
  logP = 3,
  num_rings = 2,
  num_H_bonds = 3,
  enzyme_activity = 60,
  pathway_count = 2
)
predict(cv_model, newdata = new_met)
##        1 
## 163.1173

Comment here:

# Step 9d: Compare lm and cv

predict(lm_model, newdata = new_met)
##        1 
## 163.2212
predict(cv_model, newdata = new_met)
##        1 
## 163.1173