We will use use a supervised machine learning linear regression program to predict metabolite concentration.

Load Libraries

library(caret)
library(ggplot2)
library(car)
library(lmtest)
library(corrplot)
library(GGally)
library(reshape2)
library(glmnet)
library(DALEX)

Step 1: Load the data

met_data <- read.csv("metabolite_data.csv")
head(met_data)
##       mass     logP num_rings num_H_bonds enzyme_activity pathway_count
## 1 165.9881 4.259048         2           4        23.98300             1
## 2 174.2456 3.549930         0           1        45.46802             5
## 3 218.9677 2.287884         3           2        43.24518             4
## 4 181.7627 2.934555         3           5        37.77074             3
## 5 183.2322 2.168528         2           6        65.46609             4
## 6 222.8766 2.119002         0           0        35.84718             4
##   metabolite_concentration
## 1                 122.6491
## 2                 128.3079
## 3                 155.9756
## 4                 145.3023
## 5                 161.8741
## 6                 149.0430
dim(met_data)
## [1] 200   7
str(met_data)
## 'data.frame':    200 obs. of  7 variables:
##  $ mass                    : num  166 174 219 182 183 ...
##  $ logP                    : num  4.26 3.55 2.29 2.93 2.17 ...
##  $ num_rings               : int  2 0 3 3 2 0 2 2 0 2 ...
##  $ num_H_bonds             : int  4 1 2 5 6 0 5 6 4 5 ...
##  $ enzyme_activity         : num  24 45.5 43.2 37.8 65.5 ...
##  $ pathway_count           : int  1 5 4 3 4 4 4 3 3 3 ...
##  $ metabolite_concentration: num  123 128 156 145 162 ...

The metabolite data set is read. The table shows the mass, the number of Hydrogen bonds, enzyme activity and more. The structure and dimension are also shown. There are a total of 7 variables in the metabolite data set.

Step 2: Basic EDA: Exploratory Data Analysis

#Step 2a: check the metrics (mean, media etc) for each variable

summary(met_data)
##       mass            logP          num_rings    num_H_bonds   enzyme_activity
##  Min.   :122.3   Min.   :0.5273   Min.   :0.0   Min.   :0.00   Min.   :23.98  
##  1st Qu.:164.4   1st Qu.:2.0274   1st Qu.:1.0   1st Qu.:1.00   1st Qu.:43.24  
##  Median :178.5   Median :2.5183   Median :2.0   Median :3.00   Median :51.20  
##  Mean   :179.8   Mean   :2.5337   Mean   :1.6   Mean   :3.08   Mean   :49.91  
##  3rd Qu.:194.2   3rd Qu.:3.0719   3rd Qu.:3.0   3rd Qu.:5.00   3rd Qu.:56.38  
##  Max.   :261.0   Max.   :4.5572   Max.   :3.0   Max.   :6.00   Max.   :76.92  
##  pathway_count   metabolite_concentration
##  Min.   :1.000   Min.   :102.8           
##  1st Qu.:2.000   1st Qu.:134.4           
##  Median :3.000   Median :144.0           
##  Mean   :2.955   Mean   :145.3           
##  3rd Qu.:4.000   3rd Qu.:155.8           
##  Max.   :5.000   Max.   :191.0

The summary table shows various metrics for each of the seven variables (i.e., minimum, maximum, mean).

# Step 2b: plots for each variable: x-axis = order in the data set, y-axis: measurement
par(mfrow=c(3,3))
plot(met_data$mass)
plot(met_data$logP)
plot(met_data$num_rings)
plot(met_data$num_H_bonds)
plot(met_data$enzyme_activity)
plot(met_data$pathway_count)
plot(met_data$metabolite_concentration)
par(mfrow = c(1, 1))

For each variable, the scatterplots show all the measurement of each observation.

# Step 2c: Boxplots with names and 45° labels

boxplot(
  met_data$mass,
  met_data$enzyme_activity,
  met_data$metabolite_concentration,
  names = FALSE,
  main = "Boxplots of Key Metabolite Variables",
  col = c("skyblue", "lightgreen", "salmon"),
  xaxt = "n"
)
labels1 <- c("mass", "enzyme_activity", "metabolite_concentration")
text(x = 1:length(labels1), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels1, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

boxplot(
  met_data$logP,
  met_data$num_rings,
  met_data$num_H_bonds,
  met_data$pathway_count,
  names = FALSE,
  main = "Boxplots of Other Predictors",
  col = c("orange", "purple", "lightblue", "lightpink"),
  xaxt = "n"
)
labels2 <- c("logP", "num_rings", "num_H_bonds", "pathway_count")
text(x = 1:length(labels2), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels2, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

The Boxplots shows the distribution of key metabolite variables (mass, enzyme activity and metabolite concentration) and other predictors (number of H-bonds, pathways counts, number of rings).

# Step 2d: Correlation Heatmap ----
corrplot(cor(met_data), method = "color", type = "upper", tl.col = "black")

The heat map shows which variables are good predictors for the target. Mass and enzyme activity, pathway counts are relatively good predictors. Other predictors may provide some insights.

# Step 2e: Pairwise relationships
ggpairs(met_data)

The figure above shows the correlations between the variables. Metabolite concentration and mass, metabolite concentration and enzyme activity have a positive and relatively high correlation.

# Step 2f: Distribution Check ----
met_data_long <- melt(met_data)
ggplot(met_data_long, aes(x = value, fill = variable)) +
  geom_histogram(bins = 20) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distributions of Metabolite Features")

The histograms show the distribution of each variable. Mass, logP, enzyme activity and metabolite concentration have close to a normal distribution.

Step 3: Train and Split

set.seed(42)
train_index <- createDataPartition(met_data$metabolite_concentration, p = 0.7, list = FALSE)
train_data <- met_data[train_index, ]
test_data  <- met_data[-train_index, ]

The data is being split to train and test the model. 70% of the data will be used to train the model and the 30% will be used to test the model.

Step 4: Linear regression Model

lm_model <- lm(metabolite_concentration ~ ., data = train_data)
summary(lm_model)
## 
## Call:
## lm(formula = metabolite_concentration ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1504  -3.7074  -0.1107   2.7126  15.4738 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4.24431    4.66993  -0.909    0.365    
## mass             0.50097    0.02003  25.015  < 2e-16 ***
## logP             2.66187    0.56185   4.738 5.47e-06 ***
## num_rings        3.17428    0.39278   8.082 3.44e-13 ***
## num_H_bonds      1.57205    0.23086   6.809 3.06e-10 ***
## enzyme_activity  0.74271    0.04244  17.499  < 2e-16 ***
## pathway_count    1.82975    0.31593   5.792 4.78e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.205 on 133 degrees of freedom
## Multiple R-squared:  0.8904, Adjusted R-squared:  0.8855 
## F-statistic: 180.1 on 6 and 133 DF,  p-value: < 2.2e-16

The train data set will be used to predict the target metabolite concentration alongside the other 6 variables. The summary indicates which variables will be a good predictor of the target.

Step 5: Diagnostics

# Step 5a Check residual normality
shapiro.test(lm_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model$residuals
## W = 0.98715, p-value = 0.2191
par(mfrow = c(2, 3))
hist(lm_model$residuals, main = "Histogram of Residuals")
plot(lm_model)
par(mfrow = c(1, 1))

The Shapiro test indicates that the residuals are normal (p-value>0.05) and the Histogram shows a normal distribution centered around 0, as expected. A series of other plots are used to check for linear distribution, whether the model is a good fit, if there is equal variance, if there’s any bias, if the model is stable and if there are any influential outliers. All plots indicate normality.

# Step 5b Statistical significance: Multicollinearity

vif(lm_model)
##            mass            logP       num_rings     num_H_bonds enzyme_activity 
##        1.080017        1.061047        1.031570        1.095657        1.031244 
##   pathway_count 
##        1.084573

The VIF values show that there is no significant multicollinearity as each variable has a value close to 1, indicating that it is independent in the linear model.

# Step 5c Statistical significance: Homoscadasticity
bptest(lm_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model
## BP = 14.878, df = 6, p-value = 0.02122

The Breusch-Pagan test has a value of 14.878 and has a p-value of 0.021 (<0.05), indicating that there is heteroscedasticity present. The variance in residuals is not constant.

Step 6: Model performance

lm_pred <- predict(lm_model, newdata = test_data)

# Step 6a: Calculate performance metrics and print
rmse <- sqrt(mean((test_data$metabolite_concentration - lm_pred)^2))
mae  <- mean(abs(test_data$metabolite_concentration - lm_pred))
r2   <- cor(test_data$metabolite_concentration, lm_pred)^2

# Print results
cat("RMSE:", rmse, "\n")
## RMSE: 4.970806
cat("MAE:", mae, "\n")
## MAE: 3.885294
cat("R²:", r2, "\n")
## R²: 0.9003138

The model is being tested to see how accurate it is in predicting and it shows if there are any issues in the model as it processes new data. RMSE score indicates moderate prediction accuracy with a score of 4.9, MAE indicates that model tends to deviate in either direction by 3.8 units and the R^2 indicates that 90% of the data is explained by the model, indicated an excellent fit overall.

# Step 6b: plot predicted vs observed

ggplot(data.frame(Observed = test_data$metabolite_concentration, Predicted = lm_pred),
       aes(x = Observed, y = Predicted)) +
  geom_point(color = "blue") +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Predicted vs Observed Metabolite Concentration")

The Predicted vs Observed plot shows that the model performs well and has minimal errors as indicated by the clustering of the data points near the line. The model captures the overall relationship between the predictors and metabolite concentrations and provides a reliable prediction.

Step 7

#Which Variable/Predictor is important in the model
explainer <- explain(lm_model, data = train_data[, -7], 
                     y = train_data$metabolite_concentration)
## Preparation of a new explainer is initiated
##   -> model label       :  lm  (  default  )
##   -> data              :  140  rows  6  cols 
##   -> target variable   :  140  values 
##   -> predict function  :  yhat.lm  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package stats , ver. 4.5.1 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  107.7701 , mean =  145.5337 , max =  189.6574  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -12.1504 , mean =  1.031303e-13 , max =  15.47382  
##   A new explainer has been created!
vip <- model_parts(explainer)
plot(vip) + ggtitle("Variable Importance (DALEX)")

The plot above shows which variable tends to be main predictor in metabolite concentration verifying earlier statements. Mass is the main predictor followed by enzyme activity.

Step 8

#Testing the model on an observation

new_met <- data.frame(
  mass=200,
  logP=3,
  num_rings=2,
  num_H_bonds=3,
  enzyme_activity=60,
  pathway_count=2
)

predict(lm_model, newdata=new_met)
##        1 
## 163.2212

The metabolite concentration predicted for the parameters listed above is 163.22.

Step 9: Cross-Validation

#Step 9a: Cross-Validation: another way to train and predict ----

set.seed(42)

# Define 10-fold CV
train_control <- trainControl(method = "cv", number = 10)

# Train linear regression model with CV
cv_model <- train(
  metabolite_concentration ~ .,
  data = met_data,
  method = "lm",
  trControl = train_control,
  metric = "RMSE"
)

# Show CV results
print(cv_model)
## Linear Regression 
## 
## 200 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   5.12678  0.8853752  4.151905
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Extract performance metrics
cv_results <- cv_model$resample
summary(cv_results)
##       RMSE          Rsquared           MAE          Resample        
##  Min.   :3.108   Min.   :0.8112   Min.   :2.494   Length:10         
##  1st Qu.:4.262   1st Qu.:0.8359   1st Qu.:3.543   Class :character  
##  Median :5.319   Median :0.8906   Median :4.329   Mode  :character  
##  Mean   :5.127   Mean   :0.8854   Mean   :4.152                     
##  3rd Qu.:5.940   3rd Qu.:0.9355   3rd Qu.:4.704                     
##  Max.   :6.677   Max.   :0.9599   Max.   :5.506
# Mean CV RMSE, MAE and R²
cat("Mean CV RMSE:", mean(cv_results$RMSE), "\n")
## Mean CV RMSE: 5.12678
cat("Mean CV MAE:", mean(cv_results$MAE), "\n")
## Mean CV MAE: 4.151905
cat("Mean CV R²:", mean(cv_results$Rsquared), "\n")
## Mean CV R²: 0.8853752

A 10-fold cross validation test is being used to assess the model. During the cross validation, one-fold will be used to test the model while the remaining 9 folds will be used to train the model. This will generate RMSA, MAE, and R-squared which are an average of 10 runs.

# Step 9b Visualize cross-validation performance: RMSE, MAE, Rsquared
par(mfrow = c(2, 2))

# RMSE
ggplot(cv_results, aes(x = RMSE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation RMSE Distribution (10-Fold)",
       x = "RMSE", y = "Count")

# MAE
ggplot(cv_results, aes(x = MAE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation MAE Distribution (10-Fold)",
       x = "MAE", y = "Count")

# RSquared
ggplot(cv_results, aes(x = Rsquared)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation Rsquared Distribution (10-Fold)",
       x = "Rsquared", y = "Count")

par(mfrow = c(1, 1))

The plot show the distribution of RMSE, MAE and R-squared during the 10 runs.

# Step 9c: Predict New Metabolite with Cross Validation ----
new_met <- data.frame(
  mass = 200,
  logP = 3,
  num_rings = 2,
  num_H_bonds = 3,
  enzyme_activity = 60,
  pathway_count = 2
)
predict(cv_model, newdata = new_met)
##        1 
## 163.1173

The cross validation model is then used to predict the metabolic concentration which results in a value of 163.1173, which is slightly less than the value predicted by the earlier linear model.

# Step 9d: Compare lm and cv

predict(lm_model, newdata = new_met)
##        1 
## 163.2212
predict(cv_model, newdata = new_met)
##        1 
## 163.1173