Load Libraries

library(caret)
library(ggplot2)
library(car)
library(lmtest)
library(corrplot)
library(GGally)
library(reshape2)
library(glmnet)
library(DALEX)

Step 1: Load the data

met_data <- read.csv("metabolite_data.csv")
head(met_data)
##       mass     logP num_rings num_H_bonds enzyme_activity pathway_count
## 1 165.9881 4.259048         2           4        23.98300             1
## 2 174.2456 3.549930         0           1        45.46802             5
## 3 218.9677 2.287884         3           2        43.24518             4
## 4 181.7627 2.934555         3           5        37.77074             3
## 5 183.2322 2.168528         2           6        65.46609             4
## 6 222.8766 2.119002         0           0        35.84718             4
##   metabolite_concentration
## 1                 122.6491
## 2                 128.3079
## 3                 155.9756
## 4                 145.3023
## 5                 161.8741
## 6                 149.0430
dim(met_data)
## [1] 200   7
str(met_data)
## 'data.frame':    200 obs. of  7 variables:
##  $ mass                    : num  166 174 219 182 183 ...
##  $ logP                    : num  4.26 3.55 2.29 2.93 2.17 ...
##  $ num_rings               : int  2 0 3 3 2 0 2 2 0 2 ...
##  $ num_H_bonds             : int  4 1 2 5 6 0 5 6 4 5 ...
##  $ enzyme_activity         : num  24 45.5 43.2 37.8 65.5 ...
##  $ pathway_count           : int  1 5 4 3 4 4 4 3 3 3 ...
##  $ metabolite_concentration: num  123 128 156 145 162 ...

The file “metabolite_data.csv” shows metabolic data and has 200 observations for different variables of a hypothetical chemical metabolite. These variables are the molecular weight, hydrophobicity, number or rings in a structure, number of H bond donors and acceptors, the activity of the enzyme producing the metabolite, the number of pathways this metabolite participates in, and the target variable (nmol/L). This was downloaded from BioCyc.

Step 2: Basic EDA: Exploratory Data Analysis

#Step 2a: check the metrics (mean, media etc) for each variable

summary(met_data)
##       mass            logP          num_rings    num_H_bonds   enzyme_activity
##  Min.   :122.3   Min.   :0.5273   Min.   :0.0   Min.   :0.00   Min.   :23.98  
##  1st Qu.:164.4   1st Qu.:2.0274   1st Qu.:1.0   1st Qu.:1.00   1st Qu.:43.24  
##  Median :178.5   Median :2.5183   Median :2.0   Median :3.00   Median :51.20  
##  Mean   :179.8   Mean   :2.5337   Mean   :1.6   Mean   :3.08   Mean   :49.91  
##  3rd Qu.:194.2   3rd Qu.:3.0719   3rd Qu.:3.0   3rd Qu.:5.00   3rd Qu.:56.38  
##  Max.   :261.0   Max.   :4.5572   Max.   :3.0   Max.   :6.00   Max.   :76.92  
##  pathway_count   metabolite_concentration
##  Min.   :1.000   Min.   :102.8           
##  1st Qu.:2.000   1st Qu.:134.4           
##  Median :3.000   Median :144.0           
##  Mean   :2.955   Mean   :145.3           
##  3rd Qu.:4.000   3rd Qu.:155.8           
##  Max.   :5.000   Max.   :191.0

This step checks for the basic summary of the data, such as the min, mac, mean, median, quartiles, and missing data.

# Step 2b: plots for each variable: x-axis = order in the data set, y-axis: measurement
par(mfrow=c(3,3))
plot(met_data$mass)
plot(met_data$logP)
plot(met_data$num_rings)
plot(met_data$num_H_bonds)
plot(met_data$enzyme_activity)
plot(met_data$pathway_count)
plot(met_data$metabolite_concentration)
par(mfrow = c(1, 1))

The scatter plots for all of the variables are shown above.

# Step 2c: Boxplots with names and 45° labels

boxplot(
  met_data$mass,
  met_data$enzyme_activity,
  met_data$metabolite_concentration,
  names = FALSE,
  main = "Boxplots of Key Metabolite Variables",
  col = c("skyblue", "lightgreen", "salmon"),
  xaxt = "n"
)
labels1 <- c("mass", "enzyme_activity", "metabolite_concentration")
text(x = 1:length(labels1), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels1, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

boxplot(
  met_data$logP,
  met_data$num_rings,
  met_data$num_H_bonds,
  met_data$pathway_count,
  names = FALSE,
  main = "Boxplots of Other Predictors",
  col = c("orange", "purple", "lightblue", "lightpink"),
  xaxt = "n"
)
labels2 <- c("logP", "num_rings", "num_H_bonds", "pathway_count")
text(x = 1:length(labels2), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels2, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

The first boxplot shows the distribution for mass, activity, and metabolite concentration while the second boxplot shows the distribution for other variables.

# Step 2d: Correlation Heatmap ----
corrplot(cor(met_data), method = "color", type = "upper", tl.col = "black")

The correlation heatmap shows which metabolite variables are good predictors for other metabolite variables, based on the color and intensity. Mass and metabolite concentrations are good predictors for each other and the number of rings is a bad predictor for hydrophobicity.

# Step 2e: Pairwise relationships
ggpairs(met_data)

These plots show the correlation between the data points. 1 indicates a strong positive correlation, -1 indicates a strong negative correlation, and 0 indicates no correlation. Enzyme activity and H-bond number have no correlation while metabolite concentration and mass have a pretty strong correlation.

# Step 2f: Distribution Check ----
met_data_long <- melt(met_data)
ggplot(met_data_long, aes(x = value, fill = variable)) +
  geom_histogram(bins = 20) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distributions of Metabolite Features")

These graphs show the distribution of different metabolite features, and it is evident that mass, hydrophobicity, and metabolite concentration have nearly normal distributions.

Step 3: Train and Split

set.seed(42)
train_index <- createDataPartition(met_data$metabolite_concentration, p = 0.7, list = FALSE)
train_data <- met_data[train_index, ]
test_data  <- met_data[-train_index, ]

Train and Split is used to see how good a model is at recognizing unseen variables. It does so by analyzing 70% of the data and predicting 30%. Caret chooses randomly but if you set seed as a specific number, it chooses from that number so it is good for when you want everyone to be on the same page.

Step 4: Linear regression Model

lm_model <- lm(metabolite_concentration ~ ., data = train_data)
summary(lm_model)
## 
## Call:
## lm(formula = metabolite_concentration ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1504  -3.7074  -0.1107   2.7126  15.4738 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4.24431    4.66993  -0.909    0.365    
## mass             0.50097    0.02003  25.015  < 2e-16 ***
## logP             2.66187    0.56185   4.738 5.47e-06 ***
## num_rings        3.17428    0.39278   8.082 3.44e-13 ***
## num_H_bonds      1.57205    0.23086   6.809 3.06e-10 ***
## enzyme_activity  0.74271    0.04244  17.499  < 2e-16 ***
## pathway_count    1.82975    0.31593   5.792 4.78e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.205 on 133 degrees of freedom
## Multiple R-squared:  0.8904, Adjusted R-squared:  0.8855 
## F-statistic: 180.1 on 6 and 133 DF,  p-value: < 2.2e-16

This summary shows different factors of the testing model. The p-value assesses the contribution of each variable and a majority are less than 0.05. The correlation is strong and the median is close to zero.

Step 5: Diagnostics

# Step 5a Check residual normality
shapiro.test(lm_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model$residuals
## W = 0.98715, p-value = 0.2191
par(mfrow = c(2, 3))
hist(lm_model$residuals, main = "Histogram of Residuals")
plot(lm_model)
par(mfrow = c(1, 1))

This tests whether or nor the model is reliable. The shapiro test with a p-value > 0.05 indicates normality. The histogram shows a normal distribution and the other plots also show that the model is reasonable at predicting data.

# Step 5b Statistical significance: Multicollinearity

vif(lm_model)
##            mass            logP       num_rings     num_H_bonds enzyme_activity 
##        1.080017        1.061047        1.031570        1.095657        1.031244 
##   pathway_count 
##        1.084573

The VIF (Variance Inflation Factor) is close to 1, which shows that each variable provides independent information and there is virtually no correlation between the variables, which shows that the model is relaible.

# Step 5c Statistical significance: Homoscadasticity
bptest(lm_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model
## BP = 14.878, df = 6, p-value = 0.02122

The Breusch-Pagan test shows that BP=14.878 and the P-value is 0.02122, which is less than 0.05, showing that there is no constant variance. This means that there might be some errors with this model.

Step 6: Model performance

lm_pred <- predict(lm_model, newdata = test_data)

# Step 6a: Calculate performance metrics and print
rmse <- sqrt(mean((test_data$metabolite_concentration - lm_pred)^2))
mae  <- mean(abs(test_data$metabolite_concentration - lm_pred))
r2   <- cor(test_data$metabolite_concentration, lm_pred)^2

# Print results
cat("RMSE:", rmse, "\n")
## RMSE: 4.970806
cat("MAE:", mae, "\n")
## MAE: 3.885294
cat("R²:", r2, "\n")
## R²: 0.9003138

This model has an RMSE of 4.9, which means that on average, the deviation of the predictions are 4.9 units away from the true value. The MAE is 3.88, which means that the predictions of the model are off by 3.88 units. The r-squared value of 0.9 indicates a perfect fit.

# Step 6b: plot predicted vs observed

ggplot(data.frame(Observed = test_data$metabolite_concentration, Predicted = lm_pred),
       aes(x = Observed, y = Predicted)) +
  geom_point(color = "blue") +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Predicted vs Observed Metabolite Concentration")

The model’s predicted values are clustered along the dashed line (expected value), shwoing that the model performs with high accuracy.

Step 7

#Which Variable/Predictor is important in the model
explainer <- explain(lm_model, data = train_data[, -7], 
                     y = train_data$metabolite_concentration)
## Preparation of a new explainer is initiated
##   -> model label       :  lm  (  default  )
##   -> data              :  140  rows  6  cols 
##   -> target variable   :  140  values 
##   -> predict function  :  yhat.lm  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package stats , ver. 4.4.2 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  107.7701 , mean =  145.5337 , max =  189.6574  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -12.1504 , mean =  1.031303e-13 , max =  15.47382  
##   A new explainer has been created!
vip <- model_parts(explainer)
plot(vip) + ggtitle("Variable Importance (DALEX)")

This graph shows which predictors allow for more prediction. Mass and enzyme activity seem to be the main predictors.

Step 8

#Testing the model on an observation

new_met <- data.frame(
  mass=200,
  logP=3,
  num_rings=2,
  num_H_bonds=3,
  enzyme_activity=60,
  pathway_count=2
)

predict(lm_model, newdata=new_met)
##        1 
## 163.2212

Based on the data given to the model, it has predicted that the concentration of a metabolite with these values will be 163.22.

Step 9: Cross-Validation

#Step 9a: Cross-Validation: another way to train and predict ----

set.seed(42)

# Define 10-fold CV
train_control <- trainControl(method = "cv", number = 10)

# Train linear regression model with CV
cv_model <- train(
  metabolite_concentration ~ .,
  data = met_data,
  method = "lm",
  trControl = train_control,
  metric = "RMSE"
)

# Show CV results
print(cv_model)
## Linear Regression 
## 
## 200 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   5.12678  0.8853752  4.151905
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Extract performance metrics
cv_results <- cv_model$resample
summary(cv_results)
##       RMSE          Rsquared           MAE          Resample        
##  Min.   :3.108   Min.   :0.8112   Min.   :2.494   Length:10         
##  1st Qu.:4.262   1st Qu.:0.8359   1st Qu.:3.543   Class :character  
##  Median :5.319   Median :0.8906   Median :4.329   Mode  :character  
##  Mean   :5.127   Mean   :0.8854   Mean   :4.152                     
##  3rd Qu.:5.940   3rd Qu.:0.9355   3rd Qu.:4.704                     
##  Max.   :6.677   Max.   :0.9599   Max.   :5.506
# Mean CV RMSE, MAE and R²
cat("Mean CV RMSE:", mean(cv_results$RMSE), "\n")
## Mean CV RMSE: 5.12678
cat("Mean CV MAE:", mean(cv_results$MAE), "\n")
## Mean CV MAE: 4.151905
cat("Mean CV R²:", mean(cv_results$Rsquared), "\n")
## Mean CV R²: 0.8853752

A 10-fold crossvalidation is done and the values for RMSE, MAE, and R-squared are shown.

# Step 9b Visualize cross-validation performance: RMSE, MAE, Rsquared
par(mfrow = c(2, 2))

# RMSE
ggplot(cv_results, aes(x = RMSE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation RMSE Distribution (10-Fold)",
       x = "RMSE", y = "Count")

# MAE
ggplot(cv_results, aes(x = MAE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation MAE Distribution (10-Fold)",
       x = "MAE", y = "Count")

# RSquared
ggplot(cv_results, aes(x = Rsquared)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation Rsquared Distribution (10-Fold)",
       x = "Rsquared", y = "Count")

par(mfrow = c(1, 1))

These three graphs show the distribution of the RMSE, MAE, and R-squared for the ten fold CV.

# Step 9c: Predict New Metabolite with Cross Validation ----
new_met <- data.frame(
  mass = 200,
  logP = 3,
  num_rings = 2,
  num_H_bonds = 3,
  enzyme_activity = 60,
  pathway_count = 2
)
predict(cv_model, newdata = new_met)
##        1 
## 163.1173

When the model predicts the same data as before with cross validation, it shows a slightly different and possibly more accurate prediction.

# Step 9d: Compare lm and cv

predict(lm_model, newdata = new_met)
##        1 
## 163.2212
predict(cv_model, newdata = new_met)
##        1 
## 163.1173

The data points are slightly different in the two predictions, without and with cross-validation. It is possible that the data with cross validation is more accurate since it has gone through ten rounds of validation and is an average of all those prediction points.