Load Libraries

library(caret)
library(ggplot2)
library(car)
library(lmtest)
library(corrplot)
library(GGally)
library(reshape2)
library(glmnet)
library(DALEX)

Step 1: Load the data

met_data <- read.csv("metabolite_data.csv")
head(met_data)
##       mass     logP num_rings num_H_bonds enzyme_activity pathway_count
## 1 165.9881 4.259048         2           4        23.98300             1
## 2 174.2456 3.549930         0           1        45.46802             5
## 3 218.9677 2.287884         3           2        43.24518             4
## 4 181.7627 2.934555         3           5        37.77074             3
## 5 183.2322 2.168528         2           6        65.46609             4
## 6 222.8766 2.119002         0           0        35.84718             4
##   metabolite_concentration
## 1                 122.6491
## 2                 128.3079
## 3                 155.9756
## 4                 145.3023
## 5                 161.8741
## 6                 149.0430
dim(met_data)
## [1] 200   7
str(met_data)
## 'data.frame':    200 obs. of  7 variables:
##  $ mass                    : num  166 174 219 182 183 ...
##  $ logP                    : num  4.26 3.55 2.29 2.93 2.17 ...
##  $ num_rings               : int  2 0 3 3 2 0 2 2 0 2 ...
##  $ num_H_bonds             : int  4 1 2 5 6 0 5 6 4 5 ...
##  $ enzyme_activity         : num  24 45.5 43.2 37.8 65.5 ...
##  $ pathway_count           : int  1 5 4 3 4 4 4 3 3 3 ...
##  $ metabolite_concentration: num  123 128 156 145 162 ...

The data shows the information for the first 6 data entries. We see that there are 200 observations for 7 variables (mass, logP, num_rings, num_H_bonds, enzyme_activity, pathway_count, and metabolite_concentration.

Step 2: Basic EDA: Exploratory Data Analysis

#Step 2a: check the metrics (mean, media etc) for each variable

summary(met_data)
##       mass            logP          num_rings    num_H_bonds   enzyme_activity
##  Min.   :122.3   Min.   :0.5273   Min.   :0.0   Min.   :0.00   Min.   :23.98  
##  1st Qu.:164.4   1st Qu.:2.0274   1st Qu.:1.0   1st Qu.:1.00   1st Qu.:43.24  
##  Median :178.5   Median :2.5183   Median :2.0   Median :3.00   Median :51.20  
##  Mean   :179.8   Mean   :2.5337   Mean   :1.6   Mean   :3.08   Mean   :49.91  
##  3rd Qu.:194.2   3rd Qu.:3.0719   3rd Qu.:3.0   3rd Qu.:5.00   3rd Qu.:56.38  
##  Max.   :261.0   Max.   :4.5572   Max.   :3.0   Max.   :6.00   Max.   :76.92  
##  pathway_count   metabolite_concentration
##  Min.   :1.000   Min.   :102.8           
##  1st Qu.:2.000   1st Qu.:134.4           
##  Median :3.000   Median :144.0           
##  Mean   :2.955   Mean   :145.3           
##  3rd Qu.:4.000   3rd Qu.:155.8           
##  Max.   :5.000   Max.   :191.0

This step allows us to check the metrics for each variable. We can see the minimum, maximum, mean, median, 1st and second quartiles, and maximum. We can tell that no data is missing.

# Step 2b: plots for each variable: x-axis = order in the data set, y-axis: measurement
par(mfrow=c(3,3))
plot(met_data$mass)
plot(met_data$logP)
plot(met_data$num_rings)
plot(met_data$num_H_bonds)
plot(met_data$enzyme_activity)
plot(met_data$pathway_count)
plot(met_data$metabolite_concentration)
par(mfrow = c(1, 1))

We are looking for the trends in data. We can see that there are no outliers in the data.

# Step 2c: Boxplots with names and 45° labels

boxplot(
  met_data$mass,
  met_data$enzyme_activity,
  met_data$metabolite_concentration,
  names = FALSE,
  main = "Boxplots of Key Metabolite Variables",
  col = c("skyblue", "lightgreen", "salmon"),
  xaxt = "n"
)
labels1 <- c("mass", "enzyme_activity", "metabolite_concentration")
text(x = 1:length(labels1), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels1, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

boxplot(
  met_data$logP,
  met_data$num_rings,
  met_data$num_H_bonds,
  met_data$pathway_count,
  names = FALSE,
  main = "Boxplots of Other Predictors",
  col = c("orange", "purple", "lightblue", "lightpink"),
  xaxt = "n"
)
labels2 <- c("logP", "num_rings", "num_H_bonds", "pathway_count")
text(x = 1:length(labels2), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
     labels = labels2, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)

For the first boxplot, key metabolite variables, we can see that mass, enzyme_activity, and metabolite_concentration have acceptable distribution so we can move forward. The second boxplot show the other variables and we see that they also have acceptable distribution.

# Step 2d: Correlation Heatmap ----
corrplot(cor(met_data), method = "color", type = "upper", tl.col = "black")

We are looking for correlation between variables here. The darker the color, the higher the correlation. We can see that mass and metabolite_concentration have the darkest color so we could suggest that they might have interactions. The second darkest square is enzyme_activity so that variable is also of interest to the analysis.

# Step 2e: Pairwise relationships
ggpairs(met_data)

When looking at the correlation shown in the graphs, we are looking for correlations closest to 1 or -1. We can see that mass has the highest correlation with a value of .710 and enzyme_activity is second highest at a value of .447. This confirms what we saw in the previous correlation data. Both of the values are positive numbers which means that there is a positive correlation between metabolite_concentration and both mass and enzyme activity.

# Step 2f: Distribution Check ----
met_data_long <- melt(met_data)
ggplot(met_data_long, aes(x = value, fill = variable)) +
  geom_histogram(bins = 20) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distributions of Metabolite Features")

We use histograms to look for normal distribution. Mass, logP, and enzyme activity all show normal distribution.

Step 3: Train and Split

set.seed(42)
train_index <- createDataPartition(met_data$metabolite_concentration, p = 0.7, list = FALSE)
train_data <- met_data[train_index, ]
test_data  <- met_data[-train_index, ]

This step is used to train-test split. We set the seed to the same number so that everyone gets the same results so we can compare. We are using 70% to train and 30% to test.

Step 4: Linear regression Model

lm_model <- lm(metabolite_concentration ~ ., data = train_data)
summary(lm_model)
## 
## Call:
## lm(formula = metabolite_concentration ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1504  -3.7074  -0.1107   2.7126  15.4738 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4.24431    4.66993  -0.909    0.365    
## mass             0.50097    0.02003  25.015  < 2e-16 ***
## logP             2.66187    0.56185   4.738 5.47e-06 ***
## num_rings        3.17428    0.39278   8.082 3.44e-13 ***
## num_H_bonds      1.57205    0.23086   6.809 3.06e-10 ***
## enzyme_activity  0.74271    0.04244  17.499  < 2e-16 ***
## pathway_count    1.82975    0.31593   5.792 4.78e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.205 on 133 degrees of freedom
## Multiple R-squared:  0.8904, Adjusted R-squared:  0.8855 
## F-statistic: 180.1 on 6 and 133 DF,  p-value: < 2.2e-16

We are looking for p values that are lower than 0.05. A value of lower than 0.05 would represent significance. We also see that the residual median is close to 0 which is good. The adjusted R squared is 0.8855 which tells us that 88.55% of the variable in the target variable is explained by the predictors. This indicates a good fit.

Step 5: Diagnostics

# Step 5a Check residual normality
shapiro.test(lm_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_model$residuals
## W = 0.98715, p-value = 0.2191
par(mfrow = c(2, 3))
hist(lm_model$residuals, main = "Histogram of Residuals")
plot(lm_model)
par(mfrow = c(1, 1))

We are using this step to make sure that the model is a good fit and reliable. The histogram shows that the residuals have a normal distribution. The model looks good. The residuals vs. fitted plot shows a red line around zero. The distance above and below the line show the difference between the observed and predicted values. The residuals appear randomly distributed around the horizontal line at 0. This suggest that the linear model is appropriate. The Q-Q plot shows whether the residuals are normally distributed. If the residuals follow a normal distribution, the points should fall along the diagonal dotted line. Our graph shows this normal distribution. The scale-location plot is used to check the homoscedasticity - whether the residuals have constant variance across all levels of fitted values. We are looking to see a horizontal red line with points evenly scattered - this means the variance of the residuals is constant and the assumption of homoscedasticity is met - we indeed see this with our graph. Finally we have the residuals vs. leverage graph. This graph is used to assess the quality and reliability of a linear regression model. The residuals are mostly scattered radomly around zero which suggessts the model’s assumptions are reasonable. The red smooth line is nearly flat which confirms no strong pattern between residuals and leverage.

# Step 5b Statistical significance: Multicollinearity

vif(lm_model)
##            mass            logP       num_rings     num_H_bonds enzyme_activity 
##        1.080017        1.061047        1.031570        1.095657        1.031244 
##   pathway_count 
##        1.084573

VIF checks for multicollinearity. For the VIF, if VIF=1 then there is no correlation between this variable and others. If VIF is between 1 and 5, then there is moderate correlation. If VIF>10 then there is strong multicollinearity which is problematic. All VIF values are very close to 1 which means there is virtually no multicollinearity among the predictors. Each variable provides independent information to the model.

# Step 5c Statistical significance: Homoscadasticity
bptest(lm_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_model
## BP = 14.878, df = 6, p-value = 0.02122

The studentized Breusch-Pagan test is for heteroscedasticity in a linear regression model. This test checks whether the residual variance depends on the predictor variables - whether heteroscedasticy is present. Since the p-value is < 0.05, you reject the null hypothesis. This tells us that there is evidence of heteroscedasticity.

Step 6: Model performance

lm_pred <- predict(lm_model, newdata = test_data)

# Step 6a: Calculate performance metrics and print
rmse <- sqrt(mean((test_data$metabolite_concentration - lm_pred)^2))
mae  <- mean(abs(test_data$metabolite_concentration - lm_pred))
r2   <- cor(test_data$metabolite_concentration, lm_pred)^2

# Print results
cat("RMSE:", rmse, "\n")
## RMSE: 4.970806
cat("MAE:", mae, "\n")
## MAE: 3.885294
cat("R²:", r2, "\n")
## R²: 0.9003138

RMSE is root mean square error and it is used to evaluate h well a regression model predicts continuous outcomes. Our model has a RMSE of 4.9 which means that the model’s predictions deviate from the true values by about 4.9 units. MAE is the mean absolute error and measures the average size of the errors in the predictions. Our MAE is 3.88 which means that on average the model’s predicted values differ from the actual metabolite concentration by about 3.88 units. This could be in either direction. R squared is a statistical measure that indicates how well a regression model explains the variability of the response variable. R squared ranges from 0 to 1 and the closer to 1, the better the model. Our value of .9 means that 90% of the variability in the response variable is explained by the model - this is an excellent fit.

# Step 6b: plot predicted vs observed

ggplot(data.frame(Observed = test_data$metabolite_concentration, Predicted = lm_pred),
       aes(x = Observed, y = Predicted)) +
  geom_point(color = "blue") +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Predicted vs Observed Metabolite Concentration")

The predicted vs. observed plot is used to evaluate how well a regression model’s predictions match the real observed values. The x-axis shows the observed or true values while the points represent one observation. The red dashed line represents the ideal case where predictions perfectly equal observations where y = x. In our ploted graph, we see that most of the dots are close to the red line which indicates that the model performs very well with minimal prediction error and high accuracy.

Step 7

#Which Variable/Predictor is important in the model
explainer <- explain(lm_model, data = train_data[, -7], 
                     y = train_data$metabolite_concentration)
## Preparation of a new explainer is initiated
##   -> model label       :  lm  (  default  )
##   -> data              :  140  rows  6  cols 
##   -> target variable   :  140  values 
##   -> predict function  :  yhat.lm  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package stats , ver. 4.5.1 , task regression (  default  ) 
##   -> predicted values  :  numerical, min =  107.7701 , mean =  145.5337 , max =  189.6574  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -12.1504 , mean =  1.031303e-13 , max =  15.47382  
##   A new explainer has been created!
vip <- model_parts(explainer)
plot(vip) + ggtitle("Variable Importance (DALEX)")

This step will plot a graph that will show which predictor brings more to predict the target. The main predictors are mass and enzyme activity. This confirms the model.

Step 8

#Testing the model on an observation

new_met <- data.frame(
  mass=200,
  logP=3,
  num_rings=2,
  num_H_bonds=3,
  enzyme_activity=60,
  pathway_count=2
)

predict(lm_model, newdata=new_met)
##        1 
## 163.2212

We used the model to make a prediction for a new unseen observation for the metabolite. The predict function indicates a concentration of 163.22

Step 9: Cross-Validation

#Step 9a: Cross-Validation: another way to train and predict ----

set.seed(42)

# Define 10-fold CV
train_control <- trainControl(method = "cv", number = 10)

# Train linear regression model with CV
cv_model <- train(
  metabolite_concentration ~ .,
  data = met_data,
  method = "lm",
  trControl = train_control,
  metric = "RMSE"
)

# Show CV results
print(cv_model)
## Linear Regression 
## 
## 200 samples
##   6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results:
## 
##   RMSE     Rsquared   MAE     
##   5.12678  0.8853752  4.151905
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Extract performance metrics
cv_results <- cv_model$resample
summary(cv_results)
##       RMSE          Rsquared           MAE          Resample        
##  Min.   :3.108   Min.   :0.8112   Min.   :2.494   Length:10         
##  1st Qu.:4.262   1st Qu.:0.8359   1st Qu.:3.543   Class :character  
##  Median :5.319   Median :0.8906   Median :4.329   Mode  :character  
##  Mean   :5.127   Mean   :0.8854   Mean   :4.152                     
##  3rd Qu.:5.940   3rd Qu.:0.9355   3rd Qu.:4.704                     
##  Max.   :6.677   Max.   :0.9599   Max.   :5.506
# Mean CV RMSE, MAE and R²
cat("Mean CV RMSE:", mean(cv_results$RMSE), "\n")
## Mean CV RMSE: 5.12678
cat("Mean CV MAE:", mean(cv_results$MAE), "\n")
## Mean CV MAE: 4.151905
cat("Mean CV R²:", mean(cv_results$Rsquared), "\n")
## Mean CV R²: 0.8853752

Cross validation in linear regression is a technique used to assess how well a model generalizes to unseen data. The RMSE from this model is 5.12 while the earlier model it was 4.9. The MAE from this model is 4.15 while the earlier model was 3.88. The R squared from this value is .885 while it .9 from the earlier model. The values for each these are quite similar.

# Step 9b Visualize cross-validation performance: RMSE, MAE, Rsquared
par(mfrow = c(2, 2))

# RMSE
ggplot(cv_results, aes(x = RMSE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation RMSE Distribution (10-Fold)",
       x = "RMSE", y = "Count")

# MAE
ggplot(cv_results, aes(x = MAE)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation MAE Distribution (10-Fold)",
       x = "MAE", y = "Count")

# RSquared
ggplot(cv_results, aes(x = Rsquared)) +
  geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Cross-Validation Rsquared Distribution (10-Fold)",
       x = "Rsquared", y = "Count")

par(mfrow = c(1, 1))

We see similar graphs for MAE and R squared.

# Step 9c: Predict New Metabolite with Cross Validation ----
new_met <- data.frame(
  mass = 200,
  logP = 3,
  num_rings = 2,
  num_H_bonds = 3,
  enzyme_activity = 60,
  pathway_count = 2
)
predict(cv_model, newdata = new_met)
##        1 
## 163.1173

With this type of trainin, the prediction is 163.1173 compared to the previous prediction which was 163.22

# Step 9d: Compare lm and cv

predict(lm_model, newdata = new_met)
##        1 
## 163.2212
predict(cv_model, newdata = new_met)
##        1 
## 163.1173