We will use use a supervised machine learning linear regression program to predict metabolite concentration.
Load Libraries
library(caret)
library(ggplot2)
library(car)
library(lmtest)
library(corrplot)
library(GGally)
library(reshape2)
library(glmnet)
library(DALEX)
Step 1: Load the data
met_data <- read.csv("metabolite_data.csv")
head(met_data)
## mass logP num_rings num_H_bonds enzyme_activity pathway_count
## 1 165.9881 4.259048 2 4 23.98300 1
## 2 174.2456 3.549930 0 1 45.46802 5
## 3 218.9677 2.287884 3 2 43.24518 4
## 4 181.7627 2.934555 3 5 37.77074 3
## 5 183.2322 2.168528 2 6 65.46609 4
## 6 222.8766 2.119002 0 0 35.84718 4
## metabolite_concentration
## 1 122.6491
## 2 128.3079
## 3 155.9756
## 4 145.3023
## 5 161.8741
## 6 149.0430
dim(met_data)
## [1] 200 7
str(met_data)
## 'data.frame': 200 obs. of 7 variables:
## $ mass : num 166 174 219 182 183 ...
## $ logP : num 4.26 3.55 2.29 2.93 2.17 ...
## $ num_rings : int 2 0 3 3 2 0 2 2 0 2 ...
## $ num_H_bonds : int 4 1 2 5 6 0 5 6 4 5 ...
## $ enzyme_activity : num 24 45.5 43.2 37.8 65.5 ...
## $ pathway_count : int 1 5 4 3 4 4 4 3 3 3 ...
## $ metabolite_concentration: num 123 128 156 145 162 ...
Comment here: Met data loaded with 200 observations of 7 variables
Step 2: Basic EDA: Exploratory Data Analysis
#Step 2a: check the metrics (mean, media etc) for each variable
summary(met_data)
## mass logP num_rings num_H_bonds enzyme_activity
## Min. :122.3 Min. :0.5273 Min. :0.0 Min. :0.00 Min. :23.98
## 1st Qu.:164.4 1st Qu.:2.0274 1st Qu.:1.0 1st Qu.:1.00 1st Qu.:43.24
## Median :178.5 Median :2.5183 Median :2.0 Median :3.00 Median :51.20
## Mean :179.8 Mean :2.5337 Mean :1.6 Mean :3.08 Mean :49.91
## 3rd Qu.:194.2 3rd Qu.:3.0719 3rd Qu.:3.0 3rd Qu.:5.00 3rd Qu.:56.38
## Max. :261.0 Max. :4.5572 Max. :3.0 Max. :6.00 Max. :76.92
## pathway_count metabolite_concentration
## Min. :1.000 Min. :102.8
## 1st Qu.:2.000 1st Qu.:134.4
## Median :3.000 Median :144.0
## Mean :2.955 Mean :145.3
## 3rd Qu.:4.000 3rd Qu.:155.8
## Max. :5.000 Max. :191.0
Comment here: metric data loaded for mass, logP, num_rings, num_H_bonds, enzyme_activity, pathway_count, and metabolite_concentration
# Step 2b: plots for each variable: x-axis = order in the data set, y-axis: measurement
par(mfrow=c(3,3))
plot(met_data$mass)
plot(met_data$logP)
plot(met_data$num_rings)
plot(met_data$num_H_bonds)
plot(met_data$enzyme_activity)
plot(met_data$pathway_count)
plot(met_data$metabolite_concentration)
par(mfrow = c(1, 1))
Comment here:
# Step 2c: Boxplots with names and 45° labels
boxplot(
met_data$mass,
met_data$enzyme_activity,
met_data$metabolite_concentration,
names = FALSE,
main = "Boxplots of Key Metabolite Variables",
col = c("skyblue", "lightgreen", "salmon"),
xaxt = "n"
)
labels1 <- c("mass", "enzyme_activity", "metabolite_concentration")
text(x = 1:length(labels1), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
labels = labels1, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)
boxplot(
met_data$logP,
met_data$num_rings,
met_data$num_H_bonds,
met_data$pathway_count,
names = FALSE,
main = "Boxplots of Other Predictors",
col = c("orange", "purple", "lightblue", "lightpink"),
xaxt = "n"
)
labels2 <- c("logP", "num_rings", "num_H_bonds", "pathway_count")
text(x = 1:length(labels2), y = par("usr")[3] - 0.05 * diff(par("usr")[3:4]),
labels = labels2, srt = 45, adj = 1, xpd = TRUE, cex = 0.9)
Comment here:
# Step 2d: Correlation Heatmap ----
corrplot(cor(met_data), method = "color", type = "upper", tl.col = "black")
Comment here:
# Step 2e: Pairwise relationships
ggpairs(met_data)
Comment here:
# Step 2f: Distribution Check ----
met_data_long <- melt(met_data)
ggplot(met_data_long, aes(x = value, fill = variable)) +
geom_histogram(bins = 20) +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(title = "Distributions of Metabolite Features")
Comment here:
Step 3: Train and Split
set.seed(42)
train_index <- createDataPartition(met_data$metabolite_concentration, p = 0.7, list = FALSE)
train_data <- met_data[train_index, ]
test_data <- met_data[-train_index, ]
Comment here: Training Data and test data
Step 4: Linear regression Model
lm_model <- lm(metabolite_concentration ~ ., data = train_data)
summary(lm_model)
##
## Call:
## lm(formula = metabolite_concentration ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1504 -3.7074 -0.1107 2.7126 15.4738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.24431 4.66993 -0.909 0.365
## mass 0.50097 0.02003 25.015 < 2e-16 ***
## logP 2.66187 0.56185 4.738 5.47e-06 ***
## num_rings 3.17428 0.39278 8.082 3.44e-13 ***
## num_H_bonds 1.57205 0.23086 6.809 3.06e-10 ***
## enzyme_activity 0.74271 0.04244 17.499 < 2e-16 ***
## pathway_count 1.82975 0.31593 5.792 4.78e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.205 on 133 degrees of freedom
## Multiple R-squared: 0.8904, Adjusted R-squared: 0.8855
## F-statistic: 180.1 on 6 and 133 DF, p-value: < 2.2e-16
Comment here: Adjusted R-squared indicates about 88.55% of the variability in the target variable is explained by the predictors and indicates a good fit.
Step 5: Diagnostics
# Step 5a Check residual normality
shapiro.test(lm_model$residuals)
##
## Shapiro-Wilk normality test
##
## data: lm_model$residuals
## W = 0.98715, p-value = 0.2191
par(mfrow = c(2, 3))
hist(lm_model$residuals, main = "Histogram of Residuals")
plot(lm_model)
par(mfrow = c(1, 1))
Comment here: The histogram of residuals shows that they have a normal
distribution centered around 0
# Step 5b Statistical significance: Multicollinearity
vif(lm_model)
## mass logP num_rings num_H_bonds enzyme_activity
## 1.080017 1.061047 1.031570 1.095657 1.031244
## pathway_count
## 1.084573
Comment here: The VIF values for all predictors are close to 1, indicating no significant multicollinearity
# Step 5c Statistical significance: Homoscadasticity
bptest(lm_model)
##
## studentized Breusch-Pagan test
##
## data: lm_model
## BP = 14.878, df = 6, p-value = 0.02122
Comment here: Since the p-value is less than 0.05, the null hypothesis is rejected. This indicates evidence of heteroscedasticity. The Breusch-Pagan test suggests that the residuals do not have constant variance.
Step 6: Model performance
lm_pred <- predict(lm_model, newdata = test_data)
# Step 6a: Calculate performance metrics and print
rmse <- sqrt(mean((test_data$metabolite_concentration - lm_pred)^2))
mae <- mean(abs(test_data$metabolite_concentration - lm_pred))
r2 <- cor(test_data$metabolite_concentration, lm_pred)^2
# Print results
cat("RMSE:", rmse, "\n")
## RMSE: 4.970806
cat("MAE:", mae, "\n")
## MAE: 3.885294
cat("R²:", r2, "\n")
## R²: 0.9003138
Comment here: R2 value of 0.9 indicates that 90% of the variability in the response variable is explained by the model and indicates an excellent fit
# Step 6b: plot predicted vs observed
ggplot(data.frame(Observed = test_data$metabolite_concentration, Predicted = lm_pred),
aes(x = Observed, y = Predicted)) +
geom_point(color = "blue") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Predicted vs Observed Metabolite Concentration")
Comment here: In the plot, the points are tightly clustered along the
red dashed line, indicating that the model performs well with minimal
prediction error
Step 7
#Which Variable/Predictor is important in the model
explainer <- explain(lm_model, data = train_data[, -7],
y = train_data$metabolite_concentration)
## Preparation of a new explainer is initiated
## -> model label : lm ( default )
## -> data : 140 rows 6 cols
## -> target variable : 140 values
## -> predict function : yhat.lm will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package stats , ver. 4.5.1 , task regression ( default )
## -> predicted values : numerical, min = 107.7701 , mean = 145.5337 , max = 189.6574
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -12.1504 , mean = 1.031303e-13 , max = 15.47382
## A new explainer has been created!
vip <- model_parts(explainer)
plot(vip) + ggtitle("Variable Importance (DALEX)")
Comment here: The graph shows that mass and enzyme activity are the main
predictors and logP is the least
Step 8
#Testing the model on an observation
new_met <- data.frame(
mass=200,
logP=3,
num_rings=2,
num_H_bonds=3,
enzyme_activity=60,
pathway_count=2
)
predict(lm_model, newdata=new_met)
## 1
## 163.2212
Comment here: The function indicates that the observation will correspond to a concentration of 163.2212
Step 9: Cross-Validation
#Step 9a: Cross-Validation: another way to train and predict ----
set.seed(42)
# Define 10-fold CV
train_control <- trainControl(method = "cv", number = 10)
# Train linear regression model with CV
cv_model <- train(
metabolite_concentration ~ .,
data = met_data,
method = "lm",
trControl = train_control,
metric = "RMSE"
)
# Show CV results
print(cv_model)
## Linear Regression
##
## 200 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.12678 0.8853752 4.151905
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Extract performance metrics
cv_results <- cv_model$resample
summary(cv_results)
## RMSE Rsquared MAE Resample
## Min. :3.108 Min. :0.8112 Min. :2.494 Length:10
## 1st Qu.:4.262 1st Qu.:0.8359 1st Qu.:3.543 Class :character
## Median :5.319 Median :0.8906 Median :4.329 Mode :character
## Mean :5.127 Mean :0.8854 Mean :4.152
## 3rd Qu.:5.940 3rd Qu.:0.9355 3rd Qu.:4.704
## Max. :6.677 Max. :0.9599 Max. :5.506
# Mean CV RMSE, MAE and R²
cat("Mean CV RMSE:", mean(cv_results$RMSE), "\n")
## Mean CV RMSE: 5.12678
cat("Mean CV MAE:", mean(cv_results$MAE), "\n")
## Mean CV MAE: 4.151905
cat("Mean CV R²:", mean(cv_results$Rsquared), "\n")
## Mean CV R²: 0.8853752
Comment here:
# Step 9b Visualize cross-validation performance: RMSE, MAE, Rsquared
par(mfrow = c(2, 2))
# RMSE
ggplot(cv_results, aes(x = RMSE)) +
geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
theme_minimal() +
labs(title = "Cross-Validation RMSE Distribution (10-Fold)",
x = "RMSE", y = "Count")
# MAE
ggplot(cv_results, aes(x = MAE)) +
geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
theme_minimal() +
labs(title = "Cross-Validation MAE Distribution (10-Fold)",
x = "MAE", y = "Count")
# RSquared
ggplot(cv_results, aes(x = Rsquared)) +
geom_histogram(bins = 10, fill = "steelblue", color = "white", alpha = 0.8) +
theme_minimal() +
labs(title = "Cross-Validation Rsquared Distribution (10-Fold)",
x = "Rsquared", y = "Count")
par(mfrow = c(1, 1))
Comment here:
# Step 9c: Predict New Metabolite with Cross Validation ----
new_met <- data.frame(
mass = 200,
logP = 3,
num_rings = 2,
num_H_bonds = 3,
enzyme_activity = 60,
pathway_count = 2
)
predict(cv_model, newdata = new_met)
## 1
## 163.1173
Comment here:
# Step 9d: Compare lm and cv
predict(lm_model, newdata = new_met)
## 1
## 163.2212
predict(cv_model, newdata = new_met)
## 1
## 163.1173