# =============================================================================

0. Environment Setup

Before we begin, we ensure that all required R packages are available. The function install_if_missing() checks whether a package can be loaded; otherwise it installs the package from CRAN together with its dependencies. This guarantees that the notebook can be executed on a fresh R installation without manual intervention.

A global seed (set.seed(1234)) is set to make every random operation—from synthetic data generation to cross‑validation folds— perfectly reproducible. This is an essential habit in scientific computing.

The H2O package is deliberately not loaded here. It can interfere with some earlier functions (especially dim() and print()). We will load it only when the AutoML section begins.

# =============================================================================

if ("package:h2o" %in% search()) {
  detach("package:h2o", unload = TRUE, character.only = TRUE)
}

## [1] "A shutdown has been triggered. "

install_if_missing <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, dependencies = TRUE)
    library(pkg, character.only = TRUE)
  }
}

pkgs <- c("dplyr", "ggplot2", "plotly", "DT", "protr", 
          "caret", "MLmetrics", "pROC", "Rtsne", "Boruta", 
          "randomForest", "rpart", "rpart.plot", "tidyr",
          "keras", "PRROC", "patchwork", "moments", "gridExtra",
          "stringr", "GGally")
invisible(sapply(pkgs, install_if_missing))

set.seed(1234)
select <- dplyr::select
cat("Setup complete.\n")

## Setup complete.

# =============================================================================

1. Synthetic Data – The Foundations of Classification

1.1 What Is a Classifier?

A classifier is a function that maps an input vector \(\mathbf{x} \in \mathbb{R}^p\) to a discrete class label \(y \in \{1,2,\dots,K\}\). In binary classification, \(K=2\). The classifier learns from a training set of \(n\) examples \(\{(\mathbf{x}_i, y_i)\}_{i=1}^n\) and must generalise to unseen data.

The simplest geometric intuition is that each class occupies a region in feature space. The job of a classifier is to find decision boundaries that separate these regions as accurately as possible.

1.2 Generating Gaussian Clusters

We create an artificial two‑class dataset where every feature is drawn from a normal distribution. The function rnorm(n, mean, sd) draws \(n\) independent samples from \(\mathcal{N}(\mu, \sigma^2)\). Here we generate \(x_1\) and \(x_2\) independently, so the resulting scatter plots show spherical (isotropic) clusters.

set.seed(42)
n <- 300
x1_0 <- rnorm(n/2, mean = 2, sd = 0.8)   # 150 values, mean 2, sd 0.8
x2_0 <- rnorm(n/2, mean = 2, sd = 0.8)
x1_1 <- rnorm(n/2, mean = 5, sd = 0.8)
x2_1 <- rnorm(n/2, mean = 5, sd = 0.8)

synth <- data.frame(
  x1 = c(x1_0, x1_1),
  x2 = c(x2_0, x2_1),
  Class = factor(rep(c("Class 0","Class 1"), each = n/2))
)

p_synth <- plot_ly(synth, x = ~x1, y = ~x2, color = ~Class,
                   type = 'scatter', mode = 'markers', marker = list(size=8)) %>%
  layout(title = "Synthetic Binary Classification Dataset",
         xaxis = list(title="Feature 1"), yaxis = list(title="Feature 2"))
p_synth

# =============================================================================

1.3 Effect of Class Separation

The Mahalanobis distance between two normal distributions with equal covariance \(\Sigma\) is \(D = \sqrt{(\mu_1-\mu_0)^\top \Sigma^{-1}(\mu_1-\mu_0)}\). Intuitively, the larger this distance, the easier the classification. We vary the Euclidean separation \(\Delta\) from 1 to 4 while keeping \(\Sigma = \mathbf{I}\) (identity). The patchwork package is used to arrange multiple ggplot2 figures side‑by‑side.

library(patchwork)
delta_vals <- c(1, 2, 3, 4)
plot_list <- list()
for (d in delta_vals) {
  tmp <- data.frame(
    x1 = c(rnorm(150, 2, 1), rnorm(150, 2 + d, 1)),  # shift class 1 by d
    x2 = c(rnorm(150, 2, 1), rnorm(150, 2 + d, 1)),
    Class = factor(rep(c("Class 0","Class 1"), each=150))
  )
  plot_list[[as.character(d)]] <- ggplot(tmp, aes(x1, x2, color=Class)) +
    geom_point(alpha=0.5) + labs(title=paste("Δ =", d)) + theme_minimal()
}
wrap_plots(plot_list, ncol=2) +
  plot_annotation(title="Effect of Mean Separation on Class Separability")

# =============================================================================

1.4 Effect of Variance (Overlap)

When the class centres are fixed but the variance increases, the Bhattacharyya bound—an upper bound on the Bayes error—rises. Even the optimal classifier will misclassify some points. This illustrates the concept of irreducible error.

sd_vals <- c(0.5, 1.0, 1.5, 2.0)
sd_plots <- list()
for (s in sd_vals) {
  tmp <- data.frame(
    x1 = c(rnorm(150, 2, s), rnorm(150, 5, s)),
    x2 = c(rnorm(150, 2, s), rnorm(150, 5, s)),
    Class = factor(rep(c("Class 0","Class 1"), each=150))
  )
  sd_plots[[as.character(s)]] <- ggplot(tmp, aes(x1, x2, color=Class)) +
    geom_point(alpha=0.5) + labs(title=paste("SD =", s)) + theme_minimal()
}
wrap_plots(sd_plots, ncol=2) +
  plot_annotation(title="Effect of Variance on Class Overlap")

# =============================================================================

1.5 The Bayes Decision Boundary

When both classes are normal with equal covariance, the Bayes rule (which minimises the probability of error) is a linear discriminant function. The boundary is given by \[ \mathbf{w}^\top (\mathbf{x} - \mathbf{x}_0) = 0, \qquad \mathbf{w} = \Sigma^{-1}(\mu_1 - \mu_0),\; \mathbf{x}_0 = \frac{\mu_0+\mu_1}{2}. \] We compute the slope and intercept of this line and overlay it on the original data.

mu0 <- c(2,2); mu1 <- c(5,5)
Sigma <- diag(c(0.8^2, 0.8^2))            # covariance matrix
w <- solve(Sigma) %*% (mu1 - mu0)        # solve computes inverse
x0 <- (mu0 + mu1) / 2
slope <- -w[1]/w[2]
intercept <- x0[2] - slope * x0[1]

ggplot(synth, aes(x1, x2, color=Class)) +
  geom_point(alpha=0.5) +
  geom_abline(slope=slope, intercept=intercept,
              linetype="dashed", size=1.2, color="black") +
  labs(title="Bayes Decision Boundary (Equal Covariance)") + theme_minimal()

# =============================================================================

1.6 Non‑linear Boundaries

When covariances differ, the optimal boundary becomes quadratic. We illustrate this by generating one class stretched along the \(x_1\) direction and the other along \(x_2\). The MASS::mvrnorm() function draws from a multivariate normal \(\mathcal{N}(\mu,\Sigma)\).

library(MASS)
Sigma0 <- matrix(c(3,0,0,1), ncol=2)      # variance 3 in x1, 1 in x2
Sigma1 <- matrix(c(1,0,0,3), ncol=2)      # variance 1 in x1, 3 in x2
x0n <- mvrnorm(150, mu=c(2,2), Sigma=Sigma0)
x1n <- mvrnorm(150, mu=c(5,5), Sigma=Sigma1)

df_nl <- data.frame(x1=c(x0n[,1],x1n[,1]), x2=c(x0n[,2],x1n[,2]),
                    Class=factor(rep(c("Class 0","Class 1"), each=150)))
ggplot(df_nl, aes(x1, x2, color=Class)) + geom_point(alpha=0.5) +
  stat_ellipse(level=0.95) + labs(title="Different Covariance – Non‑linear Boundary") +
  theme_minimal()

# =============================================================================

1.7 Outliers and Robustness

Outliers can severely degrade models that assume Gaussianity (e.g., linear discriminant analysis). We add 5% uniform noise to a clean dataset and observe how it pulls decision boundaries.

n_clean <- 285
x1c0 <- rnorm(n_clean/2,2,0.8); x2c0 <- rnorm(n_clean/2,2,0.8)
x1c1 <- rnorm(n_clean/2,5,0.8); x2c1 <- rnorm(n_clean/2,5,0.8)
n_out <- 15
x1_out <- runif(n_out,-2,9); x2_out <- runif(n_out,-2,9)   # uniform noise
class_out <- sample(c("Class 0","Class 1"), n_out, replace=TRUE)

df_out <- data.frame(x1=c(x1c0,x1c1,x1_out), x2=c(x2c0,x2c1,x2_out),
                     Class=factor(c(rep("Class 0",n_clean/2), rep("Class 1",n_clean/2),
                                    class_out)))
ggplot(df_out, aes(x1,x2,color=Class)) + geom_point(alpha=0.5) +
  labs(title="Data with 5% Outliers") + theme_minimal()

# =============================================================================

2. Real Biological Data – Protein Sequences

We load FASTA files provided by the protr package. FASTA format is the standard for representing biological sequences: a header line (starting with >) followed by the sequence. The function readFASTA() returns a named list; protcheck() filters out sequences with non‑standard amino acids (e.g., X, U).

# =============================================================================

extracell <- readFASTA(system.file("protseq/extracell.fasta", package="protr"))
mitonchon <- readFASTA(system.file("protseq/mitochondrion.fasta", package="protr"))
extracell <- extracell[sapply(extracell, protcheck)]
mitonchon <- mitonchon[sapply(mitonchon, protcheck)]

infoDF <- data.frame(
  ID = c(names(extracell), names(mitonchon)),
  Length = c(nchar(unlist(extracell)), nchar(unlist(mitonchon))),
  Class = c(rep("Extracellular", length(extracell)),
            rep("Mitochondrial", length(mitonchon)))
)
ggplot(infoDF, aes(Length, fill=Class)) +
  geom_histogram(alpha=0.6, position="identity", bins=30) +
  labs(title="Sequence Length Distribution") + theme_minimal()

ggplot(infoDF, aes(Class, Length, fill=Class)) +
  geom_boxplot(alpha=0.7) + labs(title="Length by Class") + theme_minimal()

# =============================================================================

2.2 Amino Acid Composition

The simplest global descriptor is the frequency of the 20 standard amino acids. Our helper function concatenates all sequences and uses stringr::str_count to tally each letter.

# =============================================================================

comp_aa <- function(seq_list) {
  aa <- c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
  all_seq <- paste(unlist(seq_list), collapse="")
  freqs <- sapply(aa, function(a) stringr::str_count(all_seq, a) / nchar(all_seq))
  data.frame(AminoAcid=aa, Frequency=freqs)
}
freqE <- comp_aa(extracell); freqM <- comp_aa(mitonchon)
freqE$Class <- "Extracellular"; freqM$Class <- "Mitochondrial"
freq_all <- rbind(freqE, freqM)

ggplot(freq_all, aes(AminoAcid, Frequency, fill=Class)) +
  geom_col(position="dodge") + labs(title="Amino Acid Frequencies by Class") + theme_minimal()

plot_ly(freq_all, x=~AminoAcid, y=~Frequency, color=~Class, type="bar") %>%
  layout(title="Interactive AA Frequencies")

# =============================================================================

3. Feature Engineering – From Sequences to Numerical Vectors

We compute three descriptor types: - AAC (20 dim): amino acid frequencies. - DC (400 dim): dipeptide frequencies. - APAAC (80 dim): adds correlation factors that capture sequence‑order effects. For two physicochemical properties (hydrophobicity and hydrophilicity) and a maximum lag \(\lambda=30\), we obtain \(2\times30 = 60\) extra features, plus the 20 AAC values.

The mathematical definition of the \(k\)‑th correlation factor for a property \(P\) is \[ \tau_k^{(P)} = \frac{1}{L-k}\sum_{i=1}^{L-k} \Theta(P_i, P_{i+k}), \] where \(\Theta\) is a coupling function (typically squared difference or product). These \(\tau_k\) are then normalised.

# =============================================================================

compute_desc <- function(seq_list, desc_type="AAC") {
  if (desc_type=="AAC")   return(t(sapply(seq_list, extractAAC)))
  if (desc_type=="DC")    return(t(sapply(seq_list, extractDC)))
  if (desc_type=="APAAC") return(t(sapply(seq_list, extractAPAAC)))
  stop("Unknown descriptor type")
}

desc_types <- c("AAC","DC","APAAC")
feature_list <- list()
for (dt in desc_types) {
  x1 <- compute_desc(extracell, dt)
  x2 <- compute_desc(mitonchon, dt)
  Features_temp <- rbind(x1, x2)
  Class_temp <- factor(c(rep("Extracellular", nrow(x1)), rep("Mitochondrial", nrow(x2))))
  df_temp <- data.frame(Class=Class_temp, Features_temp, stringsAsFactors=FALSE)
  df_temp[,-1] <- lapply(df_temp[,-1], as.numeric)
  feature_list[[dt]] <- df_temp
}

pca_plots <- list()
for (dt in names(feature_list)) {
  df <- feature_list[[dt]]
  nzv <- nearZeroVar(df[,-1], saveMetrics=FALSE)
  if (length(nzv)>0) df <- df[,-c(nzv+1)]
  pca_res <- prcomp(df[,-1], scale.=TRUE)
  scores <- data.frame(pca_res$x[,1:2], Class=df$Class)
  var1 <- round(summary(pca_res)$importance[2,1]*100,1)
  var2 <- round(summary(pca_res)$importance[2,2]*100,1)
  p <- ggplot(scores, aes(PC1,PC2,color=Class)) + geom_point(alpha=0.7) +
    labs(title=paste0(dt," (PC1:",var1,"%, PC2:",var2,"%)")) + theme_minimal()
  pca_plots[[dt]] <- p
}
wrap_plots(pca_plots, ncol=2) + plot_annotation(title="Comparison of Feature Spaces via PCA")

set.seed(1234)
tsne_apaac <- Rtsne(as.matrix(scale(feature_list[["APAAC"]][,-1])), perplexity=30, check_duplicates=FALSE)
tsne_df <- data.frame(x=tsne_apaac$Y[,1], y=tsne_apaac$Y[,2], Class=feature_list[["APAAC"]]$Class)
plot_ly(tsne_df, x=~x, y=~y, color=~Class, type='scatter', mode='markers') %>%
  layout(title="t‑SNE of APAAC Features")

df_aac <- feature_list[["AAC"]]; df_apaac <- feature_list[["APAAC"]]
df_comb <- cbind(df_aac[,-1], df_apaac[,-1]); df_comb$Class <- df_aac$Class
pca_comb <- prcomp(df_comb[,-ncol(df_comb)], scale.=TRUE)
scores_comb <- data.frame(pca_comb$x[,1:2], Class=df_comb$Class)
ggplot(scores_comb, aes(PC1,PC2,color=Class)) + geom_point(alpha=0.7) +
  labs(title="PCA of Combined (AAC+APAAC)") + theme_minimal()

aac_only <- df_aac[,-1]
hydrophobic <- c("A","V","L","I","M","F","W","P")
hydrophilic <- c("R","K","D","E","Q","N","H","S","T","Y","C","G")
hb_cols <- intersect(hydrophobic, names(aac_only)); hl_cols <- intersect(hydrophilic, names(aac_only))
aac_only$Hphob_Sum <- rowSums(aac_only[,hb_cols, drop=FALSE])
aac_only$Hphil_Sum <- rowSums(aac_only[,hl_cols, drop=FALSE])
aac_only$Hphob_Hphil_Ratio <- aac_only$Hphob_Sum / (aac_only$Hphil_Sum + 1e-6)
df_gen <- cbind(Class=df_aac$Class, aac_only)

df_gen %>% dplyr::select(Class, Hphob_Sum, Hphil_Sum, Hphob_Hphil_Ratio) %>%
  pivot_longer(-Class) %>% ggplot(aes(Class, value, fill=Class)) + geom_boxplot() +
  facet_wrap(~name, scales="free") + labs(title="Generated Features from AAC") + theme_minimal()

rf_imp <- randomForest(Class~., data=feature_list[["APAAC"]], importance=TRUE, ntree=100)
imp_df <- data.frame(Importance=importance(rf_imp)[,"MeanDecreaseAccuracy"],
                     Feature=rownames(importance(rf_imp)))
imp_df <- imp_df[order(-imp_df$Importance),]
ggplot(imp_df[1:10,], aes(reorder(Feature, Importance), Importance)) +
  geom_col(fill="steelblue") + coord_flip() + labs(title="Top10 Features (APAAC)") + theme_minimal()

Features_df <- feature_list[["APAAC"]]
cat("Selected feature set: APAAC, dimensions:", dim(Features_df), "\n")

## Selected feature set: APAAC, dimensions: 627 81

# =============================================================================

4. Exploratory Data Analysis

We examine selected APAAC features using density plots, boxplots, violin plots, statistical tests, Q‑Q plots, skewness analysis, and outlier detection. This step is crucial for understanding the data before modelling: it can reveal skewness, outliers, and class overlap that influence model choice and preprocessing.

# =============================================================================

df_eda <- Features_df
sel_feat <- c("Pc1.A","Pc1.C","Pc2.Hydrophobicity.1","Pc2.Hydrophilicity.1")

# Density
df_eda %>% dplyr::select(Class, all_of(sel_feat)) %>% pivot_longer(-Class) %>%
  ggplot(aes(value, fill=Class)) + geom_density(alpha=0.6) + facet_wrap(~name, scales="free") +
  labs(title="Density Plots") + theme_minimal()

# Boxplots
df_eda %>% dplyr::select(Class, all_of(sel_feat)) %>% pivot_longer(-Class) %>%
  ggplot(aes(Class, value, fill=Class)) + geom_boxplot(alpha=0.7) + facet_wrap(~name, scales="free") +
  labs(title="Boxplots") + theme_minimal()

# Violin
df_eda %>% dplyr::select(Class, all_of(sel_feat)) %>% pivot_longer(-Class) %>%
  ggplot(aes(Class, value, fill=Class)) + geom_violin(draw_quantiles=c(0.25,0.5,0.75)) +
  facet_wrap(~name, scales="free") + labs(title="Violin Plots") + theme_minimal()

class0 <- df_eda$Class=="Extracellular"; class1 <- df_eda$Class=="Mitochondrial"
test_feat <- function(f) {
  x0 <- df_eda[class0,f]; x1 <- df_eda[class1,f]
  data.frame(Feature=f, t_pvalue=t.test(x0,x1)$p.value,
             wilcox_pvalue=wilcox.test(x0,x1)$p.value,
             ks_pvalue=ks.test(x0,x1)$p.value, mean_diff=mean(x1)-mean(x0))
}
print(do.call(rbind, lapply(sel_feat, test_feat)))

##                Feature     t_pvalue wilcox_pvalue    ks_pvalue    mean_diff
## 1                Pc1.A 2.425727e-01  1.426917e-05 2.125439e-06  2.716267239
## 2                Pc1.C 1.053787e-05  1.744219e-06 3.155691e-05 -7.398227475
## 3 Pc2.Hydrophobicity.1 1.358591e-04  1.124960e-04 1.946253e-03 -0.001364252
## 4 Pc2.Hydrophilicity.1 5.771042e-04  9.029007e-05 4.710508e-04 -0.001078379

ggplot(df_eda, aes(sample=Pc1.A)) + stat_qq() + stat_qq_line() + facet_wrap(~Class) +
  labs(title="Q‑Q Plot of Pc1.A") + theme_minimal()

skew_df <- data.frame(Feature=names(df_eda)[-1], Skewness=sapply(df_eda[,-1], moments::skewness))
skew_df <- skew_df[order(-abs(skew_df$Skewness)),]
ggplot(skew_df[1:10,], aes(reorder(Feature, abs(Skewness)), Skewness)) +
  geom_col(fill="steelblue") + coord_flip() + labs(title="Top10 Skewed Features") + theme_minimal()

cnt_outlier <- function(x) {
  iqr <- IQR(x); q1 <- quantile(x,0.25); q3 <- quantile(x,0.75)
  sum(x < (q1-1.5*iqr) | x > (q3+1.5*iqr), na.rm=TRUE)
}
out_df <- data.frame(Feature=names(df_eda)[-1], Outliers=sapply(df_eda[,-1], cnt_outlier))
out_df <- out_df[order(-out_df$Outliers),]
ggplot(out_df, aes(Outliers)) + geom_histogram(binwidth=1, fill="steelblue") +
  labs(title="Outlier Count Distribution") + theme_minimal()

# =============================================================================

5. Data Preprocessing – Standardisation

Many algorithms—especially k‑nearest neighbours, support vector machines, and neural networks—are sensitive to the scale of the features. If one feature spans \([0,1000]\) and another spans \([0,1]\), the first will dominate Euclidean distances and gradient calculations. Tree‑based methods (Decision Trees, Random Forest) are scale invariant because they split on single features at a time.

We demonstrate three popular scaling strategies: - Standardisation (Z‑score): \(x' = \frac{x - \bar{x}}{s}\) - Min‑Max scaling: \(x' = \frac{x - \min}{\max - \min}\) - Robust scaling: \(x' = \frac{x - \text{median}}{\text{IQR}}\)

Furthermore, we show how scaling affects the convergence speed of gradient descent using a simple linear regression example.

# =============================================================================

library(caret)
ranges <- sapply(Features_df[,-1], function(x) diff(range(x)))
sf <- names(which.min(ranges)); lf <- names(which.max(ranges))
demo <- Features_df[,c(sf,lf,"Class")]

p_orig <- ggplot(demo, aes_string(x=sf, y=lf, color="Class")) +
  geom_point(alpha=0.6) + labs(title="Original (different scales)") + theme_minimal()

preProc_std <- preProcess(demo[,1:2], method=c("center","scale"))
demo_std <- predict(preProc_std, demo[,1:2]); demo_std$Class <- demo$Class

preProc_norm <- preProcess(demo[,1:2], method="range", rangeBounds=c(0,1))
demo_norm <- predict(preProc_norm, demo[,1:2]); demo_norm$Class <- demo$Class

robust_scale <- function(x) (x-median(x))/IQR(x)
demo_rob <- as.data.frame(lapply(demo[,1:2], robust_scale)); demo_rob$Class <- demo$Class
names(demo_rob)[1:2] <- paste0("Rob_", names(demo_rob)[1:2])

p_std <- ggplot(demo_std, aes_string(x=sf, y=lf, color="Class")) +
  geom_point(alpha=0.6) + ggtitle("Z‑score") + theme_minimal()
p_norm <- ggplot(demo_norm, aes_string(x=sf, y=lf, color="Class")) +
  geom_point(alpha=0.6) + ggtitle("Min‑Max") + theme_minimal()
p_rob <- ggplot(demo_rob, aes_string(x=names(demo_rob)[1], y=names(demo_rob)[2], color="Class")) +
  geom_point(alpha=0.6) + ggtitle("Robust") + theme_minimal()

(p_std | p_norm | p_rob) + plot_annotation(title="Scaling Methods Comparison")

cat("Original distance:", as.numeric(dist(demo[1:2,1:2])),
    "\nStd distance:", as.numeric(dist(demo_std[1:2,1:2])), "\n")

## Original distance: 44.70673 
## Std distance: 1.481295

set.seed(123)
x <- demo[,lf]; y <- 2*x + rnorm(length(x),0,10); xs <- scale(x)
gd <- function(x, y, lr, n=100) {
  w <- 0; b <- 0; loss <- numeric(n)
  for (i in 1:n) { yp <- w*x + b; e <- yp-y; w <- w - lr*mean(e*x); b <- b - lr*mean(e); loss[i] <- mean(e^2) }
  loss
}
loss_unsc <- gd(x, y, 1e-5); loss_sc <- gd(xs, y, 0.1)
loss_df <- data.frame(Epoch=rep(1:100,2), Loss=c(loss_unsc,loss_sc), Data=rep(c("Unscaled","Scaled"), each=100))
ggplot(loss_df, aes(Epoch, Loss, color=Data)) + geom_line(size=1) +
  labs(title="Gradient Descent Convergence") + theme_minimal()

# =============================================================================

6. Dimensionality Reduction – PCA

Principal Component Analysis (PCA) projects the data onto orthogonal directions that maximise variance. The first principal component is the direction of greatest variance; the second is orthogonal to the first and captures the next greatest variance, etc. PCA is widely used for visualisation, noise reduction (keeping only the top \(k\) PCs), and feature extraction (creating uncorrelated features).

# =============================================================================

pca_res <- prcomp(Features_df[,-1], scale.=TRUE)
var_exp <- pca_res$sdev^2 / sum(pca_res$sdev^2)

scree_df <- data.frame(PC=1:length(var_exp), Variance=var_exp, Cumulative=cumsum(var_exp))
ggplot(scree_df[1:30,], aes(PC)) + geom_col(aes(y=Variance), fill="steelblue") +
  geom_line(aes(y=Cumulative, group=1), color="red", size=1.2) +
  scale_y_continuous(labels=scales::percent) + labs(title="Scree Plot") + theme_minimal()

loadings <- pca_res$rotation[,1:2]
top_feat <- names(sort(sqrt(loadings[,1]^2+loadings[,2]^2), decreasing=TRUE))[1:8]
scores <- data.frame(pca_res$x[,1:2], Class=Features_df$Class)
load_scale <- loadings[top_feat,] * 5
arrow_df <- data.frame(xend=load_scale[,1], yend=load_scale[,2])
lab_df <- data.frame(x=load_scale[,1]*1.1, y=load_scale[,2]*1.1, label=top_feat)
ggplot(scores, aes(PC1,PC2,color=Class)) + geom_point(alpha=0.5) +
  geom_segment(data=arrow_df, aes(x=0,y=0,xend=xend,yend=yend), arrow=arrow(), inherit.aes=FALSE) +
  geom_text(data=lab_df, aes(x=x,y=y,label=label), inherit.aes=FALSE) +
  labs(title="PCA Biplot") + theme_minimal()

# =============================================================================

7. t‑SNE – Non‑linear Visualisation

t‑SNE (t‑distributed Stochastic Neighbor Embedding) preserves local neighbourhood structure. Points that are close in the original high‑ dimensional space will also be close in the 2D map. The key hyperparameter perplexity roughly controls the number of effective neighbours. We illustrate its influence by plotting embeddings for perplexity = 5, 15, 30, 50, 80.

# =============================================================================

X_scaled <- scale(Features_df[,-1])
set.seed(1234)
tsne_out <- Rtsne(X_scaled, perplexity=30, check_duplicates=FALSE)
tsne_plot <- data.frame(x=tsne_out$Y[,1], y=tsne_out$Y[,2], Class=Features_df$Class)
ggplot(tsne_plot, aes(x,y,color=Class)) + geom_point(alpha=0.7) + theme_minimal() +
  labs(title="t‑SNE (perplexity=30)")

perp_plots <- lapply(c(5,15,30,50,80), function(p) {
  tmp <- Rtsne(X_scaled, perplexity=p, check_duplicates=FALSE)
  ggplot(data.frame(x=tmp$Y[,1],y=tmp$Y[,2],Class=Features_df$Class), aes(x,y,color=Class)) +
    geom_point(alpha=0.5) + theme_minimal() + labs(title=paste("perplexity=",p))
})
wrap_plots(perp_plots, ncol=3) + plot_annotation(title="Effect of Perplexity on t‑SNE")

# =============================================================================

8. Data Splitting – Stratified Train/Test

To obtain an honest estimate of generalisation performance, we must evaluate models on data that was not used during training. A simple 80/20 stratified split ensures that the class proportions are maintained in both sets. The function caret::createDataPartition performs this split.

# =============================================================================

set.seed(1234)
train_idx <- createDataPartition(Features_df$Class, p=0.8, list=FALSE)
dataTrain <- Features_df[train_idx, ]; dataTest <- Features_df[-train_idx, ]

prop_df <- data.frame(Set=rep(c("Train","Test"), each=2),
                      Class=rep(levels(dataTrain$Class),2),
                      Prop=c(as.numeric(prop.table(table(dataTrain$Class))),
                             as.numeric(prop.table(table(dataTest$Class)))))
ggplot(prop_df, aes(Set, Prop, fill=Class)) + geom_col(position="dodge") +
  labs(title="Class Proportions After Split") + theme_minimal()

# =============================================================================

9. Feature Selection – Correlation and Boruta

9.1 Correlation Filter

Features that are strongly correlated (\(|r| > 0.9\)) provide redundant information. We remove one of each such pair using caret::findCorrelation().

9.2 Boruta Algorithm

Boruta is a wrapper method built on Random Forest. It creates shadow features by randomly permuting the original features, then compares each real feature’s importance against the maximum importance of the shadows. After many iterations, a feature is classified as Confirmed (consistently beats shadows), Tentative, or Rejected. This method captures nonlinear relationships and interactions, unlike simple correlation filters.

# =============================================================================

cor_mat <- cor(dataTrain[,-1])
high_cor <- findCorrelation(cor_mat, cutoff=0.9)
dataTrain_filt <- dataTrain[, -high_cor]

set.seed(1234)
boruta_res <- Boruta(Class~., data=dataTrain_filt, doTrace=1)
boruta_fixed <- TentativeRoughFix(boruta_res)
plot(boruta_fixed, cex.axis=0.7, las=2, main="Boruta Feature Importance")

selected_features <- getSelectedAttributes(boruta_fixed, withTentative=FALSE)
SelectedTrain <- dataTrain_filt[, c("Class", selected_features)]

# =============================================================================

10. Decision Tree – An Interpretable Model

A decision tree recursively splits the data by asking yes/no questions about the features (e.g., “Is Pc1.A > 5.3?”). At each split, the algorithm chooses the feature and threshold that maximise the information gain (or, equivalently, minimise the impurity). Common impurity measures are Gini impurity \(G = 1 - \sum p_c^2\) and entropy \(H = -\sum p_c \log_2 p_c\).

We limit the tree depth to 3 to keep it interpretable. The visualisation is created with rpart.plot().

# =============================================================================

tree_model <- rpart(Class~., data=SelectedTrain, method="class",
                    control=rpart.control(maxdepth=3, minsplit=10))
rpart.plot(tree_model, extra=104, box.palette="RdBu", shadow.col="gray",
           main="Decision Tree (max depth = 3)")

# =============================================================================

11. Random Forest – Bagging with Feature Randomness

11.1 Why Ensemble?

A single decision tree often suffers from high variance: small changes in the training data can lead to a completely different tree. Bagging (Bootstrap Aggregating) builds many trees on bootstrap samples and averages (or votes) their predictions, dramatically reducing variance.

11.2 Random Forest

Random Forest adds a second layer of randomness: at each split, only \(m\) randomly chosen features are considered (where \(m \approx \sqrt{p}\) for classification). This decorrelates the trees and further reduces variance.

The algorithm provides a built‑in estimate of generalisation error via Out‑of‑Bag (OOB) samples. It also produces two importance measures: - Mean Decrease in Accuracy (permutation importance) - Mean Decrease in Gini (total decrease in node impurity)

# =============================================================================

set.seed(1234)
rf_default <- randomForest(Class~., data=SelectedTrain, importance=TRUE, ntree=500)
varImpPlot(rf_default, main="Random Forest Feature Importance")

imp_df <- data.frame(Importance=importance(rf_default)[,"MeanDecreaseAccuracy"],
                     Feature=rownames(importance(rf_default)))
imp_df <- imp_df[order(-imp_df$Importance),]
ggplot(imp_df[1:15,], aes(reorder(Feature, Importance), Importance)) +
  geom_col(fill="steelblue") + coord_flip() + labs(title="Top 15 Important Features") + theme_minimal()

# =============================================================================

12. Hyperparameter Tuning – The Bias‑Variance Tradeoff

12.1 What Are Hyperparameters?

Parameters are learned from the data (e.g., split thresholds in a tree). Hyperparameters are set before training and control the learning process. For Random Forest, the most critical hyperparameter is mtry—the number of features randomly sampled at each split.

12.2 The Bias‑Variance Tradeoff

\[ \text{Expected Error} = \text{Bias}^2 + \text{Variance} + \text{Irreducible Error} \]

Bias is error from overly simplistic assumptions (underfitting).
Variance is error from sensitivity to small fluctuations in the training set (overfitting).
A small mtry makes trees more diverse (low variance) but may miss important features (higher bias).
A large mtry makes trees more similar (higher variance) but each tree is stronger (lower bias).

We use 5‑fold cross‑validation to select the mtry that maximises accuracy. The caret::train() function automates this search.

# =============================================================================

ctrl_cv <- trainControl(method="repeatedcv", number=5, repeats=1,
                        classProbs=TRUE, savePredictions=TRUE)
tune_grid <- expand.grid(mtry=seq(2, min(15, ncol(SelectedTrain)-1)))
set.seed(1234)
rf_tuned <- caret::train(Class~., data=SelectedTrain, method="rf",
                         trControl=ctrl_cv, tuneGrid=tune_grid, metric="Accuracy")
ggplot(rf_tuned) + geom_point() + geom_line() + labs(title="Tuning mtry", x="mtry", y="CV Accuracy") + theme_bw()

best_mtry <- rf_tuned$bestTune$mtry
cat("Optimal mtry:", best_mtry, "\n")

## Optimal mtry: 6

# =============================================================================

13. Final Random Forest and Test Evaluation

We retrain the Random Forest with the optimal mtry and evaluate it on the held‑out test set. The confusion matrix summarises the predictions:

	Pred. Positive	Pred. Negative
Actual Pos.	TP	FN
Actual Neg.	FP	TN

From this we compute: - Accuracy = \((TP+TN)/\text{Total}\) - Sensitivity (Recall) = \(TP/(TP+FN)\) - Specificity = \(TN/(TN+FP)\) - Precision = \(TP/(TP+FP)\) - F1 = \(2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision}+\text{Recall}}\)

The ROC curve plots True Positive Rate (Sensitivity) against False Positive Rate (\(1 - \text{Specificity}\)) across all thresholds. The AUC (Area Under the Curve) summarises the overall discriminative ability: 1 = perfect, 0.5 = random. The Precision‑Recall curve is more informative when classes are imbalanced.

# =============================================================================

final_rf <- randomForest(Class~., data=SelectedTrain, mtry=best_mtry, importance=TRUE, ntree=500)
test_prob <- predict(final_rf, newdata=dataTest, type="prob")
test_class <- predict(final_rf, newdata=dataTest)
cm <- confusionMatrix(test_class, dataTest$Class, positive="Mitochondrial")
print(cm)

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Extracellular Mitochondrial
##   Extracellular            52            25
##   Mitochondrial            12            35
##                                           
##                Accuracy : 0.7016          
##                  95% CI : (0.6129, 0.7804)
##     No Information Rate : 0.5161          
##     P-Value [Acc > NIR] : 2.037e-05       
##                                           
##                   Kappa : 0.3985          
##                                           
##  Mcnemar's Test P-Value : 0.04852         
##                                           
##             Sensitivity : 0.5833          
##             Specificity : 0.8125          
##          Pos Pred Value : 0.7447          
##          Neg Pred Value : 0.6753          
##              Prevalence : 0.4839          
##          Detection Rate : 0.2823          
##    Detection Prevalence : 0.3790          
##       Balanced Accuracy : 0.6979          
##                                           
##        'Positive' Class : Mitochondrial   
##

roc_rf <- roc(dataTest$Class, test_prob[,"Mitochondrial"], levels=c("Extracellular","Mitochondrial"))
plot(roc_rf, print.auc=TRUE, main="ROC – Random Forest")

# =============================================================================

14. Deep Learning – Neural Networks

14.1 The Perceptron and Activation Functions

The fundamental building block of a neural network is the perceptron: \[ z = \sum w_i x_i + b,\qquad a = \sigma(z), \] where \(\sigma\) is a non‑linear activation function. Without non‑linearity, stacking layers would collapse to a single linear transformation. Common activations: - ReLU: \(\max(0, z)\) — fast, avoids vanishing gradient for \(z>0\). - Sigmoid: \(1/(1+e^{-z})\) — used in output layer for binary classification. - Softmax: \(e^{z_i}/\sum e^{z_j}\) — for multi‑class probabilities.

14.2 Training: Backpropagation and Gradient Descent

Training proceeds by minimising a loss function (here cross‑entropy) using stochastic gradient descent. The gradient of the loss with respect to every weight is computed efficiently by backpropagation, which applies the chain rule from the output layer backwards.

14.3 Regularisation: Dropout and Early Stopping

Dropout: During training, a randomly chosen fraction of neurons is temporarily “dropped out” (set to zero). This forces the network to learn redundant representations and reduces overfitting.
Early Stopping: We monitor the validation loss. Training is halted when it stops improving for a specified number of epochs (patience), and the best weights are restored.

14.4 Implementation with `torch` and `luz`

The torch package provides low‑level tensor operations and autograd; luz is a high‑level wrapper that handles training loops, callbacks, and metrics.

# =============================================================================

library(torch); library(luz)
preProc_dl <- preProcess(SelectedTrain[,-1], method=c("center","scale"))
x_train <- as.matrix(predict(preProc_dl, SelectedTrain[,-1]))
x_test  <- as.matrix(predict(preProc_dl, dataTest[,selected_features]))
y_train_int <- as.integer(SelectedTrain$Class)   # 1 = Extracellular, 2 = Mitochondrial
y_test_int  <- as.integer(dataTest$Class)

x_train_t <- torch_tensor(x_train, dtype=torch_float32())
y_train_t <- torch_tensor(y_train_int, dtype=torch_long())
x_test_t  <- torch_tensor(x_test, dtype=torch_float32())
y_test_t  <- torch_tensor(y_test_int, dtype=torch_long())

train_ds <- tensor_dataset(x=x_train_t, y=y_train_t)
test_ds  <- tensor_dataset(x=x_test_t,  y=y_test_t)

net <- nn_module("ProteinMLP",
  initialize = function(input_dim, hidden1=64, hidden2=32, dropout_rate=0.3) {
    self$layer1 <- nn_linear(input_dim, hidden1)
    self$layer2 <- nn_linear(hidden1, hidden2)
    self$layer3 <- nn_linear(hidden2, 2)
    self$dropout <- nn_dropout(dropout_rate)
  },
  forward = function(x) {
    x %>% self$layer1() %>% nnf_relu() %>% self$dropout() %>%
      self$layer2() %>% nnf_relu() %>% self$dropout() %>%
      self$layer3()
    # Note: no softmax here; nn_cross_entropy_loss expects raw logits
  }
)
model <- net %>% setup(loss=nn_cross_entropy_loss(), optimizer=optim_adam,
                       metrics=list(luz_metric_accuracy())) %>%
  set_opt_hparams(lr=0.001) %>% set_hparams(input_dim=ncol(x_train))

torch_manual_seed(1234)
fitted <- model %>% fit(data=train_ds, epochs=100, valid_data=0.2,
                        callbacks=list(luz_callback_early_stopping(monitor="valid_loss", patience=10, min_delta=0.001)),
                        verbose=TRUE)
plot(fitted)   # shows loss and accuracy curves

dl_pred <- predict(fitted, test_ds)
dl_prob <- as.numeric(dl_pred[,2])
dl_class <- ifelse(dl_prob>0.5, "Mitochondrial", "Extracellular")
dl_class <- factor(dl_class, levels=levels(dataTest$Class))
dl_cm <- confusionMatrix(dl_class, dataTest$Class, positive="Mitochondrial")
print(dl_cm)

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Extracellular Mitochondrial
##   Extracellular            55            30
##   Mitochondrial             9            30
##                                          
##                Accuracy : 0.6855         
##                  95% CI : (0.596, 0.7659)
##     No Information Rate : 0.5161         
##     P-Value [Acc > NIR] : 9.601e-05      
##                                          
##                   Kappa : 0.3633         
##                                          
##  Mcnemar's Test P-Value : 0.001362       
##                                          
##             Sensitivity : 0.5000         
##             Specificity : 0.8594         
##          Pos Pred Value : 0.7692         
##          Neg Pred Value : 0.6471         
##              Prevalence : 0.4839         
##          Detection Rate : 0.2419         
##    Detection Prevalence : 0.3145         
##       Balanced Accuracy : 0.6797         
##                                          
##        'Positive' Class : Mitochondrial  
##

dl_roc <- roc(dataTest$Class, dl_prob, levels=c("Extracellular","Mitochondrial"))
plot(dl_roc, print.auc=TRUE, main="ROC – Deep Learning")

# =============================================================================

15. Automated Machine Learning (AutoML) with H2O

15.1 What is AutoML?

AutoML automates the tedious parts of the ML pipeline: algorithm selection, hyperparameter optimisation, and ensembling. H2O’s AutoML trains a large and diverse set of models (GLM, Random Forest, GBM, XGBoost, Deep Learning, etc.) and then builds a stacked ensemble that learns how to optimally combine their predictions.

15.2 Stacked Ensembles

In stacking, the predictions of several base models (called level‑0 models) are used as features for a meta‑learner (level‑1 model). The meta‑learner is typically a regularised GLM that assigns weights to each base model. This often yields performance better than any single model.

15.3 Running AutoML

We load h2o only here to avoid earlier namespace conflicts. The function h2o.init() starts a local H2O cluster. After the run, we display the leaderboard and evaluate the top model on the test set.

# =============================================================================

library(h2o)
h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /tmp/Rtmpkr0V9z/file43d01a806415/h2o_omeena_started_from_r.out
##     /tmp/Rtmpkr0V9z/file43d036c973f8/h2o_omeena_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 18 milliseconds 
##     H2O cluster timezone:       Asia/Kolkata 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    2 years, 4 months and 5 days 
##     H2O cluster name:           H2O_started_from_R_omeena_rdh922 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   6.75 GB 
##     H2O cluster total cores:    16 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.5.2 (2025-10-31)

h2o_data <- as.h2o(Features_df)

##   |                                                                                                                                                |                                                                                                                                        |   0%  |                                                                                                                                                |========================================================================================================================================| 100%

splits <- h2o.splitFrame(h2o_data, ratios=0.8, seed=1234)
train_h2o <- splits[[1]]; test_h2o <- splits[[2]]
y <- "Class"; x <- setdiff(colnames(train_h2o), y)
aml <- h2o.automl(x=x, y=y, training_frame=train_h2o,
                  max_runtime_secs=300, max_models=20, seed=142,
                  sort_metric="AUC", nfolds=5, balance_classes=TRUE)

##   |                                                                                                                                                |                                                                                                                                        |   0%  |                                                                                                                                                |========                                                                                                                                |   6%  |                                                                                                                                                |=================                                                                                                                       |  12%  |                                                                                                                                                |======================                                                                                                                  |  16%  |                                                                                                                                                |===============================                                                                                                         |  22%  |                                                                                                                                                |=================================                                                                                                       |  24%  |                                                                                                                                                |==========================================================                                                                              |  43%  |                                                                                                                                                |===========================================================================                                                             |  55%  |                                                                                                                                                |===================================================================================                                                     |  61%  |                                                                                                                                                |============================================================================================                                            |  67%  |                                                                                                                                                |=======================================================================================================                                 |  76%  |                                                                                                                                                |========================================================================================================================================| 100%

lb <- as.data.frame(h2o.get_leaderboard(aml))
DT::datatable(lb, options=list(pageLength=10), caption="H2O AutoML Leaderboard")

perf <- h2o.performance(aml@leader, test_h2o)
cat("Test AUC AutoML leader:", h2o.auc(perf), "\n")

## Test AUC AutoML leader: 0.8711944

h2o.confusionMatrix(perf)

## Confusion Matrix (vertical: actual; across: predicted)  for max f1 @ threshold = 0.508126034409681:
##               Extracellular Mitochondrial    Error     Rate
## Extracellular            58             5 0.079365    =5/63
## Mitochondrial            17            44 0.278689   =17/61
## Totals                   75            49 0.177419  =22/124

h2o.shutdown(prompt=FALSE)

# =============================================================================

16. Model Comparison

We compare the three approaches—tuned Random Forest, the Deep Learning MLP, and the AutoML leader—on Accuracy and AUC. An interactive table (DT::datatable()) displays the results.

# =============================================================================

models <- c("Random Forest","Deep Learning")
accs <- c(cm$overall["Accuracy"], dl_cm$overall["Accuracy"])
aucs <- c(auc(roc_rf), auc(dl_roc))
summary_df <- data.frame(Model=models, Accuracy=round(accs,3), AUC=round(aucs,3))
if (exists("perf")) {
  thr <- h2o.find_threshold_by_max_metric(perf, "f1")
  automl_acc <- as.numeric(h2o.accuracy(perf, thresholds=thr))
  automl_auc <- as.numeric(h2o.auc(perf))
  summary_df <- rbind(summary_df, data.frame(Model="AutoML Leader",
                                             Accuracy=round(automl_acc,3),
                                             AUC=round(automl_auc,3)))
}
DT::datatable(summary_df, options=list(dom='t'), caption="Model Performance Comparison")

Machine Learning & Deep Learning for Protein Subcellular Localization

A Complete Hands‑On Textbook with R

IIITD

2026-04-25