Load Libraries

# Data handling
library(tidyverse)

# Expression analysis
library(dplyr)
library(limma)

# Visualization
library(ggplot2)
library(reshape2)
library(pheatmap)
library(RColorBrewer)

# Enrichment
library(clusterProfiler)
library(org.Hs.eg.db)   # human gene annotation package

Load and Inspect Data

# Replace with your CSV file path
# Read CSV without header
data <- read.csv("C:/Users/anifo/OneDrive/Desktop/pediatric_malaria.csv", header = FALSE, stringsAsFactors = FALSE, check.names = FALSE)

# First row = actual column names
colnames(data) <- data[1, ]
data <- data[-1, ]  # remove the first row
head(data[,1:10])
##      ID_REF    Gene symbol GSM711619 GSM711620 GSM711621 GSM711622 GSM711623
## 2                            Control   Control   Control   Control   Control
## 3 1007_s_at MIR4640///DDR1   165.666   227.997   238.424   223.669   159.866
## 4   1053_at           RFC2   35.3511   38.2707   24.2497   61.4834   43.9876
## 5    117_at          HSPA6     167.5   146.128   273.367    223.42   153.009
## 6    121_at           PAX8   606.594   615.723   850.789   526.605   893.675
## 7 1255_g_at         GUCA1A   18.4024   17.4644   14.7565   24.6866   37.2562
##      GSM711624    GSM711625    GSM711626
## 2 Asymptomatic Asymptomatic Asymptomatic
## 3      138.953        163.6      196.328
## 4      88.8007      85.7829      44.4238
## 5      378.784      329.362      239.944
## 6      517.455        597.2      958.687
## 7      27.4817      28.0726      40.1647

Preprocessing

# Extract groups
groups <- as.character(data[1, 3:ncol(data)])
groups <- trimws(groups)

# Remove group row from expression
expr_df <- data[-1, ]

# Make sure first two columns are ID_REF and Gene.symbol
colnames(expr_df)[1:2] <- c("ID_REF", "Gene.symbol")

# Remove rows with empty or NA gene symbols
expr_df <- expr_df %>% filter(!is.na(Gene.symbol) & Gene.symbol != "")

# Keep first gene symbol if multiple exist
expr_df$Gene.symbol <- sapply(strsplit(expr_df$Gene.symbol, "///"), `[`, 1)

# Now compute numeric columns dynamically
num_cols <- setdiff(names(expr_df), c("ID_REF", "Gene.symbol"))

# Convert all numeric columns safely
expr_df[num_cols] <- lapply(expr_df[num_cols], function(x) as.numeric(as.character(x)))

# Collapse duplicate genes
expr_df <- expr_df %>%
  group_by(Gene.symbol) %>%
  summarise(across(all_of(num_cols), mean, na.rm = TRUE))

# Convert to matrix
expr <- as.matrix(expr_df[, num_cols])
rownames(expr) <- expr_df$Gene.symbol

# Log2 transform
expr_log <- log2(expr + 1)

# Check dimensions
dim(expr_log)  # should be 13100 x 25
## [1] 13100    25
# Top variable genes
vars <- apply(expr_log, 1, var)

# Keep only genes with non-zero variance
nonzero_vars <- which(vars > 0)
expr_log <- expr_log[nonzero_vars, , drop = FALSE]

# Select top variable genes
n_genes <- min(2000, nrow(expr_log))
top_var_idx <- order(vars[nonzero_vars], decreasing = TRUE)[1:n_genes]
expr_filt <- expr_log[top_var_idx, , drop = FALSE]

dim(expr_filt)  # must have >0 rows and correct number of columns (samples)
## [1] 2000   25

Sample Information

groups <- c(
  rep("Control", 5),
  rep("Asymptomatic", 5),
  rep("Uncomplicated", 5),
  rep("Severe", 5),
  rep("Cerebral", 5)
)

sample_info <- data.frame(
  Sample = colnames(expr_filt),
  Group = factor(groups)
)
rownames(sample_info) <- sample_info$Sample

PCA

# Transpose expression matrix for PCA (samples as rows)
expr_t <- t(expr_filt)

# Perform PCA
pca_res <- prcomp(expr_t, center = TRUE, scale. = TRUE)

# Extract percent variance explained
pca_var <- pca_res$sdev^2
pca_var_perc <- round(pca_var / sum(pca_var) * 100, 1)

# Combine PCA scores with metadata
pca_df <- data.frame(
  Sample = rownames(pca_res$x),
  PC1 = pca_res$x[,1],
  PC2 = pca_res$x[,2],
  Group = sample_info$Group
)

# Plot PCA
ggplot(pca_df, aes(x = PC1, y = PC2, color = Group, label = Sample)) +
  geom_point(size = 4) +
  geom_text(vjust = -1, size = 3) +
  xlab(paste0("PC1 (", pca_var_perc[1], "%)")) +
  ylab(paste0("PC2 (", pca_var_perc[2], "%)")) +
  theme_bw() +
  theme(
    panel.grid = element_blank(),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14)
  )

Heaptmap

# Create annotation dataframe for columns
annotation_col <- data.frame(
  Group = sample_info$Group
)
rownames(annotation_col) <- sample_info$Sample

# Scale genes (rows) for heatmap visualization
expr_scaled <- t(scale(t(expr_filt)))  # z-score per gene

# Draw heatmap
pheatmap(
  expr_scaled,
  annotation_col = annotation_col,
  show_rownames = FALSE,
  show_colnames = TRUE,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  color = colorRampPalette(c("navy", "white", "firebrick3"))(50),
  fontsize_col = 10,
  main = "Top 2000 Variable Genes Heatmap"
)

DEG

# Make design matrix
design <- model.matrix(~0 + sample_info$Group)
colnames(design) <- levels(sample_info$Group)
design
##    Asymptomatic Cerebral Control Severe Uncomplicated
## 1             0        0       1      0             0
## 2             0        0       1      0             0
## 3             0        0       1      0             0
## 4             0        0       1      0             0
## 5             0        0       1      0             0
## 6             1        0       0      0             0
## 7             1        0       0      0             0
## 8             1        0       0      0             0
## 9             1        0       0      0             0
## 10            1        0       0      0             0
## 11            0        0       0      0             1
## 12            0        0       0      0             1
## 13            0        0       0      0             1
## 14            0        0       0      0             1
## 15            0        0       0      0             1
## 16            0        0       0      1             0
## 17            0        0       0      1             0
## 18            0        0       0      1             0
## 19            0        0       0      1             0
## 20            0        0       0      1             0
## 21            0        1       0      0             0
## 22            0        1       0      0             0
## 23            0        1       0      0             0
## 24            0        1       0      0             0
## 25            0        1       0      0             0
## attr(,"assign")
## [1] 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`sample_info$Group`
## [1] "contr.treatment"
# Fit linear model
fit <- lmFit(expr_filt, design)

# Create contrasts (example: Severe vs Control, Cerebral vs Control)
contrast.matrix <- makeContrasts(
  Severe_vs_Control = Severe - Control,
  Cerebral_vs_Control = Cerebral - Control,
  levels = design
)

# Fit contrasts
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

# Extract DEGs
deg_severe <- topTable(fit2, coef="Severe_vs_Control", number=Inf, adjust.method="BH")
deg_cerebral <- topTable(fit2, coef="Cerebral_vs_Control", number=Inf, adjust.method="BH")

# Optional: filter significant DEGs (adj.p < 0.05 & |logFC| > 1)
sig_deg_severe <- deg_severe[deg_severe$adj.P.Val < 0.05 & abs(deg_severe$logFC) > 1, ]
sig_deg_cerebral <- deg_cerebral[deg_cerebral$adj.P.Val < 0.05 & abs(deg_cerebral$logFC) > 1, ]

# View top 10 DEGs
head(sig_deg_severe, 10)
##               logFC   AveExpr         t      P.Value    adj.P.Val        B
## LTF        5.249312  8.286021  7.917531 1.405779e-14 2.235424e-11 22.34612
## CLIC2      5.215725  5.652043  7.852880 2.235424e-14 2.235424e-11 21.90577
## OLFM4      5.042267  6.236554  7.405070 5.124660e-13 3.416440e-10 18.93447
## CEACAM8    4.644263  6.061702  7.026410 6.471931e-12 3.235966e-09 16.53229
## CCR3      -4.235545  6.602959 -6.437691 2.699244e-10 1.079698e-07 13.00617
## LOC149684  4.229711  6.552610  6.400382 3.388837e-10 1.129612e-07 12.79149
## CD177      4.163611  8.742921  6.315844 5.652095e-10 1.614884e-07 12.30900
## IFI27      4.118355  9.776967  6.170363 1.345419e-09 3.363548e-07 11.49158
## CLC       -3.877552  9.054747 -5.936487 5.238492e-09 1.164109e-06 10.21203
## MMP9       3.860932 10.360133  5.878981 7.269014e-09 1.453803e-06  9.90401
head(sig_deg_cerebral, 10)
##             logFC   AveExpr         t      P.Value    adj.P.Val        B
## OLFM4    6.206302  6.236554  9.114572 1.579544e-18 3.159087e-15 30.98667
## LTF      5.205690  8.286021  7.851736 2.253787e-14 2.253787e-11 21.89495
## CD177    4.608822  8.742921  6.991191 8.149741e-12 5.433161e-09 16.31228
## PGLYRP1  4.422157  6.428725  6.697445 5.375648e-11 2.687824e-08 14.52860
## MMP8     4.283557  7.229921  6.530438 1.526116e-10 6.104464e-08 13.54318
## VNN1     4.084922  8.446148  6.256512 8.066510e-10 2.143523e-07 11.97273
## MMP9     4.108468 10.360133  6.255900 8.096079e-10 2.143523e-07 11.96928
## IL1R2    4.090698  9.245679  6.246287 8.574093e-10 2.143523e-07 11.91522
## CEACAM8  4.103106  6.061702  6.207681 1.078767e-09 2.397261e-07 11.69883
## CD3G    -4.041639  7.469105 -6.081118 2.271674e-09 4.543349e-07 10.99754

Functional Enrichment

# Get gene symbols from DEGs
deg_genes <- rownames(sig_deg_severe)  # Example: Severe vs Control

# Convert gene symbols to Entrez IDs
entrez_ids <- bitr(deg_genes, fromType="SYMBOL", 
                   toType="ENTREZID", 
                   OrgDb=org.Hs.eg.db)

# GO enrichment (Biological Process)
go_bp <- enrichGO(
  gene = entrez_ids$ENTREZID,
  OrgDb = org.Hs.eg.db,
  ont = "BP",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.2,
  readable = TRUE
)

# KEGG pathway enrichment
kegg <- enrichKEGG(
  gene = entrez_ids$ENTREZID,
  organism = "hsa",
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05
)

# View top enriched GO terms and KEGG pathways
head(go_bp)
##                    ID                                            Description
## GO:0009620 GO:0009620                                     response to fungus
## GO:0050832 GO:0050832                             defense response to fungus
## GO:0006959 GO:0006959                                humoral immune response
## GO:0031640 GO:0031640                   killing of cells of another organism
## GO:0141061 GO:0141061                 disruption of cell in another organism
## GO:0141060 GO:0141060 disruption of anatomical structure in another organism
##            GeneRatio   BgRatio RichFactor FoldEnrichment   zScore       pvalue
## GO:0009620      9/62  78/18805 0.11538462       34.99690 17.30429 3.837795e-12
## GO:0050832      8/62  67/18805 0.11940299       36.21570 16.60785 4.896640e-11
## GO:0006959     12/62 309/18805 0.03883495       11.77889 10.98793 3.247247e-10
## GO:0031640      8/62 119/18805 0.06722689       20.39035 12.20400 5.153810e-09
## GO:0141061      8/62 119/18805 0.06722689       20.39035 12.20400 5.153810e-09
## GO:0141060      8/62 123/18805 0.06504065       19.72725 11.98440 6.699025e-09
##                p.adjust       qvalue
## GO:0009620 4.455680e-09 3.542891e-09
## GO:0050832 2.842500e-08 2.260186e-08
## GO:0006959 1.256684e-07 9.992405e-08
## GO:0031640 1.196715e-06 9.515560e-07
## GO:0141061 1.196715e-06 9.515560e-07
## GO:0141060 1.296261e-06 1.030710e-06
##                                                                            geneID
## GO:0009620                      LTF/DEFA1B/MPO/DEFA4/ELANE/ARG1/S100A12/CTSG/PTX3
## GO:0050832                           LTF/DEFA1B/MPO/DEFA4/ELANE/ARG1/S100A12/CTSG
## GO:0006959 LTF/PGLYRP1/DEFA1B/C1QB/EBI3/PRTN3/DEFA4/ELANE/S100A12/CTSG/C1QA/CCL23
## GO:0031640                       LTF/PGLYRP1/DEFA1B/DEFA4/ELANE/ARG1/S100A12/CTSG
## GO:0141061                       LTF/PGLYRP1/DEFA1B/DEFA4/ELANE/ARG1/S100A12/CTSG
## GO:0141060                       LTF/PGLYRP1/DEFA1B/DEFA4/ELANE/ARG1/S100A12/CTSG
##            Count
## GO:0009620     9
## GO:0050832     8
## GO:0006959    12
## GO:0031640     8
## GO:0141061     8
## GO:0141060     8
head(kegg)
##                category                   subcategory       ID
## hsa05202 Human Diseases              Cancer: overview hsa05202
## hsa05150 Human Diseases Infectious disease: bacterial hsa05150
## hsa05418 Human Diseases        Cardiovascular disease hsa05418
## hsa05322 Human Diseases                Immune disease hsa05322
## hsa00910     Metabolism             Energy metabolism hsa00910
##                                      Description GeneRatio  BgRatio RichFactor
## hsa05202 Transcriptional misregulation in cancer      6/33 201/9446 0.02985075
## hsa05150         Staphylococcus aureus infection      4/33 102/9446 0.03921569
## hsa05418  Fluid shear stress and atherosclerosis      4/33 142/9446 0.02816901
## hsa05322            Systemic lupus erythematosus      4/33 144/9446 0.02777778
## hsa00910                     Nitrogen metabolism      2/33  17/9446 0.11764706
##          FoldEnrichment   zScore       pvalue    p.adjust      qvalue
## hsa05202       8.544550 6.401354 5.907078e-05 0.005907078 0.005098741
## hsa05150      11.225193 6.147508 4.122481e-04 0.020612404 0.017791759
## hsa05418       8.063167 5.021144 1.427624e-03 0.031155242 0.026891893
## hsa05322       7.951178 4.976746 1.503276e-03 0.031155242 0.026891893
## hsa00910      33.675579 7.983786 1.557762e-03 0.031155242 0.026891893
##                                   geneID Count
## hsa05202 4318/728358/7850/4353/1669/1991     6
## hsa05150             728358/713/1669/712     4
## hsa05418             4318/7850/3674/3162     4
## hsa05322               713/1991/1511/712     4
## hsa00910                         760/759     2