Load Libraries
# Data handling
library(tidyverse)
# Expression analysis
library(dplyr)
library(limma)
# Visualization
library(ggplot2)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
# Enrichment
library(clusterProfiler)
library(org.Hs.eg.db) # human gene annotation package
Load and Inspect Data
# Replace with your CSV file path
# Read CSV without header
data <- read.csv("C:/Users/anifo/OneDrive/Desktop/pediatric_malaria.csv", header = FALSE, stringsAsFactors = FALSE, check.names = FALSE)
# First row = actual column names
colnames(data) <- data[1, ]
data <- data[-1, ] # remove the first row
head(data[,1:10])
## ID_REF Gene symbol GSM711619 GSM711620 GSM711621 GSM711622 GSM711623
## 2 Control Control Control Control Control
## 3 1007_s_at MIR4640///DDR1 165.666 227.997 238.424 223.669 159.866
## 4 1053_at RFC2 35.3511 38.2707 24.2497 61.4834 43.9876
## 5 117_at HSPA6 167.5 146.128 273.367 223.42 153.009
## 6 121_at PAX8 606.594 615.723 850.789 526.605 893.675
## 7 1255_g_at GUCA1A 18.4024 17.4644 14.7565 24.6866 37.2562
## GSM711624 GSM711625 GSM711626
## 2 Asymptomatic Asymptomatic Asymptomatic
## 3 138.953 163.6 196.328
## 4 88.8007 85.7829 44.4238
## 5 378.784 329.362 239.944
## 6 517.455 597.2 958.687
## 7 27.4817 28.0726 40.1647
Preprocessing
# Extract groups
groups <- as.character(data[1, 3:ncol(data)])
groups <- trimws(groups)
# Remove group row from expression
expr_df <- data[-1, ]
# Make sure first two columns are ID_REF and Gene.symbol
colnames(expr_df)[1:2] <- c("ID_REF", "Gene.symbol")
# Remove rows with empty or NA gene symbols
expr_df <- expr_df %>% filter(!is.na(Gene.symbol) & Gene.symbol != "")
# Keep first gene symbol if multiple exist
expr_df$Gene.symbol <- sapply(strsplit(expr_df$Gene.symbol, "///"), `[`, 1)
# Now compute numeric columns dynamically
num_cols <- setdiff(names(expr_df), c("ID_REF", "Gene.symbol"))
# Convert all numeric columns safely
expr_df[num_cols] <- lapply(expr_df[num_cols], function(x) as.numeric(as.character(x)))
# Collapse duplicate genes
expr_df <- expr_df %>%
group_by(Gene.symbol) %>%
summarise(across(all_of(num_cols), mean, na.rm = TRUE))
# Convert to matrix
expr <- as.matrix(expr_df[, num_cols])
rownames(expr) <- expr_df$Gene.symbol
# Log2 transform
expr_log <- log2(expr + 1)
# Check dimensions
dim(expr_log) # should be 13100 x 25
## [1] 13100 25
# Top variable genes
vars <- apply(expr_log, 1, var)
# Keep only genes with non-zero variance
nonzero_vars <- which(vars > 0)
expr_log <- expr_log[nonzero_vars, , drop = FALSE]
# Select top variable genes
n_genes <- min(2000, nrow(expr_log))
top_var_idx <- order(vars[nonzero_vars], decreasing = TRUE)[1:n_genes]
expr_filt <- expr_log[top_var_idx, , drop = FALSE]
dim(expr_filt) # must have >0 rows and correct number of columns (samples)
## [1] 2000 25
PCA
# Transpose expression matrix for PCA (samples as rows)
expr_t <- t(expr_filt)
# Perform PCA
pca_res <- prcomp(expr_t, center = TRUE, scale. = TRUE)
# Extract percent variance explained
pca_var <- pca_res$sdev^2
pca_var_perc <- round(pca_var / sum(pca_var) * 100, 1)
# Combine PCA scores with metadata
pca_df <- data.frame(
Sample = rownames(pca_res$x),
PC1 = pca_res$x[,1],
PC2 = pca_res$x[,2],
Group = sample_info$Group
)
# Plot PCA
ggplot(pca_df, aes(x = PC1, y = PC2, color = Group, label = Sample)) +
geom_point(size = 4) +
geom_text(vjust = -1, size = 3) +
xlab(paste0("PC1 (", pca_var_perc[1], "%)")) +
ylab(paste0("PC2 (", pca_var_perc[2], "%)")) +
theme_bw() +
theme(
panel.grid = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)
)

Heaptmap
# Create annotation dataframe for columns
annotation_col <- data.frame(
Group = sample_info$Group
)
rownames(annotation_col) <- sample_info$Sample
# Scale genes (rows) for heatmap visualization
expr_scaled <- t(scale(t(expr_filt))) # z-score per gene
# Draw heatmap
pheatmap(
expr_scaled,
annotation_col = annotation_col,
show_rownames = FALSE,
show_colnames = TRUE,
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("navy", "white", "firebrick3"))(50),
fontsize_col = 10,
main = "Top 2000 Variable Genes Heatmap"
)

DEG
# Make design matrix
design <- model.matrix(~0 + sample_info$Group)
colnames(design) <- levels(sample_info$Group)
design
## Asymptomatic Cerebral Control Severe Uncomplicated
## 1 0 0 1 0 0
## 2 0 0 1 0 0
## 3 0 0 1 0 0
## 4 0 0 1 0 0
## 5 0 0 1 0 0
## 6 1 0 0 0 0
## 7 1 0 0 0 0
## 8 1 0 0 0 0
## 9 1 0 0 0 0
## 10 1 0 0 0 0
## 11 0 0 0 0 1
## 12 0 0 0 0 1
## 13 0 0 0 0 1
## 14 0 0 0 0 1
## 15 0 0 0 0 1
## 16 0 0 0 1 0
## 17 0 0 0 1 0
## 18 0 0 0 1 0
## 19 0 0 0 1 0
## 20 0 0 0 1 0
## 21 0 1 0 0 0
## 22 0 1 0 0 0
## 23 0 1 0 0 0
## 24 0 1 0 0 0
## 25 0 1 0 0 0
## attr(,"assign")
## [1] 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$`sample_info$Group`
## [1] "contr.treatment"
# Fit linear model
fit <- lmFit(expr_filt, design)
# Create contrasts (example: Severe vs Control, Cerebral vs Control)
contrast.matrix <- makeContrasts(
Severe_vs_Control = Severe - Control,
Cerebral_vs_Control = Cerebral - Control,
levels = design
)
# Fit contrasts
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
# Extract DEGs
deg_severe <- topTable(fit2, coef="Severe_vs_Control", number=Inf, adjust.method="BH")
deg_cerebral <- topTable(fit2, coef="Cerebral_vs_Control", number=Inf, adjust.method="BH")
# Optional: filter significant DEGs (adj.p < 0.05 & |logFC| > 1)
sig_deg_severe <- deg_severe[deg_severe$adj.P.Val < 0.05 & abs(deg_severe$logFC) > 1, ]
sig_deg_cerebral <- deg_cerebral[deg_cerebral$adj.P.Val < 0.05 & abs(deg_cerebral$logFC) > 1, ]
# View top 10 DEGs
head(sig_deg_severe, 10)
## logFC AveExpr t P.Value adj.P.Val B
## LTF 5.249312 8.286021 7.917531 1.405779e-14 2.235424e-11 22.34612
## CLIC2 5.215725 5.652043 7.852880 2.235424e-14 2.235424e-11 21.90577
## OLFM4 5.042267 6.236554 7.405070 5.124660e-13 3.416440e-10 18.93447
## CEACAM8 4.644263 6.061702 7.026410 6.471931e-12 3.235966e-09 16.53229
## CCR3 -4.235545 6.602959 -6.437691 2.699244e-10 1.079698e-07 13.00617
## LOC149684 4.229711 6.552610 6.400382 3.388837e-10 1.129612e-07 12.79149
## CD177 4.163611 8.742921 6.315844 5.652095e-10 1.614884e-07 12.30900
## IFI27 4.118355 9.776967 6.170363 1.345419e-09 3.363548e-07 11.49158
## CLC -3.877552 9.054747 -5.936487 5.238492e-09 1.164109e-06 10.21203
## MMP9 3.860932 10.360133 5.878981 7.269014e-09 1.453803e-06 9.90401
head(sig_deg_cerebral, 10)
## logFC AveExpr t P.Value adj.P.Val B
## OLFM4 6.206302 6.236554 9.114572 1.579544e-18 3.159087e-15 30.98667
## LTF 5.205690 8.286021 7.851736 2.253787e-14 2.253787e-11 21.89495
## CD177 4.608822 8.742921 6.991191 8.149741e-12 5.433161e-09 16.31228
## PGLYRP1 4.422157 6.428725 6.697445 5.375648e-11 2.687824e-08 14.52860
## MMP8 4.283557 7.229921 6.530438 1.526116e-10 6.104464e-08 13.54318
## VNN1 4.084922 8.446148 6.256512 8.066510e-10 2.143523e-07 11.97273
## MMP9 4.108468 10.360133 6.255900 8.096079e-10 2.143523e-07 11.96928
## IL1R2 4.090698 9.245679 6.246287 8.574093e-10 2.143523e-07 11.91522
## CEACAM8 4.103106 6.061702 6.207681 1.078767e-09 2.397261e-07 11.69883
## CD3G -4.041639 7.469105 -6.081118 2.271674e-09 4.543349e-07 10.99754
Functional Enrichment
# Get gene symbols from DEGs
deg_genes <- rownames(sig_deg_severe) # Example: Severe vs Control
# Convert gene symbols to Entrez IDs
entrez_ids <- bitr(deg_genes, fromType="SYMBOL",
toType="ENTREZID",
OrgDb=org.Hs.eg.db)
# GO enrichment (Biological Process)
go_bp <- enrichGO(
gene = entrez_ids$ENTREZID,
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2,
readable = TRUE
)
# KEGG pathway enrichment
kegg <- enrichKEGG(
gene = entrez_ids$ENTREZID,
organism = "hsa",
pAdjustMethod = "BH",
pvalueCutoff = 0.05
)
# View top enriched GO terms and KEGG pathways
head(go_bp)
## ID Description
## GO:0009620 GO:0009620 response to fungus
## GO:0050832 GO:0050832 defense response to fungus
## GO:0006959 GO:0006959 humoral immune response
## GO:0031640 GO:0031640 killing of cells of another organism
## GO:0141061 GO:0141061 disruption of cell in another organism
## GO:0141060 GO:0141060 disruption of anatomical structure in another organism
## GeneRatio BgRatio RichFactor FoldEnrichment zScore pvalue
## GO:0009620 9/62 78/18805 0.11538462 34.99690 17.30429 3.837795e-12
## GO:0050832 8/62 67/18805 0.11940299 36.21570 16.60785 4.896640e-11
## GO:0006959 12/62 309/18805 0.03883495 11.77889 10.98793 3.247247e-10
## GO:0031640 8/62 119/18805 0.06722689 20.39035 12.20400 5.153810e-09
## GO:0141061 8/62 119/18805 0.06722689 20.39035 12.20400 5.153810e-09
## GO:0141060 8/62 123/18805 0.06504065 19.72725 11.98440 6.699025e-09
## p.adjust qvalue
## GO:0009620 4.455680e-09 3.542891e-09
## GO:0050832 2.842500e-08 2.260186e-08
## GO:0006959 1.256684e-07 9.992405e-08
## GO:0031640 1.196715e-06 9.515560e-07
## GO:0141061 1.196715e-06 9.515560e-07
## GO:0141060 1.296261e-06 1.030710e-06
## geneID
## GO:0009620 LTF/DEFA1B/MPO/DEFA4/ELANE/ARG1/S100A12/CTSG/PTX3
## GO:0050832 LTF/DEFA1B/MPO/DEFA4/ELANE/ARG1/S100A12/CTSG
## GO:0006959 LTF/PGLYRP1/DEFA1B/C1QB/EBI3/PRTN3/DEFA4/ELANE/S100A12/CTSG/C1QA/CCL23
## GO:0031640 LTF/PGLYRP1/DEFA1B/DEFA4/ELANE/ARG1/S100A12/CTSG
## GO:0141061 LTF/PGLYRP1/DEFA1B/DEFA4/ELANE/ARG1/S100A12/CTSG
## GO:0141060 LTF/PGLYRP1/DEFA1B/DEFA4/ELANE/ARG1/S100A12/CTSG
## Count
## GO:0009620 9
## GO:0050832 8
## GO:0006959 12
## GO:0031640 8
## GO:0141061 8
## GO:0141060 8
head(kegg)
## category subcategory ID
## hsa05202 Human Diseases Cancer: overview hsa05202
## hsa05150 Human Diseases Infectious disease: bacterial hsa05150
## hsa05418 Human Diseases Cardiovascular disease hsa05418
## hsa05322 Human Diseases Immune disease hsa05322
## hsa00910 Metabolism Energy metabolism hsa00910
## Description GeneRatio BgRatio RichFactor
## hsa05202 Transcriptional misregulation in cancer 6/33 201/9446 0.02985075
## hsa05150 Staphylococcus aureus infection 4/33 102/9446 0.03921569
## hsa05418 Fluid shear stress and atherosclerosis 4/33 142/9446 0.02816901
## hsa05322 Systemic lupus erythematosus 4/33 144/9446 0.02777778
## hsa00910 Nitrogen metabolism 2/33 17/9446 0.11764706
## FoldEnrichment zScore pvalue p.adjust qvalue
## hsa05202 8.544550 6.401354 5.907078e-05 0.005907078 0.005098741
## hsa05150 11.225193 6.147508 4.122481e-04 0.020612404 0.017791759
## hsa05418 8.063167 5.021144 1.427624e-03 0.031155242 0.026891893
## hsa05322 7.951178 4.976746 1.503276e-03 0.031155242 0.026891893
## hsa00910 33.675579 7.983786 1.557762e-03 0.031155242 0.026891893
## geneID Count
## hsa05202 4318/728358/7850/4353/1669/1991 6
## hsa05150 728358/713/1669/712 4
## hsa05418 4318/7850/3674/3162 4
## hsa05322 713/1991/1511/712 4
## hsa00910 760/759 2