This notebook evaluates the quality of Harmony batch correction applied to malignant CD4⁺ T cells using the Local Inverse Simpson’s Index (LISI) framework (Korsunsky et al., 2019).
Two complementary LISI metrics are computed:
| Metric | Covariate | Interpretation |
|---|---|---|
| cLISI | seurat_clusters |
Cluster purity — values close to 1 indicate well-separated, pure clusters |
| iLISI (cell line) | orig.ident |
Batch mixing across cell lines — higher values indicate better Harmony correction |
| iLISI (patient) | Patient_origin |
Batch mixing across patients — higher values indicate better Harmony correction |
# ── Install lisi if not already installed ──────────────────────────────────
# install.packages("remotes")
# remotes::install_github("immunogenomics/lisi")
library(lisi)
library(Seurat)
library(ggplot2)
library(dplyr)
library(tidyr)
library(patchwork) # for combining plots
seurat_obj
An object of class Seurat
62900 features across 49305 samples within 6 assays
Active assay: RNA (36601 features, 0 variable features)
2 layers present: data, counts
5 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3, SCT
5 dimensional reductions calculated: integrated_dr, ref.umap, pca, umap, harmony
table(seurat_obj$Patient_origin)
P1 P2 P3 CD4T_lab CD4T_10x
11760 12434 16501 5106 3504
table(seurat_obj$orig.ident)
L1 L2 L3 L4 L5 L6 L7 CD4T_lab CD4T_10x
5825 5935 6428 6006 6022 5148 5331 5106 3504
table(seurat_obj$seurat_clusters)
0 1 2 3 4 5 6 7 8 9 10 11 12 13
6789 5275 4663 4661 4086 3634 3536 3409 3338 3273 3212 1675 1063 691
# Harmony low-dimensional embedding (cells × components)
harmony_emb <- Embeddings(seurat_obj, reduction = "harmony")
# Metadata: cluster identity + two batch covariates
meta <- seurat_obj@meta.data[, c(
"seurat_clusters", # cluster identity → cLISI
"orig.ident", # batch covariate 1 → iLISI
"Patient_origin" # batch covariate 2 → iLISI
)]
cat("Cells:", nrow(harmony_emb), "\n")
Cells: 49305
cat("Harmony dimensions:", ncol(harmony_emb), "\n")
Harmony dimensions: 50
cat("Clusters:", nlevels(factor(meta$seurat_clusters)), "\n")
Clusters: 14
cat("Cell lines:", nlevels(factor(meta$orig.ident)), "\n")
Cell lines: 9
cat("Patients:", nlevels(factor(meta$Patient_origin)), "\n")
Patients: 5
# compute_lisi returns one score per cell per label column
lisi_scores <- compute_lisi(
X = harmony_emb,
meta_data = meta,
label_colnames = c("seurat_clusters", "orig.ident", "Patient_origin")
)
# Rename for clarity
colnames(lisi_scores) <- c("cLISI", "iLISI_orig.ident", "iLISI_Patient")
# Attach cluster label for downstream plotting
lisi_scores$cluster <- factor(seurat_obj@meta.data$seurat_clusters)
head(lisi_scores)
summary(lisi_scores[, c("cLISI", "iLISI_orig.ident", "iLISI_Patient")])
cLISI iLISI_orig.ident iLISI_Patient
Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.002 1st Qu.:1.013 1st Qu.:1.000
Median :1.053 Median :1.140 Median :1.004
Mean :1.284 Mean :1.372 Mean :1.166
3rd Qu.:1.376 3rd Qu.:1.563 3rd Qu.:1.148
Max. :6.075 Max. :6.376 Max. :4.501
# Per-cluster median cLISI
lisi_scores %>%
group_by(cluster) %>%
summarise(
n = n(),
median_cLISI = round(median(cLISI), 3),
mean_cLISI = round(mean(cLISI), 3),
sd_cLISI = round(sd(cLISI), 3)
) %>%
arrange(cluster)
Interpretation: Each box shows the distribution of cLISI scores for cells within that cluster. Values close to 1 indicate that each cell’s neighbourhood is predominantly composed of cells from the same cluster → high purity. Values approaching n_clusters indicate mixed neighbourhoods → low purity.
p_clisi <- ggplot(lisi_scores, aes(x = cluster, y = cLISI, fill = cluster)) +
geom_boxplot(outlier.size = 0.4, outlier.alpha = 0.4,
width = 0.65, show.legend = FALSE) +
geom_hline(yintercept = 1, linetype = "dashed", colour = "firebrick",
linewidth = 0.7) +
annotate("text", x = Inf, y = 1.05, label = "Ideal purity (cLISI = 1)",
hjust = 1.05, colour = "firebrick", size = 3.2) +
scale_fill_viridis_d(option = "turbo", begin = 0.1, end = 0.9) +
labs(
title = "Cluster Purity (cLISI) per Malignant CD4⁺ T Cell Cluster",
subtitle = "Values close to 1 indicate high cluster purity",
x = "Seurat Cluster",
y = "cLISI Score"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 13),
plot.subtitle = element_text(colour = "grey45", size = 10),
axis.text.x = element_text(angle = 45, hjust = 1)
)
p_clisi
Interpretation: Higher iLISI values indicate that each cell’s neighbourhood contains cells from many different batches → Harmony has successfully mixed cells from different origins. The theoretical maximum equals the number of unique batch labels.
# Pivot to long format for faceted histogram
ilisi_long <- lisi_scores %>%
select(iLISI_orig.ident, iLISI_Patient) %>%
pivot_longer(
cols = everything(),
names_to = "covariate",
values_to = "iLISI"
) %>%
mutate(covariate = recode(covariate,
"iLISI_orig.ident" = "Cell Line",
"iLISI_Patient" = "Patient Origin"
))
# Max possible iLISI per covariate (= number of unique labels)
n_orig.ident <- nlevels(factor(meta$orig.ident))
n_patient <- nlevels(factor(meta$Patient_origin))
ref_df <- data.frame(
covariate = c("Cell Line", "Patient Origin"),
max_lisi = c(n_orig.ident, n_patient)
)
p_ilisi <- ggplot(ilisi_long, aes(x = iLISI, fill = covariate)) +
geom_histogram(bins = 30, colour = "white", show.legend = FALSE) +
geom_vline(data = ref_df,
aes(xintercept = max_lisi),
linetype = "dashed", colour = "firebrick", linewidth = 0.7) +
facet_wrap(~ covariate, scales = "free") +
scale_fill_manual(values = c("Cell Line" = "steelblue",
"Patient Origin" = "darkorange")) +
labs(
title = "Batch Mixing Quality (iLISI) after Harmony Correction",
subtitle = "Dashed red line = theoretical maximum (= number of unique batch labels)",
x = "iLISI Score",
y = "Cell Count"
) +
theme_classic(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 13),
plot.subtitle = element_text(colour = "grey45", size = 10),
strip.text = element_text(face = "bold")
)
p_ilisi
p_clisi / p_ilisi +
plot_annotation(
title = "Harmony Integration Quality — Malignant CD4⁺ T Cells",
caption = "LISI computed on Harmony embedding | cLISI = cluster purity | iLISI = batch mixing",
theme = theme(
plot.title = element_text(face = "bold", size = 15),
plot.caption = element_text(colour = "grey50", size = 9)
)
)
trying URL 'https://cloud.r-project.org/src/contrib/kableExtra_1.4.0.tar.gz'
Content type 'application/x-gzip' length 1824636 bytes (1.7 MB)
==================================================
downloaded 1.7 MB
The downloaded source packages are in
‘/tmp/RtmpbsswCp/downloaded_packages’
| Metric | Ideal value | Poor value | Meaning |
|---|---|---|---|
| cLISI | Close to 1 | >> 1 (mixed clusters) | Harmony preserves biologically distinct clusters |
| iLISI (cell line) | Close to n_orig.idents | Close to 1 (no mixing) | Harmony successfully removes cell-line batch effects |
| iLISI (patient) | Close to n_patients | Close to 1 (no mixing) | Harmony successfully removes patient-of-origin batch effects |
trying URL 'https://cloud.r-project.org/src/contrib/DT_0.34.0.tar.gz'
Content type 'application/x-gzip' length 1664306 bytes (1.6 MB)
==================================================
downloaded 1.6 MB
The downloaded source packages are in
‘/tmp/RtmpbsswCp/downloaded_packages’
trying URL 'https://cloud.r-project.org/src/contrib/gt_1.3.0.tar.gz'
Content type 'application/x-gzip' length 3449540 bytes (3.3 MB)
==================================================
downloaded 3.3 MB
The downloaded source packages are in
‘/tmp/RtmpbsswCp/downloaded_packages’
| Metric | Ideal value | Poor value | Meaning |
|---|---|---|---|
| cLISI | Close to 1 | >> 1 (mixed clusters) | Harmony preserves biologically distinct clusters |
| iLISI (cell line) | Close to n_orig.idents | Close to 1 (no mixing) | Harmony successfully removes cell-line batch effects |
| iLISI (patient) | Close to n_patients | Close to 1 (no mixing) | Harmony successfully removes patient-of-origin batch effects |
# Stacked bar: composition of each cluster by source
ggplot(seurat_obj@meta.data,
aes(x = factor(seurat_clusters), fill = orig.ident)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(x = "Cluster", y = "Proportion", fill = "Source",
title = "Source composition per cluster") +
theme_classic()
sessionInfo()
R version 4.5.2 (2025-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=fr_FR.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=en_GB.UTF-8 LC_PAPER=fr_FR.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Paris
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] patchwork_1.3.2 tidyr_1.3.2 dplyr_1.2.0 ggplot2_4.0.2 Seurat_5.4.0 SeuratObject_5.3.0
[7] sp_2.2-1 lisi_1.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 rstudioapi_0.18.0 jsonlite_2.0.0 magrittr_2.0.4 spatstat.utils_3.2-1
[6] farver_2.1.2 rmarkdown_2.30 fs_2.0.1 vctrs_0.7.1 ROCR_1.0-12
[11] memoise_2.0.1 spatstat.explore_3.7-0 htmltools_0.5.9 usethis_3.2.1 curl_7.0.0
[16] sass_0.4.10 sctransform_0.4.3 parallelly_1.46.1 KernSmooth_2.23-26 bslib_0.10.0
[21] htmlwidgets_1.6.4 desc_1.4.3 ica_1.0-3 plyr_1.8.9 plotly_4.12.0
[26] zoo_1.8-15 cachem_1.1.0 igraph_2.2.2 mime_0.13 lifecycle_1.0.5
[31] pkgconfig_2.0.3 Matrix_1.7-4 R6_2.6.1 fastmap_1.2.0 fitdistrplus_1.2-6
[36] future_1.69.0 shiny_1.13.0 digest_0.6.39 ps_1.9.1 tensor_1.5.1
[41] RSpectra_0.16-2 irlba_2.3.7 pkgload_1.5.0 labeling_0.4.3 progressr_0.18.0
[46] spatstat.sparse_3.1-0 httr_1.4.8 polyclip_1.10-7 abind_1.4-8 compiler_4.5.2
[51] remotes_2.5.0 withr_3.0.2 S7_0.2.1 fastDummies_1.7.5 pkgbuild_1.4.8
[56] MASS_7.3-65 sessioninfo_1.2.3 tools_4.5.2 lmtest_0.9-40 otel_0.2.0
[61] httpuv_1.6.16 future.apply_1.20.2 goftest_1.2-3 glue_1.8.0 callr_3.7.6
[66] nlme_3.1-168 promises_1.5.0 grid_4.5.2 rsconnect_1.7.0 Rtsne_0.17
[71] cluster_2.1.8.2 reshape2_1.4.5 generics_0.1.4 gtable_0.3.6 spatstat.data_3.1-9
[76] data.table_1.18.2.1 spatstat.geom_3.7-0 RcppAnnoy_0.0.23 ggrepel_0.9.7 RANN_2.6.2
[81] pillar_1.11.1 stringr_1.6.0 spam_2.11-3 RcppHNSW_0.6.0 later_1.4.7
[86] splines_4.5.2 lattice_0.22-9 survival_3.8-3 deldir_2.0-4 tidyselect_1.2.1
[91] miniUI_0.1.2 pbapply_1.7-4 knitr_1.51 gridExtra_2.3 scattermore_1.2
[96] xfun_0.56 devtools_2.5.0 matrixStats_1.5.0 stringi_1.8.7 lazyeval_0.2.2
[101] yaml_2.3.12 evaluate_1.0.5 codetools_0.2-20 tibble_3.3.1 cli_3.6.5
[106] uwot_0.2.4 xtable_1.8-8 reticulate_1.45.0 processx_3.8.6 jquerylib_0.1.4
[111] dichromat_2.0-0.1 Rcpp_1.1.1 globals_0.19.0 spatstat.random_3.4-4 png_0.1-8
[116] spatstat.univar_3.1-6 parallel_4.5.2 ellipsis_0.3.2 dotCall64_1.2 listenv_0.10.0
[121] viridisLite_0.4.3 scales_1.4.0 ggridges_0.5.7 purrr_1.2.1 rlang_1.1.7
[126] cowplot_1.2.0