# Patient IDs and H5 files (replace with your actual file paths)
# Define paths to your H5 files
h5_files <- list(
"MF1" = "../data/MF/GSM5538777_MF17raw_feature_bc_matrix.h5", #MF-Patient
"MF2" = "../data/MF/GSM5538778_MF18raw_feature_bc_matrix.h5", #MF-Patient
"MF3" = "../data/MF/GSM5687768_MF19raw_feature_bc_matrix.h5", #MF-Patient
"MF4" = "../data/MF/GSM5538779_MF21raw_feature_bc_matrix.h5", #MF-Patient
"MF5" = "../data/MF/GSM5538780_MF24_raw_feature_bc_matrix.h5", #MF-Patient
"HC1_2" = "../data/Disease_state_Healthy_Control/GSM5538785_SC327raw_feature_bc_matrix.h5", # disease state: Healthy control/tissue: Skin tumor
"HC3_4" = "../data/Disease_state_Healthy_Control/GSM5538786_SC374raw_feature_bc_matrix.h5" # disease state: Healthy control/tissue: Skin tumor
)
# Read and create individual Seurat objects
seurat_list <- list()
seurat_list <- list()
for (patient in names(h5_files)) {
raw_data <- Read10X_h5(h5_files[[patient]])
# If raw_data is a list (multi-modal), extract "Gene Expression"
if (is.list(raw_data)) {
if (!"Gene Expression" %in% names(raw_data)) {
stop(paste("Gene Expression modality not found in patient", patient))
}
raw_counts <- raw_data[["Gene Expression"]]
} else {
# Single modality
raw_counts <- raw_data
}
seu <- CreateSeuratObject(
counts = raw_counts,
min.cells = 3,
min.features = 200,
project = patient
)
seu$patient_id <- patient
seurat_list[[patient]] <- seu
}
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome
# Merge into combined Seurat object
combined_seu <- merge(
seurat_list[[1]],
y = seurat_list[-1],
add.cell.ids = names(seurat_list),
project = "MF_HC_Merged"
)
# Using LogNormalize for simplicity and compatibility with reference datasets;
# SCTransform could be used for deeper integration.
# Normalization (LogNormalize)
combined_seu <- NormalizeData(combined_seu, normalization.method = "LogNormalize", scale.factor = 10000)
Normalizing layer: counts.MF1
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.MF2
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.MF3
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.MF4
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.MF5
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.HC1_2
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.HC3_4
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Variable features selection
combined_seu <- FindVariableFeatures(combined_seu, selection.method = "vst", nfeatures = 2000)
Finding variable features for layer counts.MF1
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF2
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF3
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF4
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.MF5
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.HC1_2
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.HC3_4
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Scaling
combined_seu <- ScaleData(combined_seu)
Centering and scaling data matrix
|
| | 0%
|
|================================================ | 50%
|
|===============================================================================================| 100%
# PCA
combined_seu <- RunPCA(combined_seu, features = VariableFeatures(combined_seu))
PC_ 1
Positive: LGALS7B, KRT6A, KRT16, KRT14, KRT10, KRT1, KRTDAP, S100A8, DMKN, SFN
S100A9, PERP, CALML5, LGALS7, KRT5, AQP3, LY6D, S100A7, SBSN, SERPINB4
DSP, IFI27, SPRR1B, LYPD3, S100A14, TACSTD2, KRT6B, JUP, KRT2, FABP5
Negative: IFITM2, VIM, SRGN, TSC22D3, SLC2A3, S100A4, S100A6, IL32, LDHB, LAPTM5
CORO1A, BST2, CD37, ITM2A, GYPC, GMFG, LGALS1, RAC2, EMP3, PTPRC
FYB1, CD52, ITGB2, COTL1, ANXA6, CD3E, NR4A2, CXCR4, LIMD2, TGFBR2
PC_ 2
Positive: IGFBP7, CALD1, SERPING1, SPARC, IFITM3, SOD3, TCF4, CAVIN1, PLAC9, SPARCL1
PLPP3, ADIRF, CXCL12, TIMP3, CST3, MGP, PRRX1, GSN, NNMT, CAVIN3
COL1A2, S100A13, MYL9, COL6A1, LGALS7B, KRT14, CCDC80, CAV1, FSTL1, IFI27
Negative: CORO1A, CD52, CD3E, RAC2, PTPRC, CD37, SRGN, FYB1, LAPTM5, CD3D
CYTIP, CXCR4, CD69, CD2, TRBC2, LIMD2, TSC22D3, RHOH, LAT, HCST
LCP1, IL32, SKAP1, ITGB2, UCP2, CD48, DUSP2, LTB, CD27, DOK2
PC_ 3
Positive: SOD3, PLAC9, MYL9, TIMP3, SPARCL1, PRRX1, IGFBP7, PCOLCE, COL1A2, MFAP4
CALD1, TRBV18, MGP, AEBP1, ITM2A, FXYD1, COL3A1, PLPP3, IGFBP4, SLIT3
CPE, PDGFRB, ISLR, COL6A2, LHFPL6, CXCL12, COL1A1, SPARC, IFITM1, CD3E
Negative: LYZ, IFI30, FCER1G, AIF1, SPI1, C1QA, IGSF6, CD68, TYROBP, C1QB
MS4A6A, SERPINA1, HLA-DMB, LGALS2, C1QC, LST1, CYBB, HLA-DQA1, CSF1R, CD14
LILRB4, CTSZ, HLA-DQB1, HLA-DPB1, CALHM6, CLEC10A, HLA-DMA, PILRA, FCGR2A, HLA-DPA1
PC_ 4
Positive: ECSCR, EGFL7, PLVAP, EMCN, ADGRL4, CLEC14A, RAMP2, PECAM1, CLDN5, VWF
SOX18, ESAM, TSPAN7, CD34, RAMP3, CCL14, CDH5, NPDC1, ACKR1, TM4SF1
BCAM, MYCT1, GNG11, CYYR1, SOX17, ERG, CD93, PCAT19, ROBO4, PTPRB
Negative: LUM, MFAP4, MXRA8, COL1A2, DPT, CTSK, COL3A1, TMEM176B, SOD3, TMEM176A
COL1A1, COL6A2, ISLR, CCDC80, DCN, PCOLCE, CD248, FBLN1, SERPINF1, OLFML3
COL6A1, PRRX1, PTGDS, PLAC9, CFD, VCAN, TWIST2, AEBP1, THY1, LRP1
PC_ 5
Positive: NEAT1, AC020916.1, FGFR3, SPINT2, PTPRF, EVPL, DUOX1, NECTIN1, IFFO2, CYP3A5
EPHA2, SLC2A1, TFAP2A, MIR222HG, IMPA2, EPHB6, CD3E, GPC1, LTB4R, IRX2
CA12, PRRG4, CEBPB, CXADR, AHNAK2, PKP1, ERBB3, HMGB2, ITPKC, EGFR
Negative: EGFL7, PLVAP, ECSCR, RAMP2, PECAM1, EMCN, CLEC14A, ADGRL4, CD34, A2M
GNG11, CCL14, ENG, MCTP1, VWF, ESAM, TSPAN7, RAMP3, CLDN5, AQP1
ACKR1, RNASE1, HLA-DPA1, CD93, HLA-DRA, HLA-DPB1, SPARCL1, CDH5, SOX18, JAM2
# Elbow Plot to decide number of PCs
ElbowPlot(combined_seu)
# # Step 8: JackStraw (optional but useful for PC significance)
# combined_seu <- JackStraw(combined_seu, num.replicate = 100)
# combined_seu <- ScoreJackStraw(combined_seu, dims = 1:20)
# JackStrawPlot(combined_seu, dims = 1:20)
# Calculate percent.mt for each cell
combined_seu[["percent.mt"]] <- PercentageFeatureSet(combined_seu, pattern = "^MT-")
# Filter cells with too few or too many genes
combined_seu <- subset(combined_seu,
subset = nFeature_RNA > 200 &
nFeature_RNA < 8000 &
percent.mt < 10)
VlnPlot(combined_seu, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3)
Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.
VlnPlot(combined_seu, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) &
theme(plot.title = element_text(size=10))
Warning: The following requested variables were not found: percent.rb
FeatureScatter(combined_seu,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.
FeatureScatter(combined_seu,
feature1 = "nCount_RNA",
feature2 = "percent.mt")+
geom_smooth(method = 'lm')
FeatureScatter(combined_seu,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- combined_seu[["pca"]]@stdev / sum(combined_seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 17
# TEST-2
# get significant PCs
stdv <- combined_seu[["pca"]]@stdev
sum.stdv <- sum(combined_seu[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 17
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
NA
NA
NA
combined_seu <- FindNeighbors(combined_seu, dims = 1:17)
Computing nearest neighbor graph
Computing SNN
combined_seu <- FindClusters(combined_seu, resolution = 0.2)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 60239
Number of edges: 1451156
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9536
Number of communities: 14
Elapsed time: 10 seconds
1 singletons identified. 13 final clusters.
# Run UMAP
combined_seu <- RunUMAP(combined_seu, dims = 1:17)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session14:57:24 UMAP embedding parameters a = 0.9922 b = 1.112
14:57:24 Read 60239 rows and found 17 numeric columns
14:57:24 Using Annoy for neighbor search, n_neighbors = 30
14:57:24 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:57:29 Writing NN index file to temp file /tmp/Rtmp59uyjN/file6c080219bb1b9
14:57:29 Searching Annoy index using 1 thread, search_k = 3000
14:57:47 Annoy recall = 100%
14:57:48 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
14:57:50 Initializing from normalized Laplacian + noise (using RSpectra)
14:57:53 Commencing optimization for 200 epochs, with 2713556 positive edges
14:57:53 Using rng type: pcg
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
14:58:14 Optimization finished
# UMAP plot colored by clusters
DimPlot(combined_seu, reduction = "umap", label = TRUE, pt.size = 0.2) +
ggtitle("UMAP of CD4+ T Cells (Resolution = 0.2)")
DimPlot(combined_seu, reduction = "umap", group.by = "patient_id", label = TRUE, pt.size = 0.6) +
ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")
DimPlot(combined_seu, reduction = "umap", label = TRUE,label.box = T,repel = T, pt.size = 0.6) +
ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")
NA
NA
NA
##. Metadata enhancements
combined_seu$disease_status <- ifelse(grepl("MF", combined_seu$patient_id), "MF", "Healthy")
top_50_up <- read.csv("../top_50_upregulated.csv") # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("../top_50_downregulated.csv")
FeaturePlot(combined_seu,
features = top_50_up$gene[1:10],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[11:20],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[21:30],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[31:40],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[41:50],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[1:10],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[11:20],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[21:30],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[31:40],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)