# Patient IDs and H5 files (replace with your actual file paths)
# Define paths to your H5 files
h5_files <- list(
"SS1" = "../data/SS/GSM5687767_SZ22raw_feature_bc_matrix.h5", #SS-Patient
"SS2" = "../data/SS/GSM5687769_SZ16raw_feature_bc_matrix.h5", #SS-Patient
"SS3" = "../data/SS/GSM5687772_SZ30raw_feature_bc_matrix.h5", #SS-Patient
"HC1_2" = "../data/Disease_state_Healthy_Control/GSM5538785_SC327raw_feature_bc_matrix.h5", # disease state: Healthy control/tissue: Skin tumor
"HC3_4" = "../data/Disease_state_Healthy_Control/GSM5538786_SC374raw_feature_bc_matrix.h5" # disease state: Healthy control/tissue: Skin tumor
)
# Read and create individual Seurat objects
seurat_list <- list()
seurat_list <- list()
for (patient in names(h5_files)) {
raw_data <- Read10X_h5(h5_files[[patient]])
# If raw_data is a list (multi-modal), extract "Gene Expression"
if (is.list(raw_data)) {
if (!"Gene Expression" %in% names(raw_data)) {
stop(paste("Gene Expression modality not found in patient", patient))
}
raw_counts <- raw_data[["Gene Expression"]]
} else {
# Single modality
raw_counts <- raw_data
}
seu <- CreateSeuratObject(
counts = raw_counts,
min.cells = 3,
min.features = 200,
project = patient
)
seu$patient_id <- patient
seurat_list[[patient]] <- seu
}
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome
Genome matrix has multiple modalities, returning a list of matrices for this genome
# Step 2: Merge into combined Seurat object
combined_seu <- merge(
seurat_list[[1]],
y = seurat_list[-1],
add.cell.ids = names(seurat_list),
project = "SS_HC_Merged"
)
# Step 3: Normalization (LogNormalize)
combined_seu <- NormalizeData(combined_seu, normalization.method = "LogNormalize", scale.factor = 10000)
Normalizing layer: counts.SS1
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.SS2
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.SS3
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.HC1_2
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Normalizing layer: counts.HC3_4
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Step 4: Variable features selection
combined_seu <- FindVariableFeatures(combined_seu, selection.method = "vst", nfeatures = 2000)
Finding variable features for layer counts.SS1
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.SS2
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.SS3
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.HC1_2
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Finding variable features for layer counts.HC3_4
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Step 5: Scaling
combined_seu <- ScaleData(combined_seu)
Centering and scaling data matrix
|
| | 0%
|
|================================================ | 50%
|
|===============================================================================================| 100%
# Step 6: PCA
combined_seu <- RunPCA(combined_seu, features = VariableFeatures(combined_seu))
PC_ 1
Positive: LYZ, IFI30, FTH1, S100A8, FCER1G, FTL, CTSB, AIF1, TYROBP, FCN1
S100A9, CYBB, SOD2, C15orf48, MMP9, CTSS, SPI1, CST3, TYMP, HLA-DRA
CD68, PSAP, SMIM25, CD14, SERPINA1, SAT1, GRINA, NPC2, CTSD, CTSZ
Negative: IL32, CD3E, IFITM1, RPSA, RPS18, CD7, CD3D, LTB, RPL3, CD52
RPS5, TRBC2, RPS6, EEF1A1, RPLP0, GZMA, PPIA, CORO1A, CST7, NPM1
RPL4, EEF1B2, TRAC, CTSW, GZMB, NKG7, SH2D2A, STMN1, HSPA8, CRIP1
PC_ 2
Positive: HBB, LTB, MT-CO1, TYROBP, S100A8, CD7, FTH1, FTL, MT-CO2, LYZ
TRBC2, IFI30, CD52, FCER1G, SELL, IL32, CD3E, GNLY, TMSB4X, TRAC
LCP1, NCF1, S100A9, CORO1A, NKG7, CD3D, GBP5, CST7, RPS4Y1, SOCS1
Negative: ADIRF, TIMP3, IGFBP7, CAV1, SPARC, CALD1, MGP, ID1, GSN, CD34
MYL9, CAVIN3, SERPINH1, IFI27, RAMP2, TSC22D1, CRIP2, TM4SF1, GNG11, CD9
ARHGAP29, RHOB, SELENOP, HSPA1B, HSPA1A, S100A13, PLVAP, PLK2, HES1, LMNA
PC_ 3
Positive: TYMS, GZMA, SRGN, GZMB, PCLAF, PFN1, NKG7, RRM2, ACTB, TYROBP
TMSB4X, GNLY, CCL5, LCP1, CTSW, S100A4, GZMH, CST7, STMN1, TPI1
TK1, TUBA1B, PRF1, CTSC, MKI67, ZWINT, CLSPN, CCL4, UHRF1, UBE2C
Negative: IGHM, MS4A1, LINC00926, CD79A, GAS5, IGKC, CCR7, MEF2C, NIBAN3, CD79B
BACH2, SESN3, EEF1A1, CD19, SPIB, BANK1, GNG7, AFF3, ARHGAP24, RPS18
RUBCNL, PKIG, SNHG29, SELL, CALD1, ID3, ADIRF, BCL11A, IGLC2, TIMP3
PC_ 4
Positive: HLA-DQB1, EEF1A1, LTB, CD83, COTL1, RPS6, RPL4, RPLP0, RPS18, CD74
BIRC3, CD52, HLA-DPB1, RPS5, HLA-DRB1, RPL3, RPSA, HLA-DQA1, EEF1B2, RGS1
JUNB, HLA-DPA1, PIM3, GPR183, NR4A2, ZFP36, HSPA8, MS4A1, IGHM, CD79A
Negative: GNLY, NKG7, CCL5, KLRD1, GZMB, HBB, GZMH, PRF1, GZMA, CCL4
IGFBP7, CTSW, SPARC, KLRC1, CALD1, CST7, KLRK1, MYL9, KLRF1, C1orf21
HOPX, TRDC, MGP, XCL1, COL6A2, CMC1, TIMP3, CLIC3, EOMES, XCL2
PC_ 5
Positive: IGHM, MS4A1, LINC00926, CD79A, IGKC, MEF2C, NIBAN3, SPIB, BANK1, CD79B
CD19, RUBCNL, HVCN1, GNG7, ARHGAP24, BCL11A, AFF3, CD72, HLA-DQA1, IGLC2
HLA-DPB1, FAM30A, BACH2, ADAM28, JCHAIN, RALGPS2, CD83, CD74, LHPP, BLNK
Negative: IL32, S100A4, CD3E, CD3D, S100A6, TNFRSF4, AQP3, MAL, S100A10, S100A11
VIM, TRAC, LGALS1, IL7R, IL2RA, COTL1, CD7, CORO1B, LAT, LIME1
LINC01943, TNFRSF25, LTB, GATA3, LGALS3, FOXP3, LINC02195, TRAV21, TRAT1, SIT1
# Step 7: Elbow Plot to decide number of PCs
ElbowPlot(combined_seu)
# # Step 8: JackStraw (optional but useful for PC significance)
# combined_seu <- JackStraw(combined_seu, num.replicate = 100)
# combined_seu <- ScoreJackStraw(combined_seu, dims = 1:20)
# JackStrawPlot(combined_seu, dims = 1:20)
# Calculate percent.mt for each cell
combined_seu[["percent.mt"]] <- PercentageFeatureSet(combined_seu, pattern = "^MT-")
# Filter cells with too few or too many genes
combined_seu <- subset(combined_seu,
subset = nFeature_RNA > 200 &
nFeature_RNA < 8000 &
percent.mt < 10)
VlnPlot(combined_seu, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3)
Warning: The `slot` argument of `FetchData()` is deprecated as of SeuratObject 5.0.0.
Please use the `layer` argument instead.Warning: `PackageCheck()` was deprecated in SeuratObject 5.0.0.
Please use `rlang::check_installed()` instead.
VlnPlot(combined_seu, features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt",
"percent.rb"),
ncol = 4, pt.size = 0.1) &
theme(plot.title = element_text(size=10))
Warning: The following requested variables were not found: percent.rb
FeatureScatter(combined_seu,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA") +
geom_smooth(method = 'lm')
##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.
FeatureScatter(combined_seu,
feature1 = "nCount_RNA",
feature2 = "percent.mt")+
geom_smooth(method = 'lm')
FeatureScatter(combined_seu,
feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")+
geom_smooth(method = 'lm')
# TEST-1
# given that the output of RunPCA is "pca"
# replace "so" by the name of your seurat object
pct <- combined_seu[["pca"]]@stdev / sum(combined_seu[["pca"]]@stdev) * 100
cumu <- cumsum(pct) # Calculate cumulative percents for each PC
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which((pct[-length(pct)] - pct[-1]) > 0.1), decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%. -> co2
co2
[1] 26
# TEST-2
# get significant PCs
stdv <- combined_seu[["pca"]]@stdev
sum.stdv <- sum(combined_seu[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
cumulative <- cumsum(percent.stdv)
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co2 <- sort(which((percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
min.pc <- min(co1, co2)
min.pc
[1] 26
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
NA
NA
NA
combined_seu <- FindNeighbors(combined_seu, dims = 1:26)
Computing nearest neighbor graph
Computing SNN
combined_seu <- FindClusters(combined_seu, resolution = 0.2)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 33494
Number of edges: 1281442
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9684
Number of communities: 25
Elapsed time: 4 seconds
# Run UMAP
combined_seu <- RunUMAP(combined_seu, dims = 1:26)
10:58:08 UMAP embedding parameters a = 0.9922 b = 1.112
10:58:08 Read 33494 rows and found 26 numeric columns
10:58:08 Using Annoy for neighbor search, n_neighbors = 30
10:58:08 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:58:11 Writing NN index file to temp file /tmp/RtmpzOxnpY/file6965b1c7de4f3
10:58:11 Searching Annoy index using 1 thread, search_k = 3000
10:58:19 Annoy recall = 100%
10:58:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
10:58:22 Initializing from normalized Laplacian + noise (using RSpectra)
10:58:23 Commencing optimization for 200 epochs, with 1484690 positive edges
10:58:23 Using rng type: pcg
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:58:35 Optimization finished
# UMAP plot colored by clusters
DimPlot(combined_seu, reduction = "umap", label = TRUE, pt.size = 0.2) +
ggtitle("UMAP of CD4+ T Cells (Resolution = 0.2)")
DimPlot(combined_seu, reduction = "umap", group.by = "patient_id", label = TRUE, pt.size = 0.6) +
ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")
DimPlot(combined_seu, reduction = "umap", label = TRUE,label.box = T,repel = T, pt.size = 0.6) +
ggtitle("UMAP of Sézary Syndrome CD4+ T Cells")
NA
NA
NA
##. Metadata enhancements
combined_seu$disease_status <- ifelse(grepl("SS", combined_seu$patient_id), "SS", "Healthy")
top_50_up <- read.csv("../top_50_upregulated.csv") # or read.delim("top_50_up.tsv")
top_50_down <- read.csv("../top_50_downregulated.csv")
FeaturePlot(combined_seu,
features = top_50_up$gene[1:10],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[11:20],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[21:30],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[31:40],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_up$gene[41:50],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[1:10],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[11:20],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[21:30],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[31:40],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
FeaturePlot(combined_seu,
features = top_50_down$gene[41:50],
reduction = "umap",
cols = c("lightblue", "red"), # Custom color gradient from light blue to red
label = TRUE)
DimPlot(combined_seu, group.by = "seurat_clusters", label = T, label.box = T, repel = T, reduction = "umap")
NA
NA
# Vector of genes to plot
up_genes <- c("CLIC1", "COX5A","GTSF1", "MAD2L1","MYBL2","MYL6B","NME1","PLK1", "PYCR1", "SLC25A5", "SRI", "TUBA1C", "UBE2T", "YWHAH")
# DotPlot with custom firebrick-red gradient
DotPlot(combined_seu, features = up_genes) +
RotatedAxis() +
scale_color_gradient2(low = "lightblue", mid = "red", high = "firebrick", midpoint = 1) +
ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
# DotPlot with custom firebrick-red gradient
DotPlot(combined_seu, features = up_genes, group.by = "patient_id") +
RotatedAxis() +
scale_color_gradient2(low = "lightblue", mid = "red", high = "firebrick", midpoint = 1) +
ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
# DotPlot with custom firebrick-red gradient
DotPlot(combined_seu, features = up_genes, group.by = "disease_status") +
RotatedAxis() +
scale_color_gradient2(low = "lightblue", mid = "red", high = "firebrick", midpoint = 1) +
ggtitle("Expression of Upregulated Genes in Sézary Syndrome") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
Warning: Scaling data with a low number of groups may produce misleading resultsScale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
# Downregulated genes
down_genes <- c("TXNIP", "RASA3", "RIPOR2",
"ZFP36", "ZFP36L1", "ZFP36L2",
"PRMT2", "MAX", "PIK3IP1",
"BTG1", "CDKN1B")
# DotPlot with firebrick color for high expression
DotPlot(combined_seu, features = down_genes) +
RotatedAxis() +
scale_color_gradient2(low = "lightblue", mid = "red", high = "firebrick", midpoint = 1) +
ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
# DotPlot with firebrick color for high expression
DotPlot(combined_seu, features = down_genes, group.by = "patient_id") +
RotatedAxis() +
scale_color_gradient2(low = "lightblue", mid = "red", high = "firebrick", midpoint = 1) +
ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
# DotPlot with firebrick color for high expression
DotPlot(combined_seu, features = down_genes, group.by = "disease_status") +
RotatedAxis() +
scale_color_gradient2(low = "lightblue", mid = "red", high = "firebrick", midpoint = 1) +
ggtitle("Expression of Downregulated Genes in Sézary Syndrome") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
Warning: Scaling data with a low number of groups may produce misleading resultsScale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
# Load required libraries
library(Seurat)
library(dplyr)
library(tibble)
# Join the layers of the RNA assay
combined_seu <- JoinLayers(combined_seu, assay = "RNA")
# Ensure your identity class is set to disease status
Idents(combined_seu) <- "disease_status" # e.g., levels: "SS", "Control"
# Run differential expression between SS vs Control
markers_disease <- FindMarkers(
object = combined_seu,
ident.1 = "SS",
ident.2 = "Healthy",
assay = "RNA",
logfc.threshold = 0,
min.pct = 0,
test.use = "wilcox" # or "MAST" if RNA assay supports it
)
# Save results to CSV
write.csv(markers_disease, file = "DE_SS_vs_Healthy.csv", row.names = TRUE)
# Get log-normalized expression matrix (RNA assay)
expression_data_RNA <- GetAssayData(combined_seu, assay = "RNA", slot = "data")
# Get cell names for each group
ss_cells <- WhichCells(combined_seu, idents = "SS")
healthy_cells <- WhichCells(combined_seu, idents = "Healthy")
# Function to add mean expression per group
calculate_mean_expression <- function(markers, group1_cells, group2_cells, expression_data) {
group1_mean <- rowMeans(expression_data[, group1_cells, drop = FALSE], na.rm = TRUE)
group2_mean <- rowMeans(expression_data[, group2_cells, drop = FALSE], na.rm = TRUE)
markers <- markers %>%
rownames_to_column("gene") %>%
mutate(
mean_expr_SS = group1_mean[gene],
mean_expr_Healthy = group2_mean[gene],
log2FC_manual = log2(mean_expr_SS + 1) - log2(mean_expr_Healthy + 1)
)
return(markers)
}
# Apply the function and save final result
markers_disease_with_mean <- calculate_mean_expression(markers_disease, ss_cells, healthy_cells, expression_data_RNA)
write.csv(markers_disease_with_mean, "DE_SS_vs_Healthy_with_MeanExpr.csv", row.names = FALSE)
saveRDS(combined_seu, file = "../data/ss_Gaydosik_SS_Healthy_Combined.rds")