1. load libraries

2. Load Seurat Object


#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("0-R_Objects/SS_CD4_Tcells_Azimuth_Annotated_PBMC10x_final_for_SCT_and_Integration.robj")

All_samples_Merged <- filtered_seurat

All_samples_Merged
An object of class Seurat 
36752 features across 49372 samples within 5 assays 
Active assay: RNA (36601 features, 0 variable features)
 2 layers present: data, counts
 4 other assays present: ADT, prediction.score.celltype.l1, prediction.score.celltype.l2, prediction.score.celltype.l3
 2 dimensional reductions calculated: integrated_dr, ref.umap

Summarizing Seurat Object


# Load necessary libraries
library(Seurat)

# Display basic metadata summary
head(All_samples_Merged@meta.data)

# Check if columns such as `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `nUMI`, `ngene`, and any other necessary columns exist
required_columns <- c("orig.ident", "nCount_RNA", "nFeature_RNA", "nUMI", "ngene")
missing_columns <- setdiff(required_columns, colnames(All_samples_Merged@meta.data))

if (length(missing_columns) > 0) {
    cat("Missing columns:", paste(missing_columns, collapse = ", "), "\n")
} else {
    cat("All required columns are present.\n")
}
All required columns are present.
# Check cell counts and features
cat("Number of cells:", ncol(All_samples_Merged), "\n")
Number of cells: 49372 
cat("Number of features:", nrow(All_samples_Merged), "\n")
Number of features: 36601 
# Verify that each `orig.ident` label has the correct number of cells
cat("Cell counts per group:\n")
Cell counts per group:
print(table(All_samples_Merged$orig.ident))

     L1      L2      L3      L4      L5      L6      L7    PBMC PBMC10x 
   5825    5935    6428    6007    6022    5148    5331    5171    3505 
# Check that the cell IDs are unique (which ensures no issues from merging)
if (any(duplicated(colnames(All_samples_Merged)))) {
    cat("Warning: There are duplicated cell IDs.\n")
} else {
    cat("Cell IDs are unique.\n")
}
Cell IDs are unique.
# Check the assay consistency for RNA
DefaultAssay(All_samples_Merged) <- "RNA"

# Check dimensions of the RNA counts layer using the new method
cat("Dimensions of the RNA counts layer:", dim(GetAssayData(All_samples_Merged, layer = "counts")), "\n")
Dimensions of the RNA counts layer: 36601 49372 
cat("Dimensions of the RNA data layer:", dim(GetAssayData(All_samples_Merged, layer = "data")), "\n")
Dimensions of the RNA data layer: 36601 49372 
# Check the ADT assay (optional)
if ("ADT" %in% names(All_samples_Merged@assays)) {
    cat("ADT assay is present.\n")
    cat("Dimensions of the ADT counts layer:", dim(GetAssayData(All_samples_Merged, assay = "ADT", layer = "counts")), "\n")
} else {
    cat("ADT assay is not present.\n")
}
ADT assay is present.
Dimensions of the ADT counts layer: 56 49372 

3. QC


# Remove the percent.mito column
All_samples_Merged$percent.mito <- NULL


# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "cell_line"

All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged, 
                                                           pattern = "^RP[SL]")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.rb <- replace(as.numeric(All_samples_Merged$percent.rb), is.na(All_samples_Merged$percent.rb), 0)



# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
All_samples_Merged[["percent.mt"]] <- PercentageFeatureSet(All_samples_Merged, pattern = "^MT-")

# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.mt <- replace(as.numeric(All_samples_Merged$percent.mt), is.na(All_samples_Merged$percent.mt), 0)





VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))


FeatureScatter(All_samples_Merged, feature1 = "percent.mt", 
                                  feature2 = "percent.rb")


VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)


FeatureScatter(All_samples_Merged, 
               feature1 = "percent.mt", 
               feature2 = "percent.rb") +
        geom_smooth(method = 'lm')


FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
        geom_smooth(method = 'lm')

##FeatureScatter is typically used to visualize feature-feature relationships ##for anything calculated by the object, ##i.e. columns in object metadata, PC scores etc.


FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')


FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

4. Data PREPARATION

options(future.globals.maxSize = 8000 * 1024^2)  # Set to 8000 MiB (about 8 GB)
# Data Preparation for Seurat v5
alldata <- All_samples_Merged

# Split the object by 'orig.ident' for individual dataset processing
alldata.list <- SplitObject(alldata, split.by = "orig.ident")

# Normalize and identify variable features for each dataset in the list
for (i in 1:length(alldata.list)) {
    alldata.list[[i]] <- SCTransform(alldata.list[[i]], vars.to.regress = c("percent.rb","percent.mt", "nCount_RNA"), verbose = FALSE)
    alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst", nfeatures = 3000, verbose = FALSE)
}

# Get the variable genes for each dataset
hvgs_per_dataset <- lapply(alldata.list, VariableFeatures)

# Include the variable genes selected on the whole dataset
hvgs_per_dataset$all <- VariableFeatures(alldata)

# Create a heatmap to show overlap across datasets
temp <- unique(unlist(hvgs_per_dataset))
overlap <- sapply(hvgs_per_dataset, function(x) temp %in% x)
pheatmap::pheatmap(t(overlap * 1), cluster_rows = FALSE,
                   color = c("grey90", "grey20"))


# Select integration features across datasets
hvgs_all <- SelectIntegrationFeatures(alldata.list)
# Exclude specific genes
hvgs_all <- hvgs_all[!grepl("^HLA-|^XIST|^TRBV|^TRAV", hvgs_all)]

hvgs_per_dataset$all_ranks <- hvgs_all

# Generate the overlap heatmap for all selected integration features
temp <- unique(unlist(hvgs_per_dataset))
overlap <- sapply(hvgs_per_dataset, function(x) temp %in% x)
pheatmap::pheatmap(t(overlap * 1), cluster_rows = FALSE,
                   color = c("grey90", "grey20"))


# Scale and PCA on each dataset using selected integration features
alldata.list <- lapply(alldata.list, function(x) {
    x <- RunPCA(x, features = hvgs_all, verbose = FALSE)
})

5. rpca-integration

alldata.int <- FindClusters(alldata.int, resolution = 0.5, verbose = FALSE)



wrap_plots(

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated"),

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("PCA_integrated"),
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("UMAP_integrated"),
    ncol = 2) + plot_layout(guides = "collect")

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated")

    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "orig.ident", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")


    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("PCA integrated")

    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "rpca_snn_res.0.5", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")

clean memory


# remove all objects that will not be used
rm(alldata, filtered_seurat,  alldata.anchors)

gc()
             used    (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells   16126043   861.3   25929034  1384.8   25929034  1384.8
Vcells 3191280457 24347.6 7119864201 54320.3 7102197598 54185.5

Marker Gene Visualization



# Set marker genes specific to requested immune cell types
myfeatures <- c("CD19", "CD79A", "MS4A1", # B cells
                "CD14", "LYZ", "FCGR3A", # Monocytes
                "CSF1R", "CD68", # Macrophages
                "NKG7", "GNLY", "KIR3DL1", # NK cells
                "MKI67", # Proliferating NK cells
                "CD34", "KIT", # HSPCs
                "CD3E", "CCR7", # T cells
                "SELL", "CD45RO", # Tnaive, Tcm
                "CD44", "CD45RA") # Tem, Temra

# # Visualize marker genes for RPCA
# FeaturePlot(alldata.int, features = myfeatures, reduction = "pca_rpca", ncol = 4) + 
#   ggtitle("Marker Gene Expression - RPCA Integration") +
#   NoLegend()

# Visualize marker genes for CCA
FeaturePlot(alldata.int, features = myfeatures, reduction = "umap_rpca", ncol = 4) + 
  ggtitle("Marker Gene Expression - rpca Integration") +
  NoLegend()
Warning: Found the following features in more than one assay, excluding the default. We will not include these in the final data frame: CD19, CD79A, MS4A1, CD14, LYZ, FCGR3A, CSF1R, CD68, GNLY, KIR3DL1, KITWarning: Could not find CD34 in the default search locations, found in 'RNA' assay insteadWarning: Could not find CD45RO in the default search locations, found in 'ADT' assay insteadWarning: Could not find CD45RA in the default search locations, found in 'ADT' assay insteadWarning: The following requested variables were not found (10 out of 11 shown): CD19, CD79A, MS4A1, CD14, LYZ, FCGR3A, CSF1R, CD68, GNLY, KIR3DL1

6. CCA-integration


alldata.anchors <- FindIntegrationAnchors(object.list = alldata.list, dims = 1:20, reduction = "cca", anchor.features = hvgs_all)

alldata.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:20, new.assay.name = "CCA")

names(alldata.int@assays)

alldata.int@active.assay

DefaultAssay(alldata.int) <- "CCA"

#Run Dimensionality reduction on integrated space
alldata.int <- ScaleData(alldata.int, verbose = TRUE)
alldata.int <- RunPCA(alldata.int, features = hvgs_all, reduction.name = "pca_CCA", do.print = TRUE, pcs.print = 1:5, genes.print = 15, verbose = FALSE)
alldata.int <- RunUMAP(alldata.int, reduction = "pca_CCA", reduction.name = "umap_CCA", dims = 1:20, verbose = FALSE)
alldata.int <- RunTSNE(alldata.int, reduction = "pca_CCA",reduction.name = "tsne_CCA",dims = 1:20, verbose = FALSE)
alldata.int <- FindNeighbors(alldata.int, reduction = "pca_CCA", dims = 1:20, verbose = FALSE)
alldata.int <- FindClusters(alldata.int, resolution = 1.2, verbose = FALSE)



wrap_plots(

    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated"),

    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("UMAP integrated"),
    ncol = 3) + plot_layout(guides = "collect")

    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")
    
    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")

clean memory

# remove all objects that will not be used
rm(alldata.anchors)

gc()

7. Harmony-integration

 wrap_plots(

    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated"),

    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("UMAP integrated"),
    ncol = 3) + plot_layout(guides = "collect")
 
    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated")

    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated")

    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "orig.ident", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")

    
    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated")

    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated")

    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "RNA_snn_res.1.2", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")

NA

8. Marker Gene Visualization



# Set marker genes specific to requested immune cell types
myfeatures <- c("CD19", "CD79A", "MS4A1", # B cells
                "CD14", "LYZ", "FCGR3A", # Monocytes
                "CSF1R", "CD68", # Macrophages
                "NKG7", "GNLY", "KIR3DL1", # NK cells
                "MKI67", # Proliferating NK cells
                "CD34", "KIT", # HSPCs
                "CD3E", "CCR7", # T cells
                "SELL", "CD45RO", # Tnaive, Tcm
                "CD44", "CD45RA") # Tem, Temra

# # Visualize marker genes for RPCA
# FeaturePlot(alldata.int, features = myfeatures, reduction = "pca_rpca", ncol = 4) + 
#   ggtitle("Marker Gene Expression - RPCA Integration") +
#   NoLegend()

# Visualize marker genes for CCA
FeaturePlot(alldata.int, features = myfeatures, reduction = "umap_rpca", ncol = 4) + 
  ggtitle("Marker Gene Expression - CCA Integration") +
  NoLegend()
Warning: Could not find CD45RO in the default search locations, found in 'ADT' assay insteadWarning: Could not find CD45RA in the default search locations, found in 'ADT' assay instead

9. Save the Seurat object as an Robj file


# save(All_samples_Merged_Harmony_Integrated, file = "All_samples_Merged_Harmony_Integrated.Robj")
---
title: "RPCA-CCA-Harmony Integration of PBMC10x with SCT on samples part1"
author: Nasir Mahmood Abbasi
date: "`r Sys.Date()`"
output:
  #rmdformats::readthedown
  html_notebook:
    toc: true
    toc_float: true
    toc_collapsed: true
---

# 1. load libraries
```{r setup, include=FALSE}
library(Seurat)
library(SeuratObject)
library(SeuratData)
library(patchwork)
library(harmony)
library(ggplot2)
library(reticulate)
library(Azimuth)
library(dplyr)
library(Rtsne)
library(harmony)

```




# 2. Load Seurat Object 
```{r load_seurat}

#Load Seurat Object merged from cell lines and a control(PBMC) after filtration
load("0-R_Objects/SS_CD4_Tcells_Azimuth_Annotated_PBMC10x_final_for_SCT_and_Integration.robj")

All_samples_Merged <- filtered_seurat

All_samples_Merged

```
## Summarizing Seurat Object
```{r summary, fig.height=6, fig.width=10}

# Load necessary libraries
library(Seurat)

# Display basic metadata summary
head(All_samples_Merged@meta.data)

# Check if columns such as `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `nUMI`, `ngene`, and any other necessary columns exist
required_columns <- c("orig.ident", "nCount_RNA", "nFeature_RNA", "nUMI", "ngene")
missing_columns <- setdiff(required_columns, colnames(All_samples_Merged@meta.data))

if (length(missing_columns) > 0) {
    cat("Missing columns:", paste(missing_columns, collapse = ", "), "\n")
} else {
    cat("All required columns are present.\n")
}

# Check cell counts and features
cat("Number of cells:", ncol(All_samples_Merged), "\n")
cat("Number of features:", nrow(All_samples_Merged), "\n")

# Verify that each `orig.ident` label has the correct number of cells
cat("Cell counts per group:\n")
print(table(All_samples_Merged$orig.ident))

# Check that the cell IDs are unique (which ensures no issues from merging)
if (any(duplicated(colnames(All_samples_Merged)))) {
    cat("Warning: There are duplicated cell IDs.\n")
} else {
    cat("Cell IDs are unique.\n")
}

# Check the assay consistency for RNA
DefaultAssay(All_samples_Merged) <- "RNA"

# Check dimensions of the RNA counts layer using the new method
cat("Dimensions of the RNA counts layer:", dim(GetAssayData(All_samples_Merged, layer = "counts")), "\n")
cat("Dimensions of the RNA data layer:", dim(GetAssayData(All_samples_Merged, layer = "data")), "\n")

# Check the ADT assay (optional)
if ("ADT" %in% names(All_samples_Merged@assays)) {
    cat("ADT assay is present.\n")
    cat("Dimensions of the ADT counts layer:", dim(GetAssayData(All_samples_Merged, assay = "ADT", layer = "counts")), "\n")
} else {
    cat("ADT assay is not present.\n")
}


```
# 3. QC
```{r QC, fig.height=6, fig.width=10}

# Remove the percent.mito column
All_samples_Merged$percent.mito <- NULL


# Set identity classes to an existing column in meta data
Idents(object = All_samples_Merged) <- "cell_line"

All_samples_Merged[["percent.rb"]] <- PercentageFeatureSet(All_samples_Merged, 
                                                           pattern = "^RP[SL]")
# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.rb <- replace(as.numeric(All_samples_Merged$percent.rb), is.na(All_samples_Merged$percent.rb), 0)



# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
All_samples_Merged[["percent.mt"]] <- PercentageFeatureSet(All_samples_Merged, pattern = "^MT-")

# Convert 'percent.mt' to numeric, replacing "NaN" with 0
All_samples_Merged$percent.mt <- replace(as.numeric(All_samples_Merged$percent.mt), is.na(All_samples_Merged$percent.mt), 0)





VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                         "nCount_RNA", 
                                         "percent.mt",
                                         "percent.rb"), 
                            ncol = 4, pt.size = 0.1) & 
              theme(plot.title = element_text(size=10))

FeatureScatter(All_samples_Merged, feature1 = "percent.mt", 
                                  feature2 = "percent.rb")

VlnPlot(All_samples_Merged, features = c("nFeature_RNA", 
                                    "nCount_RNA", 
                                    "percent.mt"), 
                                      ncol = 3)

FeatureScatter(All_samples_Merged, 
               feature1 = "percent.mt", 
               feature2 = "percent.rb") +
        geom_smooth(method = 'lm')

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA") +
        geom_smooth(method = 'lm')

```

##FeatureScatter is typically used to visualize feature-feature relationships
##for anything calculated by the object, 
##i.e. columns in object metadata, PC scores etc.

```{r FC, fig.height=6, fig.width=10}

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "percent.mt")+
  geom_smooth(method = 'lm')

FeatureScatter(All_samples_Merged, 
               feature1 = "nCount_RNA", 
               feature2 = "nFeature_RNA")+
  geom_smooth(method = 'lm')

```


# 4. Data PREPARATION
```{r data, fig.height=8, fig.width=12}
options(future.globals.maxSize = 8000 * 1024^2)  # Set to 8000 MiB (about 8 GB)
# Data Preparation for Seurat v5
alldata <- All_samples_Merged

# Split the object by 'orig.ident' for individual dataset processing
alldata.list <- SplitObject(alldata, split.by = "orig.ident")

# Normalize and identify variable features for each dataset in the list
for (i in 1:length(alldata.list)) {
    alldata.list[[i]] <- SCTransform(alldata.list[[i]], vars.to.regress = c("percent.rb","percent.mt", "nCount_RNA"), verbose = FALSE)
    alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst", nfeatures = 3000, verbose = FALSE)
}

# Get the variable genes for each dataset
hvgs_per_dataset <- lapply(alldata.list, VariableFeatures)

# Include the variable genes selected on the whole dataset
hvgs_per_dataset$all <- VariableFeatures(alldata)

# Create a heatmap to show overlap across datasets
temp <- unique(unlist(hvgs_per_dataset))
overlap <- sapply(hvgs_per_dataset, function(x) temp %in% x)
pheatmap::pheatmap(t(overlap * 1), cluster_rows = FALSE,
                   color = c("grey90", "grey20"))

# Select integration features across datasets
hvgs_all <- SelectIntegrationFeatures(alldata.list)
# Exclude specific genes
hvgs_all <- hvgs_all[!grepl("^HLA-|^XIST|^TRBV|^TRAV", hvgs_all)]

hvgs_per_dataset$all_ranks <- hvgs_all

# Generate the overlap heatmap for all selected integration features
temp <- unique(unlist(hvgs_per_dataset))
overlap <- sapply(hvgs_per_dataset, function(x) temp %in% x)
pheatmap::pheatmap(t(overlap * 1), cluster_rows = FALSE,
                   color = c("grey90", "grey20"))

# Scale and PCA on each dataset using selected integration features
alldata.list <- lapply(alldata.list, function(x) {
    x <- RunPCA(x, features = hvgs_all, verbose = FALSE)
})

```


# 5. rpca-integration
```{r integration-rpca, fig.height=6, fig.width=10}

alldata.anchors <- FindIntegrationAnchors(object.list = alldata.list, dims = 1:20, reduction = "rpca", anchor.features = hvgs_all)

alldata.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:20, new.assay.name = "rpca")

names(alldata.int@assays)

alldata.int@active.assay

DefaultAssay(alldata.int) <- "rpca"

#Run Dimensionality reduction on integrated space
alldata.int <- ScaleData(alldata.int, verbose = FALSE)
alldata.int <- RunPCA(alldata.int, features = hvgs_all, reduction.name = "pca_rpca", do.print = TRUE, pcs.print = 1:5, genes.print = 15, verbose = FALSE)
alldata.int <- RunUMAP(alldata.int, reduction = "pca_rpca", reduction.name = "umap_rpca", dims = 1:20, verbose = FALSE)

alldata.int <- FindNeighbors(alldata.int, reduction = "pca_rpca", dims = 1:20, verbose = FALSE)
alldata.int <- FindClusters(alldata.int, resolution = 0.5, verbose = FALSE)



wrap_plots(

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated"),

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("PCA_integrated"),
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("UMAP_integrated"),
    ncol = 2) + plot_layout(guides = "collect")

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "orig.ident", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")

    DimPlot(alldata.int, reduction = "pca_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "rpca_snn_res.0.5")+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "rpca_snn_res.0.5", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_rpca", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")

```

# clean memory
```{r cleanMemory1}

# remove all objects that will not be used
rm(alldata, filtered_seurat,  alldata.anchors)

gc()
```

## Marker Gene Visualization
```{r featureplot-rpca, fig.height=8, fig.width=12}


# Set marker genes specific to requested immune cell types
myfeatures <- c("CD19", "CD79A", "MS4A1", # B cells
                "CD14", "LYZ", "FCGR3A", # Monocytes
                "CSF1R", "CD68", # Macrophages
                "NKG7", "GNLY", "KIR3DL1", # NK cells
                "MKI67", # Proliferating NK cells
                "CD34", "KIT", # HSPCs
                "CD3E", "CCR7", # T cells
                "SELL", "CD45RO", # Tnaive, Tcm
                "CD44", "CD45RA") # Tem, Temra

# # Visualize marker genes for RPCA
# FeaturePlot(alldata.int, features = myfeatures, reduction = "pca_rpca", ncol = 4) + 
#   ggtitle("Marker Gene Expression - RPCA Integration") +
#   NoLegend()

# Visualize marker genes for CCA
FeaturePlot(alldata.int, features = myfeatures, reduction = "umap_rpca", ncol = 4) + 
  ggtitle("Marker Gene Expression - rpca Integration") +
  NoLegend()



```


# 6. CCA-integration
```{r integration-CCA, fig.height=6, fig.width=10}

alldata.anchors <- FindIntegrationAnchors(object.list = alldata.list, dims = 1:20, reduction = "cca", anchor.features = hvgs_all)

alldata.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:20, new.assay.name = "CCA")

names(alldata.int@assays)

alldata.int@active.assay

DefaultAssay(alldata.int) <- "CCA"

#Run Dimensionality reduction on integrated space
alldata.int <- ScaleData(alldata.int, verbose = TRUE)
alldata.int <- RunPCA(alldata.int, features = hvgs_all, reduction.name = "pca_CCA", do.print = TRUE, pcs.print = 1:5, genes.print = 15, verbose = FALSE)
alldata.int <- RunUMAP(alldata.int, reduction = "pca_CCA", reduction.name = "umap_CCA", dims = 1:20, verbose = FALSE)
alldata.int <- RunTSNE(alldata.int, reduction = "pca_CCA",reduction.name = "tsne_CCA",dims = 1:20, verbose = FALSE)
alldata.int <- FindNeighbors(alldata.int, reduction = "pca_CCA", dims = 1:20, verbose = FALSE)
alldata.int <- FindClusters(alldata.int, resolution = 1.2, verbose = FALSE)



wrap_plots(

    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated"),

    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("UMAP integrated"),
    ncol = 3) + plot_layout(guides = "collect")

    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")
    
    DimPlot(alldata.int, reduction = "pca_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "tsne_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "CCA_snn_res.1.2")+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_CCA", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")
```


# clean memory
```{r cleanMemory2}
# remove all objects that will not be used
rm(alldata.anchors)

gc()



```


# 7. Harmony-integration
```{r integration-harmony, fig.height=6, fig.width=10}

alldata.int@active.assay = "RNA"
VariableFeatures(alldata.int) = hvgs_all
alldata.int = ScaleData(alldata.int, vars.to.regress = c("percent.mt", "nFeature_RNA"))
alldata.int = RunPCA(alldata.int, reduction.name = "pca_harmony")



alldata.int <- RunHarmony(
  alldata.int,
  group.by.vars = "orig.ident",
  reduction.use = "pca_harmony",
  theta =0.5,
  dims.use = 1:20,
  assay.use = "RNA")


alldata.int <- RunUMAP(alldata.int, dims = 1:20, reduction = "harmony", reduction.name = "umap_harmony")
alldata.int <- RunTSNE(alldata.int, dims = 1:20, reduction = "harmony", reduction.name = "tsne_harmony")
alldata.int <- FindNeighbors(alldata.int, reduction = "pca_harmony", dims = 1:20, verbose = FALSE)
alldata.int <- FindClusters(alldata.int, resolution = 1.2, verbose = FALSE)


 
 
 wrap_plots(

    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated"),

    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated"),
    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated"),
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("UMAP integrated"),
    ncol = 3) + plot_layout(guides = "collect")
 
    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated")
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "orig.ident", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")
    
    DimPlot(alldata.int, reduction = "pca_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("PCA integrated")
    DimPlot(alldata.int, reduction = "tsne_harmony", group.by = "RNA_snn_res.1.2")+NoAxes()+ggtitle("tSNE integrated")
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "RNA_snn_res.1.2", label = T, label.box = T)+NoAxes()+ggtitle("UMAP integrated")
    DimPlot(alldata.int, reduction = "umap_harmony", group.by = "predicted.celltype.l2")+NoAxes()+ggtitle("UMAP integrated")
    
```








# 8. Marker Gene Visualization
```{r featureplot-harmony, fig.height=10, fig.width=14}


# Set marker genes specific to requested immune cell types
myfeatures <- c("CD19", "CD79A", "MS4A1", # B cells
                "CD14", "LYZ", "FCGR3A", # Monocytes
                "CSF1R", "CD68", # Macrophages
                "NKG7", "GNLY", "KIR3DL1", # NK cells
                "MKI67", # Proliferating NK cells
                "CD34", "KIT", # HSPCs
                "CD3E", "CCR7", # T cells
                "SELL", "CD45RO", # Tnaive, Tcm
                "CD44", "CD45RA") # Tem, Temra

# # Visualize marker genes for RPCA
# FeaturePlot(alldata.int, features = myfeatures, reduction = "pca_rpca", ncol = 4) + 
#   ggtitle("Marker Gene Expression - RPCA Integration") +
#   NoLegend()

# Visualize marker genes for CCA
FeaturePlot(alldata.int, features = myfeatures, reduction = "umap_rpca", ncol = 4) + 
  ggtitle("Marker Gene Expression - CCA Integration") +
  NoLegend()

# Visualize marker genes for Harmony
FeaturePlot(alldata.int, features = myfeatures, reduction = "umap_harmony", ncol = 4) + 
  ggtitle("Marker Gene Expression - Harmony Integration") +
  NoLegend()

```


# 9. Save the Seurat object as an Robj file
```{r saveROBJ}

# save(All_samples_Merged_Harmony_Integrated, file = "All_samples_Merged_Harmony_Integrated.Robj")


```




