R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

library(Seurat)
## Attaching SeuratObject
library(SeuratData)
## Registered S3 method overwritten by 'cli':
##   method     from         
##   print.boxx spatstat.geom
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## -- Installed datasets ------------------------------------- SeuratData v0.2.1 --
## v ifnb     3.1.0                        v pbmc3k   3.1.4
## v panc8    3.0.2                        v stxBrain 0.1.1
## -------------------------------------- Key -------------------------------------
## v Dataset loaded successfully
## > Dataset built with a newer version of Seurat than installed
## (?) Unknown version of Seurat installed
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.5
library(patchwork)
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots

Load the PBMC dataset

tenx.data <- Read10X(data.dir = "C:/Users/ManXiaohui/Seurat/pbmc3k_filtered_gene_bc_matrices/filtered_gene_bc_matrices/hg19")

seqwell.data <- read.table(paste0("C:/Users/ManXiaohui/Seurat/Reproduce_Publication_Projects/Data_Integration/Mouse_Human/GSM2486333_PBMC.txt"))

Setup Seurat objects since both count matrices have already filtered cells, we do no additional filtering here

seqwell <- CreateSeuratObject(counts = seqwell.data,project = "seqwell", min.cells = 3, min.features = 200)
 
tenx <- CreateSeuratObject(counts = tenx.data,project = "tenx", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

NORMALIZING THE DATA

seqwell <- NormalizeData(seqwell, normalization.method = "LogNormalize", scale.factor = 10000)
tenx <- NormalizeData(tenx, normalization.method = "LogNormalize", scale.factor = 10000)

IDENTIFICATION OF HIGHLY VARIABLE FEATURES (FEATURE SELECTION)

seqwell <- FindVariableFeatures(seqwell, selection.method = "vst", nfeatures = 2000)
tenx <- FindVariableFeatures(tenx, selection.method = "vst", nfeatures = 2000)

Add columns “platform” for metadata

seqwell <- AddMetaData(seqwell, metadata="seqwell", col.name="platform")

tenx <-AddMetaData(tenx, metadata="tenx", col.name="platform")

Create list for data integration

ptlist <- list(seqwell, tenx)

ptlist
## [[1]]
## An object of class Seurat 
## 6713 features across 4296 samples within 1 assay 
## Active assay: RNA (6713 features, 2000 variable features)
## 
## [[2]]
## An object of class Seurat 
## 13714 features across 2700 samples within 1 assay 
## Active assay: RNA (13714 features, 2000 variable features)

select features that are repeatedly variable across datasets for integration

features <- SelectIntegrationFeatures(object.list = ptlist)

Perform integration: We then identify anchors using the FindIntegrationAnchors() function, which takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with IntegrateData().

immune.anchors <- FindIntegrationAnchors(object.list = ptlist, anchor.features = features)
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 10143 anchors
## Filtering anchors
##  Retained 3213 anchors

this command creates an ‘integrated’ data assay

immune.combined <- IntegrateData(anchorset = immune.anchors)
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data

Specify that we will perform downstream analysis on the corrected data note that the original unmodified data still resides in the ‘RNA’ assay

DefaultAssay(immune.combined) <- "integrated"

Run the standard workflow for visualization and clustering

immune.combined <- ScaleData(immune.combined, verbose = FALSE)
immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 00:37:14 UMAP embedding parameters a = 0.9922 b = 1.112
## 00:37:14 Read 6996 rows and found 30 numeric columns
## 00:37:14 Using Annoy for neighbor search, n_neighbors = 30
## 00:37:14 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 00:37:16 Writing NN index file to temp file C:\Users\MANXIA~1\AppData\Local\Temp\RtmpAtBoqr\file299079bb4a61
## 00:37:16 Searching Annoy index using 1 thread, search_k = 3000
## 00:37:20 Annoy recall = 100%
## 00:37:21 Commencing smooth kNN distance calibration using 1 thread
## 00:37:22 Initializing from normalized Laplacian + noise
## 00:37:23 Commencing optimization for 500 epochs, with 326984 positive edges
## 00:37:43 Optimization finished
immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
immune.combined <- FindClusters(immune.combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6996
## Number of edges: 362559
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8890
## Number of communities: 11
## Elapsed time: 1 seconds
# Visualization
p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "platform")
p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2

# To visualize the two conditions side-by-side, we can use the split.by argument to show each condition colored by cluster.

DimPlot(immune.combined, reduction = "umap", split.by = "platform")

## Identify conserved cell type markers
# For performing differential expression after integration, we switch back to the original data
# need to run for all clusters to find the feature marker
# DefaultAssay(immune.combined) <- "RNA" 
# nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "platform", verbose = FALSE) 
# head(nk.markers) 


#We can explore these marker genes for each cluster and use them to annotate our clusters as specific cell types.

FeaturePlot(immune.combined, features = c("CD3D", "SELL", "S100A4", "CD8A", "GNLY", "MS4A1","FCGR3A", "HSP90AB1", "CCR7"), min.cutoff = "q9")

immune.combined <- RenameIdents(immune.combined,`0`="CD14 Mono",`1`="CD4 Memory T", `2` ="CD4 Naive T", `3` ="CD8 T", `4` ="B", `5` ="NK", `6` ="CD16 Mono",`7`="HS_Stress", `8` ="DC") 

DimPlot(immune.combined, label = TRUE)

# The DotPlot() function with the split.by parameter can be useful for viewing conserved cell type markers across conditions, showing both the expression level and the percentage of cells in a cluster expressing any given gene. Here we plot 2-3 strong marker genes for each of our 14 clusters.
# 
# 
# Idents(immune.combined) <- factor(Idents(immune.combined), levels = c("HSPC", "Mono/Mk Doublets","pDC", "Eryth", "Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated","CD4 Naive T", "CD4 Memory T"))
# markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5","CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1", "GPR183", "PPBP", "GNG11", "TSPAN13", "IL3RA", "IGJ", "PRSS57")
# DotPlot(immune.combined, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8, split.by = "platform") + RotatedAxis()