This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
# setwd("C:/Users/ManXiaohui/Seurat/Reproduce_Publication_ProjectsData_Integration")
# getwd()
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Seurat)
## Attaching SeuratObject
library(patchwork)
# <!-- install.packages("tidyverse") -->
# <!-- install.packages("Matrix") -->
# <!-- install.packages("RCurl") -->
# <!-- install.packages("scales") -->
# <!-- install.packages("cowplot") -->
library(Seurat)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat.geom
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.1
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(Matrix)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(scales)
## Warning: package 'scales' was built under R version 4.0.5
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.5
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:purrr':
##
## reduce
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
## Prepare the data
# Load the PBMC dataset
tenx.data <- Read10X(data.dir = "C:/Users/ManXiaohui/Seurat/pbmc3k_filtered_gene_bc_matrices/filtered_gene_bc_matrices/hg19")
seqwell.data <- read.table(paste0("C:/Users/ManXiaohui/Seurat/Reproduce_Publication_Projects/Data_Integration/Mouse_Human/GSM2486333_PBMC.txt"))
# Setup Seurat objects
# since both count matrices have already filtered cells, we do no additional filtering here
seqwell <- CreateSeuratObject(counts = seqwell.data,project = "seqwell", min.cells = 3, min.features = 200)
tenx <- CreateSeuratObject(counts = tenx.data,project = "tenx", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# NORMALIZING THE DATA
seqwell <- NormalizeData(seqwell, normalization.method = "LogNormalize", scale.factor = 10000)
tenx <- NormalizeData(tenx, normalization.method = "LogNormalize", scale.factor = 10000)
# IDENTIFICATION OF HIGHLY VARIABLE FEATURES (FEATURE SELECTION)
seqwell <- FindVariableFeatures(seqwell, selection.method = "vst", nfeatures = 2000)
tenx <- FindVariableFeatures(tenx, selection.method = "vst", nfeatures = 2000)
# Add columns "platform" for metadata
seqwell <- AddMetaData(seqwell, metadata="seqwell", col.name="platform")
tenx <-AddMetaData(tenx, metadata="tenx", col.name="platform")
# Create list for data integration
ptlist <- list(seqwell, tenx)
ptlist
## [[1]]
## An object of class Seurat
## 6713 features across 4296 samples within 1 assay
## Active assay: RNA (6713 features, 2000 variable features)
##
## [[2]]
## An object of class Seurat
## 13714 features across 2700 samples within 1 assay
## Active assay: RNA (13714 features, 2000 variable features)
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = ptlist)
## Perform integration: We then identify anchors using the FindIntegrationAnchors() function, which takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with IntegrateData().
immune.anchors <- FindIntegrationAnchors(object.list = ptlist, anchor.features = features)
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 10143 anchors
## Filtering anchors
## Retained 3213 anchors
# this command creates an 'integrated' data assay
immune.combined <- IntegrateData(anchorset = immune.anchors)
## Merging dataset 2 into 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
# specify that we will perform downstream analysis on the corrected data note that the original
# unmodified data still resides in the 'RNA' assay
DefaultAssay(immune.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
immune.combined <- ScaleData(immune.combined, verbose = FALSE)
immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 10:37:52 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:37:52 Read 6996 rows and found 30 numeric columns
## 10:37:52 Using Annoy for neighbor search, n_neighbors = 30
## 10:37:52 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:37:53 Writing NN index file to temp file C:\Users\MANXIA~1\AppData\Local\Temp\Rtmpy43vxg\file3b0c3d17e18
## 10:37:53 Searching Annoy index using 1 thread, search_k = 3000
## 10:37:55 Annoy recall = 100%
## 10:37:55 Commencing smooth kNN distance calibration using 1 thread
## 10:37:57 Initializing from normalized Laplacian + noise
## 10:37:57 Commencing optimization for 500 epochs, with 326984 positive edges
## 10:38:18 Optimization finished
immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
immune.combined <- FindClusters(immune.combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6996
## Number of edges: 362559
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8890
## Number of communities: 11
## Elapsed time: 1 seconds
# Visualization
# Visualization
p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "platform")
p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2
# To visualize the two conditions side-by-side, we can use the split.by argument to show each condition colored by cluster.
DimPlot(immune.combined, reduction = "umap", split.by = "platform")
## Identify conserved cell type markers
# For performing differential expression after integration, we switch back to the original data
# need to run for all clusters to find the feature marker
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 0, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 1, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 2, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 3, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 4, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 5, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 7, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 8, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 9, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#
# <!-- DefaultAssay(immune.combined) <- "RNA" -->
# <!-- nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 10, grouping.var = "platform", verbose = FALSE) -->
# <!-- head(nk.markers) -->
#We can explore these marker genes for each cluster and use them to annotate our clusters as specific cell types.
FeaturePlot(immune.combined, features = c("CD3D", "SELL", "S100A4", "CD8A", "GNLY", "MS4A1","FCGR3A", "HSP90AB1", "CCR7"), min.cutoff = "q9")
immune.combined <- RenameIdents(immune.combined,`0`="CD14 Mono",`1`="CD4 Memory T", `2` ="CD4 Naive T", `3` ="CD8 T", `4` ="B", `5` ="NK", `6` ="CD16 Mono",`7`="HS_Stress", `8` ="DC")
DimPlot(immune.combined, label = TRUE)
# The DotPlot() function with the split.by parameter can be useful for viewing conserved cell type markers across conditions, showing both the expression level and the percentage of cells in a cluster expressing any given gene. Here we plot 2-3 strong marker genes for each of our 14 clusters.
Idents(immune.combined) <- factor(Idents(immune.combined), levels = c("HSPC", "Mono/Mk Doublets","pDC", "Eryth", "Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated","CD4 Naive T", "CD4 Memory T"))
markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5","CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1", "GPR183", "PPBP", "GNG11", "TSPAN13", "IL3RA", "IGJ", "PRSS57")
# DotPlot(immune.combined, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8, split.by = "platform") + RotatedAxis()
```