00. Set-up

# rm(list=ls())
library(dplyr)
library(Seurat)
library(patchwork)

setwd("/Users/Shared/Data/runAnalysis/singleSell/230304_Angio_2")
cellrangers=dir(paste0(getwd(),"/03.outs_cellranger"))

a. import data

data_ori = Read10X(data.dir=paste0(getwd(),"/03.outs_cellranger/",cellrangers))
data <- CreateSeuratObject(counts = data_ori, project = cellrangers, min.cells = 3, min.features = 200)
data

## An object of class Seurat 
## 18475 features across 3535 samples within 1 assay 
## Active assay: RNA (18475 features, 0 variable features)

01. Standard pre-processing workflow

a. QC and selecting cells for further analysis

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^MT-")

# Visualize QC metrics as a violin plot
VlnPlot(data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# Filter
data_sub = subset(data, subset = nFeature_RNA > 200 & percent.mt < 20)
VlnPlot(data_sub, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# data <- subset(data_ori, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

02. Normalizing the data

data <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000)

data <- NormalizeData(data)

03. Identification of highly variable features (feature selection)

data <- FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(data), 10)
top10

##  [1] "CCL8"    "HOPX"    "ALDH1A3" "DUSP2"   "CXCL8"   "GPC3"    "TAGLN"  
##  [8] "TAC1"    "ADH1B"   "CXCL1"

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(data)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

plot1 + plot2

## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

04. Scaling the data

all.genes <- rownames(data)
data <- ScaleData(data, features = all.genes)

## Centering and scaling data matrix

05. Perform linear dimensional reduction

data <- RunPCA(data, features = VariableFeatures(object = data))

## PC_ 1 
## Positive:  COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM, LOXL2, VASN, COL4A1, PDGFA, COL15A1 
##     FAM20A, CXXC5, AEBP1, CDH23, DUSP6, COL5A3, SPRY2, HTRA1, IFI44L, LIPA 
##     CCDC14, COL11A1, STAT1, ZFP36, NEGR1, UNC5B, BGN, GOLGA8B, CCNL1, DYNC1I1 
## Negative:  MT2A, CCL5, MT-ND6, G0S2, NMB, CCL3L1, CXCL8, CCL20, CXCL5, LINC02154 
##     CDH6, DPPA4, AC108058.1, MMP9, IL13RA2, AC093249.2, HTR2C, CASZ1, AC022113.1, CA8 
##     LINC02126, KRT34, AC243772.2, EDN1, AC009121.2, AC090527.3, AL049820.1, PNLIPRP3, AC005064.1, TFPI2 
## PC_ 2 
## Positive:  GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20, GOLGA8A, USP53, COL8A1, MTSS1, CPNE7 
##     MIR222HG, KCNQ1OT1, NEAT1, CCNL1, ABCA13, AL354733.3, AC090673.1, B3GALNT1, NCAM1, SULT1C2 
##     PGM2L1, TTLL7, TGM2, LDLR, CCDC14, NRP2, SCG2, HDAC9, SEC14L1, MEG8 
## Negative:  FAM43B, ITGA11, ALX4, RAMP1, IGFBP3, LPL, LINGO1, DCLK3, RNF165, RPS4Y1 
##     TRIB3, BGN, SPOCK1, SAMD5, IGF1, LAMP3, UNC5B, COL26A1, ADM2, SORCS1 
##     MAF, CXCL13, LMO2, MFAP5, MMP27, HS6ST2, COL15A1, NDUFA4L2, EDNRA, ACSM5 
## PC_ 3 
## Positive:  UCP2, OLFM2, SRGN, MT2A, MAFB, ARL4C, CYBA, CRABP2, RAMP2, TGM2 
##     PTGES, DKK3, JPT1, ZNF676, TXN, NINJ1, SFRP1, CITED2, CAV1, S100A6 
##     C3orf70, IGFBP6, PRSS23, NUPR1, ENPP2, COTL1, STMN1, ERG28, OMG, FXYD5 
## Negative:  GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3, KCNQ1OT1, AL139147.1, CCNL1, ADAMTS6, CCDC14 
##     MIR503HG, MIR222HG, AC092807.3, NEAT1, AC020916.1, MPV17L, LINC00624, FAM71F2, MARCH1, FSIP2 
##     NABP1, MIR29B2CHG, EID3, TNFRSF25, PURPL, CNTN3, SGK494, AC016831.1, SLC1A2, HMCN1 
## PC_ 4 
## Positive:  NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3, CILP, COL15A1, COL12A1, SCN7A, OLFML3 
##     SOX2, SAMD5, TXNIP, ATP1A2, OMD, CMKLR1, COL8A1, NDUFA4L2, DDIT4, LINC01579 
##     DYNC1I1, DKK3, IGF1, PDZD2, PDGFA, HEY1, B3GALNT1, ELN, PCLO, OGN 
## Negative:  SFRP2, SNTG1, LINC00944, XG, CA12, MMP27, HGF, MX2, TLL2, HRK 
##     POU2F2, WISP2, PAPPA, LEPR, SLAMF8, FILIP1, DPP4, SOD3, EPS8L2, CXCL12 
##     FER1L6, MEGF6, CHI3L1, AKR1C3, TNXB, SOD2, IL32, CCL8, IFIT3, LAMA1 
## PC_ 5 
## Positive:  RNF165, COL26A1, HS6ST2, EXPH5, ALX4, DSC1, ANKS1B, SORCS1, PCSK1, HMX1 
##     RTN4R, DCLK3, MMP27, LINGO1, GUCY1A1, CXCL13, FXYD6, HOPX, LAMP3, RELN 
##     SERPINI1, CH25H, MAF, KIAA1549L, PART1, SGCG, ARL4C, SCN4B, IGF1, B3GALNT1 
## Negative:  EBF2, GPX3, PDGFA, CILP, SCARA5, GPC3, NEGR1, EFHD1, BGN, ADH1B 
##     SMOC2, MATN3, PRG4, AC002546.1, NTRK3, C3orf70, FMO3, C7, SOSTDC1, NDUFA4L2 
##     BST2, CCL13, NEXN, TMEM132C, APCDD1, NEURL1B, LINC01579, TAGLN, ANGPTL5, SOD3

print(data[["pca"]], dims = 1:5, nfeatures = 5)

## PC_ 1 
## Positive:  COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM 
## Negative:  MT2A, CCL5, MT-ND6, G0S2, NMB 
## PC_ 2 
## Positive:  GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20 
## Negative:  FAM43B, ITGA11, ALX4, RAMP1, IGFBP3 
## PC_ 3 
## Positive:  UCP2, OLFM2, SRGN, MT2A, MAFB 
## Negative:  GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3 
## PC_ 4 
## Positive:  NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3 
## Negative:  SFRP2, SNTG1, LINC00944, XG, CA12 
## PC_ 5 
## Positive:  RNF165, COL26A1, HS6ST2, EXPH5, ALX4 
## Negative:  EBF2, GPX3, PDGFA, CILP, SCARA5

VizDimLoadings(data, dims = 1:2, reduction = "pca")

DimPlot(data, reduction = "pca")

DimHeatmap(data, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(data, dims = 1:15, cells = 500, balanced = TRUE)

06. Determine the ‘dimensionality’ of the dataset

# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time

##data = JackStraw(data, num.replicate=100, dims=100) ; data = ScoreJackStraw(data, dims=1:100)

data <- JackStraw(data, num.replicate = 100)
data <- ScoreJackStraw(data, dims = 1:20)

JackStrawPlot(data, dims = 1:15)

## Warning: Removed 21003 rows containing missing values (`geom_point()`).

ElbowPlot(data)

07. Cluster the cells

#install.packages("Matrix")
#update.packages("Matrix")
data <- FindNeighbors(data, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

data <- FindClusters(data, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3535
## Number of edges: 106315
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7482
## Number of communities: 6
## Elapsed time: 0 seconds

# Look at cluster IDs of the first 5 cells
head(Idents(data), 5)

## AAACCCAAGCAAACAT-1 AAACCCAAGTTACGAA-1 AAACCCACACAAGCCC-1 AAACCCACATGGCCCA-1 
##                  0                  2                  0                  0 
## AAACCCAGTTGGGATG-1 
##                  1 
## Levels: 0 1 2 3 4 5

08. Run non-linear dimensional reduction (UMAP/tSNE)

# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
data <- RunUMAP(data, dims = 1:10)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 22:56:54 UMAP embedding parameters a = 0.9922 b = 1.112

## 22:56:54 Read 3535 rows and found 10 numeric columns

## 22:56:54 Using Annoy for neighbor search, n_neighbors = 30

## 22:56:54 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 22:56:54 Writing NN index file to temp file /var/folders/z3/9m4zqbk57cbbr6n1rjvmh3980000gp/T//RtmpSPIceE/file1309214b6b67
## 22:56:54 Searching Annoy index using 1 thread, search_k = 3000
## 22:56:55 Annoy recall = 100%
## 22:56:55 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:56:57 Initializing from normalized Laplacian + noise (using irlba)
## 22:56:57 Commencing optimization for 500 epochs, with 154610 positive edges
## 22:57:01 Optimization finished

# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(data, reduction = "umap")

saveRDS(data, file = "./04.seurat/data_tutorial.rds")

09. Finding differentially expressed features (cluster biomarkers)

# find all markers of cluster 2
cluster2.markers <- FindMarkers(data, ident.1 = 2, min.pct = 0.25)

## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session

head(cluster2.markers, n = 5)

##               p_val avg_log2FC pct.1 pct.2    p_val_adj
## CCL5   0.000000e+00  4.3207774 0.725 0.080 0.000000e+00
## MT-CO1 3.253174e-41 -1.2195130 0.658 0.872 6.010238e-37
## MT-ND5 3.479486e-36 -1.6644695 0.152 0.452 6.428350e-32
## MT2A   4.926631e-34  0.4606525 0.987 0.953 9.101951e-30
## FTH1   1.408766e-33  0.2870320 1.000 1.000 2.602695e-29

# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(data, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)

##                    p_val avg_log2FC pct.1 pct.2     p_val_adj
## AC090673.1 4.643933e-117  1.3936112  0.64 0.019 8.579667e-113
## AC008735.2 3.893138e-114  0.4528391  0.36 0.001 7.192572e-110
## AL139147.1 3.973684e-113  0.6124955  0.44 0.005 7.341382e-109
## MIR34AHG   1.902858e-110  1.1110842  0.80 0.041 3.515531e-106
## TTN        3.786874e-108  0.5174766  0.40 0.004 6.996251e-104

# find markers for every cluster compared to all remaining cells, report only the positive
# ones
data.markers <- FindAllMarkers(data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

## Calculating cluster 0

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

data.markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC)

## # A tibble: 12 × 7
## # Groups:   cluster [6]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene     
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
##  1 1.32e- 10      0.836 0.442 0.383 2.44e-  6 0       MT-ND5   
##  2 1.62e-  3      0.768 0.515 0.517 1   e+  0 0       MTRNR2L12
##  3 1.55e-  4      0.310 0.202 0.299 1   e+  0 1       CTHRC1   
##  4 3.56e- 12      0.309 0.131 0.255 6.59e-  8 1       ANAPC11  
##  5 0              4.32  0.725 0.08  0         2       CCL5     
##  6 8.90e-  6      1.19  0.273 0.217 1.64e-  1 2       ISG15    
##  7 1.46e-145      1.30  0.994 0.151 2.69e-141 3       CYP1B1   
##  8 1.38e-130      1.29  1     0.162 2.55e-126 3       JUN      
##  9 1.63e-220      1.82  0.902 0.062 3.01e-216 4       IGFBP5   
## 10 1.16e- 57      1.70  0.861 0.194 2.14e- 53 4       POSTN    
## 11 1.26e- 57      5.15  1     0.234 2.32e- 53 5       NEAT1    
## 12 5.28e- 34      4.51  1     0.82  9.76e- 30 5       MALAT1

cluster0.markers <- FindMarkers(data, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)

VlnPlot(data, features = c("COL4A2", "KCNQ1OT1"))

# you can plot raw counts as well
VlnPlot(data, features = c("COL4A2", "KCNQ1OT1"), slot = "counts", log = TRUE)

FeaturePlot(data, features = c("BST2", "CCL13", "NEXN", "TMEM132C", "APCDD1", "NEURL1B", "LINC01579", "TAGLN", "ANGPTL5"))

data.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(data, features = top10$gene) + NoLegend()

10. Assigning cell type identity to clusters

print(data[[“pca”]], dims = 1:5, nfeatures = 5) BST2, CCL13, NEXN, TMEM132C, APCDD1, NEURL1B, LINC01579, TAGLN, ANGPTL5, SOD3 PC_ 1 Positive: COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM Negative: MT2A, CCL5, MT-ND6, G0S2, NMB PC_ 2 Positive: GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20 Negative: FAM43B, ITGA11, ALX4, RAMP1, IGFBP3 PC_ 3 Positive: UCP2, OLFM2, SRGN, MT2A, MAFB Negative: GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3 PC_ 4 Positive: NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3 Negative: SFRP2, SNTG1, LINC00944, XG, CA12 PC_ 5 Positive: RNF165, COL26A1, HS6ST2, EXPH5, ALX4 Negative: EBF2, GPX3, PDGFA, CILP, SCARA5

# new.cluster.ids <- c("", "", "", "", "")
# names(new.cluster.ids) <- levels(data)
# data <- RenameIdents(data, new.cluster.ids)
DimPlot(data, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

saveRDS(data, file = "./04.seurat/data_final.rds")

single-cell_R_seurat(satijalab)

Joy, Son

2023-03-16

00. Set-up

a. import data

01. Standard pre-processing workflow

a. QC and selecting cells for further analysis

02. Normalizing the data

03. Identification of highly variable features (feature selection)

04. Scaling the data

05. Perform linear dimensional reduction

06. Determine the ‘dimensionality’ of the dataset

07. Cluster the cells

08. Run non-linear dimensional reduction (UMAP/tSNE)

09. Finding differentially expressed features (cluster biomarkers)

10. Assigning cell type identity to clusters