00. Set-up

# rm(list=ls())
library(dplyr)
library(Seurat)
library(patchwork)

setwd("/Users/Shared/Data/runAnalysis/singleSell/230304_Angio_2")
cellrangers=dir(paste0(getwd(),"/03.outs_cellranger"))

a. import data

data_ori = Read10X(data.dir=paste0(getwd(),"/03.outs_cellranger/",cellrangers))
data <- CreateSeuratObject(counts = data_ori, project = cellrangers, min.cells = 3, min.features = 200)
data
## An object of class Seurat 
## 18475 features across 3535 samples within 1 assay 
## Active assay: RNA (18475 features, 0 variable features)

01. Standard pre-processing workflow

a. QC and selecting cells for further analysis

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# Filter
data_sub = subset(data, subset = nFeature_RNA > 200 & percent.mt < 20)
VlnPlot(data_sub, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# data <- subset(data_ori, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

02. Normalizing the data

data <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000)

data <- NormalizeData(data)

03. Identification of highly variable features (feature selection)

data <- FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(data), 10)
top10
##  [1] "CCL8"    "HOPX"    "ALDH1A3" "DUSP2"   "CXCL8"   "GPC3"    "TAGLN"  
##  [8] "TAC1"    "ADH1B"   "CXCL1"
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(data)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

04. Scaling the data

all.genes <- rownames(data)
data <- ScaleData(data, features = all.genes)
## Centering and scaling data matrix

05. Perform linear dimensional reduction

data <- RunPCA(data, features = VariableFeatures(object = data))
## PC_ 1 
## Positive:  COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM, LOXL2, VASN, COL4A1, PDGFA, COL15A1 
##     FAM20A, CXXC5, AEBP1, CDH23, DUSP6, COL5A3, SPRY2, HTRA1, IFI44L, LIPA 
##     CCDC14, COL11A1, STAT1, ZFP36, NEGR1, UNC5B, BGN, GOLGA8B, CCNL1, DYNC1I1 
## Negative:  MT2A, CCL5, MT-ND6, G0S2, NMB, CCL3L1, CXCL8, CCL20, CXCL5, LINC02154 
##     CDH6, DPPA4, AC108058.1, MMP9, IL13RA2, AC093249.2, HTR2C, CASZ1, AC022113.1, CA8 
##     LINC02126, KRT34, AC243772.2, EDN1, AC009121.2, AC090527.3, AL049820.1, PNLIPRP3, AC005064.1, TFPI2 
## PC_ 2 
## Positive:  GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20, GOLGA8A, USP53, COL8A1, MTSS1, CPNE7 
##     MIR222HG, KCNQ1OT1, NEAT1, CCNL1, ABCA13, AL354733.3, AC090673.1, B3GALNT1, NCAM1, SULT1C2 
##     PGM2L1, TTLL7, TGM2, LDLR, CCDC14, NRP2, SCG2, HDAC9, SEC14L1, MEG8 
## Negative:  FAM43B, ITGA11, ALX4, RAMP1, IGFBP3, LPL, LINGO1, DCLK3, RNF165, RPS4Y1 
##     TRIB3, BGN, SPOCK1, SAMD5, IGF1, LAMP3, UNC5B, COL26A1, ADM2, SORCS1 
##     MAF, CXCL13, LMO2, MFAP5, MMP27, HS6ST2, COL15A1, NDUFA4L2, EDNRA, ACSM5 
## PC_ 3 
## Positive:  UCP2, OLFM2, SRGN, MT2A, MAFB, ARL4C, CYBA, CRABP2, RAMP2, TGM2 
##     PTGES, DKK3, JPT1, ZNF676, TXN, NINJ1, SFRP1, CITED2, CAV1, S100A6 
##     C3orf70, IGFBP6, PRSS23, NUPR1, ENPP2, COTL1, STMN1, ERG28, OMG, FXYD5 
## Negative:  GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3, KCNQ1OT1, AL139147.1, CCNL1, ADAMTS6, CCDC14 
##     MIR503HG, MIR222HG, AC092807.3, NEAT1, AC020916.1, MPV17L, LINC00624, FAM71F2, MARCH1, FSIP2 
##     NABP1, MIR29B2CHG, EID3, TNFRSF25, PURPL, CNTN3, SGK494, AC016831.1, SLC1A2, HMCN1 
## PC_ 4 
## Positive:  NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3, CILP, COL15A1, COL12A1, SCN7A, OLFML3 
##     SOX2, SAMD5, TXNIP, ATP1A2, OMD, CMKLR1, COL8A1, NDUFA4L2, DDIT4, LINC01579 
##     DYNC1I1, DKK3, IGF1, PDZD2, PDGFA, HEY1, B3GALNT1, ELN, PCLO, OGN 
## Negative:  SFRP2, SNTG1, LINC00944, XG, CA12, MMP27, HGF, MX2, TLL2, HRK 
##     POU2F2, WISP2, PAPPA, LEPR, SLAMF8, FILIP1, DPP4, SOD3, EPS8L2, CXCL12 
##     FER1L6, MEGF6, CHI3L1, AKR1C3, TNXB, SOD2, IL32, CCL8, IFIT3, LAMA1 
## PC_ 5 
## Positive:  RNF165, COL26A1, HS6ST2, EXPH5, ALX4, DSC1, ANKS1B, SORCS1, PCSK1, HMX1 
##     RTN4R, DCLK3, MMP27, LINGO1, GUCY1A1, CXCL13, FXYD6, HOPX, LAMP3, RELN 
##     SERPINI1, CH25H, MAF, KIAA1549L, PART1, SGCG, ARL4C, SCN4B, IGF1, B3GALNT1 
## Negative:  EBF2, GPX3, PDGFA, CILP, SCARA5, GPC3, NEGR1, EFHD1, BGN, ADH1B 
##     SMOC2, MATN3, PRG4, AC002546.1, NTRK3, C3orf70, FMO3, C7, SOSTDC1, NDUFA4L2 
##     BST2, CCL13, NEXN, TMEM132C, APCDD1, NEURL1B, LINC01579, TAGLN, ANGPTL5, SOD3
print(data[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM 
## Negative:  MT2A, CCL5, MT-ND6, G0S2, NMB 
## PC_ 2 
## Positive:  GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20 
## Negative:  FAM43B, ITGA11, ALX4, RAMP1, IGFBP3 
## PC_ 3 
## Positive:  UCP2, OLFM2, SRGN, MT2A, MAFB 
## Negative:  GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3 
## PC_ 4 
## Positive:  NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3 
## Negative:  SFRP2, SNTG1, LINC00944, XG, CA12 
## PC_ 5 
## Positive:  RNF165, COL26A1, HS6ST2, EXPH5, ALX4 
## Negative:  EBF2, GPX3, PDGFA, CILP, SCARA5
VizDimLoadings(data, dims = 1:2, reduction = "pca")

DimPlot(data, reduction = "pca")

DimHeatmap(data, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(data, dims = 1:15, cells = 500, balanced = TRUE)

06. Determine the ‘dimensionality’ of the dataset

# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time

##data = JackStraw(data, num.replicate=100, dims=100) ; data = ScoreJackStraw(data, dims=1:100)

data <- JackStraw(data, num.replicate = 100)
data <- ScoreJackStraw(data, dims = 1:20)
JackStrawPlot(data, dims = 1:15)
## Warning: Removed 21003 rows containing missing values (`geom_point()`).

ElbowPlot(data)

07. Cluster the cells

#install.packages("Matrix")
#update.packages("Matrix")
data <- FindNeighbors(data, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
data <- FindClusters(data, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3535
## Number of edges: 106315
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7482
## Number of communities: 6
## Elapsed time: 0 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(data), 5)
## AAACCCAAGCAAACAT-1 AAACCCAAGTTACGAA-1 AAACCCACACAAGCCC-1 AAACCCACATGGCCCA-1 
##                  0                  2                  0                  0 
## AAACCCAGTTGGGATG-1 
##                  1 
## Levels: 0 1 2 3 4 5

08. Run non-linear dimensional reduction (UMAP/tSNE)

# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
data <- RunUMAP(data, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 22:56:54 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:56:54 Read 3535 rows and found 10 numeric columns
## 22:56:54 Using Annoy for neighbor search, n_neighbors = 30
## 22:56:54 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:56:54 Writing NN index file to temp file /var/folders/z3/9m4zqbk57cbbr6n1rjvmh3980000gp/T//RtmpSPIceE/file1309214b6b67
## 22:56:54 Searching Annoy index using 1 thread, search_k = 3000
## 22:56:55 Annoy recall = 100%
## 22:56:55 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:56:57 Initializing from normalized Laplacian + noise (using irlba)
## 22:56:57 Commencing optimization for 500 epochs, with 154610 positive edges
## 22:57:01 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(data, reduction = "umap")

saveRDS(data, file = "./04.seurat/data_tutorial.rds")

09. Finding differentially expressed features (cluster biomarkers)

# find all markers of cluster 2
cluster2.markers <- FindMarkers(data, ident.1 = 2, min.pct = 0.25)
## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session
head(cluster2.markers, n = 5)
##               p_val avg_log2FC pct.1 pct.2    p_val_adj
## CCL5   0.000000e+00  4.3207774 0.725 0.080 0.000000e+00
## MT-CO1 3.253174e-41 -1.2195130 0.658 0.872 6.010238e-37
## MT-ND5 3.479486e-36 -1.6644695 0.152 0.452 6.428350e-32
## MT2A   4.926631e-34  0.4606525 0.987 0.953 9.101951e-30
## FTH1   1.408766e-33  0.2870320 1.000 1.000 2.602695e-29
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(data, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
##                    p_val avg_log2FC pct.1 pct.2     p_val_adj
## AC090673.1 4.643933e-117  1.3936112  0.64 0.019 8.579667e-113
## AC008735.2 3.893138e-114  0.4528391  0.36 0.001 7.192572e-110
## AL139147.1 3.973684e-113  0.6124955  0.44 0.005 7.341382e-109
## MIR34AHG   1.902858e-110  1.1110842  0.80 0.041 3.515531e-106
## TTN        3.786874e-108  0.5174766  0.40 0.004 6.996251e-104
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
data.markers <- FindAllMarkers(data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
data.markers %>%
    group_by(cluster) %>%
    slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 12 × 7
## # Groups:   cluster [6]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene     
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
##  1 1.32e- 10      0.836 0.442 0.383 2.44e-  6 0       MT-ND5   
##  2 1.62e-  3      0.768 0.515 0.517 1   e+  0 0       MTRNR2L12
##  3 1.55e-  4      0.310 0.202 0.299 1   e+  0 1       CTHRC1   
##  4 3.56e- 12      0.309 0.131 0.255 6.59e-  8 1       ANAPC11  
##  5 0              4.32  0.725 0.08  0         2       CCL5     
##  6 8.90e-  6      1.19  0.273 0.217 1.64e-  1 2       ISG15    
##  7 1.46e-145      1.30  0.994 0.151 2.69e-141 3       CYP1B1   
##  8 1.38e-130      1.29  1     0.162 2.55e-126 3       JUN      
##  9 1.63e-220      1.82  0.902 0.062 3.01e-216 4       IGFBP5   
## 10 1.16e- 57      1.70  0.861 0.194 2.14e- 53 4       POSTN    
## 11 1.26e- 57      5.15  1     0.234 2.32e- 53 5       NEAT1    
## 12 5.28e- 34      4.51  1     0.82  9.76e- 30 5       MALAT1
cluster0.markers <- FindMarkers(data, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(data, features = c("COL4A2", "KCNQ1OT1"))

# you can plot raw counts as well
VlnPlot(data, features = c("COL4A2", "KCNQ1OT1"), slot = "counts", log = TRUE)

FeaturePlot(data, features = c("BST2", "CCL13", "NEXN", "TMEM132C", "APCDD1", "NEURL1B", "LINC01579", "TAGLN", "ANGPTL5"))

data.markers %>%
    group_by(cluster) %>%
    top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(data, features = top10$gene) + NoLegend()

10. Assigning cell type identity to clusters

print(data[[“pca”]], dims = 1:5, nfeatures = 5) BST2, CCL13, NEXN, TMEM132C, APCDD1, NEURL1B, LINC01579, TAGLN, ANGPTL5, SOD3 PC_ 1 Positive: COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM Negative: MT2A, CCL5, MT-ND6, G0S2, NMB PC_ 2 Positive: GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20 Negative: FAM43B, ITGA11, ALX4, RAMP1, IGFBP3 PC_ 3 Positive: UCP2, OLFM2, SRGN, MT2A, MAFB Negative: GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3 PC_ 4 Positive: NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3 Negative: SFRP2, SNTG1, LINC00944, XG, CA12 PC_ 5 Positive: RNF165, COL26A1, HS6ST2, EXPH5, ALX4 Negative: EBF2, GPX3, PDGFA, CILP, SCARA5

# new.cluster.ids <- c("", "", "", "", "")
# names(new.cluster.ids) <- levels(data)
# data <- RenameIdents(data, new.cluster.ids)
DimPlot(data, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

saveRDS(data, file = "./04.seurat/data_final.rds")