# rm(list=ls())
library(dplyr)
library(Seurat)
library(patchwork)
setwd("/Users/Shared/Data/runAnalysis/singleSell/230304_Angio_2")
cellrangers=dir(paste0(getwd(),"/03.outs_cellranger"))
data_ori = Read10X(data.dir=paste0(getwd(),"/03.outs_cellranger/",cellrangers))
data <- CreateSeuratObject(counts = data_ori, project = cellrangers, min.cells = 3, min.features = 200)
data
## An object of class Seurat
## 18475 features across 3535 samples within 1 assay
## Active assay: RNA (18475 features, 0 variable features)
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
data[["percent.mt"]] <- PercentageFeatureSet(data, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# Filter
data_sub = subset(data, subset = nFeature_RNA > 200 & percent.mt < 20)
VlnPlot(data_sub, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
# data <- subset(data_ori, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
data <- NormalizeData(data, normalization.method = "LogNormalize", scale.factor = 10000)
data <- NormalizeData(data)
data <- FindVariableFeatures(data, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(data), 10)
top10
## [1] "CCL8" "HOPX" "ALDH1A3" "DUSP2" "CXCL8" "GPC3" "TAGLN"
## [8] "TAC1" "ADH1B" "CXCL1"
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(data)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
all.genes <- rownames(data)
data <- ScaleData(data, features = all.genes)
## Centering and scaling data matrix
data <- RunPCA(data, features = VariableFeatures(object = data))
## PC_ 1
## Positive: COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM, LOXL2, VASN, COL4A1, PDGFA, COL15A1
## FAM20A, CXXC5, AEBP1, CDH23, DUSP6, COL5A3, SPRY2, HTRA1, IFI44L, LIPA
## CCDC14, COL11A1, STAT1, ZFP36, NEGR1, UNC5B, BGN, GOLGA8B, CCNL1, DYNC1I1
## Negative: MT2A, CCL5, MT-ND6, G0S2, NMB, CCL3L1, CXCL8, CCL20, CXCL5, LINC02154
## CDH6, DPPA4, AC108058.1, MMP9, IL13RA2, AC093249.2, HTR2C, CASZ1, AC022113.1, CA8
## LINC02126, KRT34, AC243772.2, EDN1, AC009121.2, AC090527.3, AL049820.1, PNLIPRP3, AC005064.1, TFPI2
## PC_ 2
## Positive: GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20, GOLGA8A, USP53, COL8A1, MTSS1, CPNE7
## MIR222HG, KCNQ1OT1, NEAT1, CCNL1, ABCA13, AL354733.3, AC090673.1, B3GALNT1, NCAM1, SULT1C2
## PGM2L1, TTLL7, TGM2, LDLR, CCDC14, NRP2, SCG2, HDAC9, SEC14L1, MEG8
## Negative: FAM43B, ITGA11, ALX4, RAMP1, IGFBP3, LPL, LINGO1, DCLK3, RNF165, RPS4Y1
## TRIB3, BGN, SPOCK1, SAMD5, IGF1, LAMP3, UNC5B, COL26A1, ADM2, SORCS1
## MAF, CXCL13, LMO2, MFAP5, MMP27, HS6ST2, COL15A1, NDUFA4L2, EDNRA, ACSM5
## PC_ 3
## Positive: UCP2, OLFM2, SRGN, MT2A, MAFB, ARL4C, CYBA, CRABP2, RAMP2, TGM2
## PTGES, DKK3, JPT1, ZNF676, TXN, NINJ1, SFRP1, CITED2, CAV1, S100A6
## C3orf70, IGFBP6, PRSS23, NUPR1, ENPP2, COTL1, STMN1, ERG28, OMG, FXYD5
## Negative: GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3, KCNQ1OT1, AL139147.1, CCNL1, ADAMTS6, CCDC14
## MIR503HG, MIR222HG, AC092807.3, NEAT1, AC020916.1, MPV17L, LINC00624, FAM71F2, MARCH1, FSIP2
## NABP1, MIR29B2CHG, EID3, TNFRSF25, PURPL, CNTN3, SGK494, AC016831.1, SLC1A2, HMCN1
## PC_ 4
## Positive: NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3, CILP, COL15A1, COL12A1, SCN7A, OLFML3
## SOX2, SAMD5, TXNIP, ATP1A2, OMD, CMKLR1, COL8A1, NDUFA4L2, DDIT4, LINC01579
## DYNC1I1, DKK3, IGF1, PDZD2, PDGFA, HEY1, B3GALNT1, ELN, PCLO, OGN
## Negative: SFRP2, SNTG1, LINC00944, XG, CA12, MMP27, HGF, MX2, TLL2, HRK
## POU2F2, WISP2, PAPPA, LEPR, SLAMF8, FILIP1, DPP4, SOD3, EPS8L2, CXCL12
## FER1L6, MEGF6, CHI3L1, AKR1C3, TNXB, SOD2, IL32, CCL8, IFIT3, LAMA1
## PC_ 5
## Positive: RNF165, COL26A1, HS6ST2, EXPH5, ALX4, DSC1, ANKS1B, SORCS1, PCSK1, HMX1
## RTN4R, DCLK3, MMP27, LINGO1, GUCY1A1, CXCL13, FXYD6, HOPX, LAMP3, RELN
## SERPINI1, CH25H, MAF, KIAA1549L, PART1, SGCG, ARL4C, SCN4B, IGF1, B3GALNT1
## Negative: EBF2, GPX3, PDGFA, CILP, SCARA5, GPC3, NEGR1, EFHD1, BGN, ADH1B
## SMOC2, MATN3, PRG4, AC002546.1, NTRK3, C3orf70, FMO3, C7, SOSTDC1, NDUFA4L2
## BST2, CCL13, NEXN, TMEM132C, APCDD1, NEURL1B, LINC01579, TAGLN, ANGPTL5, SOD3
print(data[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM
## Negative: MT2A, CCL5, MT-ND6, G0S2, NMB
## PC_ 2
## Positive: GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20
## Negative: FAM43B, ITGA11, ALX4, RAMP1, IGFBP3
## PC_ 3
## Positive: UCP2, OLFM2, SRGN, MT2A, MAFB
## Negative: GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3
## PC_ 4
## Positive: NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3
## Negative: SFRP2, SNTG1, LINC00944, XG, CA12
## PC_ 5
## Positive: RNF165, COL26A1, HS6ST2, EXPH5, ALX4
## Negative: EBF2, GPX3, PDGFA, CILP, SCARA5
VizDimLoadings(data, dims = 1:2, reduction = "pca")
DimPlot(data, reduction = "pca")
DimHeatmap(data, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(data, dims = 1:15, cells = 500, balanced = TRUE)
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
##data = JackStraw(data, num.replicate=100, dims=100) ; data = ScoreJackStraw(data, dims=1:100)
data <- JackStraw(data, num.replicate = 100)
data <- ScoreJackStraw(data, dims = 1:20)
JackStrawPlot(data, dims = 1:15)
## Warning: Removed 21003 rows containing missing values (`geom_point()`).
ElbowPlot(data)
#install.packages("Matrix")
#update.packages("Matrix")
data <- FindNeighbors(data, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
data <- FindClusters(data, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 3535
## Number of edges: 106315
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7482
## Number of communities: 6
## Elapsed time: 0 seconds
# Look at cluster IDs of the first 5 cells
head(Idents(data), 5)
## AAACCCAAGCAAACAT-1 AAACCCAAGTTACGAA-1 AAACCCACACAAGCCC-1 AAACCCACATGGCCCA-1
## 0 2 0 0
## AAACCCAGTTGGGATG-1
## 1
## Levels: 0 1 2 3 4 5
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
data <- RunUMAP(data, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 22:56:54 UMAP embedding parameters a = 0.9922 b = 1.112
## 22:56:54 Read 3535 rows and found 10 numeric columns
## 22:56:54 Using Annoy for neighbor search, n_neighbors = 30
## 22:56:54 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 22:56:54 Writing NN index file to temp file /var/folders/z3/9m4zqbk57cbbr6n1rjvmh3980000gp/T//RtmpSPIceE/file1309214b6b67
## 22:56:54 Searching Annoy index using 1 thread, search_k = 3000
## 22:56:55 Annoy recall = 100%
## 22:56:55 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 22:56:57 Initializing from normalized Laplacian + noise (using irlba)
## 22:56:57 Commencing optimization for 500 epochs, with 154610 positive edges
## 22:57:01 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(data, reduction = "umap")
saveRDS(data, file = "./04.seurat/data_tutorial.rds")
# find all markers of cluster 2
cluster2.markers <- FindMarkers(data, ident.1 = 2, min.pct = 0.25)
## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
head(cluster2.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## CCL5 0.000000e+00 4.3207774 0.725 0.080 0.000000e+00
## MT-CO1 3.253174e-41 -1.2195130 0.658 0.872 6.010238e-37
## MT-ND5 3.479486e-36 -1.6644695 0.152 0.452 6.428350e-32
## MT2A 4.926631e-34 0.4606525 0.987 0.953 9.101951e-30
## FTH1 1.408766e-33 0.2870320 1.000 1.000 2.602695e-29
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(data, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## AC090673.1 4.643933e-117 1.3936112 0.64 0.019 8.579667e-113
## AC008735.2 3.893138e-114 0.4528391 0.36 0.001 7.192572e-110
## AL139147.1 3.973684e-113 0.6124955 0.44 0.005 7.341382e-109
## MIR34AHG 1.902858e-110 1.1110842 0.80 0.041 3.515531e-106
## TTN 3.786874e-108 0.5174766 0.40 0.004 6.996251e-104
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
data.markers <- FindAllMarkers(data, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
data.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
## # A tibble: 12 × 7
## # Groups: cluster [6]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 1.32e- 10 0.836 0.442 0.383 2.44e- 6 0 MT-ND5
## 2 1.62e- 3 0.768 0.515 0.517 1 e+ 0 0 MTRNR2L12
## 3 1.55e- 4 0.310 0.202 0.299 1 e+ 0 1 CTHRC1
## 4 3.56e- 12 0.309 0.131 0.255 6.59e- 8 1 ANAPC11
## 5 0 4.32 0.725 0.08 0 2 CCL5
## 6 8.90e- 6 1.19 0.273 0.217 1.64e- 1 2 ISG15
## 7 1.46e-145 1.30 0.994 0.151 2.69e-141 3 CYP1B1
## 8 1.38e-130 1.29 1 0.162 2.55e-126 3 JUN
## 9 1.63e-220 1.82 0.902 0.062 3.01e-216 4 IGFBP5
## 10 1.16e- 57 1.70 0.861 0.194 2.14e- 53 4 POSTN
## 11 1.26e- 57 5.15 1 0.234 2.32e- 53 5 NEAT1
## 12 5.28e- 34 4.51 1 0.82 9.76e- 30 5 MALAT1
cluster0.markers <- FindMarkers(data, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(data, features = c("COL4A2", "KCNQ1OT1"))
# you can plot raw counts as well
VlnPlot(data, features = c("COL4A2", "KCNQ1OT1"), slot = "counts", log = TRUE)
FeaturePlot(data, features = c("BST2", "CCL13", "NEXN", "TMEM132C", "APCDD1", "NEURL1B", "LINC01579", "TAGLN", "ANGPTL5"))
data.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(data, features = top10$gene) + NoLegend()
print(data[[“pca”]], dims = 1:5, nfeatures = 5) BST2, CCL13, NEXN, TMEM132C, APCDD1, NEURL1B, LINC01579, TAGLN, ANGPTL5, SOD3 PC_ 1 Positive: COL4A2, KCNQ1OT1, NEAT1, FAM43B, MYADM Negative: MT2A, CCL5, MT-ND6, G0S2, NMB PC_ 2 Positive: GOLGA8B, PCLO, MPV17L, ADAMTS6, ARHGAP20 Negative: FAM43B, ITGA11, ALX4, RAMP1, IGFBP3 PC_ 3 Positive: UCP2, OLFM2, SRGN, MT2A, MAFB Negative: GOLGA8B, GOLGA8A, MEG8, AC090673.1, AL354733.3 PC_ 4 Positive: NEGR1, DIO2, COL11A1, SOSTDC1, NTRK3 Negative: SFRP2, SNTG1, LINC00944, XG, CA12 PC_ 5 Positive: RNF165, COL26A1, HS6ST2, EXPH5, ALX4 Negative: EBF2, GPX3, PDGFA, CILP, SCARA5
# new.cluster.ids <- c("", "", "", "", "")
# names(new.cluster.ids) <- levels(data)
# data <- RenameIdents(data, new.cluster.ids)
DimPlot(data, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
saveRDS(data, file = "./04.seurat/data_final.rds")