analyze the data in R

install R packages

now, switch back to R and install the packages we are going to use in this workshop.

#install.packages("tidyverse")
#install.packages("rmarkdown")
#install.packages('Seurat')

load the library

library(tidyverse)
library(Seurat)

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "/Users/hawacoulibaly/Documents/filtered_feature_bc_matrix")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k")
pbmc

## An object of class Seurat 
## 36601 features across 2527 samples within 1 assay 
## Active assay: RNA (36601 features, 0 variable features)
##  1 layer present: counts

## getting help
?CreateSeuratObject

## Help on topic 'CreateSeuratObject' was found in the following packages:
## 
##   Package               Library
##   Seurat                /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
##   SeuratObject          /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
## 
## 
## Using the first match ...

if you want to know more details of the Seurat object, you can learn at https://github.com/satijalab/seurat/wiki

Also check https://satijalab.org/seurat/essential_commands.html for all the commands you can use to interact with Seurat objects.

# Lets examine a few B cell marker genes in the first thirty cells
pbmc.data[c("CD19", "CD34", "CD38", "MS4A1"), 1:30]

## 4 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCTGAGCCCAGCT-1', 'AAACCTGAGTGACTCT-1', 'AAACCTGCAATGGATA-1' ... ]]

##                                                                        
## CD19  .  . 1  .  1 1  . 2 1 1 . 1 . 1 . 1 . . .  1 . 1 . 1 3 2 .  1 . 1
## CD34  .  . .  .  . .  . . . . . . . . . . . . .  . . . . . . . .  . . .
## CD38  .  . .  1  . .  . . . . . . . . . . . . .  . 1 . . . . . .  . . .
## MS4A1 5 10 9 10 16 3 11 4 3 3 3 8 5 4 8 4 3 6 8 25 6 4 2 4 9 3 7 11 6 5

The . values in the matrix represent 0s (no molecules detected). Since most values in an scRNA-seq matrix are 0, Seurat uses a sparse-matrix representation whenever possible. This results in significant memory and speed savings for Drop-seq/inDrop/10x data.

Quality control and filtering cells

## check at metadata
meta <- pbmc@meta.data
dim(meta)

## [1] 2527    3

# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), layer = "counts", ncol = 3)

we set the cutoff based on the visualization above. The cutoff is quite subjective.

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 20)

visualize cell count after filtering

# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), layer = "counts", ncol = 3)

Normalization of the data

By default, we employ a global-scaling normalization method “LogNormalize” that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. Normalized values are stored in pbmc[["RNA"]]@data.

Now, Seurat has a new normalization method called SCTransform. Check out the tutorial here.

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

## Normalizing layer: counts

feature selection

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

## Finding variable features for layer counts

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)

## When using repel, set xnudge and ynudge to 0 for optimal results

CombinePlots(plots = list(plot1, plot2), ncol =1)

## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.

## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

## Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

### Check for G1, G2, M, AND S PHASES

# Cell cycle genes
cc.genes.updated.2019

## $s.genes
##  [1] "MCM5"     "PCNA"     "TYMS"     "FEN1"     "MCM7"     "MCM4"    
##  [7] "RRM1"     "UNG"      "GINS2"    "MCM6"     "CDCA7"    "DTL"     
## [13] "PRIM1"    "UHRF1"    "CENPU"    "HELLS"    "RFC2"     "POLR1B"  
## [19] "NASP"     "RAD51AP1" "GMNN"     "WDR76"    "SLBP"     "CCNE2"   
## [25] "UBR7"     "POLD3"    "MSH2"     "ATAD2"    "RAD51"    "RRM2"    
## [31] "CDC45"    "CDC6"     "EXO1"     "TIPIN"    "DSCC1"    "BLM"     
## [37] "CASP8AP2" "USP1"     "CLSPN"    "POLA1"    "CHAF1B"   "MRPL36"  
## [43] "E2F8"    
## 
## $g2m.genes
##  [1] "HMGB2"   "CDK1"    "NUSAP1"  "UBE2C"   "BIRC5"   "TPX2"    "TOP2A"  
##  [8] "NDC80"   "CKS2"    "NUF2"    "CKS1B"   "MKI67"   "TMPO"    "CENPF"  
## [15] "TACC3"   "PIMREG"  "SMC4"    "CCNB2"   "CKAP2L"  "CKAP2"   "AURKB"  
## [22] "BUB1"    "KIF11"   "ANP32E"  "TUBB4B"  "GTSE1"   "KIF20B"  "HJURP"  
## [29] "CDCA3"   "JPT1"    "CDC20"   "TTK"     "CDC25C"  "KIF2C"   "RANGAP1"
## [36] "NCAPD2"  "DLGAP5"  "CDCA2"   "CDCA8"   "ECT2"    "KIF23"   "HMMR"   
## [43] "AURKA"   "PSRC1"   "ANLN"    "LBR"     "CKAP5"   "CENPE"   "CTCF"   
## [50] "NEK2"    "G2E3"    "GAS2L3"  "CBX5"    "CENPA"

# Cell cycle score
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes)
table(pbmc[[]]$Phase)

## 
##   G1  G2M    S 
##  848  594 1014

Scaling the data

Scaling is an essential step in the Seurat workflow, but only on genes that will be used as input to PCA. Therefore, the default in ScaleData is only to perform scaling on the previously identified variable features (2,000 by default). To do this, omit the features argument in the previous function call

Next, we apply a linear transformation (‘scaling’) that is a standard pre-processing step prior to dimensional reduction techniques like PCA. The ScaleData function:

Shifts the expression of each gene, so that the mean expression across cells is 0
Scales the expression of each gene, so that the variance across cells is 1.

Think it as standardize the data. center the mean to 0 and variance to 1. ?scale

This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate * The results of this are stored in pbmc[["RNA"]]@scale.data

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)

## Centering and scaling data matrix

pbmc <- ScaleData(pbmc, vars.to.regress = c("percent.mt", "S.Score","G2M.Score"))

## Regressing out percent.mt, S.Score, G2M.Score
## Centering and scaling data matrix

## Warning: Different features in new layer data than already exists for
## scale.data

This can take long for large dataset.

Let’s check the data matrix before and after scaling.

# raw counts, same as pbmc@assays$RNA@counts[1:6, 1:6]
pbmc[["RNA"]]$counts[1:6, 1:6]

## 6 x 6 sparse Matrix of class "dgCMatrix"
##             AAACCTGAGCCCAGCT-1 AAACCTGAGTGACTCT-1 AAACCTGCAATGGATA-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .
##             AAACCTGCACCCTATC-1 AAACCTGCATACCATG-1 AAACCTGGTGACAAAT-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .

# library size normalized and log transformed data
pbmc[["RNA"]]$data[1:6, 1:6]

## 6 x 6 sparse Matrix of class "dgCMatrix"
##             AAACCTGAGCCCAGCT-1 AAACCTGAGTGACTCT-1 AAACCTGCAATGGATA-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .
##             AAACCTGCACCCTATC-1 AAACCTGCATACCATG-1 AAACCTGGTGACAAAT-1
## MIR1302-2HG                  .                  .                  .
## FAM138A                      .                  .                  .
## OR4F5                        .                  .                  .
## AL627309.1                   .                  .                  .
## AL627309.3                   .                  .                  .
## AL627309.2                   .                  .                  .

# scaled data
pbmc[["RNA"]]$scale.data[1:6, 1:6]

##            AAACCTGAGCCCAGCT-1 AAACCTGAGTGACTCT-1 AAACCTGCAATGGATA-1
## HES4              -0.25816685         3.35647141        -0.20880373
## ISG15             -0.45555286        -0.32913494        -0.46213895
## TNFRSF18          -0.11552683        -0.01412996        -0.16257746
## TAS1R3            -0.13485885        -0.08267714        -0.13882504
## MMP23B            -0.06306594        -0.03876645        -0.07187768
## AL139246.5        -0.56589496        -0.41169785        -0.40235187
##            AAACCTGCACCCTATC-1 AAACCTGCATACCATG-1 AAACCTGGTGACAAAT-1
## HES4              -0.23380069        -0.27440858        -0.18585570
## ISG15             -0.43434584        -0.42409022        -0.35669771
## TNFRSF18          -0.11462908        -0.08646289        -0.03664322
## TAS1R3            -0.12855599        -0.11988795        -0.11407057
## MMP23B            -0.05792637        -0.06009920        -0.00377415
## AL139246.5         1.75254689         1.55276555         1.50094085

PCA

Principle component analysis (PCA) is a linear dimension reduction technology.

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), verbose = FALSE)

p1<- DimPlot(pbmc, reduction = "pca")
p1

pbmc@reductions$pca

## A dimensional reduction object with key PC_ 
##  Number of dimensions: 50 
##  Number of cells: 2456 
##  Projected dimensional reduction calculated:  FALSE 
##  Jackstraw run: FALSE 
##  Computed using assay: RNA

Determine How many PCs to include for downstream analysis

To overcome the extensive technical noise in any single feature for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a ‘metafeature’ that combines information across a correlated feature set. The top principal components therefore represent a robust compression of the dataset. However, how many components should we choose to include? 10? 20? 100?

In Macosko et al, we implemented a resampling test inspired by the JackStraw procedure. We randomly permute a subset of the data (1% by default) and rerun PCA, constructing a ‘null distribution’ of feature scores, and repeat this procedure. We identify ‘significant’ PCs as those who have a strong enrichment of low p-value features.

# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time, takes 10 mins for this dataset
pbmc <- JackStraw(pbmc, num.replicate = 100, dims = 30)
pbmc <- ScoreJackStraw(pbmc, dims = 1:30)

JackStrawPlot(pbmc, dims = 1:30)

An alternative heuristic method generates an ‘Elbow plot’: a ranking of principle components based on the percentage of variance explained by each one (ElbowPlot function). In this example, we can observe an ‘elbow’ around PC10-20, suggesting that the majority of true signal is captured in the first 10 PCs.

ElbowPlot(pbmc, ndims = 10)

variance explained by each PC

hint from https://github.com/satijalab/seurat/issues/982

mat <- pbmc[["RNA"]]$scale.data 
pca <- pbmc[["pca"]]

# Get the total variance:
total_variance <- sum(matrixStats::rowVars(mat))

eigValues = (pca@stdev)^2  ## EigenValues
varExplained = eigValues / total_variance

varExplained %>% enframe(name = "PC", value = "varExplained" ) %>%
  ggplot(aes(x = PC, y = varExplained)) + 
  geom_bar(stat = "identity") +
  theme_classic() +
  ggtitle("scree plot")

### this is what Seurat is plotting: standard deviation
pca@stdev %>% enframe(name = "PC", value = "Standard Deviation" ) %>%
  ggplot(aes(x = PC, y = `Standard Deviation`)) + 
  geom_point() +
  theme_classic()

Cluster the cells

Seurat v3 applies a graph-based clustering approach, building upon initial strategies in (Macosko et al) Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’.

As in PhenoGraph, we first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the FindNeighbors function, and takes as input the previously defined dimensionality of the dataset.

To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function. The FindClusters function implements this procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets

pbmc <- FindNeighbors(pbmc, dims = 1:10)

## Computing nearest neighbor graph

## Computing SNN

pbmc <- FindClusters(pbmc, resolution = 0.5)

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2456
## Number of edges: 82753
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7965
## Number of communities: 6
## Elapsed time: 0 seconds

# Look at cluster IDs
table(pbmc[[]]$seurat_clusters)

## 
##   0   1   2   3   4   5 
## 761 614 525 386  99  71

Run non-linear dimensional reduction (UMAP/tSNE)

Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP (as opposed to PCA which is a linear dimensional reduction technique), to visualize and explore these datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. As input to the UMAP and tSNE, we suggest using the same PCs as input to the clustering analysis.

t-SNE explained by StatQuest https://www.youtube.com/watch?v=NEaUSP4YerM

pbmc <- RunUMAP(pbmc, dims = 1:10)

## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 10:40:50 UMAP embedding parameters a = 0.9922 b = 1.112

## 10:40:50 Read 2456 rows and found 10 numeric columns

## 10:40:50 Using Annoy for neighbor search, n_neighbors = 30

## 10:40:50 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 10:40:51 Writing NN index file to temp file /var/folders/wb/d_vxxlyx4kz8wmbdnxswqdhh0000gn/T//RtmpoAdFUF/file16f66cb162dc
## 10:40:51 Searching Annoy index using 1 thread, search_k = 3000
## 10:40:51 Annoy recall = 100%
## 10:40:51 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:40:52 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:40:52 Commencing optimization for 500 epochs, with 96322 positive edges
## 10:40:54 Optimization finished

pbmc<- RunTSNE(pbmc, dims = 1:10)

## now let's visualize in the UMAP space

p1 <- DimPlot(pbmc, reduction = "umap", label = TRUE, label.size = 6)

## now let's visualize in the TSNE space
p2 <- DimPlot(pbmc, reduction = "tsne", label = TRUE, label.size = 6)

p1+p2

Finding differentially expressed features (cluster biomarkers)

Seurat can help you find markers that define clusters via differential expression. By default, it identifies positive and negative markers of a single cluster (specified in ident.1), compared to all other cells. FindAllMarkers automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells.

Finding all marker genes takes long in this step. let’s watch the PCA video while Seurat is working hard.

# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

## Calculating cluster 0

## Calculating cluster 1

## Calculating cluster 2

## Calculating cluster 3

## Calculating cluster 4

## Calculating cluster 5

pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)

## # A tibble: 12 × 7
## # Groups:   cluster [6]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene      
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>     
##  1 1.37e-129       2.74 0.677 0.208 5.02e-125 0       EGR1      
##  2 2.56e-117       2.76 0.737 0.326 9.37e-113 0       DUSP2     
##  3 1.30e- 90       2.62 0.43  0.081 4.75e- 86 1       NINJ1     
##  4 4.83e- 57       2.32 0.293 0.057 1.77e- 52 1       SMAD7     
##  5 1.44e- 45       1.40 0.65  0.344 5.28e- 41 2       AC245014.3
##  6 1.39e- 23       1.27 0.385 0.197 5.10e- 19 2       C1orf162  
##  7 4.96e-134       4.88 0.373 0.018 1.82e-129 3       SIGLEC6   
##  8 1.44e-129       6.94 0.293 0.003 5.26e-125 3       DTX1      
##  9 4.64e- 35       4.17 0.414 0.077 1.70e- 30 4       TET3      
## 10 6.80e- 10       3.59 0.313 0.131 2.49e-  5 4       AUTS2     
## 11 1.12e- 32       4.07 0.268 0.023 4.09e- 28 5       IFIT3     
## 12 6.46e- 31       3.44 0.296 0.031 2.36e- 26 5       IFI44L

Save marker genes for each cluster

# write_csv(pbmc.markers, "/Users/hawacoulibaly/Documents/GitHub/scHumanBcellFlu/output/cluster_markers.csv")

Visualize marker genes

VlnPlot (shows expression probability distributions across clusters), and FeaturePlot (visualizes feature expression on a tSNE or PCA plot) are our most commonly used visualizations. We also suggest exploring RidgePlot, CellScatter, and DotPlot as additional methods to view your dataset.

VlnPlot(pbmc, features = c("MS4A1", "CD19"))

## understanding the matrix of data slots
pbmc[["RNA"]]$data[c("MS4A1", "CD19"), 1:30]

## 2 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCTGAGCCCAGCT-1', 'AAACCTGAGTGACTCT-1', 'AAACCTGCAATGGATA-1' ... ]]

##                                                                              
## MS4A1 2.344384 3.667859 3.306620 3.04338 3.394257 1.7891254 2.974083 2.172716
## CD19  .        .        1.366452 .       1.029452 0.9788544 .        1.587407
##                                                                               
## MS4A1 1.959926 2.077141 1.992037 2.758695 3.564367 1.8688677 3.143516 2.314096
## CD19  1.109530 1.202133 .        1.046408 .        0.8629905 .        1.187521
##                                                                             
## MS4A1 2.181034 3.395127 3.14964 3.5448299 2.70777 2.592621 1.706304 2.199361
## CD19  .        .        .       0.8524354 .       1.408836 .        1.100215
##                                                             
## MS4A1 2.856265 1.798177 3.078623 2.9555309 3.909218 2.720533
## CD19  1.866477 1.472262 .        0.9766816 .        1.344869

pbmc[["RNA"]]$scale.data[c("MS4A1", "CD19"), 1:30]

##       AAACCTGAGCCCAGCT-1 AAACCTGAGTGACTCT-1 AAACCTGCAATGGATA-1
## MS4A1         -0.3544605          1.2073640          0.5719494
## CD19          -0.9862125         -0.7780357          0.6642963
##       AAACCTGCACCCTATC-1 AAACCTGCATACCATG-1 AAACCTGGTGACAAAT-1
## MS4A1          0.4101735          0.9105409         -0.8753066
## CD19          -0.9626424          0.3425523          0.3919296
##       AAACCTGTCCAGAGGA-1 AAACCTGTCCTGCAGG-1 AAACCTGTCTCTAAGG-1
## MS4A1          0.3949430         -0.3892985         -0.7111133
## CD19          -0.8893385          1.1191669          0.4846064
##       AAACCTGTCTTCGGTC-1 AAACGGGCACCAGGCT-1 AAAGATGGTCAATGTC-1
## MS4A1         -0.5181901         -0.7427723          0.1553615
## CD19           0.6223438         -0.9015940          0.3859631
##       AAAGATGGTTAAGGGC-1 AAAGATGTCTGTTGAG-1 AAAGCAAAGACCCACC-1
## MS4A1           0.946173         -1.0603851          0.7214771
## CD19           -1.019756          0.1286384         -0.9457552
##       AAAGCAAAGGCTCTTA-1 AAAGCAAAGGGCTCTC-1 AAAGCAAAGTCCGGTC-1
## MS4A1         -0.3684857         -0.7600054          0.8468336
## CD19           0.5487476         -1.0400296         -0.7365731
##       AAAGCAACAGCCTTTC-1 AAAGTAGAGTCATCCA-1 AAAGTAGGTCTTCAAG-1
## MS4A1          0.5097941          1.0718962         0.00990967
## CD19          -0.9838856          0.1451089        -0.95673541
##       AAAGTAGTCGTATCAG-1 AAATGCCAGGCTACGA-1 AAATGCCAGTACATGA-1
## MS4A1         0.01708443         -1.2739751         -0.7539946
## CD19          0.82593193         -0.9948984          0.3911818
##       AAATGCCCACTGAAGG-1 AAATGCCTCAGTCCCT-1 AACACGTAGGCATGGT-1
## MS4A1          0.3873068         -1.1298587          0.5653499
## CD19           1.4168267          0.8256232         -0.9246661
##       AACACGTAGTCCCACG-1 AACACGTAGTTATCGC-1 AACACGTGTCGTTGTA-1
## MS4A1          0.3541308          1.5063709          0.2586322
## CD19           0.2899045         -0.6386353          0.7985597

pbmc[["RNA"]]$counts[c("MS4A1", "CD19"), 1:30]

## 2 x 30 sparse Matrix of class "dgCMatrix"

##   [[ suppressing 30 column names 'AAACCTGAGCCCAGCT-1', 'AAACCTGAGTGACTCT-1', 'AAACCTGCAATGGATA-1' ... ]]

##                                                                        
## MS4A1 5 10 9 10 16 3 11 4 3 3 3 8 5 4 8 4 3 6 8 25 6 4 2 4 9 3 7 11 6 5
## CD19  .  . 1  .  1 1  . 2 1 1 . 1 . 1 . 1 . . .  1 . 1 . 1 3 2 .  1 . 1

# you can plot raw counts as well
VlnPlot(pbmc, features = c("MS4A1", "CD19"), slot = "counts", log = TRUE)

## Warning: The `slot` argument of `VlnPlot()` is deprecated as of Seurat 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

VlnPlot(pbmc, features = c("MS4A1", "CD19"), slot = "scale.data")

FeaturePlot.

plot the expression intensity overlaid on the Tsne/UMAP plot.

FeaturePlot(pbmc, features = c("MS4A1", "CD19", "ITGAX", "CD72", "CD38", "CD37", "IGHM"), label = TRUE , keep.scale = 'all')

There is a package to deal with this type of overplotting problem. https://github.com/SaskiaFreytag/schex

DoHeatmap generates an expression heatmap for given cells and features. In this case, we are plotting the top 20 markers (or all markers if less than 20) for each cluster.

top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
pbmc.markers %>%
    group_by(cluster) %>%
    dplyr::filter(avg_log2FC > 1.5) %>%
    slice_head(n = 10) %>%
    ungroup() -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

## Warning in DoHeatmap(pbmc, features = top10$gene): The following features were
## omitted as they were not found in the scale.data slot for the RNA assay:
## AKAP13, ARIH1

save the Seurat object into a file

saveRDS(pbmc, "/Users/hawacoulibaly/Documents/GitHub/scHumanBcellFlu/data/pbmc_3k.RDS")

Part 1 - Quality Control & Pre-Processing

downloading the data