Analysis on scRNA-seq dataset
- Importing the relevant libraries
library(Seurat)
## Attaching SeuratObject
library(ProjecTILs)
## Loading required package: umap
## Loading required package: TILPRED
## Loading required package: Matrix
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
library(ggplot2)
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks gridExtra::combine(), Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand(), Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::slice() masks IRanges::slice()
## x tidyr::unpack() masks Matrix::unpack()
data<-readRDS('testDataset.rds')
data
## An object of class Seurat
## 14525 features across 1501 samples within 1 assay
## Active assay: RNA (14525 features, 1744 variable features)
str(data)
## Formal class 'Seurat' [package "Seurat"] with 13 slots
## Warning: Not a validObject(): no slot of name "images" for this object of class
## "Seurat"
## ..@ assays :List of 1
## .. ..$ RNA:Formal class 'Assay' [package "Seurat"] with 8 slots
## Warning: Not a validObject(): no slot of name "assay.orig" for this object of
## class "Assay"
## .. .. .. ..@ counts : num[0 , 0 ]
## .. .. .. ..@ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. .. .. .. .. ..@ i : int [1:3580739] 0 8 15 16 17 22 28 29 33 39 ...
## .. .. .. .. .. ..@ p : int [1:1502] 0 2648 6611 8214 9877 12381 14932 17048 18517 19745 ...
## .. .. .. .. .. ..@ Dim : int [1:2] 14525 1501
## .. .. .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. .. .. ..$ : chr [1:14525] "Mrpl15" "Lypla1" "Tcea1" "Atp6v1h" ...
## .. .. .. .. .. .. ..$ : chr [1:1501] "MI-AAACCTGCACATCTTT-3" "MI-AAACGGGAGAGGGCTT-3" "MI-AAACGGGAGTGACTCT-3" "MI-AAACGGGCATGAAGTA-3" ...
## .. .. .. .. .. ..@ x : num [1:3580739] 0.999 0.62 0.62 0.62 0.62 ...
## .. .. .. .. .. ..@ factors : list()
## .. .. .. ..@ scale.data : num[0 , 0 ]
## .. .. .. ..@ key : chr "rna_"
## .. .. .. ..@ var.features : chr [1:1744] "Gzmg" "Gzmd" "Cd74" "Lyz2" ...
## .. .. .. ..@ meta.features:'data.frame': 14525 obs. of 5 variables:
## .. .. .. .. ..$ vst.mean : num [1:14525] 1.478 0.494 0.445 0.188 0.12 ...
## .. .. .. .. ..$ vst.variance : num [1:14525] 2.74 0.552 0.517 0.201 0.126 ...
## .. .. .. .. ..$ vst.variance.expected : num [1:14525] 2.63 0.636 0.562 0.216 0.132 ...
## .. .. .. .. ..$ vst.variance.standardized: num [1:14525] 1.042 0.868 0.921 0.929 0.96 ...
## .. .. .. .. ..$ vst.variable : logi [1:14525] TRUE FALSE FALSE FALSE FALSE FALSE ...
## .. .. .. ..@ misc : symbol NULL
## .. .. .. ..@ NA : NULL
## ..@ meta.data :'data.frame': 1501 obs. of 8 variables:
## .. ..$ Sample : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
## .. ..$ Barcode : chr [1:1501] "AAACCTGCACATCTTT-3" "AAACGGGAGAGGGCTT-3" "AAACGGGAGTGACTCT-3" "AAACGGGCATGAAGTA-3" ...
## .. ..$ SampleLab : Factor w/ 4 levels "d10_All","d10_Tet",..: 3 3 3 3 3 3 3 3 3 3 ...
## .. ..$ Study : Factor w/ 4 levels "d10_All","d10_Tet",..: 3 3 3 3 3 3 3 3 3 3 ...
## .. ..$ nCount_RNA : num [1:1501] 11650 22912 4573 5073 9942 ...
## .. ..$ nFeature_RNA: int [1:1501] 2648 3963 1603 1663 2504 2551 2116 1469 1228 2159 ...
## .. ..$ percent.ribo: num [1:1501] 33.6 39.8 33.3 30.2 28.8 ...
## .. ..$ percent.mito: num [1:1501] 2.44 2.99 2.32 2.17 2.04 ...
## ..@ active.assay: chr "RNA"
## ..@ active.ident: Factor w/ 1 level "d20_Tet_1": 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "names")= chr [1:1501] "MI-AAACCTGCACATCTTT-3" "MI-AAACGGGAGAGGGCTT-3" "MI-AAACGGGAGTGACTCT-3" "MI-AAACGGGCATGAAGTA-3" ...
## ..@ graphs : list()
## ..@ neighbors : list()
## ..@ reductions : list()
## ..@ project.name: chr "SingleCellExperiment"
## ..@ misc : list()
## ..@ version :Classes 'package_version', 'numeric_version' hidden list of 1
## .. ..$ : int [1:3] 3 1 1
## ..@ commands :List of 2
## .. ..$ NormalizeData.RNA :Formal class 'SeuratCommand' [package "Seurat"] with 5 slots
## Warning: Not a validObject(): no slot of name "assay.used" for this object of
## class "SeuratCommand"
## .. .. .. ..@ name : chr "NormalizeData.RNA"
## .. .. .. ..@ time.stamp : POSIXct[1:1], format: "2019-12-09 18:42:08"
## .. .. .. ..@ call.string: chr "NormalizeData(miller.seurat, verbose = FALSE)"
## .. .. .. ..@ params :List of 5
## .. .. .. .. ..$ assay : chr "RNA"
## .. .. .. .. ..$ normalization.method: chr "LogNormalize"
## .. .. .. .. ..$ scale.factor : num 10000
## .. .. .. .. ..$ margin : num 1
## .. .. .. .. ..$ verbose : logi FALSE
## .. .. .. ..@ NA : NULL
## .. ..$ FindVariableFeatures.RNA:Formal class 'SeuratCommand' [package "Seurat"] with 5 slots
## Warning: Not a validObject(): no slot of name "assay.used" for this object of
## class "SeuratCommand"
## .. .. .. ..@ name : chr "FindVariableFeatures.RNA"
## .. .. .. ..@ time.stamp : POSIXct[1:1], format: "2019-12-09 18:42:11"
## .. .. .. ..@ call.string: chr [1:2] "FindVariableFeatures(miller.seurat, selection.method = \"vst\", " " nfeatures = 2000, verbose = FALSE)"
## .. .. .. ..@ params :List of 12
## .. .. .. .. ..$ assay : chr "RNA"
## .. .. .. .. ..$ selection.method : chr "vst"
## .. .. .. .. ..$ loess.span : num 0.3
## .. .. .. .. ..$ clip.max : chr "auto"
## .. .. .. .. ..$ mean.function :function (mat, display_progress)
## .. .. .. .. ..$ dispersion.function:function (mat, display_progress)
## .. .. .. .. ..$ num.bin : num 20
## .. .. .. .. ..$ binning.method : chr "equal_width"
## .. .. .. .. ..$ nfeatures : num 2000
## .. .. .. .. ..$ mean.cutoff : num [1:2] 0.1 8
## .. .. .. .. ..$ dispersion.cutoff : num [1:2] 1 Inf
## .. .. .. .. ..$ verbose : logi FALSE
## .. .. .. ..@ NA : NULL
## ..@ tools : list()
## ..@ NA : NULL
length(x = data@assays$RNA@var.features)
## [1] 1744
table(Idents(data)) #number of cells
##
## d20_Tet_1
## 1501
head(data@meta.data) #checking the QC stats
## Sample Barcode SampleLab Study nCount_RNA
## MI-AAACCTGCACATCTTT-3 3 AAACCTGCACATCTTT-3 d20_Tet_1 d20_Tet_1 11650
## MI-AAACGGGAGAGGGCTT-3 3 AAACGGGAGAGGGCTT-3 d20_Tet_1 d20_Tet_1 22912
## MI-AAACGGGAGTGACTCT-3 3 AAACGGGAGTGACTCT-3 d20_Tet_1 d20_Tet_1 4573
## MI-AAACGGGCATGAAGTA-3 3 AAACGGGCATGAAGTA-3 d20_Tet_1 d20_Tet_1 5073
## MI-AAACGGGGTAGGCATG-3 3 AAACGGGGTAGGCATG-3 d20_Tet_1 d20_Tet_1 9942
## MI-AAACGGGGTTTAGCTG-3 3 AAACGGGGTTTAGCTG-3 d20_Tet_1 d20_Tet_1 9817
## nFeature_RNA percent.ribo percent.mito
## MI-AAACCTGCACATCTTT-3 2648 33.63660 2.437559
## MI-AAACGGGAGAGGGCTT-3 3963 39.76871 2.989308
## MI-AAACGGGAGTGACTCT-3 1603 33.30418 2.317953
## MI-AAACGGGCATGAAGTA-3 1663 30.15967 2.168342
## MI-AAACGGGGTAGGCATG-3 2504 28.75390 2.041637
## MI-AAACGGGGTTTAGCTG-3 2551 34.60990 2.250968
- Finding out variable genes across the single cells.
data=FindVariableFeatures(data)
## Warning in FindVariableFeatures.Assay(object = assay.data, selection.method =
## selection.method, : selection.method set to 'vst' but count slot is empty; will
## use data slot instead
head(x = HVFInfo(object = data))
## mean variance variance.standardized
## Mrpl15 0.66420523 0.35258141 1.0431836
## Lypla1 0.27352847 0.17836753 0.9827593
## Tcea1 0.26772365 0.17596125 0.9868403
## Atp6v1h 0.11313238 0.08576263 1.0307847
## Rb1cc1 0.09377432 0.06946782 0.9961895
## 4732440D04Rik 0.01958208 0.01607741 1.0474994
VlnPlot(data, features = c("nFeature_RNA","nCount_RNA","percent.mito","percent.ribo"),
pt.size = 0.1,ncol=4)

#removing the outliers
data <- subset(data, subset = percent.mito < 4 & percent.mito>1)
# check number of cells
ncol(data)
## [1] 1436
qc.metrics<-as_tibble(
data[[c("nCount_RNA","nFeature_RNA",'percent.mito')]],
rownames="Cell.Barcode")
qc.metrics %>%
arrange(percent.mito) %>%
ggplot(aes(nCount_RNA,nFeature_RNA,colour=percent.mito)) +
geom_point() +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
ggtitle("QC metrics")

#calculating features that are highly expressed in some cells
VariableFeaturePlot(object = data)
## Warning: Transformation introduced infinite values in continuous x-axis

- Scaling the data in order to improve dimensionality reduction for the further analysis.
data<-ScaleData(data)
## Centering and scaling data matrix
- Applying dimensionality reduction technique and visualising the dimensionally reduced data
s_obj = RunPCA(data, features = VariableFeatures(data))
## PC_ 1
## Positive: Lgmn, Tyrobp, Wfdc17, Ctsh, Fcgr3, C3ar1, C1qb, Ms4a7, Cxcl2, Lyz2
## Alox5ap, Blvrb, C1qc, Trem2, Spi1, Cd300c2, Cd68, Aif1, Cfp, Ifitm3
## C1qa, Apoe, Ms4a6c, Fcgr1, Fth1, Pf4, Sirpa, Mpeg1, Fcgr2b, Ctsl
## Negative: Dut, Ptma, Hmgb2, Pclaf, Nkg7, Ran, Cd3g, Ranbp1, Hspa8, Eif5a
## Nap1l1, Npm1, Hsp90ab1, Erh, Phgdh, Hnrnpa1, Actg1, Snrpd1, Stmn1, H2afz
## Eif4a1, Thy1, Srsf3, Ube2s, Slc25a5, Srsf2, Cd3d, Sub1, Sh2d2a, Tubb5
## PC_ 2
## Positive: Nkg7, B2m, Tmsb4x, Cd3g, Cd52, Fxyd5, AW112010, S100a11, S100a6, Id2
## Srgn, Malat1, Cxcr6, S100a4, Cd3d, Ccl5, Gmfg, Klrd1, AA467197, Shisa5
## Ctla2a, Ms4a4b, Anxa2, Hcst, S100a10, Btg1, Cd8b1, Il1r2, Lag3, Cd8a
## Negative: Ran, Ptma, Hsp90ab1, Pclaf, Ranbp1, Npm1, Tuba1b, Eif5a, Ybx1, Hmgb2
## H2afz, Erh, Cks1b, Anp32b, Ncl, Dut, Set, Tubb5, Hsp90aa1, Selenoh
## Eef1g, Lyar, Ube2s, Hspd1, Snrpb, Phgdh, Nhp2, Ndufa4, Hmgn1, Rps2
## PC_ 3
## Positive: H2afv, Hmgb2, Stmn1, Xist, H3f3b, Ube2c, H2afx, Cenpf, Malat1, AW112010
## Cd7, Birc5, Ubb, Cdkn3, Cd52, Nusap1, H2-K1, Cdc25b, Cdca8, Btg1
## Pimreg, Calm2, Cenpe, Ctsw, B2m, Ifngr1, Cdc20, Klf2, Tubb5, Itm2b
## Negative: Gzmc, Gzme, Ccl9, Gzmd, Tnfrsf9, Gzmb, Serpinb6b, Gzmf, Serpinb9b, Tnfrsf4
## Nebl, Mt1, Il1r2, Irf8, Il2ra, Sdf4, Ccl3, Hspe1, Ftl1, Mt2
## Prf1, Upp1, AA467197, Calr, Hsp90ab1, Serpinb9, Capg, Atp5g1, Eef1g, Igfbp7
## PC_ 4
## Positive: Xcl1, mt-Co3, mt-Co2, Dapl1, Ndufa4, Pacsin1, Ccr7, Zfas1, Ung, Npm1
## Tcf7, Eef1g, mt-Atp6, mt-Cytb, mt-Nd1, mt-Nd4, mt-Nd2, Nrn1, Bcl2, Enpp2
## Ifi27l2a, Tbc1d4, Tspan32, Nop10, Bmyc, Crtam, Cd7, Hspa8, St6galnac3, Hsp90ab1
## Negative: Tubb4b, Ube2c, Birc5, Cdk1, Tubb5, Gzmb, AA467197, Arl6ip1, Ccnb1, Nusap1
## Cenpa, Ube2s, Cdca8, Cenpf, Tubb6, Tuba1b, Gzmf, Anxa2, Cdca3, Gzmc
## Cdc20, Ccna2, Lgals1, Hmmr, Cks2, Ccnb2, Smc4, Ly6a, H2afx, Cks1b
## PC_ 5
## Positive: Slamf9, Clec12a, Fcgr4, Clec4b1, Pld4, H2-DMb1, H2-Eb1, Lpl, Cadm1, H2-Aa
## Clec4a3, H2-Ab1, Bcl2a1a, Cd300e, Marcks, Gbp2b, Cd74, Clec4a1, Mpeg1, C1qc
## C1qa, Ccnd1, Fcgrt, Cxcl9, Cybb, Pou2f2, Gatm, Isg15, Spi1, Gsn
## Negative: Pf4, Clec4d, Thbs1, Arg1, Ccl6, Hmox1, Ctsl, Apoe, Cd36, Pla2g7
## Lyz2, Mgst1, C5ar1, Ninj1, Fcgr2b, Cxcl3, Pdpn, Clec4n, Igf1, Prdx1
## Gdf15, Gstm1, Mafb, Slc7a11, Trem2, Cd14, Abca1, Gpnmb, Tmem37, Mmp8
s_obj = FindNeighbors(s_obj)
## Computing nearest neighbor graph
## Computing SNN
s_obj = FindClusters(s_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 1436
## Number of edges: 46465
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7479
## Number of communities: 6
## Elapsed time: 0 seconds
table(Idents(data))
##
## d20_Tet_1
## 1436
s_obj = RunTSNE(s_obj)
DimPlot (s_obj, reduction = "tsne", label = TRUE)

#exploration of the primary sources of heterogeneity in a data
DimHeatmap(s_obj, dims = 1:5, cells = 500, balanced = TRUE)
