Analysis on scRNA-seq dataset

library(Seurat)
## Attaching SeuratObject
library(ProjecTILs)
## Loading required package: umap
## Loading required package: TILPRED
## Loading required package: Matrix
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
## 
##     Assays
## The following object is masked from 'package:Seurat':
## 
##     Assays
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
library(ggplot2)
library(tidyverse)
## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::collapse()   masks IRanges::collapse()
## x dplyr::combine()    masks gridExtra::combine(), Biobase::combine(), BiocGenerics::combine()
## x dplyr::count()      masks matrixStats::count()
## x dplyr::desc()       masks IRanges::desc()
## x tidyr::expand()     masks S4Vectors::expand(), Matrix::expand()
## x dplyr::filter()     masks stats::filter()
## x dplyr::first()      masks S4Vectors::first()
## x dplyr::lag()        masks stats::lag()
## x tidyr::pack()       masks Matrix::pack()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename()     masks S4Vectors::rename()
## x dplyr::slice()      masks IRanges::slice()
## x tidyr::unpack()     masks Matrix::unpack()
data<-readRDS('testDataset.rds')
data
## An object of class Seurat 
## 14525 features across 1501 samples within 1 assay 
## Active assay: RNA (14525 features, 1744 variable features)
str(data)
## Formal class 'Seurat' [package "Seurat"] with 13 slots
## Warning: Not a validObject(): no slot of name "images" for this object of class
## "Seurat"
##   ..@ assays      :List of 1
##   .. ..$ RNA:Formal class 'Assay' [package "Seurat"] with 8 slots
## Warning: Not a validObject(): no slot of name "assay.orig" for this object of
## class "Assay"
##   .. .. .. ..@ counts       : num[0 , 0 ] 
##   .. .. .. ..@ data         :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. .. .. .. .. ..@ i       : int [1:3580739] 0 8 15 16 17 22 28 29 33 39 ...
##   .. .. .. .. .. ..@ p       : int [1:1502] 0 2648 6611 8214 9877 12381 14932 17048 18517 19745 ...
##   .. .. .. .. .. ..@ Dim     : int [1:2] 14525 1501
##   .. .. .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. .. .. ..$ : chr [1:14525] "Mrpl15" "Lypla1" "Tcea1" "Atp6v1h" ...
##   .. .. .. .. .. .. ..$ : chr [1:1501] "MI-AAACCTGCACATCTTT-3" "MI-AAACGGGAGAGGGCTT-3" "MI-AAACGGGAGTGACTCT-3" "MI-AAACGGGCATGAAGTA-3" ...
##   .. .. .. .. .. ..@ x       : num [1:3580739] 0.999 0.62 0.62 0.62 0.62 ...
##   .. .. .. .. .. ..@ factors : list()
##   .. .. .. ..@ scale.data   : num[0 , 0 ] 
##   .. .. .. ..@ key          : chr "rna_"
##   .. .. .. ..@ var.features : chr [1:1744] "Gzmg" "Gzmd" "Cd74" "Lyz2" ...
##   .. .. .. ..@ meta.features:'data.frame':   14525 obs. of  5 variables:
##   .. .. .. .. ..$ vst.mean                 : num [1:14525] 1.478 0.494 0.445 0.188 0.12 ...
##   .. .. .. .. ..$ vst.variance             : num [1:14525] 2.74 0.552 0.517 0.201 0.126 ...
##   .. .. .. .. ..$ vst.variance.expected    : num [1:14525] 2.63 0.636 0.562 0.216 0.132 ...
##   .. .. .. .. ..$ vst.variance.standardized: num [1:14525] 1.042 0.868 0.921 0.929 0.96 ...
##   .. .. .. .. ..$ vst.variable             : logi [1:14525] TRUE FALSE FALSE FALSE FALSE FALSE ...
##   .. .. .. ..@ misc         : symbol NULL
##   .. .. .. ..@ NA           : NULL
##   ..@ meta.data   :'data.frame': 1501 obs. of  8 variables:
##   .. ..$ Sample      : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##   .. ..$ Barcode     : chr [1:1501] "AAACCTGCACATCTTT-3" "AAACGGGAGAGGGCTT-3" "AAACGGGAGTGACTCT-3" "AAACGGGCATGAAGTA-3" ...
##   .. ..$ SampleLab   : Factor w/ 4 levels "d10_All","d10_Tet",..: 3 3 3 3 3 3 3 3 3 3 ...
##   .. ..$ Study       : Factor w/ 4 levels "d10_All","d10_Tet",..: 3 3 3 3 3 3 3 3 3 3 ...
##   .. ..$ nCount_RNA  : num [1:1501] 11650 22912 4573 5073 9942 ...
##   .. ..$ nFeature_RNA: int [1:1501] 2648 3963 1603 1663 2504 2551 2116 1469 1228 2159 ...
##   .. ..$ percent.ribo: num [1:1501] 33.6 39.8 33.3 30.2 28.8 ...
##   .. ..$ percent.mito: num [1:1501] 2.44 2.99 2.32 2.17 2.04 ...
##   ..@ active.assay: chr "RNA"
##   ..@ active.ident: Factor w/ 1 level "d20_Tet_1": 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "names")= chr [1:1501] "MI-AAACCTGCACATCTTT-3" "MI-AAACGGGAGAGGGCTT-3" "MI-AAACGGGAGTGACTCT-3" "MI-AAACGGGCATGAAGTA-3" ...
##   ..@ graphs      : list()
##   ..@ neighbors   : list()
##   ..@ reductions  : list()
##   ..@ project.name: chr "SingleCellExperiment"
##   ..@ misc        : list()
##   ..@ version     :Classes 'package_version', 'numeric_version'  hidden list of 1
##   .. ..$ : int [1:3] 3 1 1
##   ..@ commands    :List of 2
##   .. ..$ NormalizeData.RNA       :Formal class 'SeuratCommand' [package "Seurat"] with 5 slots
## Warning: Not a validObject(): no slot of name "assay.used" for this object of
## class "SeuratCommand"
##   .. .. .. ..@ name       : chr "NormalizeData.RNA"
##   .. .. .. ..@ time.stamp : POSIXct[1:1], format: "2019-12-09 18:42:08"
##   .. .. .. ..@ call.string: chr "NormalizeData(miller.seurat, verbose = FALSE)"
##   .. .. .. ..@ params     :List of 5
##   .. .. .. .. ..$ assay               : chr "RNA"
##   .. .. .. .. ..$ normalization.method: chr "LogNormalize"
##   .. .. .. .. ..$ scale.factor        : num 10000
##   .. .. .. .. ..$ margin              : num 1
##   .. .. .. .. ..$ verbose             : logi FALSE
##   .. .. .. ..@ NA         : NULL
##   .. ..$ FindVariableFeatures.RNA:Formal class 'SeuratCommand' [package "Seurat"] with 5 slots
## Warning: Not a validObject(): no slot of name "assay.used" for this object of
## class "SeuratCommand"
##   .. .. .. ..@ name       : chr "FindVariableFeatures.RNA"
##   .. .. .. ..@ time.stamp : POSIXct[1:1], format: "2019-12-09 18:42:11"
##   .. .. .. ..@ call.string: chr [1:2] "FindVariableFeatures(miller.seurat, selection.method = \"vst\", " "    nfeatures = 2000, verbose = FALSE)"
##   .. .. .. ..@ params     :List of 12
##   .. .. .. .. ..$ assay              : chr "RNA"
##   .. .. .. .. ..$ selection.method   : chr "vst"
##   .. .. .. .. ..$ loess.span         : num 0.3
##   .. .. .. .. ..$ clip.max           : chr "auto"
##   .. .. .. .. ..$ mean.function      :function (mat, display_progress)  
##   .. .. .. .. ..$ dispersion.function:function (mat, display_progress)  
##   .. .. .. .. ..$ num.bin            : num 20
##   .. .. .. .. ..$ binning.method     : chr "equal_width"
##   .. .. .. .. ..$ nfeatures          : num 2000
##   .. .. .. .. ..$ mean.cutoff        : num [1:2] 0.1 8
##   .. .. .. .. ..$ dispersion.cutoff  : num [1:2] 1 Inf
##   .. .. .. .. ..$ verbose            : logi FALSE
##   .. .. .. ..@ NA         : NULL
##   ..@ tools       : list()
##   ..@ NA          : NULL
length(x = data@assays$RNA@var.features)
## [1] 1744
table(Idents(data)) #number of cells
## 
## d20_Tet_1 
##      1501
head(data@meta.data) #checking the QC stats
##                       Sample            Barcode SampleLab     Study nCount_RNA
## MI-AAACCTGCACATCTTT-3      3 AAACCTGCACATCTTT-3 d20_Tet_1 d20_Tet_1      11650
## MI-AAACGGGAGAGGGCTT-3      3 AAACGGGAGAGGGCTT-3 d20_Tet_1 d20_Tet_1      22912
## MI-AAACGGGAGTGACTCT-3      3 AAACGGGAGTGACTCT-3 d20_Tet_1 d20_Tet_1       4573
## MI-AAACGGGCATGAAGTA-3      3 AAACGGGCATGAAGTA-3 d20_Tet_1 d20_Tet_1       5073
## MI-AAACGGGGTAGGCATG-3      3 AAACGGGGTAGGCATG-3 d20_Tet_1 d20_Tet_1       9942
## MI-AAACGGGGTTTAGCTG-3      3 AAACGGGGTTTAGCTG-3 d20_Tet_1 d20_Tet_1       9817
##                       nFeature_RNA percent.ribo percent.mito
## MI-AAACCTGCACATCTTT-3         2648     33.63660     2.437559
## MI-AAACGGGAGAGGGCTT-3         3963     39.76871     2.989308
## MI-AAACGGGAGTGACTCT-3         1603     33.30418     2.317953
## MI-AAACGGGCATGAAGTA-3         1663     30.15967     2.168342
## MI-AAACGGGGTAGGCATG-3         2504     28.75390     2.041637
## MI-AAACGGGGTTTAGCTG-3         2551     34.60990     2.250968
data=FindVariableFeatures(data)
## Warning in FindVariableFeatures.Assay(object = assay.data, selection.method =
## selection.method, : selection.method set to 'vst' but count slot is empty; will
## use data slot instead
head(x = HVFInfo(object = data))
##                     mean   variance variance.standardized
## Mrpl15        0.66420523 0.35258141             1.0431836
## Lypla1        0.27352847 0.17836753             0.9827593
## Tcea1         0.26772365 0.17596125             0.9868403
## Atp6v1h       0.11313238 0.08576263             1.0307847
## Rb1cc1        0.09377432 0.06946782             0.9961895
## 4732440D04Rik 0.01958208 0.01607741             1.0474994
VlnPlot(data, features = c("nFeature_RNA","nCount_RNA","percent.mito","percent.ribo"),
pt.size = 0.1,ncol=4)

#removing the outliers
data <- subset(data, subset = percent.mito < 4 & percent.mito>1)

# check number of cells
ncol(data)
## [1] 1436
qc.metrics<-as_tibble(
data[[c("nCount_RNA","nFeature_RNA",'percent.mito')]],
rownames="Cell.Barcode")

qc.metrics %>%
arrange(percent.mito) %>%
ggplot(aes(nCount_RNA,nFeature_RNA,colour=percent.mito)) +
geom_point() +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
ggtitle("QC metrics")

#calculating features that are highly expressed in some cells

VariableFeaturePlot(object = data) 
## Warning: Transformation introduced infinite values in continuous x-axis

data<-ScaleData(data)
## Centering and scaling data matrix
s_obj = RunPCA(data, features = VariableFeatures(data))
## PC_ 1 
## Positive:  Lgmn, Tyrobp, Wfdc17, Ctsh, Fcgr3, C3ar1, C1qb, Ms4a7, Cxcl2, Lyz2 
##     Alox5ap, Blvrb, C1qc, Trem2, Spi1, Cd300c2, Cd68, Aif1, Cfp, Ifitm3 
##     C1qa, Apoe, Ms4a6c, Fcgr1, Fth1, Pf4, Sirpa, Mpeg1, Fcgr2b, Ctsl 
## Negative:  Dut, Ptma, Hmgb2, Pclaf, Nkg7, Ran, Cd3g, Ranbp1, Hspa8, Eif5a 
##     Nap1l1, Npm1, Hsp90ab1, Erh, Phgdh, Hnrnpa1, Actg1, Snrpd1, Stmn1, H2afz 
##     Eif4a1, Thy1, Srsf3, Ube2s, Slc25a5, Srsf2, Cd3d, Sub1, Sh2d2a, Tubb5 
## PC_ 2 
## Positive:  Nkg7, B2m, Tmsb4x, Cd3g, Cd52, Fxyd5, AW112010, S100a11, S100a6, Id2 
##     Srgn, Malat1, Cxcr6, S100a4, Cd3d, Ccl5, Gmfg, Klrd1, AA467197, Shisa5 
##     Ctla2a, Ms4a4b, Anxa2, Hcst, S100a10, Btg1, Cd8b1, Il1r2, Lag3, Cd8a 
## Negative:  Ran, Ptma, Hsp90ab1, Pclaf, Ranbp1, Npm1, Tuba1b, Eif5a, Ybx1, Hmgb2 
##     H2afz, Erh, Cks1b, Anp32b, Ncl, Dut, Set, Tubb5, Hsp90aa1, Selenoh 
##     Eef1g, Lyar, Ube2s, Hspd1, Snrpb, Phgdh, Nhp2, Ndufa4, Hmgn1, Rps2 
## PC_ 3 
## Positive:  H2afv, Hmgb2, Stmn1, Xist, H3f3b, Ube2c, H2afx, Cenpf, Malat1, AW112010 
##     Cd7, Birc5, Ubb, Cdkn3, Cd52, Nusap1, H2-K1, Cdc25b, Cdca8, Btg1 
##     Pimreg, Calm2, Cenpe, Ctsw, B2m, Ifngr1, Cdc20, Klf2, Tubb5, Itm2b 
## Negative:  Gzmc, Gzme, Ccl9, Gzmd, Tnfrsf9, Gzmb, Serpinb6b, Gzmf, Serpinb9b, Tnfrsf4 
##     Nebl, Mt1, Il1r2, Irf8, Il2ra, Sdf4, Ccl3, Hspe1, Ftl1, Mt2 
##     Prf1, Upp1, AA467197, Calr, Hsp90ab1, Serpinb9, Capg, Atp5g1, Eef1g, Igfbp7 
## PC_ 4 
## Positive:  Xcl1, mt-Co3, mt-Co2, Dapl1, Ndufa4, Pacsin1, Ccr7, Zfas1, Ung, Npm1 
##     Tcf7, Eef1g, mt-Atp6, mt-Cytb, mt-Nd1, mt-Nd4, mt-Nd2, Nrn1, Bcl2, Enpp2 
##     Ifi27l2a, Tbc1d4, Tspan32, Nop10, Bmyc, Crtam, Cd7, Hspa8, St6galnac3, Hsp90ab1 
## Negative:  Tubb4b, Ube2c, Birc5, Cdk1, Tubb5, Gzmb, AA467197, Arl6ip1, Ccnb1, Nusap1 
##     Cenpa, Ube2s, Cdca8, Cenpf, Tubb6, Tuba1b, Gzmf, Anxa2, Cdca3, Gzmc 
##     Cdc20, Ccna2, Lgals1, Hmmr, Cks2, Ccnb2, Smc4, Ly6a, H2afx, Cks1b 
## PC_ 5 
## Positive:  Slamf9, Clec12a, Fcgr4, Clec4b1, Pld4, H2-DMb1, H2-Eb1, Lpl, Cadm1, H2-Aa 
##     Clec4a3, H2-Ab1, Bcl2a1a, Cd300e, Marcks, Gbp2b, Cd74, Clec4a1, Mpeg1, C1qc 
##     C1qa, Ccnd1, Fcgrt, Cxcl9, Cybb, Pou2f2, Gatm, Isg15, Spi1, Gsn 
## Negative:  Pf4, Clec4d, Thbs1, Arg1, Ccl6, Hmox1, Ctsl, Apoe, Cd36, Pla2g7 
##     Lyz2, Mgst1, C5ar1, Ninj1, Fcgr2b, Cxcl3, Pdpn, Clec4n, Igf1, Prdx1 
##     Gdf15, Gstm1, Mafb, Slc7a11, Trem2, Cd14, Abca1, Gpnmb, Tmem37, Mmp8
s_obj = FindNeighbors(s_obj)
## Computing nearest neighbor graph
## Computing SNN
s_obj = FindClusters(s_obj, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1436
## Number of edges: 46465
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7479
## Number of communities: 6
## Elapsed time: 0 seconds
table(Idents(data)) 
## 
## d20_Tet_1 
##      1436
s_obj = RunTSNE(s_obj)
DimPlot (s_obj, reduction = "tsne", label = TRUE)

#exploration of the primary sources of heterogeneity in a data
DimHeatmap(s_obj, dims = 1:5, cells = 500, balanced = TRUE)