#1.加载R包

library(Seurat)
library(tidyverse)
## ─ Attaching packages ──────────────────── tidyverse 1.3.0 ─
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ─ Conflicts ───────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(patchwork)

#2.读取10x数据,并创建Seurat对象

scRNA.counts <- Read10X("./BC20/")

scRNA <-  CreateSeuratObject(scRNA.counts ,min.cells = 3, min.features = 200) #初筛:每个基因至少在3个细胞中表达,每个基因的表达量大于200

table(scRNA@meta.data$orig.ident) #查看样本的细胞数量
## 
## SeuratProject 
##         11096

#3.细胞质控 ##计算线粒体基因比例

scRNA[["percent.mt"]] <- PercentageFeatureSet(scRNA, pattern = "^MT-")

##计算红细胞比例

HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") #命名红细胞基因
HB_m <- match(HB.genes, rownames(scRNA@assays$RNA)) #匹配红细胞基因位置
HB.genes <- rownames(scRNA@assays$RNA)[HB_m] #取出样本中的红细胞基因
HB.genes <- HB.genes[!is.na(HB.genes)] #去除NA值
scRNA[["percent.HB"] ] <- PercentageFeatureSet(scRNA, features = HB.genes)  #计算红细胞比例

##各比例可视化

head(scRNA@meta.data)
##                     orig.ident nCount_RNA nFeature_RNA percent.mt  percent.HB
## AAACCCAAGCTCACTA SeuratProject      38190         6581  2.1654883 0.002618487
## AAACCCAAGGCGCTTC SeuratProject       1026          314 57.7972710 0.000000000
## AAACCCAAGGGCATGT SeuratProject        855          551  1.1695906 0.116959064
## AAACCCAAGGTAAGAG SeuratProject        734          296 42.2343324 0.136239782
## AAACCCAAGTAGAATC SeuratProject      13192         2825  1.2507580 0.007580352
## AAACCCAAGTCTCCTC SeuratProject      22981         3957  0.9007441 0.000000000
col.num <- length(levels(scRNA@active.ident)) #??看有几个样本??

violin <- VlnPlot(scRNA,
                  features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.HB"),
                  cols = rainbow(col.num),
                  pt.size = 0.01, #不需要显示点,值为0
                  ncol = 4) +
  theme(axis.title.x=element_blank(), axis.text.x=element_blank(),axis.ticks.x=element_blank()) 

violin

#保存图片
ggsave("vlnplot_before_qc.pdf", plot = violin, width = 12, height = 6) 
ggsave("vlnplot_before_qc.png", plot = violin, width = 12, height = 6)  

#这几个指标之间的相关性。 
plot1=FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2=FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3=FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.HB")
pearplot <- CombinePlots(plots = list(plot1, plot2, plot3), nrow=1, legend="none") 

plot1

plot2

plot3

pearplot

ggsave("pearplot_before_qc.pdf",width = 12, height = 6)
##我们可以看到,nFeature_RNA的范围在0到8000之内,percent.mt代表线粒体含量
###我们默认线粒体含量至少要小于20%,这是根据生物学知识得出的默认阈值。红细胞的数目要至少小于5%
###至于nFeature_RNA和nCount_RNA的阈值怎么确定,这个要结合pearplot的图来判断。我们质控的目标就是删除离异值。而且注意阈值尽可能取的宽松一下,防止后面分析想要的细胞得不到。

##接下来从pearplot的图片来做质控—剔除离异值

#nFeature_RNA选择大于200 小于7500的,nCount_RNA选择小于100000,percent.mt小于20,percent.HB小于5 ??怎么看图
scRNA1 <- subset(scRNA, subset = nFeature_RNA > 200& nFeature_RNA < 7500 & percent.mt < 20 & percent.HB < 5 & nCount_RNA < 100000)
scRNA
## An object of class Seurat 
## 22211 features across 11096 samples within 1 assay 
## Active assay: RNA (22211 features)
scRNA1
## An object of class Seurat 
## 22211 features across 10004 samples within 1 assay 
## Active assay: RNA (22211 features)

#4.均一化

 #过滤完之后 我们就要对数据进行均一化,使用NormalizeData这个函数。
 ##注意均一化是用NormalizeData,标准化是用ScaleData
scRNA1 <- NormalizeData(scRNA1, normalization.method = "LogNormalize", scale.factor = 10000)
#好了,这一节数据加载、质控的内容就算是做完了
save(scRNA1,file='scRNA1.Rdata')