library(Seurat)
library(ggplot2)
library(Matrix)
library(stringr)
library(readr)
library(here)
library(fitdistrplus)
library(dplyr)
library(monocle)
library(reticulate)
Loading H5 file and prepare gene label
load(file = "/Users/cuihening/Downloads/chicken_heart-master/data/cc.genes.rda")
mito_genes = c("ND1", "ND2", "COX1", "COII", "ATP8", "ATP6", "COX3", "ND3", "ND4L", "ND4", "ND5", "CYTB", "ND6")
samples = c("D4", "D7_LV", "D7_RV", "D10_LV", "D10_RV", "D14_LV", "D14_RV")
cc.genes
## $s.genes
## [1] "MCM5" "PCNA" "TYMS" "FEN1" "MCM2" "MCM4"
## [7] "RRM1" "UNG" "GINS2" "MCM6" "CDCA7" "DTL"
## [13] "PRIM1" "UHRF1" "MLF1IP" "HELLS" "RFC2" "RPA2"
## [19] "NASP" "RAD51AP1" "GMNN" "WDR76" "SLBP" "CCNE2"
## [25] "UBR7" "POLD3" "MSH2" "ATAD2" "RAD51" "RRM2"
## [31] "CDC45" "CDC6" "EXO1" "TIPIN" "DSCC1" "BLM"
## [37] "CASP8AP2" "USP1" "CLSPN" "POLA1" "CHAF1B" "BRIP1"
## [43] "E2F8"
##
## $g2m.genes
## [1] "HMGB2" "CDK1" "NUSAP1" "UBE2C" "BIRC5" "TPX2" "TOP2A"
## [8] "NDC80" "CKS2" "NUF2" "CKS1B" "MKI67" "TMPO" "CENPF"
## [15] "TACC3" "FAM64A" "SMC4" "CCNB2" "CKAP2L" "CKAP2" "AURKB"
## [22] "BUB1" "KIF11" "ANP32E" "TUBB4B" "GTSE1" "KIF20B" "HJURP"
## [29] "CDCA3" "HN1" "CDC20" "TTK" "CDC25C" "KIF2C" "RANGAP1"
## [36] "NCAPD2" "DLGAP5" "CDCA2" "CDCA8" "ECT2" "KIF23" "HMMR"
## [43] "AURKA" "PSRC1" "ANLN" "LBR" "CKAP5" "CENPE" "CTCF"
## [50] "NEK2" "G2E3" "GAS2L3" "CBX5" "CENPA"
Function to prepare data set. Read10X_h5 is used to read
dataset(for h5 file), Read10X for other file.
prepare_datasets <- function(sample_name, cc.genes, mito_genes){
# Read data
data.dir = paste("/Users/cuihening/Downloads/GSE149457_RAW/chicken_heart_scRNAseq_", sample_name, "_filtered_feature_bc_matrix.h5", sep = "")
data <- Read10X_h5(data.dir)
seurat.object <- CreateSeuratObject(counts = data, min.cells = 1, min.features = 1, project = sample_name)
# Mitrocondria
seurat.object$percent.mito <- PercentageFeatureSet(object = seurat.object, features = mito_genes)
return(seurat.object)
}
Prepare dataset
day4 = prepare_datasets(samples[1], cc.genes, mito_genes)
day7_lv = prepare_datasets(samples[2], cc.genes, mito_genes)
day7_rv = prepare_datasets(samples[3], cc.genes, mito_genes)
day10_lv = prepare_datasets(samples[4], cc.genes, mito_genes)
day10_rv = prepare_datasets(samples[5], cc.genes, mito_genes)
day14_lv = prepare_datasets(samples[6], cc.genes, mito_genes)
day14_rv = prepare_datasets(samples[7], cc.genes, mito_genes)
dim(day4)
## [1] 17210 7755
dim(day7_lv)
## [1] 16049 6413
dim(day7_rv)
## [1] 16641 5278
dim(day10_lv)
## [1] 13370 3048
dim(day10_rv)
## [1] 13338 3046
dim(day14_lv)
## [1] 12324 1988
dim(day14_rv)
## [1] 12069 2342
Save object
save.image("/Users/cuihening/Desktop/robjs/all.objs.RData")
load("/Users/cuihening/Desktop/robjs/all.objs.RData")
Merge the dataset
chicken = merge(day4, y = c(day7_lv, day7_rv, day10_lv, day10_rv, day14_lv, day14_rv), add.cell.ids = samples, project = "ChickenEmbryo")
dim(chicken)
## [1] 18797 29870
table(chicken$orig.ident)
##
## D10_LV D10_RV D14_LV D14_RV D4 D7_LV D7_RV
## 3048 3046 1988 2342 7755 6413 5278
Vlnplot: Draws a violin plot of single cell data
FeatureScatter: Creates a scatter plot of two features,
across a set of single cells. Cells are colored by their identity class.
Pearson correlation between the two features is displayed above the
plot.
VlnPlot(object = chicken, features = c("nCount_RNA", "nFeature_RNA", "percent.mito"), pt.size = 0.01, group.by = "orig.ident")
FeatureScatter(object = chicken, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(object = chicken, feature1 = "nFeature_RNA", feature2 = "percent.mito") + geom_hline(yintercept = 20) + geom_vline(xintercept = 200)
PS: delete use.raw=TRUE,
do.return=TRUE.
Data cleaning, filter out cells with few reads and few genes
chicken <- subset(chicken, subset = nFeature_RNA >= 200 & percent.mito <= 20)
dim(chicken)
## [1] 18797 22315
Assign cell cycle score
chicken <- CellCycleScoring(object = chicken, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes)
chicken <- NormalizeData(object = chicken, scale.factor = 1e6)
chicken <- FindVariableFeatures(object = chicken, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
chicken <- ScaleData(object = chicken)
## Centering and scaling data matrix
dim(GetAssayData(chicken, assay = "RNA", slot = "scale.data"))
## [1] 2000 22315
chicken <- RunPCA(object = chicken, assay = "RNA")
## PC_ 1
## Positive: MYH9, PODXL, SPTBN1, HSP90B1, ENSGALG00000012659, SYNE2, MYH10, CCDC80, PDIA6, COL9A2
## TKT, FN1, TXNDC5, PHLDB2, ENSGALG00000043598, CDH11, HYOU1, TM7SF2, MFAP2, AGRN
## AKAP9, LMO4, GPC1, HSPA5, ENSGALG00000013239, ENSGALG00000036956, FSTL1, GATA4, LIMA1, GOLGB1
## Negative: MYL3, FABP3, ENSGALG00000028183, TNNI1, MYL2, TNNT2, MYL10, TNNC1, EEF1A2, HSPB1
## ADCYAP1, ACTC1, ACTA1, TRIM55, TPM4, PLN, SMPX, CSRP3, FHL2, MYH15
## MYBPC3, ACTN2, APOBEC2, ENSGALG00000016826, NEBL.1, CTNNA3, NPPA, NKX2-5, ENSGALG00000016127, IRX4
## PC_ 2
## Positive: CKB, NEXN, ADCYAP1, MYBPC3, TRIM55, MYH15, TNNT2, EEF1A2, FKBP1B, CSRP3
## RBM24, IRX4, LDHB, ACTN2, MYL9, ENSGALG00000016826, APOBEC2, TNNI1, MDH1, MGST3
## HSPB1, NEBL.1, ENSGALG00000028183, RCAN2, CYCS, SMPX, FHL2, TPM4, ENSGALG00000020788, NKX2-5
## Negative: POSTN, DCN, FOS, COL1A1, ACTB, COL3A1, COL5A1, FN1, SPARC, CFD
## ENG, IFITM3, THBS1, HIC1, KLF2, ND2, AQP1, CXCL12, COL6A3, COL14A1
## KLF6, BRT-1, COL6A2, LRRC17, COL1A2, NFKBIA, CCL4.1, RARRES2, NFKBIZ, RAMP2
## PC_ 3
## Positive: SLC4A1, EPB42, TUBB1, SUSD3, ENSGALG00000009552, TSPO2, SLC25A37, ENSGALG00000047208, STEAP3, ABCB10
## ENSGALG00000053241, ALDOB, SLC30A10, GLRX, CA13, ENSGALG00000045740, ITPK1, HBE1, ENSGALG00000014736, RFESD
## FBP1, TFR2, FECH, CCND3, GSTA4L, ENSGALG00000023936, ENSGALG00000029521, SLC43A3, NECAB2, ENSGALG00000036805
## Negative: TMSB4X, ALDOC, SPARC, COL1A1, DKK3, IGFBP7, CRIP1, MYL9, COL5A1, DCN
## MDK, STMN1, GATA5, COL3A1, COL1A2, CDH2, TPM1, NREP, DES, MYL6
## S100A11, FHL2, CNN3, SPTBN1, CSRP1, GPC1, TGFB2, POSTN, HSPB1, MYL10
## PC_ 4
## Positive: COL1A1, COL3A1, TCF21, CDH11, SLIT1, TGFB2, LRRC17, FBLN1, RSPO3, COL9A2
## CSRP2, CNMD, THBS1, DCN, PTN, CHODL, CTHRC1, CXCL12, C1H2ORF40, COL1A2
## PRRX1, MOXD1, COL9A3, COL6A3, METRNL, ACTA2, ENSGALG00000011687, PTX3, CTGF, NREP
## Negative: MAFB, CDH5, RARRES1, SHE, ENSGALG00000046550, STAB1, ENSGALG00000024379, ENSGALG00000002012, RAMP2, CD34
## KDR, PECAM1, FHL1, SH3TC1, AFAP1L1, NOS3, ASS1, HPGD, TEK, ENSGALG00000045684
## JCAD, ACE, CD93, ARHGEF9, NPC2, MAOA, MYCT1, KLHL4, NRK, ENSGALG00000007646
## PC_ 5
## Positive: ENSGALG00000046550, CDH5, ENSGALG00000024379, CD34, FHL1, JCAD, ENSGALG00000002012, KDR, PECAM1, TEK
## RAMP2, SHE, NOS3, HPGD, AFAP1L1, ENSGALG00000045684, SH3TC1, DIPK2B, ACE, CAVIN1
## APOLD1, SYNM, LMCD1, C5H11ORF96, SPON1, PLPP3, RARRES1, KLHL4, NPR3, IRX6
## Negative: LCP1, CSF1R, LY86, GSTA3, LAPTM5, IFI30, CTSS, SPI1, MYO1F, ENSGALG00000047587
## PTPN6, CCL4, TIMD4, CD83, P2RY13, PTPRC, ENSGALG00000014585, ENSGALG00000047864, DCLK3, Ii
## CD48, HAVCR1, RGS18, FYB, HMOX1, CCAH221, RAC2, C1QB, MERTK, C1QC
FeaturePlot(chicken, reduction = "pca", c("nCount_RNA", "nFeature_RNA"))
Run the elbow plot to determine the significant dimensions.
ElbowPlot(chicken, reduction = "pca")
n.pcs = 20
chicken <- FindNeighbors(object = chicken, assay = "RNA", reduction = "pca", dims = 1:n.pcs, force.recalc = TRUE)
## Computing nearest neighbor graph
## Computing SNN
chicken <- FindClusters(object = chicken, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 22315
## Number of edges: 796950
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9318
## Number of communities: 20
## Elapsed time: 3 seconds
table(Idents(chicken))
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3657 2400 2116 1886 1644 1324 1309 1093 960 960 927 791 754 673 557 416
## 16 17 18 19
## 370 286 122 70
Clustered on the RNA assay using the shared near neighbor(SNN) graph; the “0.5” bit indicates that you clustered at a resolution of 0.5
chicken <- RunUMAP(object = chicken, assay = "RNA", reduction = "pca", dims = 1:n.pcs)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 06:26:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 06:26:55 Read 22315 rows and found 20 numeric columns
## 06:26:55 Using Annoy for neighbor search, n_neighbors = 30
## 06:26:55 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 06:26:57 Writing NN index file to temp file /var/folders/l7/frm5yjw16mldyr4r2fldvrk80000gn/T//RtmpMPMVBV/file134041c57ebe5
## 06:26:57 Searching Annoy index using 1 thread, search_k = 3000
## 06:27:03 Annoy recall = 100%
## 06:27:03 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 06:27:05 Initializing from normalized Laplacian + noise (using irlba)
## 06:27:06 Commencing optimization for 200 epochs, with 946378 positive edges
## 06:27:19 Optimization finished
DimPlot(chicken, reduction = "umap", label = TRUE, group.by = "RNA_snn_res.0.5")
DimPlot(chicken, reduction = "umap", group.by = "orig.ident")
save(chicken, file="/Users/cuihening/Desktop/robjs/chicken_raw.Robj")