ELeFHAnt_Tutorial

library(ELeFHAnt)
#> Loading required package: Seurat
#> Attaching SeuratObject
#> Attaching sp
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
#> Loading required package: tidyr
#> Loading required package: ggplot2
#> Loading required package: class
#> Loading required package: splitstackshape
#> Loading required package: fgsea
#> Loading required package: tibble
#> Loading required package: msigdbr
#> Loading required package: scPred
#> Loading required package: magrittr
#> 
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:tidyr':
#> 
#>     extract
#> Loading required package: harmony
#> Loading required package: Rcpp
#> Loading required package: scater
#> Loading required package: SingleCellExperiment
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats
#> 
#> Attaching package: 'matrixStats'
#> The following object is masked from 'package:dplyr':
#> 
#>     count
#> 
#> Attaching package: 'MatrixGenerics'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
#>     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
#>     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
#>     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
#>     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
#>     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
#>     colWeightedMeans, colWeightedMedians, colWeightedSds,
#>     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
#>     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
#>     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
#>     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
#>     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
#>     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
#>     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
#>     rowWeightedSds, rowWeightedVars
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:dplyr':
#> 
#>     combine, intersect, setdiff, union
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#>     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#>     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#>     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#>     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#>     union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:tidyr':
#> 
#>     expand
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, rename
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> Loading required package: IRanges
#> 
#> Attaching package: 'IRanges'
#> The following objects are masked from 'package:dplyr':
#> 
#>     collapse, desc, slice
#> The following object is masked from 'package:sp':
#> 
#>     %over%
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> 
#> Attaching package: 'Biobase'
#> The following object is masked from 'package:MatrixGenerics':
#> 
#>     rowMedians
#> The following objects are masked from 'package:matrixStats':
#> 
#>     anyMissing, rowMedians
#> 
#> Attaching package: 'SummarizedExperiment'
#> The following object is masked from 'package:SeuratObject':
#> 
#>     Assays
#> The following object is masked from 'package:Seurat':
#> 
#>     Assays
#> Loading required package: parsnip
#> Loading required package: ranger
#> Loading required package: LiblineaR
#> Loading required package: caTools
#> 
#> Attaching package: 'caTools'
#> The following object is masked from 'package:IRanges':
#> 
#>     runmean
#> The following object is masked from 'package:S4Vectors':
#> 
#>     runmean
#> Loading required package: reshape2
#> 
#> Attaching package: 'reshape2'
#> The following object is masked from 'package:tidyr':
#> 
#>     smiths
#> Loading required package: biomaRt
#> Warning: replacing previous import 'biomaRt::select' by 'dplyr::select' when
#> loading 'ELeFHAnt'
#> Warning: replacing previous import 'magrittr::extract' by 'tidyr::extract' when
#> loading 'ELeFHAnt'
library(Matrix)
#> 
#> Attaching package: 'Matrix'
#> The following object is masked from 'package:S4Vectors':
#> 
#>     expand
#> The following objects are masked from 'package:tidyr':
#> 
#>     expand, pack, unpack

data("reference_PBMC")
data("query_PBMC")
reference = reference_PBMC
query = query_PBMC
query = NormalizeData(query)
query = FindVariableFeatures(query)
query = ScaleData(query)
#> Centering and scaling data matrix
query = RunPCA(query)
#> PC_ 1 
#> Positive:  CST3, AIF1, LST1, FTL, FTH1, TYMP, TYROBP, CFD, FCER1G, SERPINA1 
#>     FCN1, LYZ, CTSS, IFITM3, S100A9, LGALS1, COTL1, PSAP, IFI30, S100A11 
#>     NPC2, CFP, SAT1, RP11-290F20.3, S100A8, PYCARD, S100A6, PILRA, LGALS2, CEBPB 
#> Negative:  IL32, LTB, CD3E, LDHB, CTSW, GZMM, CD2, IL7R, CCL5, CD247 
#>     ACAP1, CST7, GZMA, STK17A, NKG7, CD27, PRF1, HOPX, GIMAP5, NOSIP 
#>     AQP3, GZMK, NCR3, FGFBP2, LYAR, KLRG1, SAMD3, CD8B, ETS1, GZMB 
#> PC_ 2 
#> Positive:  PF4, SDPR, GNG11, PPBP, SPARC, GP9, TUBB1, HIST1H2AC, CLU, AP001189.4 
#>     PTCRA, ITGA2B, NRGN, RGS18, CD9, TMEM40, MMD, CA2, ACRBP, TREML1 
#>     F13A1, SEPT5, TSC22D1, PTGS1, CMTM5, LY6G6F, GP1BA, RP11-367G6.3, MYL9, RUFY1 
#> Negative:  RPS2, TMSB10, CYBA, NKG7, S100A4, GZMA, CST7, PRF1, CTSW, GNLY 
#>     FGFBP2, CD247, EIF4A1, GZMB, GZMM, ID2, IFITM2, GZMH, SPON2, ANXA1 
#>     CCL4, FCGR3A, PFN1, APOBEC3G, RBM3, S100A10, GIMAP7, IGFBP7, HOPX, CLIC3 
#> PC_ 3 
#> Positive:  NKG7, PRF1, GZMB, CST7, GZMA, FGFBP2, GNLY, CTSW, SPON2, CD247 
#>     GZMH, GZMM, CCL5, CCL4, FCGR3A, SRGN, CLIC3, AKR1C3, XCL2, PFN1 
#>     ACTB, IGFBP7, TTC38, HOPX, APMAP, SH3BGRL3, RHOC, ID2, ARPC5L, ANXA1 
#> Negative:  CD79A, MS4A1, HLA-DRA, HLA-DQB1, TCL1A, HLA-DQA1, RPS2, HLA-DRB1, CD74, CD79B 
#>     LTB, HLA-DPB1, HLA-DMA, HLA-DRB5, HLA-DPA1, HLA-DQA2, FCER2, LY86, HVCN1, SNHG7 
#>     KIAA0125, P2RX5, IRF8, CD19, QRSL1, SWAP70, IGLL5, FCGR2B, C6orf48, POU2AF1 
#> PC_ 4 
#> Positive:  S100A4, S100A8, TMSB4X, S100A6, S100A9, CD14, GIMAP7, FCN1, IL32, RBP7 
#>     LGALS2, S100A11, CD3E, TYROBP, ANXA1, LYZ, S100A12, IL7R, MS4A6A, GZMM 
#>     GIMAP4, FTL, CFD, LGALS1, S100A10, NOSIP, CD2, AIF1, FYB, TIMP1 
#> Negative:  HLA-DQA1, KIAA0101, TYMS, CD79A, HLA-DQB1, RRM2, TK1, CD74, CD79B, GINS2 
#>     MS4A1, HLA-DQA2, MKI67, HLA-DPB1, ZWINT, HLA-DRA, MYBL2, HLA-DRB1, BIRC5, HLA-DPA1 
#>     HLA-DRB5, KIFC1, TCL1A, CLSPN, HLA-DMA, CENPM, MZB1, AURKB, STMN1, NUSAP1 
#> PC_ 5 
#> Positive:  LDHB, VIM, IL7R, CD3E, IL32, AQP3, NOSIP, CD27, RPS2, CD2 
#>     FYB, GIMAP7, CD40LG, RRM2, KIAA0101, S100A10, LTB, TYMS, GIMAP4, TK1 
#>     ZWINT, MKI67, PPA1, LDLRAP1, GIMAP5, BIRC5, GINS2, GAPDH, TRADD, COTL1 
#> Negative:  GZMB, FGFBP2, CD79B, CD79A, GNLY, TCL1A, SPON2, PRF1, MS4A1, CD74 
#>     HLA-DQA1, NKG7, CCL4, HLA-DQB1, HLA-DPB1, CLIC3, HLA-DPA1, HLA-DRA, CST7, HLA-DRB1 
#>     IGFBP7, PLAC8, TTC38, AKR1C3, GZMA, FCGR3A, XCL2, HLA-DRB5, FCER2, APMAP
query = RunUMAP(query, dims = 1:20)
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 21:03:14 UMAP embedding parameters a = 0.9922 b = 1.112
#> 21:03:14 Read 1358 rows and found 20 numeric columns
#> 21:03:14 Using Annoy for neighbor search, n_neighbors = 30
#> 21:03:14 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 21:03:14 Writing NN index file to temp file /var/folders/bw/whg3swn15jb08_f7v2y09xw9glk1wg/T//RtmpeqmOHt/file804e5c5b3e7a
#> 21:03:14 Searching Annoy index using 1 thread, search_k = 3000
#> 21:03:15 Annoy recall = 100%
#> 21:03:15 Commencing smooth kNN distance calibration using 1 thread
#> 21:03:16 Initializing from normalized Laplacian + noise
#> 21:03:16 Commencing optimization for 500 epochs, with 54848 positive edges
#> 21:03:19 Optimization finished

out.CelltypeAnnotation = CelltypeAnnotation(reference = reference, query = query, downsample = TRUE, downsample_to = 1000, validatePredictions = FALSE, annotationCol = "Celltype")
#> Setting Assay of reference and query to RNA
#> Running Diagonistis on reference and query
#> Number of cells in reference:2019
#> Number of cells in query:1358
#> Downsampling reference
#> Number of cells in reference after downsampling per celltype:2019
#> Calculating ratio of number of cells in downsampled reference vs query
#> Ratio of number of cells in query vs downsampled reference:0.672610203070827
#> Centering and scaling data matrix
#> Centering and scaling data matrix
#> Finding common variable features between reference and query
#> Subsetting reference and query for common variable features
#> Preparing train and test datasets from reference and query
#> Scaling reference to obtain training set
#> Scaling query to obtain test set
#> 
#> Setting up three classifiers: randomForest, SVM and LR
#> Initializing randomForest
#> randomForest Complete
#> Initializing SVM
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 2.949909
#> #nonzeros/#features = 51/493
#> ..*.*
#> optimization finished, #iter = 39
#> Objective value = 22.066079
#> #nonzeros/#features = 188/493
#> ..*.*
#> optimization finished, #iter = 35
#> Objective value = 8.864476
#> #nonzeros/#features = 117/493
#> ..*.*
#> optimization finished, #iter = 33
#> Objective value = 22.589678
#> #nonzeros/#features = 186/493
#> ..*.*
#> optimization finished, #iter = 31
#> Objective value = 7.711503
#> #nonzeros/#features = 96/493
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 13.829589
#> #nonzeros/#features = 198/493
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 3.922960
#> #nonzeros/#features = 69/493
#> ....**.
#> optimization finished, #iter = 50
#> Objective value = 21.344268
#> #nonzeros/#features = 208/493
#> ..*.*
#> optimization finished, #iter = 32
#> Objective value = 15.986209
#> #nonzeros/#features = 154/493
#> ..*.*.
#> optimization finished, #iter = 40
#> Objective value = 19.200190
#> #nonzeros/#features = 159/493
#> .**
#> optimization finished, #iter = 18
#> Objective value = 4.516412
#> #nonzeros/#features = 50/493
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> SVM Complete
#> Initializing LR
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .
#> optimization finished, #iter = 13
#> Objective value = -3.992689
#> ....
#> optimization finished, #iter = 40
#> Objective value = -18.939503
#> ...
#> optimization finished, #iter = 34
#> Objective value = -10.360630
#> ....
#> optimization finished, #iter = 40
#> Objective value = -19.051939
#> ..
#> optimization finished, #iter = 28
#> Objective value = -9.799596
#> ...
#> optimization finished, #iter = 36
#> Objective value = -12.786152
#> ..
#> optimization finished, #iter = 23
#> Objective value = -7.560311
#> ....
#> optimization finished, #iter = 43
#> Objective value = -18.135476
#> ..
#> optimization finished, #iter = 28
#> Objective value = -15.249190
#> ..
#> optimization finished, #iter = 27
#> Objective value = -17.553502
#> ..
#> optimization finished, #iter = 23
#> Objective value = -10.646397
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> LR Complete
#> 
#> Classifying cells in query using each classifier
#> Warning: Invalid name supplied, making object
#> name syntactically valid. New object name is
#> ELeFHAnt_RF_CD14..Monocyte.ProbabilityELeFHAnt_RF_CD19..B.ProbabilityELeFHAnt_RF_CD34..ProbabilityELeFHAnt_RF_CD4..T.Helper2.ProbabilityELeFHAnt_RF_CD4..CD25.T.Reg.ProbabilityELeFHAnt_RF_CD4..CD45RA..CD25..Naive.T.ProbabilityELeFHAnt_RF_CD4..CD45RO..Memory.ProbabilityELeFHAnt_RF_CD56..NK.ProbabilityELeFHAnt_RF_CD8..Cytotoxic.T.ProbabilityELeFHAnt_RF_CD8..CD45RA..Naive.Cytotoxic.ProbabilityELeFHAnt_RF_Dendritic.Probability;
#> see ?make.names for more details on syntax validity
#> Warning: Invalid name supplied, making object
#> name syntactically valid. New object name is
#> ELeFHAnt_SVM_CD34..Decision.ValuesELeFHAnt_SVM_CD4..CD45RO..Memory.Decision.ValuesELeFHAnt_SVM_CD14..Monocyte.Decision.ValuesELeFHAnt_SVM_CD4..CD25.T.Reg.Decision.ValuesELeFHAnt_SVM_CD56..NK.Decision.ValuesELeFHAnt_SVM_Dendritic.Decision.ValuesELeFHAnt_SVM_CD19..B.Decision.ValuesELeFHAnt_SVM_CD4..CD45RA..CD25..Naive.T.Decision.ValuesELeFHAnt_SVM_CD8..Cytotoxic.T.Decision.ValuesELeFHAnt_SVM_CD8..CD45RA..Naive.Cytotoxic.Decision.ValuesELeFHAnt_SVM_CD4..T.Helper2.Decision.Values;
#> see ?make.names for more details on syntax validity
#> Warning: Invalid name supplied, making object
#> name syntactically valid. New object name is
#> ELeFHAnt_LR_CD34..ProbabilityELeFHAnt_LR_CD4..CD45RO..Memory.ProbabilityELeFHAnt_LR_CD14..Monocyte.ProbabilityELeFHAnt_LR_CD4..CD25.T.Reg.ProbabilityELeFHAnt_LR_CD56..NK.ProbabilityELeFHAnt_LR_Dendritic.ProbabilityELeFHAnt_LR_CD19..B.ProbabilityELeFHAnt_LR_CD4..CD45RA..CD25..Naive.T.ProbabilityELeFHAnt_LR_CD8..Cytotoxic.T.ProbabilityELeFHAnt_LR_CD8..CD45RA..Naive.Cytotoxic.ProbabilityELeFHAnt_LR_CD4..T.Helper2.Probability;
#> see ?make.names for more details on syntax validity
#> 
#> Obtaing Ensemble Predictions using RF, SVM and LR
#> 
#> Celltype predictions are stored in query metadata. Please see: ELeFHAnt_RF_CelltypePrediction, ELeFHAnt_SVM_CelltypePrediction, ELeFHAnt_LR_CelltypePrediction, ELeFHAnt_Ensemble_CelltypePrediction
#> Ensembl celltype annotation completed.

p1 = DimPlot(out.CelltypeAnnotation, group.by = "seurat_clusters", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p2 = DimPlot(out.CelltypeAnnotation, group.by = "ELeFHAnt_Ensemble_CelltypePrediction", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p3 = DimPlot(out.CelltypeAnnotation, group.by = "ELeFHAnt_RF_CelltypePrediction", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p4 = DimPlot(out.CelltypeAnnotation, group.by = "ELeFHAnt_SVM_CelltypePrediction", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p5 = DimPlot(out.CelltypeAnnotation, group.by = "ELeFHAnt_LR_CelltypePrediction", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p1

out.DR = DeduceRelationship(reference1 = reference, reference2 = query, downsample = TRUE, downsample_to = 1000, selectvarfeatures = 2000, ntree = 500, annotationCol_ref1 = "Celltype", annotationCol_ref2 = "Celltype")
#> Setting Assay of reference1 and reference2 to RNA
#> Number of cells in reference1:2019
#> Number of cells in reference2:1358
#> Centering and scaling data matrix
#> Centering and scaling data matrix
#> Number of cells in reference1 after downsampling:2019
#> Number of cells in reference2 after downsampling:1358
#> Finding common variable features between reference and query
#> Subsetting reference1 and reference2 for common variable features
#> Preparing train and test datasets from reference1 and reference2
#> Scaling reference1 to obtain training set
#> Scaling reference2 to obtain test set
#> 
#> Setting up three classifiers: randomForest, SVM and LR
#> Initializing randomForest
#> randomForest Complete
#> Initializing SVM
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 2.949909
#> #nonzeros/#features = 51/493
#> ..*.*
#> optimization finished, #iter = 39
#> Objective value = 22.066079
#> #nonzeros/#features = 188/493
#> ..*.*
#> optimization finished, #iter = 35
#> Objective value = 8.864476
#> #nonzeros/#features = 117/493
#> ..*.*
#> optimization finished, #iter = 33
#> Objective value = 22.589678
#> #nonzeros/#features = 186/493
#> ..*.*
#> optimization finished, #iter = 31
#> Objective value = 7.711503
#> #nonzeros/#features = 96/493
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 13.829589
#> #nonzeros/#features = 198/493
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 3.922960
#> #nonzeros/#features = 69/493
#> ....**.
#> optimization finished, #iter = 50
#> Objective value = 21.344268
#> #nonzeros/#features = 208/493
#> ..*.*
#> optimization finished, #iter = 32
#> Objective value = 15.986209
#> #nonzeros/#features = 154/493
#> ..*.*.
#> optimization finished, #iter = 40
#> Objective value = 19.200190
#> #nonzeros/#features = 159/493
#> .**
#> optimization finished, #iter = 18
#> Objective value = 4.516412
#> #nonzeros/#features = 50/493
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> SVM Complete
#> Initializing LR
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .
#> optimization finished, #iter = 13
#> Objective value = -3.992689
#> ....
#> optimization finished, #iter = 40
#> Objective value = -18.939503
#> ...
#> optimization finished, #iter = 34
#> Objective value = -10.360630
#> ....
#> optimization finished, #iter = 40
#> Objective value = -19.051939
#> ..
#> optimization finished, #iter = 28
#> Objective value = -9.799596
#> ...
#> optimization finished, #iter = 36
#> Objective value = -12.786152
#> ..
#> optimization finished, #iter = 23
#> Objective value = -7.560311
#> ....
#> optimization finished, #iter = 43
#> Objective value = -18.135476
#> ..
#> optimization finished, #iter = 28
#> Objective value = -15.249190
#> ..
#> optimization finished, #iter = 27
#> Objective value = -17.553502
#> ..
#> optimization finished, #iter = 23
#> Objective value = -10.646397
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> LR Complete
#> 
#> Classifying cells in query using each classifier and Generating scaled confusion matrix
#> 
#> Using Relative Similarity from Normalized Confusion matrices to generate Reference1 vs Reference2 similarity

out.DR

reference$Celltypes = reference$Celltype
query$Celltypes = query$Celltype

out.LH = LabelHarmonization(seurat.objects = c(reference, query), perform_integration = TRUE, integrated.atlas = NULL, downsample = TRUE, downsample_to = 1000, npcs = 30, resolution = 0.8, validatePredictions = FALSE, selectanchorfeatures = 2000, ntree = 500, k.anchor = 5, k.filter = 200, k.score = 30, dims = 1:30, species = NULL, tissue = NULL, annotationCol = "Celltypes")
#> Downsampling seurat objects
#> Starting integration using Seurat Canonical Correlation Algorithm
#> Computing 2000 integration features
#> Scaling features for provided objects
#> Finding all pairwise anchors
#> Running CCA
#> Merging objects
#> Finding neighborhoods
#> Finding anchors
#>  Found 4810 anchors
#> Filtering anchors
#>  Retained 3602 anchors
#> Merging dataset 2 into 1
#> Extracting anchors for merged samples
#> Finding integration vectors
#> Finding integration vector weights
#> Integrating data
#> Integration Completed. Performing Scaling, Dimension reduction and clustering
#> 21:04:34 UMAP embedding parameters a = 0.9922 b = 1.112
#> 21:04:34 Read 3377 rows and found 30 numeric columns
#> 21:04:34 Using Annoy for neighbor search, n_neighbors = 30
#> 21:04:34 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 21:04:34 Writing NN index file to temp file /var/folders/bw/whg3swn15jb08_f7v2y09xw9glk1wg/T//RtmpeqmOHt/file804e6883f220
#> 21:04:34 Searching Annoy index using 1 thread, search_k = 3000
#> 21:04:35 Annoy recall = 100%
#> 21:04:35 Commencing smooth kNN distance calibration using 1 thread
#> 21:04:37 Initializing from normalized Laplacian + noise
#> 21:04:37 Commencing optimization for 500 epochs, with 143848 positive edges
#> 21:04:42 Optimization finished
#> Computing nearest neighbor graph
#> Computing SNN
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 3377
#> Number of edges: 168438
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8663
#> Number of communities: 14
#> Elapsed time: 0 seconds
#> Number of cells in integrated atlas:3377
#> Generating train and test datasets using stratification -- 70% for training & 30% for testing
#> Number of Anchor Features selected:2000
#> 
#> Setting up three classifiers: randomForest, SVM and LR
#> Initializing randomForest
#> randomForest Complete
#> Initializing SVM
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> ..*...**
#> optimization finished, #iter = 59
#> Objective value = 3.463748
#> #nonzeros/#features = 63/2001
#> .**.
#> optimization finished, #iter = 20
#> Objective value = 1.870714
#> #nonzeros/#features = 17/2001
#> .**
#> optimization finished, #iter = 18
#> Objective value = 8.515331
#> #nonzeros/#features = 171/2001
#> .*.*
#> optimization finished, #iter = 24
#> Objective value = 9.205882
#> #nonzeros/#features = 190/2001
#> ..*.**
#> optimization finished, #iter = 37
#> Objective value = 9.426450
#> #nonzeros/#features = 212/2001
#> ..*.**
#> optimization finished, #iter = 36
#> Objective value = 10.029644
#> #nonzeros/#features = 216/2001
#> ..*.*
#> optimization finished, #iter = 36
#> Objective value = 11.192939
#> #nonzeros/#features = 284/2001
#> ..*.*
#> optimization finished, #iter = 32
#> Objective value = 10.068798
#> #nonzeros/#features = 246/2001
#> ...*.*
#> optimization finished, #iter = 49
#> Objective value = 7.784389
#> #nonzeros/#features = 162/2001
#> **.
#> optimization finished, #iter = 10
#> Objective value = 2.089124
#> #nonzeros/#features = 18/2001
#> .*.*
#> optimization finished, #iter = 25
#> Objective value = 11.347513
#> #nonzeros/#features = 270/2001
#> ..*.*.*
#> optimization finished, #iter = 41
#> Objective value = 8.589253
#> #nonzeros/#features = 222/2001
#> .*.*
#> optimization finished, #iter = 21
#> Objective value = 4.318886
#> #nonzeros/#features = 82/2001
#> ..*.*.*
#> optimization finished, #iter = 42
#> Objective value = 8.714387
#> #nonzeros/#features = 199/2001
#> ..*.*.*
#> optimization finished, #iter = 41
#> Objective value = 6.275500
#> #nonzeros/#features = 112/2001
#> ..*..*
#> optimization finished, #iter = 41
#> Objective value = 10.596130
#> #nonzeros/#features = 249/2001
#> ..*.**.
#> optimization finished, #iter = 40
#> Objective value = 5.188268
#> #nonzeros/#features = 103/2001
#> .**
#> optimization finished, #iter = 14
#> Objective value = 2.501563
#> #nonzeros/#features = 28/2001
#> .*.*
#> optimization finished, #iter = 24
#> Objective value = 6.337268
#> #nonzeros/#features = 117/2001
#> ..*.*
#> optimization finished, #iter = 36
#> Objective value = 6.841068
#> #nonzeros/#features = 130/2001
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> SVM Complete
#> Initializing LR
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .
#> optimization finished, #iter = 19
#> Objective value = -5.686856
#> ...
#> optimization finished, #iter = 33
#> Objective value = -6.688618
#> ..
#> optimization finished, #iter = 24
#> Objective value = -7.936455
#> ..
#> optimization finished, #iter = 22
#> Objective value = -8.181496
#> ..
#> optimization finished, #iter = 24
#> Objective value = -8.119265
#> ..
#> optimization finished, #iter = 20
#> Objective value = -8.384488
#> ..
#> optimization finished, #iter = 26
#> Objective value = -8.422691
#> ..
#> optimization finished, #iter = 25
#> Objective value = -8.130720
#> ..
#> optimization finished, #iter = 23
#> Objective value = -7.539691
#> ..
#> optimization finished, #iter = 21
#> Objective value = -6.777600
#> ..
#> optimization finished, #iter = 21
#> Objective value = -8.644200
#> ..
#> optimization finished, #iter = 23
#> Objective value = -7.524811
#> ..
#> optimization finished, #iter = 25
#> Objective value = -6.766292
#> ..
#> optimization finished, #iter = 27
#> Objective value = -7.640640
#> ..
#> optimization finished, #iter = 26
#> Objective value = -7.169847
#> ..
#> optimization finished, #iter = 21
#> Objective value = -8.331353
#> ..
#> optimization finished, #iter = 26
#> Objective value = -6.927301
#> ..
#> optimization finished, #iter = 25
#> Objective value = -6.634453
#> ..
#> optimization finished, #iter = 23
#> Objective value = -6.889984
#> ..
#> optimization finished, #iter = 21
#> Objective value = -7.274180
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> LR Complete
#> 
#> Classifying cells in query using each classifier and Generating scaled confusion matrix
#> 
#> Harmonized Celltype predictions are stored in integrated metadata. Please see: ELeFHAnt_RF_HarmonizedCelltype, ELeFHAnt_SVM_HarmonizedCelltype, ELeFHAnt_LR_HarmonizedCelltype, ELeFHAnt_Ensemble_HarmonizedCelltype
#> Ensembl celltype harmonization completed.

p1 = DimPlot(out.LH, group.by = "Celltypes", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p2 = DimPlot(out.LH, group.by = "ELeFHAnt_Ensemble_HarmonizedCelltype", label = T, reduction = "umap", label.size = 6, repel = T) + NoLegend()
p1
#> Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
#> increasing max.overlaps


out.Benchmark = BenchmarkELeFHAnt(reference = reference, query = query, downsample = TRUE, downsample_to = 1000, selectvarfeatures = 2000, ntree = 500, annotationCol = "Celltype")
#> 
#> Downsampling reference cells to enable fast computation
#> Centering and scaling data matrix
#> 
#> Deploying ELeFHAnt
#> Setting Assay of reference and query to RNA
#> Running Diagonistis on reference and query
#> Number of cells in reference:2019
#> Number of cells in query:1358
#> Downsampling reference
#> Number of cells in reference after downsampling per celltype:2019
#> Calculating ratio of number of cells in downsampled reference vs query
#> Ratio of number of cells in query vs downsampled reference:0.672610203070827
#> Centering and scaling data matrix
#> Centering and scaling data matrix
#> Finding common variable features between reference and query
#> Subsetting reference and query for common variable features
#> Preparing train and test datasets from reference and query
#> Scaling reference to obtain training set
#> Scaling query to obtain test set
#> 
#> Setting up three classifiers: randomForest, SVM and LR
#> Initializing randomForest
#> randomForest Complete
#> Initializing SVM
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 2.949909
#> #nonzeros/#features = 51/493
#> ..*.*
#> optimization finished, #iter = 39
#> Objective value = 22.066079
#> #nonzeros/#features = 188/493
#> ..*.*
#> optimization finished, #iter = 35
#> Objective value = 8.864476
#> #nonzeros/#features = 117/493
#> ..*.*
#> optimization finished, #iter = 33
#> Objective value = 22.589678
#> #nonzeros/#features = 186/493
#> ..*.*
#> optimization finished, #iter = 31
#> Objective value = 7.711503
#> #nonzeros/#features = 96/493
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 13.829589
#> #nonzeros/#features = 198/493
#> .*.*
#> optimization finished, #iter = 29
#> Objective value = 3.922960
#> #nonzeros/#features = 69/493
#> ....**.
#> optimization finished, #iter = 50
#> Objective value = 21.344268
#> #nonzeros/#features = 208/493
#> ..*.*
#> optimization finished, #iter = 32
#> Objective value = 15.986209
#> #nonzeros/#features = 154/493
#> ..*.*.
#> optimization finished, #iter = 40
#> Objective value = 19.200190
#> #nonzeros/#features = 159/493
#> .**
#> optimization finished, #iter = 18
#> Objective value = 4.516412
#> #nonzeros/#features = 50/493
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> SVM Complete
#> Initializing LR
#> ARGUMENTS SETUP
#> PROBLEM SETUP
#> FILL DATA STRUCTURE
#> SETUP CHECK
#> TRAIN
#> .
#> optimization finished, #iter = 13
#> Objective value = -3.992689
#> ....
#> optimization finished, #iter = 40
#> Objective value = -18.939503
#> ...
#> optimization finished, #iter = 34
#> Objective value = -10.360630
#> ....
#> optimization finished, #iter = 40
#> Objective value = -19.051939
#> ..
#> optimization finished, #iter = 28
#> Objective value = -9.799596
#> ...
#> optimization finished, #iter = 36
#> Objective value = -12.786152
#> ..
#> optimization finished, #iter = 23
#> Objective value = -7.560311
#> ....
#> optimization finished, #iter = 43
#> Objective value = -18.135476
#> ..
#> optimization finished, #iter = 28
#> Objective value = -15.249190
#> ..
#> optimization finished, #iter = 27
#> Objective value = -17.553502
#> ..
#> optimization finished, #iter = 23
#> Objective value = -10.646397
#> COPY MODEL TO WEIGHT VECTOR
#> FREE SPACE
#> FREED SPACE
#> LR Complete
#> 
#> Classifying cells in query using each classifier
#> Warning: Invalid name supplied, making object
#> name syntactically valid. New object name is
#> ELeFHAnt_RF_CD14..Monocyte.ProbabilityELeFHAnt_RF_CD19..B.ProbabilityELeFHAnt_RF_CD34..ProbabilityELeFHAnt_RF_CD4..T.Helper2.ProbabilityELeFHAnt_RF_CD4..CD25.T.Reg.ProbabilityELeFHAnt_RF_CD4..CD45RA..CD25..Naive.T.ProbabilityELeFHAnt_RF_CD4..CD45RO..Memory.ProbabilityELeFHAnt_RF_CD56..NK.ProbabilityELeFHAnt_RF_CD8..Cytotoxic.T.ProbabilityELeFHAnt_RF_CD8..CD45RA..Naive.Cytotoxic.ProbabilityELeFHAnt_RF_Dendritic.Probability;
#> see ?make.names for more details on syntax validity
#> Warning: Invalid name supplied, making object
#> name syntactically valid. New object name is
#> ELeFHAnt_SVM_CD34..Decision.ValuesELeFHAnt_SVM_CD4..CD45RO..Memory.Decision.ValuesELeFHAnt_SVM_CD14..Monocyte.Decision.ValuesELeFHAnt_SVM_CD4..CD25.T.Reg.Decision.ValuesELeFHAnt_SVM_CD56..NK.Decision.ValuesELeFHAnt_SVM_Dendritic.Decision.ValuesELeFHAnt_SVM_CD19..B.Decision.ValuesELeFHAnt_SVM_CD4..CD45RA..CD25..Naive.T.Decision.ValuesELeFHAnt_SVM_CD8..Cytotoxic.T.Decision.ValuesELeFHAnt_SVM_CD8..CD45RA..Naive.Cytotoxic.Decision.ValuesELeFHAnt_SVM_CD4..T.Helper2.Decision.Values;
#> see ?make.names for more details on syntax validity
#> Warning: Invalid name supplied, making object
#> name syntactically valid. New object name is
#> ELeFHAnt_LR_CD34..ProbabilityELeFHAnt_LR_CD4..CD45RO..Memory.ProbabilityELeFHAnt_LR_CD14..Monocyte.ProbabilityELeFHAnt_LR_CD4..CD25.T.Reg.ProbabilityELeFHAnt_LR_CD56..NK.ProbabilityELeFHAnt_LR_Dendritic.ProbabilityELeFHAnt_LR_CD19..B.ProbabilityELeFHAnt_LR_CD4..CD45RA..CD25..Naive.T.ProbabilityELeFHAnt_LR_CD8..Cytotoxic.T.ProbabilityELeFHAnt_LR_CD8..CD45RA..Naive.Cytotoxic.ProbabilityELeFHAnt_LR_CD4..T.Helper2.Probability;
#> see ?make.names for more details on syntax validity
#> 
#> Obtaing Ensemble Predictions using RF, SVM and LR
#> 
#> Celltype predictions are stored in query metadata. Please see: ELeFHAnt_RF_CelltypePrediction, ELeFHAnt_SVM_CelltypePrediction, ELeFHAnt_LR_CelltypePrediction, ELeFHAnt_Ensemble_CelltypePrediction
#> Ensembl celltype annotation completed.
#> 
#> Deploying Seurat Label Transfer
#> Performing PCA on the provided reference using 1850 features as input.
#> Projecting cell embeddings
#> Finding neighborhoods
#> Finding anchors
#>  Found 2258 anchors
#> Filtering anchors
#>  Retained 1914 anchors
#> Finding integration vectors
#> Finding integration vector weights
#> Predicting cell labels
#> 
#> Deploying scPred
#> PC_ 1 
#> Positive:  RPS2, RPS4X, RPLP0, LTB, NPM1, S100A4, S100A6, CD74, VIM, IFITM2 
#>     DUSP1, HLA-DRB1, IL32, HLA-DRA, HLA-DRB5, ZFP36, HLA-DPB1, PRELID1, FOS, CD7 
#>     TYROBP, LGALS1, PPIB, HLA-DPA1, HLA-DQA1, S100A11, GSTP1, HLA-DQB1, PFN1, HLA-DQA2 
#> Negative:  PF4, SDPR, PPBP, GNG11, TUBB1, GP9, ACRBP, CMTM5, SPARC, CLU 
#>     HIST1H2AC, NRGN, TREML1, RUFY1, NCOA4, ITGA2B, CLDN5, AP001189.4, PTCRA, AC147651.3 
#>     RGS18, TMEM40, MYL9, MAP3K7CL, CLEC1B, SNCA, MPP1, CD9, CTSA, FERMT3 
#> PC_ 2 
#> Positive:  LST1, AIF1, SPI1, CST3, SERPINA1, LYZ, IFI30, CFD, FCN1, CFP 
#>     RP11-290F20.3, HCK, MS4A7, TYMP, PILRA, TMEM176B, FCER1G, TYROBP, LRRC25, HLA-DRB1 
#>     CTSS, HLA-DPA1, PSAP, HLA-DRB5, CD68, HLA-DRA, S100A11, FTL, HMOX1, SAT1 
#> Negative:  IL32, CD7, LTB, CCL5, CTSW, GZMA, NPM1, CST7, RPS4X, GNLY 
#>     NKG7, RPLP0, HOPX, AQP3, GZMH, GZMK, PRF1, CCR7, ITM2A, FGFBP2 
#>     CD8B, GZMB, CD8A, CCL4, SH3YL1, SPON2, CLIC3, RGCC, NCR3, KLRG1 
#> PC_ 3 
#> Positive:  NKG7, GNLY, GZMA, CST7, FGFBP2, FCGR3A, CTSW, PRF1, GZMH, CCL5 
#>     GZMB, CD7, SPON2, IFITM2, HOPX, CCL4, S100A4, TMSB4X, ID2, IL32 
#>     TYROBP, SRGN, PFN1, RHOC, GPR56, CLIC3, ABI3, PRSS23, KLRC1, IL2RB 
#> Negative:  CD79A, HLA-DRA, HLA-DQA1, MS4A1, TCL1A, CD79B, HLA-DQA2, CD74, HLA-DRB1, HLA-DMB 
#>     HLA-DMA, HLA-DPB1, HLA-DRB5, LINC00926, HLA-DPA1, HLA-DQB1, FCER2, SPIB, VPREB3, IRF8 
#>     LTB, HVCN1, CYB561A3, HLA-DOB, BANK1, EAF2, FCGR2B, KIAA0125, BLK, CD19 
#> PC_ 4 
#> Positive:  GZMB, NKG7, GNLY, CLIC3, CST7, FGFBP2, PRF1, GZMA, CTSW, GZMH 
#>     HOPX, PRSS57, IGFBP7, SPON2, CCL4, FCER1A, AKR1C3, CYTL1, PTGDS, GPR56 
#>     GSTP1, MZB1, C19orf77, CCL5, LRRC26, ITM2C, IL2RB, PFN1, LILRA4, PRSS23 
#> Negative:  LTB, CFD, RP11-290F20.3, SERPINA1, TMSB4X, TMEM176B, CCR7, S100A8, COTL1, MS4A7 
#>     AQP3, S100A9, SAT1, CDKN1C, FCN1, PILRA, HES4, GPBAR1, VMO1, LYPD2 
#>     C5AR1, HCK, HMOX1, C1QA, FTH1, IFI30, TYMP, RBP7, CDA, CD14 
#> PC_ 5 
#> Positive:  CYTL1, PRSS57, C19orf77, SPINK2, GATA2, CRHBP, MYB, NFE2, RP11-620J15.3, SOX4 
#>     NPM1, FAM212A, IL18, EGFL7, IGLL1, RP11-354E11.2, CD34, GNA15, GATA1, ARMCX1 
#>     SERPINB1, HOXA9, H2AFY, HMGA1, IL1B, ID1, EREG, PTRF, RPLP0, ERG 
#> Negative:  GZMB, TMSB4X, CD79A, CLIC3, TCL1A, GNLY, HLA-DRB1, MS4A1, NKG7, FGFBP2 
#>     HLA-DRB5, HLA-DPB1, CD79B, CST7, PRF1, IRF8, CD74, CCL5, SPIB, HLA-DQA2 
#>     GZMH, HLA-DQA1, TYROBP, PTGDS, LINC00926, GZMA, FCER1G, HLA-DRA, HLA-DMB, HLA-DPA1
#> 21:05:30 UMAP embedding parameters a = 0.9922 b = 1.112
#> 21:05:30 Read 2019 rows and found 30 numeric columns
#> 21:05:30 Using Annoy for neighbor search, n_neighbors = 30
#> 21:05:30 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 21:05:30 Writing NN index file to temp file /var/folders/bw/whg3swn15jb08_f7v2y09xw9glk1wg/T//RtmpeqmOHt/file804e3d919b1d
#> 21:05:30 Searching Annoy index using 1 thread, search_k = 3000
#> 21:05:30 Annoy recall = 100%
#> 21:05:31 Commencing smooth kNN distance calibration using 1 thread
#> 21:05:32 Initializing from normalized Laplacian + noise
#> 21:05:32 Commencing optimization for 500 epochs, with 88246 positive edges
#> 21:05:36 Optimization finished
#> ●  Extracting feature space for each cell type...
#> DONE!
#> ●  Training models for each cell type...
#> Loading required package: lattice
#> DONE!
#> ●  Matching reference with new dataset...
#>   ─ 2000 features present in reference loadings
#>   ─ 1850 features shared between reference and new dataset
#>   ─ 92.5% of features in the reference are present in new dataset
#> ●  Aligning new data to reference...
#> Harmony 1/20
#> Harmony 2/20
#> Harmony 3/20
#> Harmony 4/20
#> Harmony 5/20
#> Harmony 6/20
#> Harmony converged after 6 iterations
#> ●  Classifying cells...
#> DONE!

p1 = DimPlot(out.Benchmark, group.by = "ELeFHAnt_Ensemble_CelltypePrediction", label=T, repel = T, label.size = 6, reduction = "umap") + NoLegend() + ggtitle("ELeFHAnt Predictions")
p2 = DimPlot(out.Benchmark, group.by = "predicted.id", label=T, repel = T, label.size = 6, reduction = "umap") + NoLegend() + ggtitle("LabelTransfer Predictions")
p3 = DimPlot(out.Benchmark, group.by = "scpred_prediction", label=T, repel = T, label.size = 6, reduction = "umap") + NoLegend() + ggtitle("scPred Predictions")

p1