One bone marrow sample treated with erythropoeitin sample E16

Author: Shenfeng Qiu

# PYTHON 3.10 has compatability issues because collections module was changed
# use Python 3.9 or run the following:

import collections.abc
#cellrank needs the four following aliases to be done manually.
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping

!pip install numpy==1.21.5

!pip install scvelo

!pip install cellrank

!pip install python-igraph louvain

import scvelo as scv
import scanpy as sc
import cellrank as cr
import numpy as np
import pandas as pd
import anndata as ad
import pickle
import loompy as lmp
import igraph
import anndata
from scipy import io
from scipy.sparse import coo_matrix, csr_matrix
import os
scv.logging.print_version()
Running scvelo 0.2.4 (python 3.9.13) on 2023-07-28 21:47.
print(lmp.__version__)
3.0.7
# file path names
fpath = r'D:/Bioinformatics Projects/shalini_bm/E15'
fdata = fpath + r'/E15_sample.h5ad'
fl = fpath + r'/velocyto/E15.loom'

Load the Data

scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', facecolor='white', dpi=100, frameon=False)
cr.settings.verbosity = 2
scv.settings.presenter_view = True  # set max width size for presenter view
ldata = scv.read(fl, cache=True)
ldata
AnnData object with n_obs × n_vars = 6757 × 36601
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
# inspect the ldata structure
# accessing a specific cell
ldata.obs_names[:5]
Index(['E15:AAAGTCCAGATTGATGx', 'E15:AAAGAACAGTCATGCTx',
       'E15:AACAAGAGTGCTTATGx', 'E15:AAATGGACACTCAAGTx',
       'E15:AAAGAACCATAGCACTx'],
      dtype='object', name='CellID')

ldata.obs.loc[‘AAAGTCCAGATTGATG’]

# accessing a specific gene
ldata.var.loc['CD44']
Accession     ENSG00000026508
Chromosome                 11
End                  35232402
Start                35138870
Strand                      +
Name: CD44, dtype: object
# access data from specific layer
ldata.layers['spliced']
<6757x36601 sparse matrix of type '<class 'numpy.uint16'>'
    with 15512493 stored elements in Compressed Sparse Row format>
#  access the data for a specific cell and gene in a specific layer

ldata.layers['matrix'][ldata.obs_names == 'AAAGAACCATAGCACT', ldata.var_names == 'CD44']
<1x0 sparse matrix of type '<class 'numpy.float32'>'
    with 0 stored elements in Compressed Sparse Row format>
scv.pl.proportions(ldata)

# scv.pl.proportions(ldata_m)
png
png

Preprocessing

def pp(path):
    adata = sc.read_10x_mtx(path)
    sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
    sc.pp.filter_genes(adata, min_cells=3) #get rid of genes that are found in fewer than 3 cells
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
    lower_lim = np.quantile(adata.obs.n_genes_by_counts.values, .02)
    adata = adata[(adata.obs.n_genes_by_counts < upper_lim) & (adata.obs.n_genes_by_counts > lower_lim)]
    adata = adata[adata.obs.pct_counts_mt < 20]
    sc.pp.normalize_total(adata, target_sum=1e4) #normalize every cell to 10,000 UMI
    sc.pp.log1p(adata) #change to log counts
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) #these are default values
    adata.raw = adata #save raw data before processing values and further filtering
    adata = adata[:, adata.var.highly_variable] #filter highly variable
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt']) #Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed
    sc.pp.scale(adata, max_value=10) #scale each gene to unit variance
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
    sc.tl.leiden(adata, resolution = 0.25)
    sc.tl.umap(adata)
    return adata
#
import os
os.getcwd()
'D:\\Bioinformatics Projects\\shalini_bm'
#!pip3 install leidenalg
### ONLY RUN THIS ONCE. THE SECOND LINE OF CODE saves the output to whatever directory you specify
# adata = pp('/xdisk/sqiu/jessicakzhang/sc_exp4_mTBI/outs/filtered_feature_bc_matrix/')
# pickle.dump(adata, open(r'/xdisk/sqiu/jessicakzhang/adata.h5ad', 'wb'))

adata = pp('./E15/outs/filtered_feature_bc_matrix/')
pickle.dump(adata, open(r'./adata.h5ad', 'wb'))  ### write the .h5ad binary file to the working directory
adata
AnnData object with n_obs × n_vars = 6335 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
# this loads the previously saved output. RUN THIS and change your directory to wherever you saved your output
adata = pickle.load(open(r'./adata.h5ad', 'rb'))
print(adata)
AnnData object with n_obs × n_vars = 6335 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
    # Find the cells that express CD44.
    # This creates a boolean mask where True indicates cells that express CD44.
# mask = adata[:, 'CD44'].X > 0

    # Use the mask to subset the AnnData object.
# adata_cd44 = adata[mask]
# adata_cd44
adata.obs_names[:5]
Index(['AAACCCACATCGCTCT-1', 'AAACCCAGTAGGGTAC-1', 'AAACCCAGTGCATCTA-1',
       'AAACCCATCATAGAGA-1', 'AAACCCATCGGAAACG-1'],
      dtype='object')
adata.obs.loc['AAACCCACATCGCTCT']
adata.var.loc['CD44']
gene_ids                 ENSG00000026508
feature_types            Gene Expression
n_cells                             3339
mt                                 False
n_cells_by_counts                   3339
mean_counts                       1.5769
pct_dropout_by_counts           50.23845
total_counts                     10581.0
highly_variable                     True
means                           0.691778
dispersions                     1.210978
dispersions_norm                1.348397
mean                                 0.0
std                             0.592069
Name: CD44, dtype: object
# to access a specific leiden cluster:
adata_cluster0 = adata[adata.obs['leiden'] == '0']
adata_cluster0
View of AnnData object with n_obs × n_vars = 1290 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
    # select all cells within clusters 0 and 1, that has CD44 gene expression
    # Create boolean masks for the Leiden clusters and CD44 expression.
# mask_cluster = (adata.obs['leiden'] == '0') | (adata.obs['leiden'] == '1')
# mask_cd44 = adata[:, 'CD44'].X.toarray().flatten() > 0

    # Combine the masks using the logical AND operator.
# mask_combined = mask_cluster & mask_cd44

    # Subset the AnnData object using the combined mask.
# adata_subset = adata[mask_combined]
# adata_subset
Merge the gene expression and loom data
adata = scv.utils.merge(adata, ldata)
# adata_m = scv.utils.merge(adata, ldata_m)
adata
AnnData object with n_obs × n_vars = 6335 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
    obsp: 'distances', 'connectivities'
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
Filtered out 1320 genes that are detected 20 counts (shared).
WARNING: Did not normalize X as it looks processed already. To enforce normalization, set `enforce=True`.
Normalized count data: spliced, unspliced.
Extracted 2000 highly variable genes.
WARNING: Did not modify X as it looks preprocessed already.
computing neighbors
    finished (0:00:01) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
# scv.pp.filter_and_normalize(adata_m)
# scv.pp.moments(adata_m)
adata
AnnData object with n_obs × n_vars = 6335 × 2000
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu'
    obsp: 'distances', 'connectivities'
# adata.obs
# adata.var
# adata.X
# adata.uns
sc.pl.pca_variance_ratio(adata, log=True)
png
png
sc.tl.tsne(adata)
sc.pl.tsne(adata)
png
png

!pip install python-louvain # not working

sc.pl.tsne(adata, color = ['leiden'])
png
png
sc.pl.umap(adata, color="leiden")
png
png
sc.pl.umap(adata, color="CD44")
png
png
# Now, create and save the plot
# SQ NOTE this creates a 'figures' folder in pwd

sc.pl.umap(adata, color=“leiden”, save=“umap_leiden_clusters.pdf”) sc.pl.umap(adata, color=“CD44”, save=“umap_CD44_clusters.pdf”)

Estimate RNA velocity

scv.tl.velocity(adata)
computing velocities
    finished (0:00:08) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
scv.tl.velocity_graph(adata)
computing velocity graph (using 1/80 cores)



  0%|          | 0/6335 [00:00<?, ?cells/s]


    finished (0:00:50) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
# scv.tl.velocity(adata_m)
# scv.tl.velocity_graph(adata_m)
scv.set_figure_params()
# scv.set_figure_params(dpi=200, color_map='viridis')
adata
AnnData object with n_obs × n_vars = 6335 × 2000
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'leiden_colors', 'velocity_params', 'velocity_graph', 'velocity_graph_neg'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu', 'velocity', 'variance_velocity'
    obsp: 'distances', 'connectivities'
adata.obsm
AxisArrays with keys: X_pca, X_umap
adata.layers['velocity']
# adata.uns['velocity_graph']
array([[ 0.16311729,  0.00325262,  0.02549965, ...,  0.2947519 ,
         0.24068618,  0.00166135],
       [ 0.05183516,  0.        , -0.00231888, ..., -0.03039856,
         0.        ,  0.62309027],
       [ 0.14193237,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.12683076],
       ...,
       [ 0.07127077, -0.00128302,  0.        , ...,  0.17017308,
         0.        ,  0.49752218],
       [ 0.06123393, -0.0016032 , -0.00131531, ...,  0.63884544,
         0.05677878,  0.        ],
       [ 0.139233  ,  0.03396539,  0.00363499, ...,  0.5291901 ,
         0.2595918 , -0.04816317]], dtype=float32)
adata.obs
# adata.obs['n_genes_by_counts']
n_genes n_genes_by_counts total_counts total_counts_mt pct_counts_mt leiden initial_size_spliced initial_size_unspliced initial_size n_counts velocity_self_transition
AAACCCACATCGCTCT 6772 6770 32469.0 1621.0 4.992455 2 14938 11611 14938.0 728.407471 0.353792
AAACCCAGTAGGGTAC 1508 1507 6036.0 96.0 1.590457 5 3632 1872 3632.0 -61.993500 0.237123
AAACCCAGTGCATCTA 1735 1735 6346.0 165.0 2.600063 1 4681 876 4681.0 -484.200470 0.272838
AAACCCATCATAGAGA 4500 4500 20373.0 1340.0 6.577333 0 13036 3108 13036.0 -580.061584 0.223742
AAACCCATCGGAAACG 2396 2395 4683.0 325.0 6.939996 7 1787 1950 1787.0 893.204102 0.365823
TTTGTTGAGACCAACG 5686 5685 24267.0 2669.0 10.998476 1 12288 5878 12288.0 -473.220093 0.226389
TTTGTTGAGCTGGTGA 3300 3300 13835.0 872.0 6.302855 0 9332 1777 9332.0 -536.107422 0.264881
TTTGTTGAGGGACACT 1187 1187 3105.0 36.0 1.159420 0 1763 1064 1763.0 -268.330414 0.173882
TTTGTTGGTCTCCCTA 3784 3783 8852.0 240.0 2.711252 7 4014 3374 4014.0 1190.949463 0.403490
TTTGTTGTCTACACAG 4755 4755 13093.0 391.0 2.986329 2 6323 4428 6323.0 812.005249 0.402153

6335 rows × 11 columns

adata.var
gene_ids feature_types n_cells mt n_cells_by_counts mean_counts pct_dropout_by_counts total_counts highly_variable means std Accession Chromosome End Start Strand velocity_gamma velocity_qreg_ratio velocity_r2 velocity_genes
C1orf159 ENSG00000131591 Gene Expression 1568 False 1568 0.349180 76.631893 2343.0 True -7.512550e-04 0.256349 ENSG00000131591 1 1116361 1081818 - 0.960242 0.960242 -1.056696 False
ACAP3 ENSG00000131584 Gene Expression 745 False 745 0.124590 88.897168 836.0 True -2.124832e-03 0.184853 ENSG00000131584 1 1309609 1292390 - 0.013770 0.152219 0.288276 True
MIB2 ENSG00000197530 Gene Expression 906 False 906 0.155589 86.497765 1044.0 True -1.027507e-03 0.198915 ENSG00000197530 1 1630610 1615415 + 0.044242 0.044242 -0.414496 False
KLHL21 ENSG00000162413 Gene Expression 1283 False 1283 0.230253 80.879285 1545.0 True -1.395323e-08 0.284522 ENSG00000162413 1 6614607 6590724 - 0.809669 0.809669 -0.450083 False
ENO1 ENSG00000074800 Gene Expression 3961 False 3961 6.071833 40.968703 40742.0 True -6.604966e-09 0.890816 ENSG00000074800 1 8879190 8861000 - 0.028938 0.038693 0.764427 True
AFF2 ENSG00000155966 Gene Expression 528 False 528 0.164531 92.131148 1104.0 True -2.977980e-04 0.294817 ENSG00000155966 X 149000663 148500617 + 0.285532 3.813417 0.628403 True
IDS ENSG00000010404 Gene Expression 802 False 802 0.169896 88.047690 1140.0 True -1.446279e-04 0.247148 ENSG00000010404 X 149521096 149476988 - 0.032241 0.070538 0.403367 True
MTM1 ENSG00000171100 Gene Expression 1499 False 1499 0.387183 77.660209 2598.0 True -1.403791e-08 0.318311 ENSG00000171100 X 150673322 150568619 + 0.632687 2.800952 0.395020 True
ARHGAP4 ENSG00000089820 Gene Expression 1585 False 1585 0.421162 76.378539 2826.0 True -4.290405e-09 0.314576 ENSG00000089820 X 153934999 153907367 - 0.057723 0.286018 0.456565 True
CLIC2 ENSG00000155962 Gene Expression 2154 False 2154 0.570045 67.898659 3825.0 True -7.113040e-09 0.440094 ENSG00000155962 X 155334657 155276211 - 1.007027 2.217264 0.801841 True

2000 rows × 23 columns

# adata_m.obs
# fetch data from adata cluster 0 and 1, number of genes and make a histogram plot
import matplotlib.pyplot as plt

# Create boolean mask for clusters
mask_cluster = (adata.obs['leiden'] == '0') | (adata.obs['leiden'] == '1')

# Subset the data
adata_subset = adata[mask_cluster]

# Get number of genes for each cell
num_genes = adata_subset.obs['n_genes']

# Create histogram
plt.hist(num_genes, bins=30, alpha=0.75)

# Set the title and labels
plt.title('Histogram of Number of Genes')
plt.xlabel('Number of Genes')
plt.ylabel('Number of Cells')

# Show the plot
plt.show()
png
png
# find gene markers for each of the detected leiden clusters
# Perform the ranking
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

# The result is stored in adata.uns['rank_genes_groups']
result = adata.uns['rank_genes_groups']

# Initialize a list to hold dataframes for each cluster
df_list = []

# Iterate over clusters
for group in result['names'].dtype.names:
    # Create a DataFrame for the current cluster
    df = pd.DataFrame({    
        'Genes': result['names'][group],
        'Scores': result['scores'][group],
        'Logfoldchanges': result['logfoldchanges'][group],
        'pvals': result['pvals'][group],
        'pvals_adj': result['pvals_adj'][group],
    })
    
    # Add a column for the cluster
    df['Cluster'] = group

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all dataframes
df_all = pd.concat(df_list, ignore_index=True)

# Write DataFrame to a csv file
df_all.to_csv('scanpy_marker_genes.csv', index=False)
df_all
Genes Scores Logfoldchanges pvals pvals_adj Cluster
0 NUSAP1 48.397820 2.678382 0.000000e+00 0.000000 0
1 TOP2A 47.867912 2.843289 0.000000e+00 0.000000 0
2 CENPE 45.637989 2.997186 0.000000e+00 0.000000 0
3 DLGAP5 44.949627 2.646727 0.000000e+00 0.000000 0
4 HMGB2 44.792137 2.093099 0.000000e+00 0.000000 0
289483 MT-CO3 -5.545708 -3.068403 2.927671e-08 0.000014 11
289484 RPL6 -5.575259 -3.883544 2.471614e-08 0.000014 11
289485 RPS15A -5.584476 -3.632532 2.344061e-08 0.000014 11
289486 RPS29 -5.653746 -3.751123 1.569881e-08 0.000012 11
289487 MT-CO1 -5.927827 -4.364321 3.069686e-09 0.000004 11

289488 rows × 6 columns

df_all_f = df_all[(df_all.pvals_adj < 0.01)&(abs(df_all.Logfoldchanges)>2)]
df_all_f
Genes Scores Logfoldchanges pvals pvals_adj Cluster
0 NUSAP1 48.397820 2.678382 0.000000e+00 0.000000 0
1 TOP2A 47.867912 2.843289 0.000000e+00 0.000000 0
2 CENPE 45.637989 2.997186 0.000000e+00 0.000000 0
3 DLGAP5 44.949627 2.646727 0.000000e+00 0.000000 0
4 HMGB2 44.792137 2.093099 0.000000e+00 0.000000 0
289483 MT-CO3 -5.545708 -3.068403 2.927671e-08 0.000014 11
289484 RPL6 -5.575259 -3.883544 2.471614e-08 0.000014 11
289485 RPS15A -5.584476 -3.632532 2.344061e-08 0.000014 11
289486 RPS29 -5.653746 -3.751123 1.569881e-08 0.000012 11
289487 MT-CO1 -5.927827 -4.364321 3.069686e-09 0.000004 11

24668 rows × 6 columns

df_all_f.to_csv('scvelo_leiden_markers.csv')
cluster_list = adata.obs.leiden.unique()
cluster_list
['2', '5', '1', '0', '7', ..., '3', '4', '9', '10', '11']
Length: 12
Categories (12, object): ['0', '1', '2', '3', ..., '8', '9', '10', '11']

Manual annotation will be done by SS.

Alternatively, map to the Azimuth human bone marrow reference dataset

# Make cluster anottation dictionary
annotation = {"Ctype0":[0],
              "Ctype1":[1],
              "Ctype2":[2],
              "Ctype3":[3,11],
              "Ctype4":[4],
              "Ctype5":[5],
              "Ctype6":[6],
              "Ctype7":[7],
              "Ctype8":[8],
              "Ctype9":[9],
              "Ctype10":[10]}

# Change dictionary format
annotation_rev = {}
for i in cluster_list:
    for k in annotation:
        if int(i) in annotation[k]:
            annotation_rev[i] = k

# Check dictionary
annotation_rev
{'2': 'Ctype2',
 '5': 'Ctype5',
 '1': 'Ctype1',
 '0': 'Ctype0',
 '7': 'Ctype7',
 '6': 'Ctype6',
 '8': 'Ctype8',
 '3': 'Ctype3',
 '4': 'Ctype4',
 '9': 'Ctype9',
 '10': 'Ctype10',
 '11': 'Ctype3'}
adata.obs["cell_type"] = [annotation_rev[i] for i in adata.obs.leiden]
sc.pl.rank_genes_groups(adata, n_genes=10, groupby='leiden', sharey=False)
png
png
results = adata.uns['rank_genes_groups']
results['names']['0']
array(['NUSAP1', 'TOP2A', 'SLC4A1', 'HEMGN', 'HMGB2', 'MKI67', 'DLGAP5',
       'UBE2S', 'TPX2', 'H1F0'], dtype=object)
results['names'].dtype.names
('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11')
scv.pl.velocity_embedding(adata, basis = 'umap', color = 'leiden')
# scv.pl.velocity_embedding(adata_m, basis = 'umap', color = 'leiden')
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
png
png
scv.pl.velocity_embedding_grid(adata, basis = 'umap', color = 'leiden')
#scv.pl.velocity_embedding_grid(adata_m, basis = 'umap', color = 'leiden')
png
png
scv.pl.velocity_embedding_stream(adata, basis = 'umap', color = 'leiden', save='velocity_embedding_stream_umap.pdf')
#scv.pl.velocity_embedding_stream(adata_m, basis = 'umap', color = 'leiden')
# figure cannot be saved as pdf, using png instead.
# saving figure to file ./figures/scvelo_velocity_embedding_stream.png
figure cannot be saved as pdf, using png instead.
saving figure to file ./figures/scvelo_velocity_embedding_stream_umap.png
png
png
scv.pl.velocity_embedding_stream(adata, basis = 'tsne', color = 'leiden', save='velocity_embedding_stream_tsne.pdf')
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_tsne', embedded velocity vectors (adata.obsm)
saving figure to file ./figures/scvelo_velocity_embedding_stream_tsne.pdf
png
png
scv.pl.velocity_embedding_stream(adata, basis='tsne', color = 'cell_type', legend_loc='on data')
png
png
adata.obs.cell_type.unique()
['Ctype2', 'Ctype5', 'Ctype1', 'Ctype0', 'Ctype7', ..., 'Ctype8', 'Ctype3', 'Ctype4', 'Ctype9', 'Ctype10']
Length: 11
Categories (11, object): ['Ctype0', 'Ctype1', 'Ctype10', 'Ctype2', ..., 'Ctype6', 'Ctype7', 'Ctype8', 'Ctype9']
cell_of_interest = adata.obs.index[~adata.obs.cell_type.isin(["Ctype0", "Ctype5"])]
adata_ss1 = adata[cell_of_interest, :] # subset 1, NOT include Ctype0 and Ctype5 cell types
scv.pl.velocity_embedding_stream(adata_ss1, basis='tsne', color = 'cell_type', legend_loc='right margin')
png
png
sc.pl.tsne(adata, color=['cell_type'])
png
png
scv.pl.scatter(adata, 'CD44', color=['cell_type', 'velocity'])
png
png
scv.pl.velocity_embedding(adata, arrow_length=3, arrow_size=2, dpi=120)
png
png
scv.tl.rank_velocity_genes(adata, groupby = 'leiden', min_corr=0.3)
ranking velocity genes
    finished (0:00:08) --> added 
    'rank_velocity_genes', sorted scores by group ids (adata.uns) 
    'spearmans_score', spearmans correlation scores (adata.var)
df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()
0 1 2 3 4 5 6 7 8 9 10 11
0 KIF14 NETO2 PLCB1 NOL4L SLC16A9 PPME1 MYLIP IKZF2 HDAC9 UBR2 ELK3 ZNF462
1 CD36 NFIA ATP8B4 MAST4 ACSBG1 GYPE TLN1 SLC39A11 CBL DAPK1 GPR183 KCNQ1
2 ASPM E2F7 ETV6 LCP2 KIAA1211 FOXO3 RBM38 PALM2-AKAP2 WDFY2 CLIC2 CAMK1 FOSB
3 CENPE SLC25A21 PLAGL1 STON2 ZDHHC14 OSBP2 HIST1H2BJ PLD1 DOCK10 KIAA0232 FMO5 FAM30A
4 CCNB1 BMPR2 KAT6B P2RX1 PVT1 SLC14A1 HIST1H3J ST3GAL4 DISC1 TMCC2 LGALS3 ZDHHC19
#### CHANGE THE GENES TO SEE DIFFERENT RNA VELOCITY ANALYSES Look at output 88 (cell above) to find some interesting genes
scv.pl.velocity(adata, ['CD44', 'CD36','NFIA','KIF14','HDAC9','IKZF2'], ncols=2, color=['cell_type'],basis='tsne')
png
png
scv.pl.velocity(adata, ['CD44', 'CD36','NFIA','KIF14','HDAC9','IKZF2'], ncols=2, color=['cell_type'])
png
png
adata.obs.cell_type
AAACCCACATCGCTCT    Ctype2
AAACCCAGTAGGGTAC    Ctype5
AAACCCAGTGCATCTA    Ctype1
AAACCCATCATAGAGA    Ctype0
AAACCCATCGGAAACG    Ctype7
                     ...  
TTTGTTGAGACCAACG    Ctype1
TTTGTTGAGCTGGTGA    Ctype0
TTTGTTGAGGGACACT    Ctype0
TTTGTTGGTCTCCCTA    Ctype7
TTTGTTGTCTACACAG    Ctype2
Name: cell_type, Length: 6335, dtype: category
Categories (11, object): ['Ctype0', 'Ctype1', 'Ctype10', 'Ctype2', ..., 'Ctype6', 'Ctype7', 'Ctype8', 'Ctype9']
scv.tl.rank_velocity_genes(adata, groupby='cell_type', min_corr=.3)

df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()
ranking velocity genes
    finished (0:00:04) --> added 
    'rank_velocity_genes', sorted scores by group ids (adata.uns) 
    'spearmans_score', spearmans correlation scores (adata.var)
Ctype0 Ctype1 Ctype10 Ctype2 Ctype3 Ctype4 Ctype5 Ctype6 Ctype7 Ctype8 Ctype9
0 RHD OSBPL6 TCF4 PLAGL1 TTC7B LDLRAD4 TANK HIST1H2BJ IKZF2 DOCK10 WDR26
1 EPB42 DAAM1 IMMP2L ANTXR2 CLIP2 ACACA EPB42 MAP1LC3B PLD1 DISC1 CREBRF
2 HIST1H2BJ FAM111B SKAP1 DOCK5 WWC2 NTN1 GPCPD1 SLC7A5 GAB2 MYO1F RNF19A
3 KDM7A MDM1 DNAH14 GNG7 TNIK KIF2A KDM7A TENT5C SLC24A3 AKAP13 GPCPD1
4 TPM1 HBM ARPC1B ERG SUSD1 CCSER1 RNF19A REL PAPSS1 ARHGAP30 ULK1
df.to_csv('scvelo_df_leiden_rank_velocity_genes.csv')
kwargs = dict(frameon=False, size=10, linewidth=1.5,
              add_outline='Ctype0')

scv.pl.scatter(adata, df['Ctype0'][:10], ylabel='Ctype0', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype1'][:10], ylabel='Ctype1', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype3'][:10], ylabel='Ctype3',color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype9'][:10], ylabel='Ctype9', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype5'][:10], ylabel='Ctype5', color=['cell_type'], **kwargs)
png
png
png
png
png
png
png
png
png
png
scv.tl.velocity_confidence(adata)
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, c = keys, cmap='coolwarm', perc = [5,95])
--> added 'velocity_length' (adata.obs)
--> added 'velocity_confidence' (adata.obs)
--> added 'velocity_confidence_transition' (adata.obs)
png
png
adata.obs
n_genes n_genes_by_counts total_counts total_counts_mt pct_counts_mt leiden initial_size_spliced initial_size_unspliced initial_size n_counts velocity_self_transition velocity_length velocity_confidence velocity_confidence_transition root_cells end_points velocity_pseudotime
AAACCCACATCGCTCT 6772 6770 32469.0 1621.0 4.992455 2 14938 11611 14938.0 863.637268 0.383647 30.610001 0.948891 -0.027396 0.031412 0.118727 0.317202
AAACCCAGTAGGGTAC 1508 1507 6036.0 96.0 1.590457 5 3632 1872 3632.0 -106.495049 0.216999 44.320000 0.972637 0.156213 0.000064 0.048307 0.228780
AAACCCAGTGCATCTA 1735 1735 6346.0 165.0 2.600063 1 4681 876 4681.0 -547.307556 0.168688 27.280001 0.939039 0.121379 0.022585 0.000116 0.059130
AAACCCATCATAGAGA 4500 4500 20373.0 1340.0 6.577333 0 13036 3108 13036.0 -707.811035 0.253844 24.570000 0.977568 0.072128 0.001643 0.006620 0.031518
AAACCCATCGGAAACG 2396 2395 4683.0 325.0 6.939996 7 1787 1950 1787.0 920.113159 0.342233 41.799999 0.941867 0.073386 0.001660 0.100762 0.239823
TTTGTTGAGACCAACG 5686 5685 24267.0 2669.0 10.998476 1 12288 5878 12288.0 -572.973083 0.286407 29.740000 0.963285 0.033437 0.001004 0.031958 0.040667
TTTGTTGAGCTGGTGA 3300 3300 13835.0 872.0 6.302855 0 9332 1777 9332.0 -614.256470 0.139380 29.740000 0.983455 0.158749 0.000351 0.004965 0.026805
TTTGTTGAGGGACACT 1187 1187 3105.0 36.0 1.159420 0 1763 1064 1763.0 -232.348145 0.186577 28.490000 0.954237 0.160944 0.000534 0.000141 0.038064
TTTGTTGGTCTCCCTA 3784 3783 8852.0 240.0 2.711252 7 4014 3374 4014.0 1288.671997 0.398848 39.500000 0.957163 -0.043992 0.001771 0.109230 0.241569
TTTGTTGTCTACACAG 4755 4755 13093.0 391.0 2.986329 2 6323 4428 6323.0 806.780151 0.416695 29.190001 0.961888 0.014598 0.010415 0.276882 0.350445

6335 rows × 17 columns

df = adata.obs.groupby('cell_type')[keys].mean().T
df.style.background_gradient(cmap='coolwarm', axis=1)
cell_type Ctype0 Ctype1 Ctype10 Ctype2 Ctype3 Ctype4 Ctype5 Ctype6 Ctype7 Ctype8 Ctype9
velocity_length 28.376411 25.740473 24.188236 25.939058 25.545404 22.209726 32.450985 18.562965 32.190620 26.416279 38.004265
velocity_confidence 0.982803 0.977395 0.932590 0.951401 0.932277 0.976059 0.972348 0.911105 0.935803 0.951495 0.966852
scv.pl.paga(adata, basis='tsne', size=50, alpha=.1,
            min_edge_width=2, node_size_scale=1.5)
WARNING: Invalid color key. Using grey instead.
png
png
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color = 'velocity_pseudotime')
png
png
## THIS IS VERY TIME AND COMPUTATIONALLY CONSUMING
## WILL TAKE 30min TO RUN; it will take 20 of the 80 cores on WFH-7920 computer to run, with CPU fans on a lot!
scv.tl.recover_dynamics(adata, n_jobs = 20)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis = 'umap', color = 'leiden', save='velocity_embedding_stream_2.png')
recovering dynamics (using 20/80 cores)



  0%|          | 0/1415 [00:00<?, ?gene/s]


    finished (0:02:46) --> added 
    'fit_pars', fitted parameters for splicing dynamics (adata.var)
computing velocities
    finished (0:00:10) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/80 cores)



  0%|          | 0/6335 [00:00<?, ?cells/s]


    finished (0:00:17) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
saving figure to file ./figures/scvelo_velocity_embedding_stream_2.png
png
png
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color = 'latent_time', color_map = 'gnuplot', size = 80, save='latent_time.pdf')
computing latent time using root_cells as prior
    finished (0:00:02) --> added 
    'latent_time', shared time (adata.obs)
saving figure to file ./figures/scvelo_latent_time.pdf
png
png
top_genes = adata.var['velocity_qreg_ratio'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', col_color='cell_type', n_convolve=100)
png
png
var_names = ['CD44', 'STOX2', 'PID1', 'PLXDC2']
scv.pl.scatter(adata, var_names, color = ['cell_type'], frameon=False)
scv.pl.scatter(adata, x='latent_time', color = ['cell_type'],y=var_names, frameon=False)
png
png
png
png
#Cluster-specific top-likelihood genes
#Moreover, partial gene likelihoods can be computed for a each cluster of cells to enable cluster-specific identification of potential drivers.

scv.tl.rank_dynamical_genes(adata, groupby='cell_type')
df = scv.get_df(adata, 'rank_dynamical_genes/names')
df.head(5)
ranking genes by cluster-specific likelihoods
    finished (0:00:07) --> added 
    'rank_dynamical_genes', sorted scores by group ids (adata.uns)
Ctype0 Ctype1 Ctype10 Ctype2 Ctype3 Ctype4 Ctype5 Ctype6 Ctype7 Ctype8 Ctype9
0 SLC4A1 LGALS3 CDK6 MPO CDK6 CDK6 HBM HBM MS4A2 CD1C EPB42
1 NFIA GYPB FAM117A ATP8B4 KALRN DLGAP5 SLC4A1 EPB42 CDK6 SLC8A1 ISG20
2 HBM ISG20 CENPE PTPRC PALM2-AKAP2 MYB TCP11L2 ISG20 ENO1 LY86 SLC4A1
3 NETO2 SLC25A21 AURKA MACF1 SELP NFIA CR1L GYPB FABP5 CLEC10A HBM
4 GYPB NFIA IGSF6 ANGPT1 TC2N SLC25A21 SEC62 NFIA PRKACB PKIB OSBP2
adata.var
gene_ids feature_types n_cells mt n_cells_by_counts mean_counts pct_dropout_by_counts total_counts highly_variable means Chromosome End Start Strand velocity_gamma velocity_qreg_ratio velocity_r2 velocity_genes spearmans_score velocity_score
C1orf159 ENSG00000131591 Gene Expression 1568 False 1568 0.349180 76.631893 2343.0 True -7.512550e-04 1 1116361 1081818 - 0.960242 0.960242 -1.056696 False 0.454777 0
ACAP3 ENSG00000131584 Gene Expression 745 False 745 0.124590 88.897168 836.0 True -2.124832e-03 1 1309609 1292390 - 0.013770 0.152219 0.288276 True 0.275960 0
MIB2 ENSG00000197530 Gene Expression 906 False 906 0.155589 86.497765 1044.0 True -1.027507e-03 1 1630610 1615415 + 0.044242 0.044242 -0.414496 False 0.127040 0
KLHL21 ENSG00000162413 Gene Expression 1283 False 1283 0.230253 80.879285 1545.0 True -1.395323e-08 1 6614607 6590724 - 0.809669 0.809669 -0.450083 False 0.000000 0
ENO1 ENSG00000074800 Gene Expression 3961 False 3961 6.071833 40.968703 40742.0 True -6.604966e-09 1 8879190 8861000 - 0.028938 0.038693 0.764427 True 0.851996 0
AFF2 ENSG00000155966 Gene Expression 528 False 528 0.164531 92.131148 1104.0 True -2.977980e-04 X 149000663 148500617 + 0.285532 3.813417 0.628403 True 0.657834 0
IDS ENSG00000010404 Gene Expression 802 False 802 0.169896 88.047690 1140.0 True -1.446279e-04 X 149521096 149476988 - 0.032241 0.070538 0.403367 True 0.708676 0
MTM1 ENSG00000171100 Gene Expression 1499 False 1499 0.387183 77.660209 2598.0 True -1.403791e-08 X 150673322 150568619 + 0.632687 2.800952 0.395020 True 0.686165 0
ARHGAP4 ENSG00000089820 Gene Expression 1585 False 1585 0.421162 76.378539 2826.0 True -4.290405e-09 X 153934999 153907367 - 0.057723 0.286018 0.456565 True 0.809695 0
CLIC2 ENSG00000155962 Gene Expression 2154 False 2154 0.570045 67.898659 3825.0 True -7.113040e-09 X 155334657 155276211 - 1.007027 2.217264 0.801841 True 0.743202 0

2000 rows × 25 columns

# saving this adata file
adata.write('./scanpy_analysis/E15.h5ad')
# next time you read this E15.h5ad into adata:
# adata = sc.read('./scanpy_analysis/E15.h5ad')