scVelo--3-

One bone marrow sample treated with erythropoeitin sample E16

Author: Shenfeng Qiu

# PYTHON 3.10 has compatability issues because collections module was changed
# use Python 3.9 or run the following:

import collections.abc
#cellrank needs the four following aliases to be done manually.
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping

!pip install numpy==1.21.5

!pip install scvelo

!pip install cellrank

!pip install python-igraph louvain

import scvelo as scv
import scanpy as sc
import cellrank as cr
import numpy as np
import pandas as pd
import anndata as ad
import pickle
import loompy as lmp

import igraph
import anndata
from scipy import io
from scipy.sparse import coo_matrix, csr_matrix
import os

scv.logging.print_version()

Running scvelo 0.2.4 (python 3.9.13) on 2023-07-28 21:47.

print(lmp.__version__)

3.0.7

# file path names
fpath = r'D:/Bioinformatics Projects/shalini_bm/E15'
fdata = fpath + r'/E15_sample.h5ad'
fl = fpath + r'/velocyto/E15.loom'

Load the Data

scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', facecolor='white', dpi=100, frameon=False)
cr.settings.verbosity = 2
scv.settings.presenter_view = True  # set max width size for presenter view

ldata = scv.read(fl, cache=True)

ldata

AnnData object with n_obs × n_vars = 6757 × 36601
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'

# inspect the ldata structure
# accessing a specific cell
ldata.obs_names[:5]

Index(['E15:AAAGTCCAGATTGATGx', 'E15:AAAGAACAGTCATGCTx',
       'E15:AACAAGAGTGCTTATGx', 'E15:AAATGGACACTCAAGTx',
       'E15:AAAGAACCATAGCACTx'],
      dtype='object', name='CellID')

ldata.obs.loc[‘AAAGTCCAGATTGATG’]

# accessing a specific gene
ldata.var.loc['CD44']

Accession     ENSG00000026508
Chromosome                 11
End                  35232402
Start                35138870
Strand                      +
Name: CD44, dtype: object

# access data from specific layer
ldata.layers['spliced']

<6757x36601 sparse matrix of type '<class 'numpy.uint16'>'
    with 15512493 stored elements in Compressed Sparse Row format>

#  access the data for a specific cell and gene in a specific layer

ldata.layers['matrix'][ldata.obs_names == 'AAAGAACCATAGCACT', ldata.var_names == 'CD44']

<1x0 sparse matrix of type '<class 'numpy.float32'>'
    with 0 stored elements in Compressed Sparse Row format>

scv.pl.proportions(ldata)

# scv.pl.proportions(ldata_m)

png

Preprocessing

def pp(path):
    adata = sc.read_10x_mtx(path)
    sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
    sc.pp.filter_genes(adata, min_cells=3) #get rid of genes that are found in fewer than 3 cells
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
    lower_lim = np.quantile(adata.obs.n_genes_by_counts.values, .02)
    adata = adata[(adata.obs.n_genes_by_counts < upper_lim) & (adata.obs.n_genes_by_counts > lower_lim)]
    adata = adata[adata.obs.pct_counts_mt < 20]
    sc.pp.normalize_total(adata, target_sum=1e4) #normalize every cell to 10,000 UMI
    sc.pp.log1p(adata) #change to log counts
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) #these are default values
    adata.raw = adata #save raw data before processing values and further filtering
    adata = adata[:, adata.var.highly_variable] #filter highly variable
    sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt']) #Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed
    sc.pp.scale(adata, max_value=10) #scale each gene to unit variance
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
    sc.tl.leiden(adata, resolution = 0.25)
    sc.tl.umap(adata)
    return adata

#
import os
os.getcwd()

'D:\\Bioinformatics Projects\\shalini_bm'

#!pip3 install leidenalg

### ONLY RUN THIS ONCE. THE SECOND LINE OF CODE saves the output to whatever directory you specify
# adata = pp('/xdisk/sqiu/jessicakzhang/sc_exp4_mTBI/outs/filtered_feature_bc_matrix/')
# pickle.dump(adata, open(r'/xdisk/sqiu/jessicakzhang/adata.h5ad', 'wb'))

adata = pp('./E15/outs/filtered_feature_bc_matrix/')
pickle.dump(adata, open(r'./adata.h5ad', 'wb'))  ### write the .h5ad binary file to the working directory

adata

AnnData object with n_obs × n_vars = 6335 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

# this loads the previously saved output. RUN THIS and change your directory to wherever you saved your output
adata = pickle.load(open(r'./adata.h5ad', 'rb'))

print(adata)

AnnData object with n_obs × n_vars = 6335 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

    # Find the cells that express CD44.
    # This creates a boolean mask where True indicates cells that express CD44.
# mask = adata[:, 'CD44'].X > 0

    # Use the mask to subset the AnnData object.
# adata_cd44 = adata[mask]
# adata_cd44

adata.obs_names[:5]

Index(['AAACCCACATCGCTCT-1', 'AAACCCAGTAGGGTAC-1', 'AAACCCAGTGCATCTA-1',
       'AAACCCATCATAGAGA-1', 'AAACCCATCGGAAACG-1'],
      dtype='object')

adata.obs.loc['AAACCCACATCGCTCT']

adata.var.loc['CD44']

gene_ids                 ENSG00000026508
feature_types            Gene Expression
n_cells                             3339
mt                                 False
n_cells_by_counts                   3339
mean_counts                       1.5769
pct_dropout_by_counts           50.23845
total_counts                     10581.0
highly_variable                     True
means                           0.691778
dispersions                     1.210978
dispersions_norm                1.348397
mean                                 0.0
std                             0.592069
Name: CD44, dtype: object

# to access a specific leiden cluster:
adata_cluster0 = adata[adata.obs['leiden'] == '0']
adata_cluster0

View of AnnData object with n_obs × n_vars = 1290 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

    # select all cells within clusters 0 and 1, that has CD44 gene expression
    # Create boolean masks for the Leiden clusters and CD44 expression.
# mask_cluster = (adata.obs['leiden'] == '0') | (adata.obs['leiden'] == '1')
# mask_cd44 = adata[:, 'CD44'].X.toarray().flatten() > 0

    # Combine the masks using the logical AND operator.
# mask_combined = mask_cluster & mask_cd44

    # Subset the AnnData object using the combined mask.
# adata_subset = adata[mask_combined]
# adata_subset

Merge the gene expression and loom data

adata = scv.utils.merge(adata, ldata)
# adata_m = scv.utils.merge(adata, ldata_m)

adata

AnnData object with n_obs × n_vars = 6335 × 3708
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
    obsp: 'distances', 'connectivities'

scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

Filtered out 1320 genes that are detected 20 counts (shared).
WARNING: Did not normalize X as it looks processed already. To enforce normalization, set `enforce=True`.
Normalized count data: spliced, unspliced.
Extracted 2000 highly variable genes.
WARNING: Did not modify X as it looks preprocessed already.
computing neighbors
    finished (0:00:01) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)

# scv.pp.filter_and_normalize(adata_m)
# scv.pp.moments(adata_m)
adata

AnnData object with n_obs × n_vars = 6335 × 2000
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu'
    obsp: 'distances', 'connectivities'

# adata.obs

# adata.var

# adata.X

# adata.uns

sc.pl.pca_variance_ratio(adata, log=True)

png

sc.tl.tsne(adata)

sc.pl.tsne(adata)

png

!pip install python-louvain # not working

sc.pl.tsne(adata, color = ['leiden'])

png

sc.pl.umap(adata, color="leiden")

png

sc.pl.umap(adata, color="CD44")

png

# Now, create and save the plot
# SQ NOTE this creates a 'figures' folder in pwd

sc.pl.umap(adata, color=“leiden”, save=“umap_leiden_clusters.pdf”) sc.pl.umap(adata, color=“CD44”, save=“umap_CD44_clusters.pdf”)

Estimate RNA velocity

scv.tl.velocity(adata)

computing velocities
    finished (0:00:08) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)

scv.tl.velocity_graph(adata)

computing velocity graph (using 1/80 cores)



  0%|          | 0/6335 [00:00<?, ?cells/s]


    finished (0:00:50) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)

# scv.tl.velocity(adata_m)
# scv.tl.velocity_graph(adata_m)

scv.set_figure_params()
# scv.set_figure_params(dpi=200, color_map='viridis')

adata

AnnData object with n_obs × n_vars = 6335 × 2000
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'leiden_colors', 'velocity_params', 'velocity_graph', 'velocity_graph_neg'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu', 'velocity', 'variance_velocity'
    obsp: 'distances', 'connectivities'

adata.obsm

AxisArrays with keys: X_pca, X_umap

adata.layers['velocity']
# adata.uns['velocity_graph']

array([[ 0.16311729,  0.00325262,  0.02549965, ...,  0.2947519 ,
         0.24068618,  0.00166135],
       [ 0.05183516,  0.        , -0.00231888, ..., -0.03039856,
         0.        ,  0.62309027],
       [ 0.14193237,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.12683076],
       ...,
       [ 0.07127077, -0.00128302,  0.        , ...,  0.17017308,
         0.        ,  0.49752218],
       [ 0.06123393, -0.0016032 , -0.00131531, ...,  0.63884544,
         0.05677878,  0.        ],
       [ 0.139233  ,  0.03396539,  0.00363499, ...,  0.5291901 ,
         0.2595918 , -0.04816317]], dtype=float32)

adata.obs
# adata.obs['n_genes_by_counts']

	n_genes	n_genes_by_counts	total_counts	total_counts_mt	pct_counts_mt	leiden	initial_size_spliced	initial_size_unspliced	initial_size	n_counts	velocity_self_transition
AAACCCACATCGCTCT	6772	6770	32469.0	1621.0	4.992455	2	14938	11611	14938.0	728.407471	0.353792
AAACCCAGTAGGGTAC	1508	1507	6036.0	96.0	1.590457	5	3632	1872	3632.0	-61.993500	0.237123
AAACCCAGTGCATCTA	1735	1735	6346.0	165.0	2.600063	1	4681	876	4681.0	-484.200470	0.272838
AAACCCATCATAGAGA	4500	4500	20373.0	1340.0	6.577333	0	13036	3108	13036.0	-580.061584	0.223742
AAACCCATCGGAAACG	2396	2395	4683.0	325.0	6.939996	7	1787	1950	1787.0	893.204102	0.365823
…	…	…	…	…	…	…	…	…	…	…	…
TTTGTTGAGACCAACG	5686	5685	24267.0	2669.0	10.998476	1	12288	5878	12288.0	-473.220093	0.226389
TTTGTTGAGCTGGTGA	3300	3300	13835.0	872.0	6.302855	0	9332	1777	9332.0	-536.107422	0.264881
TTTGTTGAGGGACACT	1187	1187	3105.0	36.0	1.159420	0	1763	1064	1763.0	-268.330414	0.173882
TTTGTTGGTCTCCCTA	3784	3783	8852.0	240.0	2.711252	7	4014	3374	4014.0	1190.949463	0.403490
TTTGTTGTCTACACAG	4755	4755	13093.0	391.0	2.986329	2	6323	4428	6323.0	812.005249	0.402153

6335 rows × 11 columns

adata.var

	gene_ids	feature_types	n_cells	mt	n_cells_by_counts	mean_counts	pct_dropout_by_counts	total_counts	highly_variable	means	…	std	Accession	Chromosome	End	Start	Strand	velocity_gamma	velocity_qreg_ratio	velocity_r2	velocity_genes
C1orf159	ENSG00000131591	Gene Expression	1568	False	1568	0.349180	76.631893	2343.0	True	-7.512550e-04	…	0.256349	ENSG00000131591	1	1116361	1081818	-	0.960242	0.960242	-1.056696	False
ACAP3	ENSG00000131584	Gene Expression	745	False	745	0.124590	88.897168	836.0	True	-2.124832e-03	…	0.184853	ENSG00000131584	1	1309609	1292390	-	0.013770	0.152219	0.288276	True
MIB2	ENSG00000197530	Gene Expression	906	False	906	0.155589	86.497765	1044.0	True	-1.027507e-03	…	0.198915	ENSG00000197530	1	1630610	1615415	+	0.044242	0.044242	-0.414496	False
KLHL21	ENSG00000162413	Gene Expression	1283	False	1283	0.230253	80.879285	1545.0	True	-1.395323e-08	…	0.284522	ENSG00000162413	1	6614607	6590724	-	0.809669	0.809669	-0.450083	False
ENO1	ENSG00000074800	Gene Expression	3961	False	3961	6.071833	40.968703	40742.0	True	-6.604966e-09	…	0.890816	ENSG00000074800	1	8879190	8861000	-	0.028938	0.038693	0.764427	True
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
AFF2	ENSG00000155966	Gene Expression	528	False	528	0.164531	92.131148	1104.0	True	-2.977980e-04	…	0.294817	ENSG00000155966	X	149000663	148500617	+	0.285532	3.813417	0.628403	True
IDS	ENSG00000010404	Gene Expression	802	False	802	0.169896	88.047690	1140.0	True	-1.446279e-04	…	0.247148	ENSG00000010404	X	149521096	149476988	-	0.032241	0.070538	0.403367	True
MTM1	ENSG00000171100	Gene Expression	1499	False	1499	0.387183	77.660209	2598.0	True	-1.403791e-08	…	0.318311	ENSG00000171100	X	150673322	150568619	+	0.632687	2.800952	0.395020	True
ARHGAP4	ENSG00000089820	Gene Expression	1585	False	1585	0.421162	76.378539	2826.0	True	-4.290405e-09	…	0.314576	ENSG00000089820	X	153934999	153907367	-	0.057723	0.286018	0.456565	True
CLIC2	ENSG00000155962	Gene Expression	2154	False	2154	0.570045	67.898659	3825.0	True	-7.113040e-09	…	0.440094	ENSG00000155962	X	155334657	155276211	-	1.007027	2.217264	0.801841	True

2000 rows × 23 columns

# adata_m.obs

# fetch data from adata cluster 0 and 1, number of genes and make a histogram plot
import matplotlib.pyplot as plt

# Create boolean mask for clusters
mask_cluster = (adata.obs['leiden'] == '0') | (adata.obs['leiden'] == '1')

# Subset the data
adata_subset = adata[mask_cluster]

# Get number of genes for each cell
num_genes = adata_subset.obs['n_genes']

# Create histogram
plt.hist(num_genes, bins=30, alpha=0.75)

# Set the title and labels
plt.title('Histogram of Number of Genes')
plt.xlabel('Number of Genes')
plt.ylabel('Number of Cells')

# Show the plot
plt.show()

png

# find gene markers for each of the detected leiden clusters
# Perform the ranking
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

# The result is stored in adata.uns['rank_genes_groups']
result = adata.uns['rank_genes_groups']

# Initialize a list to hold dataframes for each cluster
df_list = []

# Iterate over clusters
for group in result['names'].dtype.names:
    # Create a DataFrame for the current cluster
    df = pd.DataFrame({    
        'Genes': result['names'][group],
        'Scores': result['scores'][group],
        'Logfoldchanges': result['logfoldchanges'][group],
        'pvals': result['pvals'][group],
        'pvals_adj': result['pvals_adj'][group],
    })
    
    # Add a column for the cluster
    df['Cluster'] = group

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all dataframes
df_all = pd.concat(df_list, ignore_index=True)

# Write DataFrame to a csv file
df_all.to_csv('scanpy_marker_genes.csv', index=False)
df_all

	Genes	Scores	Logfoldchanges	pvals	pvals_adj	Cluster
0	NUSAP1	48.397820	2.678382	0.000000e+00	0.000000	0
1	TOP2A	47.867912	2.843289	0.000000e+00	0.000000	0
2	CENPE	45.637989	2.997186	0.000000e+00	0.000000	0
3	DLGAP5	44.949627	2.646727	0.000000e+00	0.000000	0
4	HMGB2	44.792137	2.093099	0.000000e+00	0.000000	0
…	…	…	…	…	…	…
289483	MT-CO3	-5.545708	-3.068403	2.927671e-08	0.000014	11
289484	RPL6	-5.575259	-3.883544	2.471614e-08	0.000014	11
289485	RPS15A	-5.584476	-3.632532	2.344061e-08	0.000014	11
289486	RPS29	-5.653746	-3.751123	1.569881e-08	0.000012	11
289487	MT-CO1	-5.927827	-4.364321	3.069686e-09	0.000004	11

289488 rows × 6 columns

df_all_f = df_all[(df_all.pvals_adj < 0.01)&(abs(df_all.Logfoldchanges)>2)]
df_all_f

	Genes	Scores	Logfoldchanges	pvals	pvals_adj	Cluster
0	NUSAP1	48.397820	2.678382	0.000000e+00	0.000000	0
1	TOP2A	47.867912	2.843289	0.000000e+00	0.000000	0
2	CENPE	45.637989	2.997186	0.000000e+00	0.000000	0
3	DLGAP5	44.949627	2.646727	0.000000e+00	0.000000	0
4	HMGB2	44.792137	2.093099	0.000000e+00	0.000000	0
…	…	…	…	…	…	…
289483	MT-CO3	-5.545708	-3.068403	2.927671e-08	0.000014	11
289484	RPL6	-5.575259	-3.883544	2.471614e-08	0.000014	11
289485	RPS15A	-5.584476	-3.632532	2.344061e-08	0.000014	11
289486	RPS29	-5.653746	-3.751123	1.569881e-08	0.000012	11
289487	MT-CO1	-5.927827	-4.364321	3.069686e-09	0.000004	11

24668 rows × 6 columns

df_all_f.to_csv('scvelo_leiden_markers.csv')

cluster_list = adata.obs.leiden.unique()
cluster_list

['2', '5', '1', '0', '7', ..., '3', '4', '9', '10', '11']
Length: 12
Categories (12, object): ['0', '1', '2', '3', ..., '8', '9', '10', '11']

Manual annotation will be done by SS.

Alternatively, map to the Azimuth human bone marrow reference dataset

# Make cluster anottation dictionary
annotation = {"Ctype0":[0],
              "Ctype1":[1],
              "Ctype2":[2],
              "Ctype3":[3,11],
              "Ctype4":[4],
              "Ctype5":[5],
              "Ctype6":[6],
              "Ctype7":[7],
              "Ctype8":[8],
              "Ctype9":[9],
              "Ctype10":[10]}

# Change dictionary format
annotation_rev = {}
for i in cluster_list:
    for k in annotation:
        if int(i) in annotation[k]:
            annotation_rev[i] = k

# Check dictionary
annotation_rev

{'2': 'Ctype2',
 '5': 'Ctype5',
 '1': 'Ctype1',
 '0': 'Ctype0',
 '7': 'Ctype7',
 '6': 'Ctype6',
 '8': 'Ctype8',
 '3': 'Ctype3',
 '4': 'Ctype4',
 '9': 'Ctype9',
 '10': 'Ctype10',
 '11': 'Ctype3'}

adata.obs["cell_type"] = [annotation_rev[i] for i in adata.obs.leiden]

sc.pl.rank_genes_groups(adata, n_genes=10, groupby='leiden', sharey=False)

png

results = adata.uns['rank_genes_groups']
results['names']['0']

array(['NUSAP1', 'TOP2A', 'SLC4A1', 'HEMGN', 'HMGB2', 'MKI67', 'DLGAP5',
       'UBE2S', 'TPX2', 'H1F0'], dtype=object)

results['names'].dtype.names

('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11')

scv.pl.velocity_embedding(adata, basis = 'umap', color = 'leiden')
# scv.pl.velocity_embedding(adata_m, basis = 'umap', color = 'leiden')

computing velocity embedding
    finished (0:00:01) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)

png

scv.pl.velocity_embedding_grid(adata, basis = 'umap', color = 'leiden')
#scv.pl.velocity_embedding_grid(adata_m, basis = 'umap', color = 'leiden')

png

scv.pl.velocity_embedding_stream(adata, basis = 'umap', color = 'leiden', save='velocity_embedding_stream_umap.pdf')
#scv.pl.velocity_embedding_stream(adata_m, basis = 'umap', color = 'leiden')
# figure cannot be saved as pdf, using png instead.
# saving figure to file ./figures/scvelo_velocity_embedding_stream.png

figure cannot be saved as pdf, using png instead.
saving figure to file ./figures/scvelo_velocity_embedding_stream_umap.png

png

scv.pl.velocity_embedding_stream(adata, basis = 'tsne', color = 'leiden', save='velocity_embedding_stream_tsne.pdf')

computing velocity embedding
    finished (0:00:01) --> added
    'velocity_tsne', embedded velocity vectors (adata.obsm)
saving figure to file ./figures/scvelo_velocity_embedding_stream_tsne.pdf

png

scv.pl.velocity_embedding_stream(adata, basis='tsne', color = 'cell_type', legend_loc='on data')

png

adata.obs.cell_type.unique()

['Ctype2', 'Ctype5', 'Ctype1', 'Ctype0', 'Ctype7', ..., 'Ctype8', 'Ctype3', 'Ctype4', 'Ctype9', 'Ctype10']
Length: 11
Categories (11, object): ['Ctype0', 'Ctype1', 'Ctype10', 'Ctype2', ..., 'Ctype6', 'Ctype7', 'Ctype8', 'Ctype9']

cell_of_interest = adata.obs.index[~adata.obs.cell_type.isin(["Ctype0", "Ctype5"])]
adata_ss1 = adata[cell_of_interest, :] # subset 1, NOT include Ctype0 and Ctype5 cell types

scv.pl.velocity_embedding_stream(adata_ss1, basis='tsne', color = 'cell_type', legend_loc='right margin')

png

sc.pl.tsne(adata, color=['cell_type'])

png

scv.pl.scatter(adata, 'CD44', color=['cell_type', 'velocity'])

png

scv.pl.velocity_embedding(adata, arrow_length=3, arrow_size=2, dpi=120)

png

scv.tl.rank_velocity_genes(adata, groupby = 'leiden', min_corr=0.3)

ranking velocity genes
    finished (0:00:08) --> added 
    'rank_velocity_genes', sorted scores by group ids (adata.uns) 
    'spearmans_score', spearmans correlation scores (adata.var)

df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()

	0	1	2	3	4	5	6	7	8	9	10	11
0	KIF14	NETO2	PLCB1	NOL4L	SLC16A9	PPME1	MYLIP	IKZF2	HDAC9	UBR2	ELK3	ZNF462
1	CD36	NFIA	ATP8B4	MAST4	ACSBG1	GYPE	TLN1	SLC39A11	CBL	DAPK1	GPR183	KCNQ1
2	ASPM	E2F7	ETV6	LCP2	KIAA1211	FOXO3	RBM38	PALM2-AKAP2	WDFY2	CLIC2	CAMK1	FOSB
3	CENPE	SLC25A21	PLAGL1	STON2	ZDHHC14	OSBP2	HIST1H2BJ	PLD1	DOCK10	KIAA0232	FMO5	FAM30A
4	CCNB1	BMPR2	KAT6B	P2RX1	PVT1	SLC14A1	HIST1H3J	ST3GAL4	DISC1	TMCC2	LGALS3	ZDHHC19

#### CHANGE THE GENES TO SEE DIFFERENT RNA VELOCITY ANALYSES Look at output 88 (cell above) to find some interesting genes
scv.pl.velocity(adata, ['CD44', 'CD36','NFIA','KIF14','HDAC9','IKZF2'], ncols=2, color=['cell_type'],basis='tsne')

png

scv.pl.velocity(adata, ['CD44', 'CD36','NFIA','KIF14','HDAC9','IKZF2'], ncols=2, color=['cell_type'])

png

adata.obs.cell_type

AAACCCACATCGCTCT    Ctype2
AAACCCAGTAGGGTAC    Ctype5
AAACCCAGTGCATCTA    Ctype1
AAACCCATCATAGAGA    Ctype0
AAACCCATCGGAAACG    Ctype7
                     ...  
TTTGTTGAGACCAACG    Ctype1
TTTGTTGAGCTGGTGA    Ctype0
TTTGTTGAGGGACACT    Ctype0
TTTGTTGGTCTCCCTA    Ctype7
TTTGTTGTCTACACAG    Ctype2
Name: cell_type, Length: 6335, dtype: category
Categories (11, object): ['Ctype0', 'Ctype1', 'Ctype10', 'Ctype2', ..., 'Ctype6', 'Ctype7', 'Ctype8', 'Ctype9']

scv.tl.rank_velocity_genes(adata, groupby='cell_type', min_corr=.3)

df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()

ranking velocity genes
    finished (0:00:04) --> added 
    'rank_velocity_genes', sorted scores by group ids (adata.uns) 
    'spearmans_score', spearmans correlation scores (adata.var)

	Ctype0	Ctype1	Ctype10	Ctype2	Ctype3	Ctype4	Ctype5	Ctype6	Ctype7	Ctype8	Ctype9
0	RHD	OSBPL6	TCF4	PLAGL1	TTC7B	LDLRAD4	TANK	HIST1H2BJ	IKZF2	DOCK10	WDR26
1	EPB42	DAAM1	IMMP2L	ANTXR2	CLIP2	ACACA	EPB42	MAP1LC3B	PLD1	DISC1	CREBRF
2	HIST1H2BJ	FAM111B	SKAP1	DOCK5	WWC2	NTN1	GPCPD1	SLC7A5	GAB2	MYO1F	RNF19A
3	KDM7A	MDM1	DNAH14	GNG7	TNIK	KIF2A	KDM7A	TENT5C	SLC24A3	AKAP13	GPCPD1
4	TPM1	HBM	ARPC1B	ERG	SUSD1	CCSER1	RNF19A	REL	PAPSS1	ARHGAP30	ULK1

df.to_csv('scvelo_df_leiden_rank_velocity_genes.csv')

kwargs = dict(frameon=False, size=10, linewidth=1.5,
              add_outline='Ctype0')

scv.pl.scatter(adata, df['Ctype0'][:10], ylabel='Ctype0', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype1'][:10], ylabel='Ctype1', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype3'][:10], ylabel='Ctype3',color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype9'][:10], ylabel='Ctype9', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype5'][:10], ylabel='Ctype5', color=['cell_type'], **kwargs)

png

png

png

png

png

scv.tl.velocity_confidence(adata)
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, c = keys, cmap='coolwarm', perc = [5,95])

--> added 'velocity_length' (adata.obs)
--> added 'velocity_confidence' (adata.obs)
--> added 'velocity_confidence_transition' (adata.obs)

png

adata.obs

	n_genes	n_genes_by_counts	total_counts	total_counts_mt	pct_counts_mt	leiden	initial_size_spliced	initial_size_unspliced	initial_size	n_counts	velocity_self_transition	velocity_length	velocity_confidence	velocity_confidence_transition	root_cells	end_points	velocity_pseudotime
AAACCCACATCGCTCT	6772	6770	32469.0	1621.0	4.992455	2	14938	11611	14938.0	863.637268	0.383647	30.610001	0.948891	-0.027396	0.031412	0.118727	0.317202
AAACCCAGTAGGGTAC	1508	1507	6036.0	96.0	1.590457	5	3632	1872	3632.0	-106.495049	0.216999	44.320000	0.972637	0.156213	0.000064	0.048307	0.228780
AAACCCAGTGCATCTA	1735	1735	6346.0	165.0	2.600063	1	4681	876	4681.0	-547.307556	0.168688	27.280001	0.939039	0.121379	0.022585	0.000116	0.059130
AAACCCATCATAGAGA	4500	4500	20373.0	1340.0	6.577333	0	13036	3108	13036.0	-707.811035	0.253844	24.570000	0.977568	0.072128	0.001643	0.006620	0.031518
AAACCCATCGGAAACG	2396	2395	4683.0	325.0	6.939996	7	1787	1950	1787.0	920.113159	0.342233	41.799999	0.941867	0.073386	0.001660	0.100762	0.239823
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
TTTGTTGAGACCAACG	5686	5685	24267.0	2669.0	10.998476	1	12288	5878	12288.0	-572.973083	0.286407	29.740000	0.963285	0.033437	0.001004	0.031958	0.040667
TTTGTTGAGCTGGTGA	3300	3300	13835.0	872.0	6.302855	0	9332	1777	9332.0	-614.256470	0.139380	29.740000	0.983455	0.158749	0.000351	0.004965	0.026805
TTTGTTGAGGGACACT	1187	1187	3105.0	36.0	1.159420	0	1763	1064	1763.0	-232.348145	0.186577	28.490000	0.954237	0.160944	0.000534	0.000141	0.038064
TTTGTTGGTCTCCCTA	3784	3783	8852.0	240.0	2.711252	7	4014	3374	4014.0	1288.671997	0.398848	39.500000	0.957163	-0.043992	0.001771	0.109230	0.241569
TTTGTTGTCTACACAG	4755	4755	13093.0	391.0	2.986329	2	6323	4428	6323.0	806.780151	0.416695	29.190001	0.961888	0.014598	0.010415	0.276882	0.350445

6335 rows × 17 columns

df = adata.obs.groupby('cell_type')[keys].mean().T
df.style.background_gradient(cmap='coolwarm', axis=1)

cell_type	Ctype0	Ctype1	Ctype10	Ctype2	Ctype3	Ctype4	Ctype5	Ctype6	Ctype7	Ctype8	Ctype9
velocity_length	28.376411	25.740473	24.188236	25.939058	25.545404	22.209726	32.450985	18.562965	32.190620	26.416279	38.004265
velocity_confidence	0.982803	0.977395	0.932590	0.951401	0.932277	0.976059	0.972348	0.911105	0.935803	0.951495	0.966852

scv.pl.paga(adata, basis='tsne', size=50, alpha=.1,
            min_edge_width=2, node_size_scale=1.5)

WARNING: Invalid color key. Using grey instead.

png

scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color = 'velocity_pseudotime')

png

## THIS IS VERY TIME AND COMPUTATIONALLY CONSUMING
## WILL TAKE 30min TO RUN; it will take 20 of the 80 cores on WFH-7920 computer to run, with CPU fans on a lot!
scv.tl.recover_dynamics(adata, n_jobs = 20)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis = 'umap', color = 'leiden', save='velocity_embedding_stream_2.png')

recovering dynamics (using 20/80 cores)



  0%|          | 0/1415 [00:00<?, ?gene/s]


    finished (0:02:46) --> added 
    'fit_pars', fitted parameters for splicing dynamics (adata.var)
computing velocities
    finished (0:00:10) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/80 cores)



  0%|          | 0/6335 [00:00<?, ?cells/s]


    finished (0:00:17) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
    finished (0:00:01) --> added
    'velocity_umap', embedded velocity vectors (adata.obsm)
saving figure to file ./figures/scvelo_velocity_embedding_stream_2.png

png

scv.tl.latent_time(adata)
scv.pl.scatter(adata, color = 'latent_time', color_map = 'gnuplot', size = 80, save='latent_time.pdf')

computing latent time using root_cells as prior
    finished (0:00:02) --> added 
    'latent_time', shared time (adata.obs)
saving figure to file ./figures/scvelo_latent_time.pdf

png

top_genes = adata.var['velocity_qreg_ratio'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', col_color='cell_type', n_convolve=100)

png

var_names = ['CD44', 'STOX2', 'PID1', 'PLXDC2']
scv.pl.scatter(adata, var_names, color = ['cell_type'], frameon=False)
scv.pl.scatter(adata, x='latent_time', color = ['cell_type'],y=var_names, frameon=False)

png

#Cluster-specific top-likelihood genes
#Moreover, partial gene likelihoods can be computed for a each cluster of cells to enable cluster-specific identification of potential drivers.

scv.tl.rank_dynamical_genes(adata, groupby='cell_type')
df = scv.get_df(adata, 'rank_dynamical_genes/names')
df.head(5)

ranking genes by cluster-specific likelihoods
    finished (0:00:07) --> added 
    'rank_dynamical_genes', sorted scores by group ids (adata.uns)

	Ctype0	Ctype1	Ctype10	Ctype2	Ctype3	Ctype4	Ctype5	Ctype6	Ctype7	Ctype8	Ctype9
0	SLC4A1	LGALS3	CDK6	MPO	CDK6	CDK6	HBM	HBM	MS4A2	CD1C	EPB42
1	NFIA	GYPB	FAM117A	ATP8B4	KALRN	DLGAP5	SLC4A1	EPB42	CDK6	SLC8A1	ISG20
2	HBM	ISG20	CENPE	PTPRC	PALM2-AKAP2	MYB	TCP11L2	ISG20	ENO1	LY86	SLC4A1
3	NETO2	SLC25A21	AURKA	MACF1	SELP	NFIA	CR1L	GYPB	FABP5	CLEC10A	HBM
4	GYPB	NFIA	IGSF6	ANGPT1	TC2N	SLC25A21	SEC62	NFIA	PRKACB	PKIB	OSBP2

adata.var

	gene_ids	feature_types	n_cells	mt	n_cells_by_counts	mean_counts	pct_dropout_by_counts	total_counts	highly_variable	means	…	Chromosome	End	Start	Strand	velocity_gamma	velocity_qreg_ratio	velocity_r2	velocity_genes	spearmans_score	velocity_score
C1orf159	ENSG00000131591	Gene Expression	1568	False	1568	0.349180	76.631893	2343.0	True	-7.512550e-04	…	1	1116361	1081818	-	0.960242	0.960242	-1.056696	False	0.454777	0
ACAP3	ENSG00000131584	Gene Expression	745	False	745	0.124590	88.897168	836.0	True	-2.124832e-03	…	1	1309609	1292390	-	0.013770	0.152219	0.288276	True	0.275960	0
MIB2	ENSG00000197530	Gene Expression	906	False	906	0.155589	86.497765	1044.0	True	-1.027507e-03	…	1	1630610	1615415	+	0.044242	0.044242	-0.414496	False	0.127040	0
KLHL21	ENSG00000162413	Gene Expression	1283	False	1283	0.230253	80.879285	1545.0	True	-1.395323e-08	…	1	6614607	6590724	-	0.809669	0.809669	-0.450083	False	0.000000	0
ENO1	ENSG00000074800	Gene Expression	3961	False	3961	6.071833	40.968703	40742.0	True	-6.604966e-09	…	1	8879190	8861000	-	0.028938	0.038693	0.764427	True	0.851996	0
…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…	…
AFF2	ENSG00000155966	Gene Expression	528	False	528	0.164531	92.131148	1104.0	True	-2.977980e-04	…	X	149000663	148500617	+	0.285532	3.813417	0.628403	True	0.657834	0
IDS	ENSG00000010404	Gene Expression	802	False	802	0.169896	88.047690	1140.0	True	-1.446279e-04	…	X	149521096	149476988	-	0.032241	0.070538	0.403367	True	0.708676	0
MTM1	ENSG00000171100	Gene Expression	1499	False	1499	0.387183	77.660209	2598.0	True	-1.403791e-08	…	X	150673322	150568619	+	0.632687	2.800952	0.395020	True	0.686165	0
ARHGAP4	ENSG00000089820	Gene Expression	1585	False	1585	0.421162	76.378539	2826.0	True	-4.290405e-09	…	X	153934999	153907367	-	0.057723	0.286018	0.456565	True	0.809695	0
CLIC2	ENSG00000155962	Gene Expression	2154	False	2154	0.570045	67.898659	3825.0	True	-7.113040e-09	…	X	155334657	155276211	-	1.007027	2.217264	0.801841	True	0.743202	0

2000 rows × 25 columns

# saving this adata file
adata.write('./scanpy_analysis/E15.h5ad')
# next time you read this E15.h5ad into adata:
# adata = sc.read('./scanpy_analysis/E15.h5ad')