scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', facecolor='white', dpi=100, frameon=False)
cr.settings.verbosity = 2
scv.settings.presenter_view = True # set max width size for presenter view
ldata = scv.read(fl, cache=True)
ldata
AnnData object with n_obs × n_vars = 6757 × 36601
var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
# inspect the ldata structure
# accessing a specific cell
ldata.obs_names[:5]
Index(['E15:AAAGTCCAGATTGATGx', 'E15:AAAGAACAGTCATGCTx',
'E15:AACAAGAGTGCTTATGx', 'E15:AAATGGACACTCAAGTx',
'E15:AAAGAACCATAGCACTx'],
dtype='object', name='CellID')
ldata.obs.loc[‘AAAGTCCAGATTGATG’]
# accessing a specific gene
ldata.var.loc['CD44']
Accession ENSG00000026508
Chromosome 11
End 35232402
Start 35138870
Strand +
Name: CD44, dtype: object
# access data from specific layer
ldata.layers['spliced']
<6757x36601 sparse matrix of type '<class 'numpy.uint16'>'
with 15512493 stored elements in Compressed Sparse Row format>
# access the data for a specific cell and gene in a specific layer
ldata.layers['matrix'][ldata.obs_names == 'AAAGAACCATAGCACT', ldata.var_names == 'CD44']
<1x0 sparse matrix of type '<class 'numpy.float32'>'
with 0 stored elements in Compressed Sparse Row format>
scv.pl.proportions(ldata)
# scv.pl.proportions(ldata_m)
def pp(path):
adata = sc.read_10x_mtx(path)
sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
sc.pp.filter_genes(adata, min_cells=3) #get rid of genes that are found in fewer than 3 cells
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
lower_lim = np.quantile(adata.obs.n_genes_by_counts.values, .02)
adata = adata[(adata.obs.n_genes_by_counts < upper_lim) & (adata.obs.n_genes_by_counts > lower_lim)]
adata = adata[adata.obs.pct_counts_mt < 20]
sc.pp.normalize_total(adata, target_sum=1e4) #normalize every cell to 10,000 UMI
sc.pp.log1p(adata) #change to log counts
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) #these are default values
adata.raw = adata #save raw data before processing values and further filtering
adata = adata[:, adata.var.highly_variable] #filter highly variable
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt']) #Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed
sc.pp.scale(adata, max_value=10) #scale each gene to unit variance
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
sc.tl.leiden(adata, resolution = 0.25)
sc.tl.umap(adata)
return adata
#
import os
os.getcwd()
'D:\\Bioinformatics Projects\\shalini_bm'
#!pip3 install leidenalg
### ONLY RUN THIS ONCE. THE SECOND LINE OF CODE saves the output to whatever directory you specify
# adata = pp('/xdisk/sqiu/jessicakzhang/sc_exp4_mTBI/outs/filtered_feature_bc_matrix/')
# pickle.dump(adata, open(r'/xdisk/sqiu/jessicakzhang/adata.h5ad', 'wb'))
adata = pp('./E15/outs/filtered_feature_bc_matrix/')
pickle.dump(adata, open(r'./adata.h5ad', 'wb')) ### write the .h5ad binary file to the working directory
adata
AnnData object with n_obs × n_vars = 6335 × 3708
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
# this loads the previously saved output. RUN THIS and change your directory to wherever you saved your output
adata = pickle.load(open(r'./adata.h5ad', 'rb'))
print(adata)
AnnData object with n_obs × n_vars = 6335 × 3708
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
# Find the cells that express CD44.
# This creates a boolean mask where True indicates cells that express CD44.
# mask = adata[:, 'CD44'].X > 0
# Use the mask to subset the AnnData object.
# adata_cd44 = adata[mask]
# adata_cd44
adata.obs_names[:5]
Index(['AAACCCACATCGCTCT-1', 'AAACCCAGTAGGGTAC-1', 'AAACCCAGTGCATCTA-1',
'AAACCCATCATAGAGA-1', 'AAACCCATCGGAAACG-1'],
dtype='object')
adata.obs.loc['AAACCCACATCGCTCT']
adata.var.loc['CD44']
gene_ids ENSG00000026508
feature_types Gene Expression
n_cells 3339
mt False
n_cells_by_counts 3339
mean_counts 1.5769
pct_dropout_by_counts 50.23845
total_counts 10581.0
highly_variable True
means 0.691778
dispersions 1.210978
dispersions_norm 1.348397
mean 0.0
std 0.592069
Name: CD44, dtype: object
# to access a specific leiden cluster:
adata_cluster0 = adata[adata.obs['leiden'] == '0']
adata_cluster0
View of AnnData object with n_obs × n_vars = 1290 × 3708
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
obsp: 'distances', 'connectivities'
# select all cells within clusters 0 and 1, that has CD44 gene expression
# Create boolean masks for the Leiden clusters and CD44 expression.
# mask_cluster = (adata.obs['leiden'] == '0') | (adata.obs['leiden'] == '1')
# mask_cd44 = adata[:, 'CD44'].X.toarray().flatten() > 0
# Combine the masks using the logical AND operator.
# mask_combined = mask_cluster & mask_cd44
# Subset the AnnData object using the combined mask.
# adata_subset = adata[mask_combined]
# adata_subset
adata = scv.utils.merge(adata, ldata)
# adata_m = scv.utils.merge(adata, ldata_m)
adata
AnnData object with n_obs × n_vars = 6335 × 3708
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
layers: 'ambiguous', 'matrix', 'spliced', 'unspliced'
obsp: 'distances', 'connectivities'
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
Filtered out 1320 genes that are detected 20 counts (shared).
WARNING: Did not normalize X as it looks processed already. To enforce normalization, set `enforce=True`.
Normalized count data: spliced, unspliced.
Extracted 2000 highly variable genes.
WARNING: Did not modify X as it looks preprocessed already.
computing neighbors
finished (0:00:01) --> added
'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
finished (0:00:00) --> added
'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
# scv.pp.filter_and_normalize(adata_m)
# scv.pp.moments(adata_m)
adata
AnnData object with n_obs × n_vars = 6335 × 2000
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu'
obsp: 'distances', 'connectivities'
# adata.obs
# adata.var
# adata.X
# adata.uns
sc.pl.pca_variance_ratio(adata, log=True)
sc.tl.tsne(adata)
sc.pl.tsne(adata)
!pip install python-louvain # not working
sc.pl.tsne(adata, color = ['leiden'])
sc.pl.umap(adata, color="leiden")
sc.pl.umap(adata, color="CD44")
# Now, create and save the plot
# SQ NOTE this creates a 'figures' folder in pwd
sc.pl.umap(adata, color=“leiden”, save=“umap_leiden_clusters.pdf”) sc.pl.umap(adata, color=“CD44”, save=“umap_CD44_clusters.pdf”)
scv.tl.velocity(adata)
computing velocities
finished (0:00:08) --> added
'velocity', velocity vectors for each individual cell (adata.layers)
scv.tl.velocity_graph(adata)
computing velocity graph (using 1/80 cores)
0%| | 0/6335 [00:00<?, ?cells/s]
finished (0:00:50) --> added
'velocity_graph', sparse matrix with cosine correlations (adata.uns)
# scv.tl.velocity(adata_m)
# scv.tl.velocity_graph(adata_m)
scv.set_figure_params()
# scv.set_figure_params(dpi=200, color_map='viridis')
adata
AnnData object with n_obs × n_vars = 6335 × 2000
obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'initial_size_spliced', 'initial_size_unspliced', 'initial_size', 'n_counts', 'velocity_self_transition'
var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'leiden_colors', 'velocity_params', 'velocity_graph', 'velocity_graph_neg'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu', 'velocity', 'variance_velocity'
obsp: 'distances', 'connectivities'
adata.obsm
AxisArrays with keys: X_pca, X_umap
adata.layers['velocity']
# adata.uns['velocity_graph']
array([[ 0.16311729, 0.00325262, 0.02549965, ..., 0.2947519 ,
0.24068618, 0.00166135],
[ 0.05183516, 0. , -0.00231888, ..., -0.03039856,
0. , 0.62309027],
[ 0.14193237, 0. , 0. , ..., 0. ,
0. , 0.12683076],
...,
[ 0.07127077, -0.00128302, 0. , ..., 0.17017308,
0. , 0.49752218],
[ 0.06123393, -0.0016032 , -0.00131531, ..., 0.63884544,
0.05677878, 0. ],
[ 0.139233 , 0.03396539, 0.00363499, ..., 0.5291901 ,
0.2595918 , -0.04816317]], dtype=float32)
adata.obs
# adata.obs['n_genes_by_counts']
| n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | leiden | initial_size_spliced | initial_size_unspliced | initial_size | n_counts | velocity_self_transition | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCACATCGCTCT | 6772 | 6770 | 32469.0 | 1621.0 | 4.992455 | 2 | 14938 | 11611 | 14938.0 | 728.407471 | 0.353792 |
| AAACCCAGTAGGGTAC | 1508 | 1507 | 6036.0 | 96.0 | 1.590457 | 5 | 3632 | 1872 | 3632.0 | -61.993500 | 0.237123 |
| AAACCCAGTGCATCTA | 1735 | 1735 | 6346.0 | 165.0 | 2.600063 | 1 | 4681 | 876 | 4681.0 | -484.200470 | 0.272838 |
| AAACCCATCATAGAGA | 4500 | 4500 | 20373.0 | 1340.0 | 6.577333 | 0 | 13036 | 3108 | 13036.0 | -580.061584 | 0.223742 |
| AAACCCATCGGAAACG | 2396 | 2395 | 4683.0 | 325.0 | 6.939996 | 7 | 1787 | 1950 | 1787.0 | 893.204102 | 0.365823 |
| … | … | … | … | … | … | … | … | … | … | … | … |
| TTTGTTGAGACCAACG | 5686 | 5685 | 24267.0 | 2669.0 | 10.998476 | 1 | 12288 | 5878 | 12288.0 | -473.220093 | 0.226389 |
| TTTGTTGAGCTGGTGA | 3300 | 3300 | 13835.0 | 872.0 | 6.302855 | 0 | 9332 | 1777 | 9332.0 | -536.107422 | 0.264881 |
| TTTGTTGAGGGACACT | 1187 | 1187 | 3105.0 | 36.0 | 1.159420 | 0 | 1763 | 1064 | 1763.0 | -268.330414 | 0.173882 |
| TTTGTTGGTCTCCCTA | 3784 | 3783 | 8852.0 | 240.0 | 2.711252 | 7 | 4014 | 3374 | 4014.0 | 1190.949463 | 0.403490 |
| TTTGTTGTCTACACAG | 4755 | 4755 | 13093.0 | 391.0 | 2.986329 | 2 | 6323 | 4428 | 6323.0 | 812.005249 | 0.402153 |
6335 rows × 11 columns
adata.var
| gene_ids | feature_types | n_cells | mt | n_cells_by_counts | mean_counts | pct_dropout_by_counts | total_counts | highly_variable | means | … | std | Accession | Chromosome | End | Start | Strand | velocity_gamma | velocity_qreg_ratio | velocity_r2 | velocity_genes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| C1orf159 | ENSG00000131591 | Gene Expression | 1568 | False | 1568 | 0.349180 | 76.631893 | 2343.0 | True | -7.512550e-04 | … | 0.256349 | ENSG00000131591 | 1 | 1116361 | 1081818 | - | 0.960242 | 0.960242 | -1.056696 | False |
| ACAP3 | ENSG00000131584 | Gene Expression | 745 | False | 745 | 0.124590 | 88.897168 | 836.0 | True | -2.124832e-03 | … | 0.184853 | ENSG00000131584 | 1 | 1309609 | 1292390 | - | 0.013770 | 0.152219 | 0.288276 | True |
| MIB2 | ENSG00000197530 | Gene Expression | 906 | False | 906 | 0.155589 | 86.497765 | 1044.0 | True | -1.027507e-03 | … | 0.198915 | ENSG00000197530 | 1 | 1630610 | 1615415 | + | 0.044242 | 0.044242 | -0.414496 | False |
| KLHL21 | ENSG00000162413 | Gene Expression | 1283 | False | 1283 | 0.230253 | 80.879285 | 1545.0 | True | -1.395323e-08 | … | 0.284522 | ENSG00000162413 | 1 | 6614607 | 6590724 | - | 0.809669 | 0.809669 | -0.450083 | False |
| ENO1 | ENSG00000074800 | Gene Expression | 3961 | False | 3961 | 6.071833 | 40.968703 | 40742.0 | True | -6.604966e-09 | … | 0.890816 | ENSG00000074800 | 1 | 8879190 | 8861000 | - | 0.028938 | 0.038693 | 0.764427 | True |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| AFF2 | ENSG00000155966 | Gene Expression | 528 | False | 528 | 0.164531 | 92.131148 | 1104.0 | True | -2.977980e-04 | … | 0.294817 | ENSG00000155966 | X | 149000663 | 148500617 | + | 0.285532 | 3.813417 | 0.628403 | True |
| IDS | ENSG00000010404 | Gene Expression | 802 | False | 802 | 0.169896 | 88.047690 | 1140.0 | True | -1.446279e-04 | … | 0.247148 | ENSG00000010404 | X | 149521096 | 149476988 | - | 0.032241 | 0.070538 | 0.403367 | True |
| MTM1 | ENSG00000171100 | Gene Expression | 1499 | False | 1499 | 0.387183 | 77.660209 | 2598.0 | True | -1.403791e-08 | … | 0.318311 | ENSG00000171100 | X | 150673322 | 150568619 | + | 0.632687 | 2.800952 | 0.395020 | True |
| ARHGAP4 | ENSG00000089820 | Gene Expression | 1585 | False | 1585 | 0.421162 | 76.378539 | 2826.0 | True | -4.290405e-09 | … | 0.314576 | ENSG00000089820 | X | 153934999 | 153907367 | - | 0.057723 | 0.286018 | 0.456565 | True |
| CLIC2 | ENSG00000155962 | Gene Expression | 2154 | False | 2154 | 0.570045 | 67.898659 | 3825.0 | True | -7.113040e-09 | … | 0.440094 | ENSG00000155962 | X | 155334657 | 155276211 | - | 1.007027 | 2.217264 | 0.801841 | True |
2000 rows × 23 columns
# adata_m.obs
# fetch data from adata cluster 0 and 1, number of genes and make a histogram plot
import matplotlib.pyplot as plt
# Create boolean mask for clusters
mask_cluster = (adata.obs['leiden'] == '0') | (adata.obs['leiden'] == '1')
# Subset the data
adata_subset = adata[mask_cluster]
# Get number of genes for each cell
num_genes = adata_subset.obs['n_genes']
# Create histogram
plt.hist(num_genes, bins=30, alpha=0.75)
# Set the title and labels
plt.title('Histogram of Number of Genes')
plt.xlabel('Number of Genes')
plt.ylabel('Number of Cells')
# Show the plot
plt.show()
# find gene markers for each of the detected leiden clusters
# Perform the ranking
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
# The result is stored in adata.uns['rank_genes_groups']
result = adata.uns['rank_genes_groups']
# Initialize a list to hold dataframes for each cluster
df_list = []
# Iterate over clusters
for group in result['names'].dtype.names:
# Create a DataFrame for the current cluster
df = pd.DataFrame({
'Genes': result['names'][group],
'Scores': result['scores'][group],
'Logfoldchanges': result['logfoldchanges'][group],
'pvals': result['pvals'][group],
'pvals_adj': result['pvals_adj'][group],
})
# Add a column for the cluster
df['Cluster'] = group
# Append the DataFrame to the list
df_list.append(df)
# Concatenate all dataframes
df_all = pd.concat(df_list, ignore_index=True)
# Write DataFrame to a csv file
df_all.to_csv('scanpy_marker_genes.csv', index=False)
df_all
| Genes | Scores | Logfoldchanges | pvals | pvals_adj | Cluster | |
|---|---|---|---|---|---|---|
| 0 | NUSAP1 | 48.397820 | 2.678382 | 0.000000e+00 | 0.000000 | 0 |
| 1 | TOP2A | 47.867912 | 2.843289 | 0.000000e+00 | 0.000000 | 0 |
| 2 | CENPE | 45.637989 | 2.997186 | 0.000000e+00 | 0.000000 | 0 |
| 3 | DLGAP5 | 44.949627 | 2.646727 | 0.000000e+00 | 0.000000 | 0 |
| 4 | HMGB2 | 44.792137 | 2.093099 | 0.000000e+00 | 0.000000 | 0 |
| … | … | … | … | … | … | … |
| 289483 | MT-CO3 | -5.545708 | -3.068403 | 2.927671e-08 | 0.000014 | 11 |
| 289484 | RPL6 | -5.575259 | -3.883544 | 2.471614e-08 | 0.000014 | 11 |
| 289485 | RPS15A | -5.584476 | -3.632532 | 2.344061e-08 | 0.000014 | 11 |
| 289486 | RPS29 | -5.653746 | -3.751123 | 1.569881e-08 | 0.000012 | 11 |
| 289487 | MT-CO1 | -5.927827 | -4.364321 | 3.069686e-09 | 0.000004 | 11 |
289488 rows × 6 columns
df_all_f = df_all[(df_all.pvals_adj < 0.01)&(abs(df_all.Logfoldchanges)>2)]
df_all_f
| Genes | Scores | Logfoldchanges | pvals | pvals_adj | Cluster | |
|---|---|---|---|---|---|---|
| 0 | NUSAP1 | 48.397820 | 2.678382 | 0.000000e+00 | 0.000000 | 0 |
| 1 | TOP2A | 47.867912 | 2.843289 | 0.000000e+00 | 0.000000 | 0 |
| 2 | CENPE | 45.637989 | 2.997186 | 0.000000e+00 | 0.000000 | 0 |
| 3 | DLGAP5 | 44.949627 | 2.646727 | 0.000000e+00 | 0.000000 | 0 |
| 4 | HMGB2 | 44.792137 | 2.093099 | 0.000000e+00 | 0.000000 | 0 |
| … | … | … | … | … | … | … |
| 289483 | MT-CO3 | -5.545708 | -3.068403 | 2.927671e-08 | 0.000014 | 11 |
| 289484 | RPL6 | -5.575259 | -3.883544 | 2.471614e-08 | 0.000014 | 11 |
| 289485 | RPS15A | -5.584476 | -3.632532 | 2.344061e-08 | 0.000014 | 11 |
| 289486 | RPS29 | -5.653746 | -3.751123 | 1.569881e-08 | 0.000012 | 11 |
| 289487 | MT-CO1 | -5.927827 | -4.364321 | 3.069686e-09 | 0.000004 | 11 |
24668 rows × 6 columns
df_all_f.to_csv('scvelo_leiden_markers.csv')
cluster_list = adata.obs.leiden.unique()
cluster_list
['2', '5', '1', '0', '7', ..., '3', '4', '9', '10', '11']
Length: 12
Categories (12, object): ['0', '1', '2', '3', ..., '8', '9', '10', '11']
# Make cluster anottation dictionary
annotation = {"Ctype0":[0],
"Ctype1":[1],
"Ctype2":[2],
"Ctype3":[3,11],
"Ctype4":[4],
"Ctype5":[5],
"Ctype6":[6],
"Ctype7":[7],
"Ctype8":[8],
"Ctype9":[9],
"Ctype10":[10]}
# Change dictionary format
annotation_rev = {}
for i in cluster_list:
for k in annotation:
if int(i) in annotation[k]:
annotation_rev[i] = k
# Check dictionary
annotation_rev
{'2': 'Ctype2',
'5': 'Ctype5',
'1': 'Ctype1',
'0': 'Ctype0',
'7': 'Ctype7',
'6': 'Ctype6',
'8': 'Ctype8',
'3': 'Ctype3',
'4': 'Ctype4',
'9': 'Ctype9',
'10': 'Ctype10',
'11': 'Ctype3'}
adata.obs["cell_type"] = [annotation_rev[i] for i in adata.obs.leiden]
sc.pl.rank_genes_groups(adata, n_genes=10, groupby='leiden', sharey=False)
results = adata.uns['rank_genes_groups']
results['names']['0']
array(['NUSAP1', 'TOP2A', 'SLC4A1', 'HEMGN', 'HMGB2', 'MKI67', 'DLGAP5',
'UBE2S', 'TPX2', 'H1F0'], dtype=object)
results['names'].dtype.names
('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11')
scv.pl.velocity_embedding(adata, basis = 'umap', color = 'leiden')
# scv.pl.velocity_embedding(adata_m, basis = 'umap', color = 'leiden')
computing velocity embedding
finished (0:00:01) --> added
'velocity_umap', embedded velocity vectors (adata.obsm)
scv.pl.velocity_embedding_grid(adata, basis = 'umap', color = 'leiden')
#scv.pl.velocity_embedding_grid(adata_m, basis = 'umap', color = 'leiden')
scv.pl.velocity_embedding_stream(adata, basis = 'umap', color = 'leiden', save='velocity_embedding_stream_umap.pdf')
#scv.pl.velocity_embedding_stream(adata_m, basis = 'umap', color = 'leiden')
# figure cannot be saved as pdf, using png instead.
# saving figure to file ./figures/scvelo_velocity_embedding_stream.png
figure cannot be saved as pdf, using png instead.
saving figure to file ./figures/scvelo_velocity_embedding_stream_umap.png
scv.pl.velocity_embedding_stream(adata, basis = 'tsne', color = 'leiden', save='velocity_embedding_stream_tsne.pdf')
computing velocity embedding
finished (0:00:01) --> added
'velocity_tsne', embedded velocity vectors (adata.obsm)
saving figure to file ./figures/scvelo_velocity_embedding_stream_tsne.pdf
scv.pl.velocity_embedding_stream(adata, basis='tsne', color = 'cell_type', legend_loc='on data')
adata.obs.cell_type.unique()
['Ctype2', 'Ctype5', 'Ctype1', 'Ctype0', 'Ctype7', ..., 'Ctype8', 'Ctype3', 'Ctype4', 'Ctype9', 'Ctype10']
Length: 11
Categories (11, object): ['Ctype0', 'Ctype1', 'Ctype10', 'Ctype2', ..., 'Ctype6', 'Ctype7', 'Ctype8', 'Ctype9']
cell_of_interest = adata.obs.index[~adata.obs.cell_type.isin(["Ctype0", "Ctype5"])]
adata_ss1 = adata[cell_of_interest, :] # subset 1, NOT include Ctype0 and Ctype5 cell types
scv.pl.velocity_embedding_stream(adata_ss1, basis='tsne', color = 'cell_type', legend_loc='right margin')
sc.pl.tsne(adata, color=['cell_type'])
scv.pl.scatter(adata, 'CD44', color=['cell_type', 'velocity'])
scv.pl.velocity_embedding(adata, arrow_length=3, arrow_size=2, dpi=120)
scv.tl.rank_velocity_genes(adata, groupby = 'leiden', min_corr=0.3)
ranking velocity genes
finished (0:00:08) --> added
'rank_velocity_genes', sorted scores by group ids (adata.uns)
'spearmans_score', spearmans correlation scores (adata.var)
df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KIF14 | NETO2 | PLCB1 | NOL4L | SLC16A9 | PPME1 | MYLIP | IKZF2 | HDAC9 | UBR2 | ELK3 | ZNF462 |
| 1 | CD36 | NFIA | ATP8B4 | MAST4 | ACSBG1 | GYPE | TLN1 | SLC39A11 | CBL | DAPK1 | GPR183 | KCNQ1 |
| 2 | ASPM | E2F7 | ETV6 | LCP2 | KIAA1211 | FOXO3 | RBM38 | PALM2-AKAP2 | WDFY2 | CLIC2 | CAMK1 | FOSB |
| 3 | CENPE | SLC25A21 | PLAGL1 | STON2 | ZDHHC14 | OSBP2 | HIST1H2BJ | PLD1 | DOCK10 | KIAA0232 | FMO5 | FAM30A |
| 4 | CCNB1 | BMPR2 | KAT6B | P2RX1 | PVT1 | SLC14A1 | HIST1H3J | ST3GAL4 | DISC1 | TMCC2 | LGALS3 | ZDHHC19 |
#### CHANGE THE GENES TO SEE DIFFERENT RNA VELOCITY ANALYSES Look at output 88 (cell above) to find some interesting genes
scv.pl.velocity(adata, ['CD44', 'CD36','NFIA','KIF14','HDAC9','IKZF2'], ncols=2, color=['cell_type'],basis='tsne')
scv.pl.velocity(adata, ['CD44', 'CD36','NFIA','KIF14','HDAC9','IKZF2'], ncols=2, color=['cell_type'])
adata.obs.cell_type
AAACCCACATCGCTCT Ctype2
AAACCCAGTAGGGTAC Ctype5
AAACCCAGTGCATCTA Ctype1
AAACCCATCATAGAGA Ctype0
AAACCCATCGGAAACG Ctype7
...
TTTGTTGAGACCAACG Ctype1
TTTGTTGAGCTGGTGA Ctype0
TTTGTTGAGGGACACT Ctype0
TTTGTTGGTCTCCCTA Ctype7
TTTGTTGTCTACACAG Ctype2
Name: cell_type, Length: 6335, dtype: category
Categories (11, object): ['Ctype0', 'Ctype1', 'Ctype10', 'Ctype2', ..., 'Ctype6', 'Ctype7', 'Ctype8', 'Ctype9']
scv.tl.rank_velocity_genes(adata, groupby='cell_type', min_corr=.3)
df = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
df.head()
ranking velocity genes
finished (0:00:04) --> added
'rank_velocity_genes', sorted scores by group ids (adata.uns)
'spearmans_score', spearmans correlation scores (adata.var)
| Ctype0 | Ctype1 | Ctype10 | Ctype2 | Ctype3 | Ctype4 | Ctype5 | Ctype6 | Ctype7 | Ctype8 | Ctype9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RHD | OSBPL6 | TCF4 | PLAGL1 | TTC7B | LDLRAD4 | TANK | HIST1H2BJ | IKZF2 | DOCK10 | WDR26 |
| 1 | EPB42 | DAAM1 | IMMP2L | ANTXR2 | CLIP2 | ACACA | EPB42 | MAP1LC3B | PLD1 | DISC1 | CREBRF |
| 2 | HIST1H2BJ | FAM111B | SKAP1 | DOCK5 | WWC2 | NTN1 | GPCPD1 | SLC7A5 | GAB2 | MYO1F | RNF19A |
| 3 | KDM7A | MDM1 | DNAH14 | GNG7 | TNIK | KIF2A | KDM7A | TENT5C | SLC24A3 | AKAP13 | GPCPD1 |
| 4 | TPM1 | HBM | ARPC1B | ERG | SUSD1 | CCSER1 | RNF19A | REL | PAPSS1 | ARHGAP30 | ULK1 |
df.to_csv('scvelo_df_leiden_rank_velocity_genes.csv')
kwargs = dict(frameon=False, size=10, linewidth=1.5,
add_outline='Ctype0')
scv.pl.scatter(adata, df['Ctype0'][:10], ylabel='Ctype0', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype1'][:10], ylabel='Ctype1', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype3'][:10], ylabel='Ctype3',color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype9'][:10], ylabel='Ctype9', color=['cell_type'], **kwargs)
scv.pl.scatter(adata, df['Ctype5'][:10], ylabel='Ctype5', color=['cell_type'], **kwargs)
scv.tl.velocity_confidence(adata)
keys = 'velocity_length', 'velocity_confidence'
scv.pl.scatter(adata, c = keys, cmap='coolwarm', perc = [5,95])
--> added 'velocity_length' (adata.obs)
--> added 'velocity_confidence' (adata.obs)
--> added 'velocity_confidence_transition' (adata.obs)
adata.obs
| n_genes | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | leiden | initial_size_spliced | initial_size_unspliced | initial_size | n_counts | velocity_self_transition | velocity_length | velocity_confidence | velocity_confidence_transition | root_cells | end_points | velocity_pseudotime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCACATCGCTCT | 6772 | 6770 | 32469.0 | 1621.0 | 4.992455 | 2 | 14938 | 11611 | 14938.0 | 863.637268 | 0.383647 | 30.610001 | 0.948891 | -0.027396 | 0.031412 | 0.118727 | 0.317202 |
| AAACCCAGTAGGGTAC | 1508 | 1507 | 6036.0 | 96.0 | 1.590457 | 5 | 3632 | 1872 | 3632.0 | -106.495049 | 0.216999 | 44.320000 | 0.972637 | 0.156213 | 0.000064 | 0.048307 | 0.228780 |
| AAACCCAGTGCATCTA | 1735 | 1735 | 6346.0 | 165.0 | 2.600063 | 1 | 4681 | 876 | 4681.0 | -547.307556 | 0.168688 | 27.280001 | 0.939039 | 0.121379 | 0.022585 | 0.000116 | 0.059130 |
| AAACCCATCATAGAGA | 4500 | 4500 | 20373.0 | 1340.0 | 6.577333 | 0 | 13036 | 3108 | 13036.0 | -707.811035 | 0.253844 | 24.570000 | 0.977568 | 0.072128 | 0.001643 | 0.006620 | 0.031518 |
| AAACCCATCGGAAACG | 2396 | 2395 | 4683.0 | 325.0 | 6.939996 | 7 | 1787 | 1950 | 1787.0 | 920.113159 | 0.342233 | 41.799999 | 0.941867 | 0.073386 | 0.001660 | 0.100762 | 0.239823 |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| TTTGTTGAGACCAACG | 5686 | 5685 | 24267.0 | 2669.0 | 10.998476 | 1 | 12288 | 5878 | 12288.0 | -572.973083 | 0.286407 | 29.740000 | 0.963285 | 0.033437 | 0.001004 | 0.031958 | 0.040667 |
| TTTGTTGAGCTGGTGA | 3300 | 3300 | 13835.0 | 872.0 | 6.302855 | 0 | 9332 | 1777 | 9332.0 | -614.256470 | 0.139380 | 29.740000 | 0.983455 | 0.158749 | 0.000351 | 0.004965 | 0.026805 |
| TTTGTTGAGGGACACT | 1187 | 1187 | 3105.0 | 36.0 | 1.159420 | 0 | 1763 | 1064 | 1763.0 | -232.348145 | 0.186577 | 28.490000 | 0.954237 | 0.160944 | 0.000534 | 0.000141 | 0.038064 |
| TTTGTTGGTCTCCCTA | 3784 | 3783 | 8852.0 | 240.0 | 2.711252 | 7 | 4014 | 3374 | 4014.0 | 1288.671997 | 0.398848 | 39.500000 | 0.957163 | -0.043992 | 0.001771 | 0.109230 | 0.241569 |
| TTTGTTGTCTACACAG | 4755 | 4755 | 13093.0 | 391.0 | 2.986329 | 2 | 6323 | 4428 | 6323.0 | 806.780151 | 0.416695 | 29.190001 | 0.961888 | 0.014598 | 0.010415 | 0.276882 | 0.350445 |
6335 rows × 17 columns
df = adata.obs.groupby('cell_type')[keys].mean().T
df.style.background_gradient(cmap='coolwarm', axis=1)
| cell_type | Ctype0 | Ctype1 | Ctype10 | Ctype2 | Ctype3 | Ctype4 | Ctype5 | Ctype6 | Ctype7 | Ctype8 | Ctype9 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| velocity_length | 28.376411 | 25.740473 | 24.188236 | 25.939058 | 25.545404 | 22.209726 | 32.450985 | 18.562965 | 32.190620 | 26.416279 | 38.004265 |
| velocity_confidence | 0.982803 | 0.977395 | 0.932590 | 0.951401 | 0.932277 | 0.976059 | 0.972348 | 0.911105 | 0.935803 | 0.951495 | 0.966852 |
scv.pl.paga(adata, basis='tsne', size=50, alpha=.1,
min_edge_width=2, node_size_scale=1.5)
WARNING: Invalid color key. Using grey instead.
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color = 'velocity_pseudotime')
## THIS IS VERY TIME AND COMPUTATIONALLY CONSUMING
## WILL TAKE 30min TO RUN; it will take 20 of the 80 cores on WFH-7920 computer to run, with CPU fans on a lot!
scv.tl.recover_dynamics(adata, n_jobs = 20)
scv.tl.velocity(adata, mode = 'dynamical')
scv.tl.velocity_graph(adata)
scv.pl.velocity_embedding_stream(adata, basis = 'umap', color = 'leiden', save='velocity_embedding_stream_2.png')
recovering dynamics (using 20/80 cores)
0%| | 0/1415 [00:00<?, ?gene/s]
finished (0:02:46) --> added
'fit_pars', fitted parameters for splicing dynamics (adata.var)
computing velocities
finished (0:00:10) --> added
'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/80 cores)
0%| | 0/6335 [00:00<?, ?cells/s]
finished (0:00:17) --> added
'velocity_graph', sparse matrix with cosine correlations (adata.uns)
computing velocity embedding
finished (0:00:01) --> added
'velocity_umap', embedded velocity vectors (adata.obsm)
saving figure to file ./figures/scvelo_velocity_embedding_stream_2.png
scv.tl.latent_time(adata)
scv.pl.scatter(adata, color = 'latent_time', color_map = 'gnuplot', size = 80, save='latent_time.pdf')
computing latent time using root_cells as prior
finished (0:00:02) --> added
'latent_time', shared time (adata.obs)
saving figure to file ./figures/scvelo_latent_time.pdf
top_genes = adata.var['velocity_qreg_ratio'].sort_values(ascending=False).index[:300]
scv.pl.heatmap(adata, var_names=top_genes, sortby='latent_time', col_color='cell_type', n_convolve=100)
var_names = ['CD44', 'STOX2', 'PID1', 'PLXDC2']
scv.pl.scatter(adata, var_names, color = ['cell_type'], frameon=False)
scv.pl.scatter(adata, x='latent_time', color = ['cell_type'],y=var_names, frameon=False)
#Cluster-specific top-likelihood genes
#Moreover, partial gene likelihoods can be computed for a each cluster of cells to enable cluster-specific identification of potential drivers.
scv.tl.rank_dynamical_genes(adata, groupby='cell_type')
df = scv.get_df(adata, 'rank_dynamical_genes/names')
df.head(5)
ranking genes by cluster-specific likelihoods
finished (0:00:07) --> added
'rank_dynamical_genes', sorted scores by group ids (adata.uns)
| Ctype0 | Ctype1 | Ctype10 | Ctype2 | Ctype3 | Ctype4 | Ctype5 | Ctype6 | Ctype7 | Ctype8 | Ctype9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | SLC4A1 | LGALS3 | CDK6 | MPO | CDK6 | CDK6 | HBM | HBM | MS4A2 | CD1C | EPB42 |
| 1 | NFIA | GYPB | FAM117A | ATP8B4 | KALRN | DLGAP5 | SLC4A1 | EPB42 | CDK6 | SLC8A1 | ISG20 |
| 2 | HBM | ISG20 | CENPE | PTPRC | PALM2-AKAP2 | MYB | TCP11L2 | ISG20 | ENO1 | LY86 | SLC4A1 |
| 3 | NETO2 | SLC25A21 | AURKA | MACF1 | SELP | NFIA | CR1L | GYPB | FABP5 | CLEC10A | HBM |
| 4 | GYPB | NFIA | IGSF6 | ANGPT1 | TC2N | SLC25A21 | SEC62 | NFIA | PRKACB | PKIB | OSBP2 |
adata.var
| gene_ids | feature_types | n_cells | mt | n_cells_by_counts | mean_counts | pct_dropout_by_counts | total_counts | highly_variable | means | … | Chromosome | End | Start | Strand | velocity_gamma | velocity_qreg_ratio | velocity_r2 | velocity_genes | spearmans_score | velocity_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| C1orf159 | ENSG00000131591 | Gene Expression | 1568 | False | 1568 | 0.349180 | 76.631893 | 2343.0 | True | -7.512550e-04 | … | 1 | 1116361 | 1081818 | - | 0.960242 | 0.960242 | -1.056696 | False | 0.454777 | 0 |
| ACAP3 | ENSG00000131584 | Gene Expression | 745 | False | 745 | 0.124590 | 88.897168 | 836.0 | True | -2.124832e-03 | … | 1 | 1309609 | 1292390 | - | 0.013770 | 0.152219 | 0.288276 | True | 0.275960 | 0 |
| MIB2 | ENSG00000197530 | Gene Expression | 906 | False | 906 | 0.155589 | 86.497765 | 1044.0 | True | -1.027507e-03 | … | 1 | 1630610 | 1615415 | + | 0.044242 | 0.044242 | -0.414496 | False | 0.127040 | 0 |
| KLHL21 | ENSG00000162413 | Gene Expression | 1283 | False | 1283 | 0.230253 | 80.879285 | 1545.0 | True | -1.395323e-08 | … | 1 | 6614607 | 6590724 | - | 0.809669 | 0.809669 | -0.450083 | False | 0.000000 | 0 |
| ENO1 | ENSG00000074800 | Gene Expression | 3961 | False | 3961 | 6.071833 | 40.968703 | 40742.0 | True | -6.604966e-09 | … | 1 | 8879190 | 8861000 | - | 0.028938 | 0.038693 | 0.764427 | True | 0.851996 | 0 |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| AFF2 | ENSG00000155966 | Gene Expression | 528 | False | 528 | 0.164531 | 92.131148 | 1104.0 | True | -2.977980e-04 | … | X | 149000663 | 148500617 | + | 0.285532 | 3.813417 | 0.628403 | True | 0.657834 | 0 |
| IDS | ENSG00000010404 | Gene Expression | 802 | False | 802 | 0.169896 | 88.047690 | 1140.0 | True | -1.446279e-04 | … | X | 149521096 | 149476988 | - | 0.032241 | 0.070538 | 0.403367 | True | 0.708676 | 0 |
| MTM1 | ENSG00000171100 | Gene Expression | 1499 | False | 1499 | 0.387183 | 77.660209 | 2598.0 | True | -1.403791e-08 | … | X | 150673322 | 150568619 | + | 0.632687 | 2.800952 | 0.395020 | True | 0.686165 | 0 |
| ARHGAP4 | ENSG00000089820 | Gene Expression | 1585 | False | 1585 | 0.421162 | 76.378539 | 2826.0 | True | -4.290405e-09 | … | X | 153934999 | 153907367 | - | 0.057723 | 0.286018 | 0.456565 | True | 0.809695 | 0 |
| CLIC2 | ENSG00000155962 | Gene Expression | 2154 | False | 2154 | 0.570045 | 67.898659 | 3825.0 | True | -7.113040e-09 | … | X | 155334657 | 155276211 | - | 1.007027 | 2.217264 | 0.801841 | True | 0.743202 | 0 |
2000 rows × 25 columns
# saving this adata file
adata.write('./scanpy_analysis/E15.h5ad')
# next time you read this E15.h5ad into adata:
# adata = sc.read('./scanpy_analysis/E15.h5ad')