Gene
s in the mitochondriancRNA_gene
s in the mitochondrialibrary(reticulate)
import sys
print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
# import regular expression
import re
import seaborn as sb
sb.set_style('whitegrid')
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('ps') # # generate postscript output by default
source ~/.profile
wget -bqc ftp://ftp.ensembl.org/pub/release-98/gff3/homo_sapiens/Homo_sapiens.GRCh38.98.gff3.gz -O Homo_sapiens.GRCh38.98.gff3.gz
## Continuing in background, pid 849.
source ~/.profile
zcat Homo_sapiens.GRCh38.98.gff3.gz | head -n 24
## ##gff-version 3
## ##sequence-region 1 1 248956422
## ##sequence-region 10 1 133797422
## ##sequence-region 11 1 135086622
## ##sequence-region 12 1 133275309
## ##sequence-region 13 1 114364328
## ##sequence-region 14 1 107043718
## ##sequence-region 15 1 101991189
## ##sequence-region 16 1 90338345
## ##sequence-region 17 1 83257441
## ##sequence-region 18 1 80373285
## ##sequence-region 19 1 58617616
## ##sequence-region 2 1 242193529
## ##sequence-region 20 1 64444167
## ##sequence-region 21 1 46709983
## ##sequence-region 22 1 50818468
## ##sequence-region 3 1 198295559
## ##sequence-region 4 1 190214555
## ##sequence-region 5 1 181538259
## ##sequence-region 6 1 170805979
## ##sequence-region 7 1 159345973
## ##sequence-region 8 1 145138636
## ##sequence-region 9 1 138394717
## ##sequence-region GL000008.2 1 209709
source ~/.profile
zcat Homo_sapiens.GRCh38.98.gff3.gz | tail
## Y havana pseudogene 26626520 26627159 . - . ID=gene:ENSG00000231514;Name=CCNQP2;biotype=processed_pseudogene;description=CCNQ pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:38436];gene_id=ENSG00000231514;logic_name=havana_homo_sapiens;version=1
## Y havana pseudogenic_transcript 26626520 26627159 . - . ID=transcript:ENST00000435741;Parent=gene:ENSG00000231514;Name=CCNQP2-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000435741;transcript_support_level=NA;version=1
## Y havana exon 26626520 26627159 . - . Parent=transcript:ENST00000435741;Name=ENSE00001616687;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001616687;rank=1;version=1
## ###
## Y . biological_region 26626966 26627137 0.994 - . external_name=rank %3D 1;logic_name=firstef
## Y . biological_region 26627457 26628186 0.997 + . external_name=rank %3D 1;logic_name=firstef
## Y havana pseudogene 56855244 56855488 . + . ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=processed_pseudogene;description=C-terminal binding protein 2 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:23940];gene_id=ENSG00000235857;logic_name=havana_homo_sapiens;version=1
## Y havana pseudogenic_transcript 56855244 56855488 . + . ID=transcript:ENST00000431853;Parent=gene:ENSG00000235857;Name=CTBP2P1-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000431853;transcript_support_level=NA;version=1
## Y havana exon 56855244 56855488 . + . Parent=transcript:ENST00000431853;Name=ENSE00001794473;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001794473;rank=1;version=1
## ###
df = pd.read_csv('Homo_sapiens.GRCh38.98.gff3.gz', compression='gzip', sep='\t', comment='#', low_memory=False, header=None)
df.head()
## 0 1 ... 7 8
## 0 1 Ensembl ... . ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1 1 . ... . external_name=oe %3D 0.79;logic_name=cpg
## 2 1 . ... . logic_name=eponine
## 3 1 . ... . logic_name=eponine
## 4 1 . ... . logic_name=eponine
##
## [5 rows x 9 columns]
df.tail()
## 0 1 ... 7 8
## 2911081 Y . ... . external_name=rank %3D 1;logic_name=firstef
## 2911082 Y . ... . external_name=rank %3D 1;logic_name=firstef
## 2911083 Y havana ... . ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=p...
## 2911084 Y havana ... . ID=transcript:ENST00000431853;Parent=gene:ENSG...
## 2911085 Y havana ... . Parent=transcript:ENST00000431853;Name=ENSE000...
##
## [5 rows x 9 columns]
col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attribute']
df.columns=col_names
print(df.head())
## seqid source ... phase attribute
## 0 1 Ensembl ... . ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1 1 . ... . external_name=oe %3D 0.79;logic_name=cpg
## 2 1 . ... . logic_name=eponine
## 3 1 . ... . logic_name=eponine
## 4 1 . ... . logic_name=eponine
##
## [5 rows x 9 columns]
# These are the uique sequece ids
df['seqid'].unique()
## array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
## '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9',
## 'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1',
## 'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1',
## 'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1',
## 'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1',
## 'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1',
## 'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1',
## 'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1',
## 'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1',
## 'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1',
## 'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1',
## 'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1',
## 'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1',
## 'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1',
## 'KI270384.1', 'KI270385.1', 'KI270386.1', 'KI270387.1',
## 'KI270388.1', 'KI270389.1', 'KI270390.1', 'KI270391.1',
## 'KI270392.1', 'KI270393.1', 'KI270394.1', 'KI270395.1',
## 'KI270396.1', 'KI270411.1', 'KI270412.1', 'KI270414.1',
## 'KI270417.1', 'KI270418.1', 'KI270419.1', 'KI270420.1',
## 'KI270422.1', 'KI270423.1', 'KI270424.1', 'KI270425.1',
## 'KI270429.1', 'KI270435.1', 'KI270438.1', 'KI270442.1',
## 'KI270448.1', 'KI270465.1', 'KI270466.1', 'KI270467.1',
## 'KI270468.1', 'KI270507.1', 'KI270508.1', 'KI270509.1',
## 'KI270510.1', 'KI270511.1', 'KI270512.1', 'KI270515.1',
## 'KI270516.1', 'KI270517.1', 'KI270518.1', 'KI270519.1',
## 'KI270521.1', 'KI270522.1', 'KI270528.1', 'KI270529.1',
## 'KI270530.1', 'KI270538.1', 'KI270539.1', 'KI270544.1',
## 'KI270548.1', 'KI270579.1', 'KI270580.1', 'KI270581.1',
## 'KI270582.1', 'KI270583.1', 'KI270584.1', 'KI270587.1',
## 'KI270588.1', 'KI270589.1', 'KI270590.1', 'KI270591.1',
## 'KI270593.1', 'KI270706.1', 'KI270707.1', 'KI270708.1',
## 'KI270709.1', 'KI270710.1', 'KI270711.1', 'KI270712.1',
## 'KI270713.1', 'KI270714.1', 'KI270715.1', 'KI270716.1',
## 'KI270717.1', 'KI270718.1', 'KI270719.1', 'KI270720.1',
## 'KI270721.1', 'KI270722.1', 'KI270723.1', 'KI270724.1',
## 'KI270725.1', 'KI270726.1', 'KI270727.1', 'KI270728.1',
## 'KI270729.1', 'KI270730.1', 'KI270731.1', 'KI270732.1',
## 'KI270733.1', 'KI270734.1', 'KI270735.1', 'KI270736.1',
## 'KI270737.1', 'KI270738.1', 'KI270739.1', 'KI270740.1',
## 'KI270741.1', 'KI270742.1', 'KI270743.1', 'KI270744.1',
## 'KI270745.1', 'KI270746.1', 'KI270747.1', 'KI270748.1',
## 'KI270749.1', 'KI270750.1', 'KI270751.1', 'KI270752.1',
## 'KI270753.1', 'KI270754.1', 'KI270755.1', 'KI270756.1',
## 'KI270757.1', 'MT', 'X', 'Y'], dtype=object)
df_mt = df[df['seqid'] == 'MT']
df_mt['seqid'].value_counts()
## MT 126
## Name: seqid, dtype: int64
df_mt.head()
## seqid source ... phase attribute
## 2810411 MT Ensembl ... . ID=chromosome:MT;Alias=chrM,J01415.2,NC_012920.1
## 2810412 MT insdc ... . ID=gene:ENSG00000210049;Name=MT-TF;biotype=Mt_...
## 2810413 MT ensembl ... . ID=transcript:ENST00000387314;Parent=gene:ENSG...
## 2810414 MT ensembl ... . Parent=transcript:ENST00000387314;Name=ENSE000...
## 2810415 MT insdc ... . ID=gene:ENSG00000211459;Name=MT-RNR1;biotype=M...
##
## [5 rows x 9 columns]
df_mt['type'].unique()
## array(['chromosome', 'ncRNA_gene', 'tRNA', 'exon', 'rRNA',
## 'biological_region', 'gene', 'mRNA', 'CDS'], dtype=object)
df_mt[df_mt['type']=='gene']['type'].value_counts()
## gene 13
## Name: type, dtype: int64
# number of ncRNA_genes in mt
df_mt[df_mt['type']=='mRNA']['type'].value_counts()
## mRNA 13
## Name: type, dtype: int64
df_mt[df_mt['type']=='ncRNA_gene']['type'].value_counts()
## ncRNA_gene 24
## Name: type, dtype: int64
df_mt[df_mt['type']=='tRNA']['type'].value_counts()
## tRNA 22
## Name: type, dtype: int64
df_mt[df_mt['type']=='rRNA']['type'].value_counts()
## rRNA 2
## Name: type, dtype: int64
Gene
s in the mitochondria
# subset gene
df_mtgn = df_mt[df_mt['type']=='gene']
df_mtgn.head()
## seqid source ... phase attribute
## 2810428 MT insdc ... . ID=gene:ENSG00000198888;Name=MT-ND1;biotype=pr...
## 2810441 MT insdc ... . ID=gene:ENSG00000198763;Name=MT-ND2;biotype=pr...
## 2810460 MT insdc ... . ID=gene:ENSG00000198804;Name=MT-CO1;biotype=pr...
## 2810470 MT insdc ... . ID=gene:ENSG00000198712;Name=MT-CO2;biotype=pr...
## 2810477 MT insdc ... . ID=gene:ENSG00000228253;Name=MT-ATP8;biotype=p...
##
## [5 rows x 9 columns]
# make a copy pf the data set
df_mtgn= df_mtgn.copy()
# extract gene name from the attribute column
re_gene_name = re.compile(r'Name=(?P<gene_name>.+?);')
def extract_gene_name(attribute_col):
rslt = re_gene_name.search(attribute_col)
return rslt.group('gene_name')
df_mtgn['gene_name']=df_mtgn['attribute'].apply(extract_gene_name)
df_mtgn.drop('attribute', axis=1, inplace=True)
df_mtgn.head(n=15)
## seqid source type start end score strand phase gene_name
## 2810428 MT insdc gene 3307 4262 . + . MT-ND1
## 2810441 MT insdc gene 4470 5511 . + . MT-ND2
## 2810460 MT insdc gene 5904 7445 . + . MT-CO1
## 2810470 MT insdc gene 7586 8269 . + . MT-CO2
## 2810477 MT insdc gene 8366 8572 . + . MT-ATP8
## 2810481 MT insdc gene 8527 9207 . + . MT-ATP6
## 2810485 MT insdc gene 9207 9990 . + . MT-CO3
## 2810492 MT insdc gene 10059 10404 . + . MT-ND3
## 2810499 MT insdc gene 10470 10766 . + . MT-ND4L
## 2810503 MT insdc gene 10760 12137 . + . MT-ND4
## 2810516 MT insdc gene 12337 14148 . + . MT-ND5
## 2810520 MT insdc gene 14149 14673 . - . MT-ND6
## 2810527 MT insdc gene 14747 15887 . + . MT-CYB
# print the name of the genes as a list
print(sorted(df_mtgn['gene_name'].to_list()))
## ['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6']
df_mtgn['length'] = df_mtgn['end']- df_mtgn['start'] + 1
df_mtgn[['gene_name', 'length']].sort_values('length', ascending=False)
## gene_name length
## 2810516 MT-ND5 1812
## 2810460 MT-CO1 1542
## 2810503 MT-ND4 1378
## 2810527 MT-CYB 1141
## 2810441 MT-ND2 1042
## 2810428 MT-ND1 956
## 2810485 MT-CO3 784
## 2810470 MT-CO2 684
## 2810481 MT-ATP6 681
## 2810520 MT-ND6 525
## 2810492 MT-ND3 346
## 2810499 MT-ND4L 297
## 2810477 MT-ATP8 207
# sort the length of the genes (descending)
df_mtgn = df_mtgn[['gene_name', 'length']].sort_values('length', ascending=False)
df_mtgn
## gene_name length
## 2810516 MT-ND5 1812
## 2810460 MT-CO1 1542
## 2810503 MT-ND4 1378
## 2810527 MT-CYB 1141
## 2810441 MT-ND2 1042
## 2810428 MT-ND1 956
## 2810485 MT-CO3 784
## 2810470 MT-CO2 684
## 2810481 MT-ATP6 681
## 2810520 MT-ND6 525
## 2810492 MT-ND3 346
## 2810499 MT-ND4L 297
## 2810477 MT-ATP8 207
f, ax = plt.subplots(figsize=(12, 6))
sb.barplot(y='gene_name', x='length', data=df_mtgn)
plt.xticks(rotation=25)
## (array([ 0., 250., 500., 750., 1000., 1250., 1500., 1750., 2000.]), <a list of 9 Text xticklabel objects>)
plt.title('Mitochondrial genes by their base length')
plt.show()
f, ax = plt.subplots(figsize=(8, 8))
sb.distplot(df_mtgn['length'], hist=True, rug=False)
plt.xlabel('gene length')
plt.title('Distribution of lengths of mitochondria genes')
plt.show()
f, ax = plt.subplots(figsize=(8, 4))
sb.boxplot(x='length', data=df_mtgn)
plt.xlabel('gene length')
plt.title('Distribution of lengths of mitochondria genes')
plt.show()
attribute
column to derive various features of the gene
s in the mitochondria
# subsets the gens after copying the data frame
pd.set_option('max_colwidth', 200)
df_mtattr = df_mt.copy()
df_mtattrg = df_mtattr[df_mtattr['type']=='gene']
df_mtattrg.head()
## seqid ... attribute
## 2810428 MT ... ID=gene:ENSG00000198888;Name=MT-ND1;biotype=protein_coding;description=mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 [Source:HGNC Symbol%3BAcc:HGNC:7455];gene_id=ENSG000001...
## 2810441 MT ... ID=gene:ENSG00000198763;Name=MT-ND2;biotype=protein_coding;description=mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 [Source:HGNC Symbol%3BAcc:HGNC:7456];gene_id=ENSG000001...
## 2810460 MT ... ID=gene:ENSG00000198804;Name=MT-CO1;biotype=protein_coding;description=mitochondrially encoded cytochrome c oxidase I [Source:HGNC Symbol%3BAcc:HGNC:7419];gene_id=ENSG00000198804;logic_name=mt_gen...
## 2810470 MT ... ID=gene:ENSG00000198712;Name=MT-CO2;biotype=protein_coding;description=mitochondrially encoded cytochrome c oxidase II [Source:HGNC Symbol%3BAcc:HGNC:7421];gene_id=ENSG00000198712;logic_name=mt_ge...
## 2810477 MT ... ID=gene:ENSG00000228253;Name=MT-ATP8;biotype=protein_coding;description=mitochondrially encoded ATP synthase membrane subunit 8 [Source:HGNC Symbol%3BAcc:HGNC:7415];gene_id=ENSG00000228253;logic_n...
##
## [5 rows x 9 columns]
# if not using regular expression, the 'attribute' column could be split and expanded to get the value of various features
df_mtattrg[['ID', 'Name', 'Biotype', 'description', 'gene_id', 'logic_name', 'version']] = df_mtattrg['attribute'].str.split(';', expand=True)
df_mtattrg.head()
## seqid source ... logic_name version
## 2810428 MT insdc ... logic_name=mt_genbank_import_homo_sapiens version=2
## 2810441 MT insdc ... logic_name=mt_genbank_import_homo_sapiens version=3
## 2810460 MT insdc ... logic_name=mt_genbank_import_homo_sapiens version=2
## 2810470 MT insdc ... logic_name=mt_genbank_import_homo_sapiens version=1
## 2810477 MT insdc ... logic_name=mt_genbank_import_homo_sapiens version=1
##
## [5 rows x 16 columns]
# remove extraneous strings from the expanded columns using re
df_mtattrg['ID']=df_mtattrg['ID'].str.replace(r'ID=gene:', '')
df_mtattrg['Name']=df_mtattrg['Name'].str.replace(r'Name=', '')
df_mtattrg['Biotype']=df_mtattrg['Biotype'].str.replace(r'biotype=', '')
df_mtattrg['description']=df_mtattrg['description'].str.replace(r'description=', '')
df_mtattrg['gene_id']=df_mtattrg['gene_id'].str.replace(r'gene_id=', '')
df_mtattrg['logic_name']=df_mtattrg['logic_name'].str.replace(r'logic_name=', '')
df_mtattrg['version']=df_mtattrg['version'].str.replace(r'version=', '')
df_mtattrg.drop('attribute', axis=1, inplace=True)
df_mtattrg
## seqid source ... logic_name version
## 2810428 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810441 MT insdc ... mt_genbank_import_homo_sapiens 3
## 2810460 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810470 MT insdc ... mt_genbank_import_homo_sapiens 1
## 2810477 MT insdc ... mt_genbank_import_homo_sapiens 1
## 2810481 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810485 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810492 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810499 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810503 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810516 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810520 MT insdc ... mt_genbank_import_homo_sapiens 2
## 2810527 MT insdc ... mt_genbank_import_homo_sapiens 2
##
## [13 rows x 15 columns]
ncRNA_gene
s in the mitochondria
df_mt_ncrnagene = df_mt.copy()
df_mt_ncrnagene = df_mt_ncrnagene[df_mt_ncrnagene['type']=='ncRNA_gene']
df_mt_ncrnagene.head()
## seqid ... attribute
## 2810412 MT ... ID=gene:ENSG00000210049;Name=MT-TF;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Phe (UUU/C) [Source:HGNC Symbol%3BAcc:HGNC:7481];gene_id=ENSG00000210049;logic_name=mt_genbank_import_ho...
## 2810415 MT ... ID=gene:ENSG00000211459;Name=MT-RNR1;biotype=Mt_rRNA;description=mitochondrially encoded 12S rRNA [Source:HGNC Symbol%3BAcc:HGNC:7470];gene_id=ENSG00000211459;logic_name=mt_genbank_import_homo_sap...
## 2810418 MT ... ID=gene:ENSG00000210077;Name=MT-TV;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Val (GUN) [Source:HGNC Symbol%3BAcc:HGNC:7500];gene_id=ENSG00000210077;logic_name=mt_genbank_import_homo...
## 2810421 MT ... ID=gene:ENSG00000210082;Name=MT-RNR2;biotype=Mt_rRNA;description=mitochondrially encoded 16S rRNA [Source:HGNC Symbol%3BAcc:HGNC:7471];gene_id=ENSG00000210082;logic_name=mt_genbank_import_homo_sap...
## 2810424 MT ... ID=gene:ENSG00000209082;Name=MT-TL1;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Leu (UUA/G) 1 [Source:HGNC Symbol%3BAcc:HGNC:7490];gene_id=ENSG00000209082;logic_name=mt_genbank_import...
##
## [5 rows x 9 columns]
df_mt_ncrnagene=df_mt_ncrnagene.copy()
df_mt_ncrnagene['gene_name']=df_mt_ncrnagene['attribute'].apply(extract_gene_name)
df_mt_ncrnagene.drop('attribute', axis=1, inplace=True)
# print the name of the nc_RNA genes as a list
print(sorted(df_mt_ncrnagene['gene_name'].to_list()))
## ['MT-RNR1', 'MT-RNR2', 'MT-TA', 'MT-TC', 'MT-TD', 'MT-TE', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TK', 'MT-TL1', 'MT-TL2', 'MT-TM', 'MT-TN', 'MT-TP', 'MT-TQ', 'MT-TR', 'MT-TS1', 'MT-TS2', 'MT-TT', 'MT-TV', 'MT-TW', 'MT-TY']
pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 15)
df_mt_ncrnagene['length'] = df_mt_ncrnagene['end']-df_mt_ncrnagene['start'] + 1
df_mt_ncrnagene
## seqid source type start end score strand phase gene_name \
## 2810412 MT insdc ncRNA_gene 577 647 . + . MT-TF
## 2810415 MT insdc ncRNA_gene 648 1601 . + . MT-RNR1
## 2810418 MT insdc ncRNA_gene 1602 1670 . + . MT-TV
## 2810421 MT insdc ncRNA_gene 1671 3229 . + . MT-RNR2
## 2810424 MT insdc ncRNA_gene 3230 3304 . + . MT-TL1
## 2810432 MT insdc ncRNA_gene 4263 4331 . + . MT-TI
## 2810435 MT insdc ncRNA_gene 4329 4400 . - . MT-TQ
## 2810438 MT insdc ncRNA_gene 4402 4469 . + . MT-TM
## 2810445 MT insdc ncRNA_gene 5512 5579 . + . MT-TW
## 2810448 MT insdc ncRNA_gene 5587 5655 . - . MT-TA
## 2810451 MT insdc ncRNA_gene 5657 5729 . - . MT-TN
## 2810454 MT insdc ncRNA_gene 5761 5826 . - . MT-TC
## 2810457 MT insdc ncRNA_gene 5826 5891 . - . MT-TY
## 2810464 MT insdc ncRNA_gene 7446 7514 . - . MT-TS1
## 2810467 MT insdc ncRNA_gene 7518 7585 . + . MT-TD
## 2810474 MT insdc ncRNA_gene 8295 8364 . + . MT-TK
## 2810489 MT insdc ncRNA_gene 9991 10058 . + . MT-TG
## 2810496 MT insdc ncRNA_gene 10405 10469 . + . MT-TR
## 2810507 MT insdc ncRNA_gene 12138 12206 . + . MT-TH
## 2810510 MT insdc ncRNA_gene 12207 12265 . + . MT-TS2
## 2810513 MT insdc ncRNA_gene 12266 12336 . + . MT-TL2
## 2810524 MT insdc ncRNA_gene 14674 14742 . - . MT-TE
## 2810531 MT insdc ncRNA_gene 15888 15953 . + . MT-TT
## 2810534 MT insdc ncRNA_gene 15956 16023 . - . MT-TP
##
## length
## 2810412 71
## 2810415 954
## 2810418 69
## 2810421 1559
## 2810424 75
## 2810432 69
## 2810435 72
## 2810438 68
## 2810445 68
## 2810448 69
## 2810451 73
## 2810454 66
## 2810457 66
## 2810464 69
## 2810467 68
## 2810474 70
## 2810489 68
## 2810496 65
## 2810507 69
## 2810510 59
## 2810513 71
## 2810524 69
## 2810531 66
## 2810534 68
# sort the length of the genes (descending)
df_mt_ncrnagene = df_mt_ncrnagene[['gene_name', 'length']].sort_values('length', ascending=False)
df_mt_ncrnagene
## gene_name length
## 2810421 MT-RNR2 1559
## 2810415 MT-RNR1 954
## 2810424 MT-TL1 75
## 2810451 MT-TN 73
## 2810435 MT-TQ 72
## 2810412 MT-TF 71
## 2810513 MT-TL2 71
## 2810474 MT-TK 70
## 2810418 MT-TV 69
## 2810432 MT-TI 69
## 2810524 MT-TE 69
## 2810448 MT-TA 69
## 2810507 MT-TH 69
## 2810464 MT-TS1 69
## 2810489 MT-TG 68
## 2810534 MT-TP 68
## 2810467 MT-TD 68
## 2810445 MT-TW 68
## 2810438 MT-TM 68
## 2810454 MT-TC 66
## 2810531 MT-TT 66
## 2810457 MT-TY 66
## 2810496 MT-TR 65
## 2810510 MT-TS2 59
f, ax = plt.subplots(figsize=(12, 6))
sb.barplot(y='gene_name', x='length', data=df_mt_ncrnagene)
plt.xticks(rotation=25)
## (array([ 0., 200., 400., 600., 800., 1000., 1200., 1400., 1600.,
## 1800.]), <a list of 10 Text xticklabel objects>)
plt.title('Mitochondrial nc_rna genes by their base length')
plt.show()
f, ax = plt.subplots(figsize=(8, 8))
sb.distplot(df_mt_ncrnagene['length'], hist=True, rug=False)
plt.xlabel('nc_rna gene length')
plt.title('Distribution of lengths of mitochondria nc_rna genes')
plt.show()
f, ax = plt.subplots(figsize=(8, 4))
sb.boxplot(x='length', data=df_mtgn)
plt.xlabel('nc_rna gene length')
plt.title('Distribution of lengths of mitochondria nc_rna genes')
plt.show()