Resouces and libraries

library(reticulate)

import sys

print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import pandas as pd

import numpy as np

# import regular expression

import re

import seaborn as sb

sb.set_style('whitegrid')

import matplotlib.pyplot as plt

import matplotlib as mpl

mpl.use('ps') # # generate postscript output by default

Obtain and glimpse the data


source ~/.profile

wget -bqc ftp://ftp.ensembl.org/pub/release-98/gff3/homo_sapiens/Homo_sapiens.GRCh38.98.gff3.gz -O Homo_sapiens.GRCh38.98.gff3.gz
## Continuing in background, pid 849.

First few rows of the data


source ~/.profile

zcat Homo_sapiens.GRCh38.98.gff3.gz | head -n 24
## ##gff-version 3
## ##sequence-region   1 1 248956422
## ##sequence-region   10 1 133797422
## ##sequence-region   11 1 135086622
## ##sequence-region   12 1 133275309
## ##sequence-region   13 1 114364328
## ##sequence-region   14 1 107043718
## ##sequence-region   15 1 101991189
## ##sequence-region   16 1 90338345
## ##sequence-region   17 1 83257441
## ##sequence-region   18 1 80373285
## ##sequence-region   19 1 58617616
## ##sequence-region   2 1 242193529
## ##sequence-region   20 1 64444167
## ##sequence-region   21 1 46709983
## ##sequence-region   22 1 50818468
## ##sequence-region   3 1 198295559
## ##sequence-region   4 1 190214555
## ##sequence-region   5 1 181538259
## ##sequence-region   6 1 170805979
## ##sequence-region   7 1 159345973
## ##sequence-region   8 1 145138636
## ##sequence-region   9 1 138394717
## ##sequence-region   GL000008.2 1 209709

Last few rows of the data


source ~/.profile

zcat Homo_sapiens.GRCh38.98.gff3.gz | tail
## Y    havana  pseudogene  26626520    26627159    .   -   .   ID=gene:ENSG00000231514;Name=CCNQP2;biotype=processed_pseudogene;description=CCNQ pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:38436];gene_id=ENSG00000231514;logic_name=havana_homo_sapiens;version=1
## Y    havana  pseudogenic_transcript  26626520    26627159    .   -   .   ID=transcript:ENST00000435741;Parent=gene:ENSG00000231514;Name=CCNQP2-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000435741;transcript_support_level=NA;version=1
## Y    havana  exon    26626520    26627159    .   -   .   Parent=transcript:ENST00000435741;Name=ENSE00001616687;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001616687;rank=1;version=1
## ###
## Y    .   biological_region   26626966    26627137    0.994   -   .   external_name=rank %3D 1;logic_name=firstef
## Y    .   biological_region   26627457    26628186    0.997   +   .   external_name=rank %3D 1;logic_name=firstef
## Y    havana  pseudogene  56855244    56855488    .   +   .   ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=processed_pseudogene;description=C-terminal binding protein 2 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:23940];gene_id=ENSG00000235857;logic_name=havana_homo_sapiens;version=1
## Y    havana  pseudogenic_transcript  56855244    56855488    .   +   .   ID=transcript:ENST00000431853;Parent=gene:ENSG00000235857;Name=CTBP2P1-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000431853;transcript_support_level=NA;version=1
## Y    havana  exon    56855244    56855488    .   +   .   Parent=transcript:ENST00000431853;Name=ENSE00001794473;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001794473;rank=1;version=1
## ###

Read the data as a csv file


df = pd.read_csv('Homo_sapiens.GRCh38.98.gff3.gz', compression='gzip', sep='\t', comment='#', low_memory=False, header=None)

df.head()
##    0        1  ...  7                                                  8
## 0  1  Ensembl  ...  .  ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1  1        .  ...  .           external_name=oe %3D 0.79;logic_name=cpg
## 2  1        .  ...  .                                 logic_name=eponine
## 3  1        .  ...  .                                 logic_name=eponine
## 4  1        .  ...  .                                 logic_name=eponine
## 
## [5 rows x 9 columns]

df.tail()
##          0       1  ...  7                                                  8
## 2911081  Y       .  ...  .        external_name=rank %3D 1;logic_name=firstef
## 2911082  Y       .  ...  .        external_name=rank %3D 1;logic_name=firstef
## 2911083  Y  havana  ...  .  ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=p...
## 2911084  Y  havana  ...  .  ID=transcript:ENST00000431853;Parent=gene:ENSG...
## 2911085  Y  havana  ...  .  Parent=transcript:ENST00000431853;Name=ENSE000...
## 
## [5 rows x 9 columns]

Assign column names

col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attribute']
df.columns=col_names
print(df.head())
##   seqid   source  ... phase                                          attribute
## 0     1  Ensembl  ...     .  ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1     1        .  ...     .           external_name=oe %3D 0.79;logic_name=cpg
## 2     1        .  ...     .                                 logic_name=eponine
## 3     1        .  ...     .                                 logic_name=eponine
## 4     1        .  ...     .                                 logic_name=eponine
## 
## [5 rows x 9 columns]

Subset the mitochondria genome


# These are the uique sequece ids

df['seqid'].unique()
## array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
##        '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9',
##        'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1',
##        'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1',
##        'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1',
##        'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1',
##        'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1',
##        'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1',
##        'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1',
##        'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1',
##        'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1',
##        'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1',
##        'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1',
##        'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1',
##        'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1',
##        'KI270384.1', 'KI270385.1', 'KI270386.1', 'KI270387.1',
##        'KI270388.1', 'KI270389.1', 'KI270390.1', 'KI270391.1',
##        'KI270392.1', 'KI270393.1', 'KI270394.1', 'KI270395.1',
##        'KI270396.1', 'KI270411.1', 'KI270412.1', 'KI270414.1',
##        'KI270417.1', 'KI270418.1', 'KI270419.1', 'KI270420.1',
##        'KI270422.1', 'KI270423.1', 'KI270424.1', 'KI270425.1',
##        'KI270429.1', 'KI270435.1', 'KI270438.1', 'KI270442.1',
##        'KI270448.1', 'KI270465.1', 'KI270466.1', 'KI270467.1',
##        'KI270468.1', 'KI270507.1', 'KI270508.1', 'KI270509.1',
##        'KI270510.1', 'KI270511.1', 'KI270512.1', 'KI270515.1',
##        'KI270516.1', 'KI270517.1', 'KI270518.1', 'KI270519.1',
##        'KI270521.1', 'KI270522.1', 'KI270528.1', 'KI270529.1',
##        'KI270530.1', 'KI270538.1', 'KI270539.1', 'KI270544.1',
##        'KI270548.1', 'KI270579.1', 'KI270580.1', 'KI270581.1',
##        'KI270582.1', 'KI270583.1', 'KI270584.1', 'KI270587.1',
##        'KI270588.1', 'KI270589.1', 'KI270590.1', 'KI270591.1',
##        'KI270593.1', 'KI270706.1', 'KI270707.1', 'KI270708.1',
##        'KI270709.1', 'KI270710.1', 'KI270711.1', 'KI270712.1',
##        'KI270713.1', 'KI270714.1', 'KI270715.1', 'KI270716.1',
##        'KI270717.1', 'KI270718.1', 'KI270719.1', 'KI270720.1',
##        'KI270721.1', 'KI270722.1', 'KI270723.1', 'KI270724.1',
##        'KI270725.1', 'KI270726.1', 'KI270727.1', 'KI270728.1',
##        'KI270729.1', 'KI270730.1', 'KI270731.1', 'KI270732.1',
##        'KI270733.1', 'KI270734.1', 'KI270735.1', 'KI270736.1',
##        'KI270737.1', 'KI270738.1', 'KI270739.1', 'KI270740.1',
##        'KI270741.1', 'KI270742.1', 'KI270743.1', 'KI270744.1',
##        'KI270745.1', 'KI270746.1', 'KI270747.1', 'KI270748.1',
##        'KI270749.1', 'KI270750.1', 'KI270751.1', 'KI270752.1',
##        'KI270753.1', 'KI270754.1', 'KI270755.1', 'KI270756.1',
##        'KI270757.1', 'MT', 'X', 'Y'], dtype=object)

df_mt = df[df['seqid'] == 'MT']

df_mt['seqid'].value_counts()
## MT    126
## Name: seqid, dtype: int64

df_mt.head()
##         seqid   source  ... phase                                          attribute
## 2810411    MT  Ensembl  ...     .   ID=chromosome:MT;Alias=chrM,J01415.2,NC_012920.1
## 2810412    MT    insdc  ...     .  ID=gene:ENSG00000210049;Name=MT-TF;biotype=Mt_...
## 2810413    MT  ensembl  ...     .  ID=transcript:ENST00000387314;Parent=gene:ENSG...
## 2810414    MT  ensembl  ...     .  Parent=transcript:ENST00000387314;Name=ENSE000...
## 2810415    MT    insdc  ...     .  ID=gene:ENSG00000211459;Name=MT-RNR1;biotype=M...
## 
## [5 rows x 9 columns]

What are the various mitochondria sequence types


df_mt['type'].unique()
## array(['chromosome', 'ncRNA_gene', 'tRNA', 'exon', 'rRNA',
##        'biological_region', 'gene', 'mRNA', 'CDS'], dtype=object)

Number of genes in the mitochondria


df_mt[df_mt['type']=='gene']['type'].value_counts()
## gene    13
## Name: type, dtype: int64

Number of mRNAs in the mitochondria


# number of ncRNA_genes in mt

df_mt[df_mt['type']=='mRNA']['type'].value_counts()
## mRNA    13
## Name: type, dtype: int64

Number of ncRNA_genes in the mitochondria



df_mt[df_mt['type']=='ncRNA_gene']['type'].value_counts()
## ncRNA_gene    24
## Name: type, dtype: int64

Number of tRNAs in the mitochondria


df_mt[df_mt['type']=='tRNA']['type'].value_counts()
## tRNA    22
## Name: type, dtype: int64

Number of rRNAs in mt


df_mt[df_mt['type']=='rRNA']['type'].value_counts()
## rRNA    2
## Name: type, dtype: int64

Genes in the mitochondria

Subset mitochondria genes


# subset gene

df_mtgn = df_mt[df_mt['type']=='gene']

df_mtgn.head()
##         seqid source  ... phase                                          attribute
## 2810428    MT  insdc  ...     .  ID=gene:ENSG00000198888;Name=MT-ND1;biotype=pr...
## 2810441    MT  insdc  ...     .  ID=gene:ENSG00000198763;Name=MT-ND2;biotype=pr...
## 2810460    MT  insdc  ...     .  ID=gene:ENSG00000198804;Name=MT-CO1;biotype=pr...
## 2810470    MT  insdc  ...     .  ID=gene:ENSG00000198712;Name=MT-CO2;biotype=pr...
## 2810477    MT  insdc  ...     .  ID=gene:ENSG00000228253;Name=MT-ATP8;biotype=p...
## 
## [5 rows x 9 columns]

# make a copy pf the data set

df_mtgn= df_mtgn.copy()

# extract gene name from the attribute column

re_gene_name = re.compile(r'Name=(?P<gene_name>.+?);')

def extract_gene_name(attribute_col):
    rslt = re_gene_name.search(attribute_col)
    return rslt.group('gene_name')

df_mtgn['gene_name']=df_mtgn['attribute'].apply(extract_gene_name)

df_mtgn.drop('attribute', axis=1, inplace=True)

df_mtgn.head(n=15)
##         seqid source  type  start    end score strand phase gene_name
## 2810428    MT  insdc  gene   3307   4262     .      +     .    MT-ND1
## 2810441    MT  insdc  gene   4470   5511     .      +     .    MT-ND2
## 2810460    MT  insdc  gene   5904   7445     .      +     .    MT-CO1
## 2810470    MT  insdc  gene   7586   8269     .      +     .    MT-CO2
## 2810477    MT  insdc  gene   8366   8572     .      +     .   MT-ATP8
## 2810481    MT  insdc  gene   8527   9207     .      +     .   MT-ATP6
## 2810485    MT  insdc  gene   9207   9990     .      +     .    MT-CO3
## 2810492    MT  insdc  gene  10059  10404     .      +     .    MT-ND3
## 2810499    MT  insdc  gene  10470  10766     .      +     .   MT-ND4L
## 2810503    MT  insdc  gene  10760  12137     .      +     .    MT-ND4
## 2810516    MT  insdc  gene  12337  14148     .      +     .    MT-ND5
## 2810520    MT  insdc  gene  14149  14673     .      -     .    MT-ND6
## 2810527    MT  insdc  gene  14747  15887     .      +     .    MT-CYB

List the names of the mitochondria genes


# print the name of the genes as a list

print(sorted(df_mtgn['gene_name'].to_list()))
## ['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6']

Lengths of the mitochondrial genes and their distribution


df_mtgn['length'] = df_mtgn['end']- df_mtgn['start'] + 1

df_mtgn[['gene_name', 'length']].sort_values('length', ascending=False)
##         gene_name  length
## 2810516    MT-ND5    1812
## 2810460    MT-CO1    1542
## 2810503    MT-ND4    1378
## 2810527    MT-CYB    1141
## 2810441    MT-ND2    1042
## 2810428    MT-ND1     956
## 2810485    MT-CO3     784
## 2810470    MT-CO2     684
## 2810481   MT-ATP6     681
## 2810520    MT-ND6     525
## 2810492    MT-ND3     346
## 2810499   MT-ND4L     297
## 2810477   MT-ATP8     207

# sort the length of the genes (descending)

df_mtgn = df_mtgn[['gene_name', 'length']].sort_values('length', ascending=False)

df_mtgn
##         gene_name  length
## 2810516    MT-ND5    1812
## 2810460    MT-CO1    1542
## 2810503    MT-ND4    1378
## 2810527    MT-CYB    1141
## 2810441    MT-ND2    1042
## 2810428    MT-ND1     956
## 2810485    MT-CO3     784
## 2810470    MT-CO2     684
## 2810481   MT-ATP6     681
## 2810520    MT-ND6     525
## 2810492    MT-ND3     346
## 2810499   MT-ND4L     297
## 2810477   MT-ATP8     207

Barplot for lengths of mitochondrial genes


f, ax = plt.subplots(figsize=(12, 6))

sb.barplot(y='gene_name', x='length', data=df_mtgn)

plt.xticks(rotation=25)
## (array([   0.,  250.,  500.,  750., 1000., 1250., 1500., 1750., 2000.]), <a list of 9 Text xticklabel objects>)
plt.title('Mitochondrial genes by their base length')

plt.show()

Histogram and kernel density plot for the lengths of the mitochondrial genes


f, ax = plt.subplots(figsize=(8, 8))

sb.distplot(df_mtgn['length'], hist=True, rug=False)

plt.xlabel('gene length')

plt.title('Distribution of lengths of mitochondria genes')

plt.show()

Box plot for the lengths of the mitochondrial genes


f, ax = plt.subplots(figsize=(8, 4))

sb.boxplot(x='length', data=df_mtgn)

plt.xlabel('gene length')

plt.title('Distribution of lengths of mitochondria genes')

plt.show()

Split the attribute column to derive various features of the genes in the mitochondria


# subsets the gens after copying the data frame

pd.set_option('max_colwidth', 200)

df_mtattr = df_mt.copy()

df_mtattrg = df_mtattr[df_mtattr['type']=='gene']

df_mtattrg.head()
##         seqid  ...                                                                                                                                                                                                attribute
## 2810428    MT  ...  ID=gene:ENSG00000198888;Name=MT-ND1;biotype=protein_coding;description=mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 1 [Source:HGNC Symbol%3BAcc:HGNC:7455];gene_id=ENSG000001...
## 2810441    MT  ...  ID=gene:ENSG00000198763;Name=MT-ND2;biotype=protein_coding;description=mitochondrially encoded NADH:ubiquinone oxidoreductase core subunit 2 [Source:HGNC Symbol%3BAcc:HGNC:7456];gene_id=ENSG000001...
## 2810460    MT  ...  ID=gene:ENSG00000198804;Name=MT-CO1;biotype=protein_coding;description=mitochondrially encoded cytochrome c oxidase I [Source:HGNC Symbol%3BAcc:HGNC:7419];gene_id=ENSG00000198804;logic_name=mt_gen...
## 2810470    MT  ...  ID=gene:ENSG00000198712;Name=MT-CO2;biotype=protein_coding;description=mitochondrially encoded cytochrome c oxidase II [Source:HGNC Symbol%3BAcc:HGNC:7421];gene_id=ENSG00000198712;logic_name=mt_ge...
## 2810477    MT  ...  ID=gene:ENSG00000228253;Name=MT-ATP8;biotype=protein_coding;description=mitochondrially encoded ATP synthase membrane subunit 8 [Source:HGNC Symbol%3BAcc:HGNC:7415];gene_id=ENSG00000228253;logic_n...
## 
## [5 rows x 9 columns]


# if not using regular expression, the 'attribute' column could be split and expanded to get the value of various features

df_mtattrg[['ID', 'Name', 'Biotype', 'description', 'gene_id', 'logic_name', 'version']] = df_mtattrg['attribute'].str.split(';', expand=True)

df_mtattrg.head()
##         seqid source  ...                                 logic_name    version
## 2810428    MT  insdc  ...  logic_name=mt_genbank_import_homo_sapiens  version=2
## 2810441    MT  insdc  ...  logic_name=mt_genbank_import_homo_sapiens  version=3
## 2810460    MT  insdc  ...  logic_name=mt_genbank_import_homo_sapiens  version=2
## 2810470    MT  insdc  ...  logic_name=mt_genbank_import_homo_sapiens  version=1
## 2810477    MT  insdc  ...  logic_name=mt_genbank_import_homo_sapiens  version=1
## 
## [5 rows x 16 columns]

# remove extraneous strings from the expanded columns using re

df_mtattrg['ID']=df_mtattrg['ID'].str.replace(r'ID=gene:', '')
df_mtattrg['Name']=df_mtattrg['Name'].str.replace(r'Name=', '')
df_mtattrg['Biotype']=df_mtattrg['Biotype'].str.replace(r'biotype=', '')

df_mtattrg['description']=df_mtattrg['description'].str.replace(r'description=', '')

df_mtattrg['gene_id']=df_mtattrg['gene_id'].str.replace(r'gene_id=', '')
df_mtattrg['logic_name']=df_mtattrg['logic_name'].str.replace(r'logic_name=', '')
df_mtattrg['version']=df_mtattrg['version'].str.replace(r'version=', '')

df_mtattrg.drop('attribute', axis=1, inplace=True)

df_mtattrg
##         seqid source  ...                      logic_name  version
## 2810428    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810441    MT  insdc  ...  mt_genbank_import_homo_sapiens        3
## 2810460    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810470    MT  insdc  ...  mt_genbank_import_homo_sapiens        1
## 2810477    MT  insdc  ...  mt_genbank_import_homo_sapiens        1
## 2810481    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810485    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810492    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810499    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810503    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810516    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810520    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 2810527    MT  insdc  ...  mt_genbank_import_homo_sapiens        2
## 
## [13 rows x 15 columns]

ncRNA_genes in the mitochondria

Subset mitochondria nc_RNA genes


df_mt_ncrnagene = df_mt.copy()

df_mt_ncrnagene = df_mt_ncrnagene[df_mt_ncrnagene['type']=='ncRNA_gene']

df_mt_ncrnagene.head()
##         seqid  ...                                                                                                                                                                                                attribute
## 2810412    MT  ...  ID=gene:ENSG00000210049;Name=MT-TF;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Phe (UUU/C) [Source:HGNC Symbol%3BAcc:HGNC:7481];gene_id=ENSG00000210049;logic_name=mt_genbank_import_ho...
## 2810415    MT  ...  ID=gene:ENSG00000211459;Name=MT-RNR1;biotype=Mt_rRNA;description=mitochondrially encoded 12S rRNA [Source:HGNC Symbol%3BAcc:HGNC:7470];gene_id=ENSG00000211459;logic_name=mt_genbank_import_homo_sap...
## 2810418    MT  ...  ID=gene:ENSG00000210077;Name=MT-TV;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Val (GUN) [Source:HGNC Symbol%3BAcc:HGNC:7500];gene_id=ENSG00000210077;logic_name=mt_genbank_import_homo...
## 2810421    MT  ...  ID=gene:ENSG00000210082;Name=MT-RNR2;biotype=Mt_rRNA;description=mitochondrially encoded 16S rRNA [Source:HGNC Symbol%3BAcc:HGNC:7471];gene_id=ENSG00000210082;logic_name=mt_genbank_import_homo_sap...
## 2810424    MT  ...  ID=gene:ENSG00000209082;Name=MT-TL1;biotype=Mt_tRNA;description=mitochondrially encoded tRNA-Leu (UUA/G) 1 [Source:HGNC Symbol%3BAcc:HGNC:7490];gene_id=ENSG00000209082;logic_name=mt_genbank_import...
## 
## [5 rows x 9 columns]
df_mt_ncrnagene=df_mt_ncrnagene.copy()

df_mt_ncrnagene['gene_name']=df_mt_ncrnagene['attribute'].apply(extract_gene_name)

df_mt_ncrnagene.drop('attribute', axis=1, inplace=True)

List the names of the mitochondria nc_RNA genes


# print the name of the nc_RNA genes as a list

print(sorted(df_mt_ncrnagene['gene_name'].to_list()))
## ['MT-RNR1', 'MT-RNR2', 'MT-TA', 'MT-TC', 'MT-TD', 'MT-TE', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TK', 'MT-TL1', 'MT-TL2', 'MT-TM', 'MT-TN', 'MT-TP', 'MT-TQ', 'MT-TR', 'MT-TS1', 'MT-TS2', 'MT-TT', 'MT-TV', 'MT-TW', 'MT-TY']

Determine the lengths of mitochondria nc_RNA genes


pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 15)


df_mt_ncrnagene['length'] = df_mt_ncrnagene['end']-df_mt_ncrnagene['start'] + 1

df_mt_ncrnagene
##         seqid source        type  start    end score strand phase gene_name  \
## 2810412    MT  insdc  ncRNA_gene    577    647     .      +     .     MT-TF   
## 2810415    MT  insdc  ncRNA_gene    648   1601     .      +     .   MT-RNR1   
## 2810418    MT  insdc  ncRNA_gene   1602   1670     .      +     .     MT-TV   
## 2810421    MT  insdc  ncRNA_gene   1671   3229     .      +     .   MT-RNR2   
## 2810424    MT  insdc  ncRNA_gene   3230   3304     .      +     .    MT-TL1   
## 2810432    MT  insdc  ncRNA_gene   4263   4331     .      +     .     MT-TI   
## 2810435    MT  insdc  ncRNA_gene   4329   4400     .      -     .     MT-TQ   
## 2810438    MT  insdc  ncRNA_gene   4402   4469     .      +     .     MT-TM   
## 2810445    MT  insdc  ncRNA_gene   5512   5579     .      +     .     MT-TW   
## 2810448    MT  insdc  ncRNA_gene   5587   5655     .      -     .     MT-TA   
## 2810451    MT  insdc  ncRNA_gene   5657   5729     .      -     .     MT-TN   
## 2810454    MT  insdc  ncRNA_gene   5761   5826     .      -     .     MT-TC   
## 2810457    MT  insdc  ncRNA_gene   5826   5891     .      -     .     MT-TY   
## 2810464    MT  insdc  ncRNA_gene   7446   7514     .      -     .    MT-TS1   
## 2810467    MT  insdc  ncRNA_gene   7518   7585     .      +     .     MT-TD   
## 2810474    MT  insdc  ncRNA_gene   8295   8364     .      +     .     MT-TK   
## 2810489    MT  insdc  ncRNA_gene   9991  10058     .      +     .     MT-TG   
## 2810496    MT  insdc  ncRNA_gene  10405  10469     .      +     .     MT-TR   
## 2810507    MT  insdc  ncRNA_gene  12138  12206     .      +     .     MT-TH   
## 2810510    MT  insdc  ncRNA_gene  12207  12265     .      +     .    MT-TS2   
## 2810513    MT  insdc  ncRNA_gene  12266  12336     .      +     .    MT-TL2   
## 2810524    MT  insdc  ncRNA_gene  14674  14742     .      -     .     MT-TE   
## 2810531    MT  insdc  ncRNA_gene  15888  15953     .      +     .     MT-TT   
## 2810534    MT  insdc  ncRNA_gene  15956  16023     .      -     .     MT-TP   
## 
##          length  
## 2810412      71  
## 2810415     954  
## 2810418      69  
## 2810421    1559  
## 2810424      75  
## 2810432      69  
## 2810435      72  
## 2810438      68  
## 2810445      68  
## 2810448      69  
## 2810451      73  
## 2810454      66  
## 2810457      66  
## 2810464      69  
## 2810467      68  
## 2810474      70  
## 2810489      68  
## 2810496      65  
## 2810507      69  
## 2810510      59  
## 2810513      71  
## 2810524      69  
## 2810531      66  
## 2810534      68

# sort the length of the genes (descending)

df_mt_ncrnagene = df_mt_ncrnagene[['gene_name', 'length']].sort_values('length', ascending=False)

df_mt_ncrnagene
##         gene_name  length
## 2810421   MT-RNR2    1559
## 2810415   MT-RNR1     954
## 2810424    MT-TL1      75
## 2810451     MT-TN      73
## 2810435     MT-TQ      72
## 2810412     MT-TF      71
## 2810513    MT-TL2      71
## 2810474     MT-TK      70
## 2810418     MT-TV      69
## 2810432     MT-TI      69
## 2810524     MT-TE      69
## 2810448     MT-TA      69
## 2810507     MT-TH      69
## 2810464    MT-TS1      69
## 2810489     MT-TG      68
## 2810534     MT-TP      68
## 2810467     MT-TD      68
## 2810445     MT-TW      68
## 2810438     MT-TM      68
## 2810454     MT-TC      66
## 2810531     MT-TT      66
## 2810457     MT-TY      66
## 2810496     MT-TR      65
## 2810510    MT-TS2      59

Barplot for lengths of mitochondrial nc-RNA genes


f, ax = plt.subplots(figsize=(12, 6))

sb.barplot(y='gene_name', x='length', data=df_mt_ncrnagene)

plt.xticks(rotation=25)
## (array([   0.,  200.,  400.,  600.,  800., 1000., 1200., 1400., 1600.,
##        1800.]), <a list of 10 Text xticklabel objects>)
plt.title('Mitochondrial nc_rna genes by their base length')

plt.show()

Histogram and kernel density plot for the lengths of the mitochondrial nc-RNA genes


f, ax = plt.subplots(figsize=(8, 8))

sb.distplot(df_mt_ncrnagene['length'], hist=True, rug=False)

plt.xlabel('nc_rna gene length')

plt.title('Distribution of lengths of mitochondria nc_rna genes')

plt.show()

Box plot for the lengths of the mitochondrial nc-RNA genes


f, ax = plt.subplots(figsize=(8, 4))

sb.boxplot(x='length', data=df_mtgn)

plt.xlabel('nc_rna gene length')

plt.title('Distribution of lengths of mitochondria nc_rna genes')

plt.show()