Resouces and libraries

library(reticulate)

import sys

print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import pandas as pd

import numpy as np

import seaborn as sb

sb.set_style('whitegrid')

import matplotlib.pyplot as plt

import matplotlib as mpl

mpl.use('ps') # generate postscript output by default

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 200)

Obtain and glimpse the data


source ~/.profile

wget -bqc ftp://ftp.ensembl.org/pub/release-98/gff3/homo_sapiens/Homo_sapiens.GRCh38.98.gff3.gz -O Homo_sapiens.GRCh38.98.gff3.gz
## Continuing in background, pid 1107.

First few rows of the data


source ~/.profile

zcat Homo_sapiens.GRCh38.98.gff3.gz | head -n 24
## ##gff-version 3
## ##sequence-region   1 1 248956422
## ##sequence-region   10 1 133797422
## ##sequence-region   11 1 135086622
## ##sequence-region   12 1 133275309
## ##sequence-region   13 1 114364328
## ##sequence-region   14 1 107043718
## ##sequence-region   15 1 101991189
## ##sequence-region   16 1 90338345
## ##sequence-region   17 1 83257441
## ##sequence-region   18 1 80373285
## ##sequence-region   19 1 58617616
## ##sequence-region   2 1 242193529
## ##sequence-region   20 1 64444167
## ##sequence-region   21 1 46709983
## ##sequence-region   22 1 50818468
## ##sequence-region   3 1 198295559
## ##sequence-region   4 1 190214555
## ##sequence-region   5 1 181538259
## ##sequence-region   6 1 170805979
## ##sequence-region   7 1 159345973
## ##sequence-region   8 1 145138636
## ##sequence-region   9 1 138394717
## ##sequence-region   GL000008.2 1 209709

Last few rows of the data


source ~/.profile

zcat Homo_sapiens.GRCh38.98.gff3.gz | tail
## Y    havana  pseudogene  26626520    26627159    .   -   .   ID=gene:ENSG00000231514;Name=CCNQP2;biotype=processed_pseudogene;description=CCNQ pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:38436];gene_id=ENSG00000231514;logic_name=havana_homo_sapiens;version=1
## Y    havana  pseudogenic_transcript  26626520    26627159    .   -   .   ID=transcript:ENST00000435741;Parent=gene:ENSG00000231514;Name=CCNQP2-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000435741;transcript_support_level=NA;version=1
## Y    havana  exon    26626520    26627159    .   -   .   Parent=transcript:ENST00000435741;Name=ENSE00001616687;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001616687;rank=1;version=1
## ###
## Y    .   biological_region   26626966    26627137    0.994   -   .   external_name=rank %3D 1;logic_name=firstef
## Y    .   biological_region   26627457    26628186    0.997   +   .   external_name=rank %3D 1;logic_name=firstef
## Y    havana  pseudogene  56855244    56855488    .   +   .   ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=processed_pseudogene;description=C-terminal binding protein 2 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:23940];gene_id=ENSG00000235857;logic_name=havana_homo_sapiens;version=1
## Y    havana  pseudogenic_transcript  56855244    56855488    .   +   .   ID=transcript:ENST00000431853;Parent=gene:ENSG00000235857;Name=CTBP2P1-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000431853;transcript_support_level=NA;version=1
## Y    havana  exon    56855244    56855488    .   +   .   Parent=transcript:ENST00000431853;Name=ENSE00001794473;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001794473;rank=1;version=1
## ###

Read the data as a csv file


df = pd.read_csv('Homo_sapiens.GRCh38.98.gff3.gz', compression='gzip', sep='\t', comment='#', low_memory=False, header=None)

df.head()
##    0        1                  2      3          4        5  6  7                                                  8
## 0  1  Ensembl         chromosome      1  248956422        .  .  .  ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1  1        .  biological_region  10469      11240  1.3e+03  .  .           external_name=oe %3D 0.79;logic_name=cpg
## 2  1        .  biological_region  10650      10657    0.999  +  .                                 logic_name=eponine
## 3  1        .  biological_region  10655      10657    0.999  -  .                                 logic_name=eponine
## 4  1        .  biological_region  10678      10687    0.999  +  .                                 logic_name=eponine

df.tail()
##          0       1                       2         3         4      5  6  7                                                  8
## 2911081  Y       .       biological_region  26626966  26627137  0.994  -  .        external_name=rank %3D 1;logic_name=firstef
## 2911082  Y       .       biological_region  26627457  26628186  0.997  +  .        external_name=rank %3D 1;logic_name=firstef
## 2911083  Y  havana              pseudogene  56855244  56855488      .  +  .  ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=p...
## 2911084  Y  havana  pseudogenic_transcript  56855244  56855488      .  +  .  ID=transcript:ENST00000431853;Parent=gene:ENSG...
## 2911085  Y  havana                    exon  56855244  56855488      .  +  .  Parent=transcript:ENST00000431853;Name=ENSE000...

Assign column names

col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attribute']
df.columns=col_names
df.head()
##   seqid   source               type  start        end    score strand phase                                          attribute
## 0     1  Ensembl         chromosome      1  248956422        .      .     .  ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1     1        .  biological_region  10469      11240  1.3e+03      .     .           external_name=oe %3D 0.79;logic_name=cpg
## 2     1        .  biological_region  10650      10657    0.999      +     .                                 logic_name=eponine
## 3     1        .  biological_region  10655      10657    0.999      -     .                                 logic_name=eponine
## 4     1        .  biological_region  10678      10687    0.999      +     .                                 logic_name=eponine

print(df.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 2911086 entries, 0 to 2911085
## Data columns (total 9 columns):
## seqid        object
## source       object
## type         object
## start        int64
## end          int64
## score        object
## strand       object
## phase        object
## attribute    object
## dtypes: int64(2), object(7)
## memory usage: 199.9+ MB
## None

Various elements in the human genome

There are 194 unique seqids. These are Chromosomes 1 to 22, X, Y, and mitochondrion (MT) DNA as well as 170 others (i.e., scaffolds)


df['seqid'].unique()
## array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
##        '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9',
##        'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1',
##        'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1',
##        'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1',
##        'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1',
##        'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1',
##        'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1',
##        'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1',
##        'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1',
##        'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1',
##        'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1',
##        'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1',
##        'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1',
##        'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1',
##        'KI270384.1', 'KI270385.1', 'KI270386.1', 'KI270387.1',
##        'KI270388.1', 'KI270389.1', 'KI270390.1', 'KI270391.1',
##        'KI270392.1', 'KI270393.1', 'KI270394.1', 'KI270395.1',
##        'KI270396.1', 'KI270411.1', 'KI270412.1', 'KI270414.1',
##        'KI270417.1', 'KI270418.1', 'KI270419.1', 'KI270420.1',
##        'KI270422.1', 'KI270423.1', 'KI270424.1', 'KI270425.1',
##        'KI270429.1', 'KI270435.1', 'KI270438.1', 'KI270442.1',
##        'KI270448.1', 'KI270465.1', 'KI270466.1', 'KI270467.1',
##        'KI270468.1', 'KI270507.1', 'KI270508.1', 'KI270509.1',
##        'KI270510.1', 'KI270511.1', 'KI270512.1', 'KI270515.1',
##        'KI270516.1', 'KI270517.1', 'KI270518.1', 'KI270519.1',
##        'KI270521.1', 'KI270522.1', 'KI270528.1', 'KI270529.1',
##        'KI270530.1', 'KI270538.1', 'KI270539.1', 'KI270544.1',
##        'KI270548.1', 'KI270579.1', 'KI270580.1', 'KI270581.1',
##        'KI270582.1', 'KI270583.1', 'KI270584.1', 'KI270587.1',
##        'KI270588.1', 'KI270589.1', 'KI270590.1', 'KI270591.1',
##        'KI270593.1', 'KI270706.1', 'KI270707.1', 'KI270708.1',
##        'KI270709.1', 'KI270710.1', 'KI270711.1', 'KI270712.1',
##        'KI270713.1', 'KI270714.1', 'KI270715.1', 'KI270716.1',
##        'KI270717.1', 'KI270718.1', 'KI270719.1', 'KI270720.1',
##        'KI270721.1', 'KI270722.1', 'KI270723.1', 'KI270724.1',
##        'KI270725.1', 'KI270726.1', 'KI270727.1', 'KI270728.1',
##        'KI270729.1', 'KI270730.1', 'KI270731.1', 'KI270732.1',
##        'KI270733.1', 'KI270734.1', 'KI270735.1', 'KI270736.1',
##        'KI270737.1', 'KI270738.1', 'KI270739.1', 'KI270740.1',
##        'KI270741.1', 'KI270742.1', 'KI270743.1', 'KI270744.1',
##        'KI270745.1', 'KI270746.1', 'KI270747.1', 'KI270748.1',
##        'KI270749.1', 'KI270750.1', 'KI270751.1', 'KI270752.1',
##        'KI270753.1', 'KI270754.1', 'KI270755.1', 'KI270756.1',
##        'KI270757.1', 'MT', 'X', 'Y'], dtype=object)

df['seqid'].unique().shape
## (194,)

df['seqid'].value_counts()
## 1             266965
## 2             216479
## 3             185364
## 17            174091
## 19            172147
##                ...  
## KI270395.1         1
## KI270379.1         1
## KI270382.1         1
## KI270340.1         1
## KI270730.1         1
## Name: seqid, Length: 194, dtype: int64

Chromosomes


# quotation marks/string notation for seqid as shown above

chrs=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',  'X', 'Y']

df_chrs = df[df['seqid'].isin(chrs)]

df_chrs.head()
##   seqid   source               type  start        end    score strand phase                                          attribute
## 0     1  Ensembl         chromosome      1  248956422        .      .     .  ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1     1        .  biological_region  10469      11240  1.3e+03      .     .           external_name=oe %3D 0.79;logic_name=cpg
## 2     1        .  biological_region  10650      10657    0.999      +     .                                 logic_name=eponine
## 3     1        .  biological_region  10655      10657    0.999      -     .                                 logic_name=eponine
## 4     1        .  biological_region  10678      10687    0.999      +     .                                 logic_name=eponine

df_chrs['seqid'].unique()
## array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
##        '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'X', 'Y'],
##       dtype=object)
df_chrs['seqid'].unique().shape
## (24,)

# number of entries by seqid/chromosomes

pd.set_option("display.max_rows", 24)

df_chrs['seqid'].value_counts()
## 1     266965
## 2     216479
## 3     185364
## 17    174091
## 19    172147
## 11    169346
## 12    164355
## 7     139890
## 16    135858
## 6     133533
## 5     129573
## 4     121926
## 10    109692
## 15    107924
## 8     106448
## 9     104055
## 14     99668
## X      92643
## 20     69131
## 22     64369
## 18     53030
## 13     48805
## 21     34510
## Y       7906
## Name: seqid, dtype: int64

# plot the number of entries by chromosomes

f, ax = plt.subplots(figsize=(16, 10))

sb.countplot(y="seqid", data=df_chrs, order = df_chrs['seqid'].value_counts().index)

plt.title('Number of entries by chromosomes in the human genome')

plt.show()

Scaffolds

There are 169 unique scaffolds.


# Those are not in chromosomes nor in MT DNA

df_scflds = df[-(df['seqid'].isin(chrs)) & -(df['seqid']=='MT')]

df_scflds.head()
##               seqid   source               type  start     end score strand phase                                          attribute
## 2807159  GL000008.2  Ensembl           scaffold      1  209709     .      .     .  ID=scaffold:GL000008.2;Alias=chr4_GL000008v2_r...
## 2807160  GL000008.2        .  biological_region  83284   83394     1      -     .        external_name=rank %3D 1;logic_name=firstef
## 2807161  GL000008.2        .  biological_region  83318   83804   342      .     .           external_name=oe %3D 0.85;logic_name=cpg
## 2807162  GL000008.2        .  biological_region  83763   84145     1      +     .        external_name=rank %3D 1;logic_name=firstef
## 2807163  GL000009.2  Ensembl           scaffold      1  201709     .      .     .  ID=scaffold:GL000009.2;Alias=chr14_GL000009v2_...

df_scflds['seqid'].unique()
## array(['GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1',
##        'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1',
##        'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1',
##        'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1',
##        'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1',
##        'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1',
##        'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1',
##        'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1',
##        'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1',
##        'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1',
##        'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1',
##        'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1',
##        'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1',
##        'KI270384.1', 'KI270385.1', 'KI270386.1', 'KI270387.1',
##        'KI270388.1', 'KI270389.1', 'KI270390.1', 'KI270391.1',
##        'KI270392.1', 'KI270393.1', 'KI270394.1', 'KI270395.1',
##        'KI270396.1', 'KI270411.1', 'KI270412.1', 'KI270414.1',
##        'KI270417.1', 'KI270418.1', 'KI270419.1', 'KI270420.1',
##        'KI270422.1', 'KI270423.1', 'KI270424.1', 'KI270425.1',
##        'KI270429.1', 'KI270435.1', 'KI270438.1', 'KI270442.1',
##        'KI270448.1', 'KI270465.1', 'KI270466.1', 'KI270467.1',
##        'KI270468.1', 'KI270507.1', 'KI270508.1', 'KI270509.1',
##        'KI270510.1', 'KI270511.1', 'KI270512.1', 'KI270515.1',
##        'KI270516.1', 'KI270517.1', 'KI270518.1', 'KI270519.1',
##        'KI270521.1', 'KI270522.1', 'KI270528.1', 'KI270529.1',
##        'KI270530.1', 'KI270538.1', 'KI270539.1', 'KI270544.1',
##        'KI270548.1', 'KI270579.1', 'KI270580.1', 'KI270581.1',
##        'KI270582.1', 'KI270583.1', 'KI270584.1', 'KI270587.1',
##        'KI270588.1', 'KI270589.1', 'KI270590.1', 'KI270591.1',
##        'KI270593.1', 'KI270706.1', 'KI270707.1', 'KI270708.1',
##        'KI270709.1', 'KI270710.1', 'KI270711.1', 'KI270712.1',
##        'KI270713.1', 'KI270714.1', 'KI270715.1', 'KI270716.1',
##        'KI270717.1', 'KI270718.1', 'KI270719.1', 'KI270720.1',
##        'KI270721.1', 'KI270722.1', 'KI270723.1', 'KI270724.1',
##        'KI270725.1', 'KI270726.1', 'KI270727.1', 'KI270728.1',
##        'KI270729.1', 'KI270730.1', 'KI270731.1', 'KI270732.1',
##        'KI270733.1', 'KI270734.1', 'KI270735.1', 'KI270736.1',
##        'KI270737.1', 'KI270738.1', 'KI270739.1', 'KI270740.1',
##        'KI270741.1', 'KI270742.1', 'KI270743.1', 'KI270744.1',
##        'KI270745.1', 'KI270746.1', 'KI270747.1', 'KI270748.1',
##        'KI270749.1', 'KI270750.1', 'KI270751.1', 'KI270752.1',
##        'KI270753.1', 'KI270754.1', 'KI270755.1', 'KI270756.1',
##        'KI270757.1'], dtype=object)

df_scflds['seqid'].unique().shape
## (169,)

pd.set_option("display.max_rows", 169)

df_scflds['seqid'].value_counts()
## KI270733.1    422
## GL000220.1    408
## KI270728.1    368
## KI270712.1    353
## KI270711.1    148
## KI270749.1    144
## KI270734.1    118
## KI270727.1    112
## KI270725.1    103
## KI270717.1     90
## GL000213.1     71
## KI270729.1     63
## GL000225.1     58
## KI270721.1     56
## KI270732.1     48
## KI270709.1     46
## KI270747.1     45
## KI270714.1     35
## GL000194.1     34
## KI270731.1     28
## GL000195.1     26
## KI270713.1     26
## KI270442.1     25
## GL000219.1     24
## GL000216.2     23
## KI270754.1     22
## KI270735.1     21
## GL000205.2     20
## KI270753.1     19
## KI270738.1     19
## KI270744.1     18
## KI270726.1     18
## GL000218.1     14
## KI270724.1     11
## KI270741.1     10
## KI270723.1     10
## KI270719.1      9
## GL000009.2      9
## KI270743.1      8
## GL000224.1      7
## KI270750.1      7
## KI270742.1      4
## KI270722.1      4
## GL000214.1      4
## KI270752.1      4
## GL000008.2      4
## KI270706.1      4
## KI270708.1      3
## KI270745.1      3
## GL000221.1      3
## KI270751.1      3
## KI270539.1      2
## KI270748.1      2
## KI270528.1      1
## KI270538.1      1
## KI270320.1      1
## KI270382.1      1
## KI270544.1      1
## KI270715.1      1
## KI270720.1      1
## KI270390.1      1
## KI270589.1      1
## KI270418.1      1
## KI270511.1      1
## KI270422.1      1
## KI270310.1      1
## KI270757.1      1
## KI270373.1      1
## KI270374.1      1
## KI270593.1      1
## KI270419.1      1
## KI270362.1      1
## KI270394.1      1
## KI270316.1      1
## GL000226.1      1
## KI270391.1      1
## KI270335.1      1
## KI270333.1      1
## KI270303.1      1
## KI270334.1      1
## KI270378.1      1
## KI270465.1      1
## KI270710.1      1
## KI270337.1      1
## KI270707.1      1
## KI270317.1      1
## GL000208.1      1
## KI270530.1      1
## KI270519.1      1
## KI270508.1      1
## KI270395.1      1
## KI270322.1      1
## KI270438.1      1
## KI270548.1      1
## KI270425.1      1
## KI270383.1      1
## KI270366.1      1
## KI270467.1      1
## KI270587.1      1
## KI270376.1      1
## KI270515.1      1
## KI270417.1      1
## KI270582.1      1
## KI270329.1      1
## KI270740.1      1
## KI270364.1      1
## KI270730.1      1
## KI270579.1      1
## KI270375.1      1
## KI270756.1      1
## KI270371.1      1
## KI270302.1      1
## KI270718.1      1
## KI270580.1      1
## KI270435.1      1
## KI270412.1      1
## KI270315.1      1
## KI270448.1      1
## KI270414.1      1
## KI270379.1      1
## KI270393.1      1
## KI270516.1      1
## KI270384.1      1
## KI270363.1      1
## KI270311.1      1
## KI270340.1      1
## KI270330.1      1
## KI270509.1      1
## KI270584.1      1
## KI270590.1      1
## KI270429.1      1
## KI270716.1      1
## KI270736.1      1
## KI270381.1      1
## KI270737.1      1
## KI270510.1      1
## KI270423.1      1
## KI270581.1      1
## KI270312.1      1
## KI270389.1      1
## KI270583.1      1
## KI270529.1      1
## KI270420.1      1
## KI270739.1      1
## KI270517.1      1
## KI270588.1      1
## KI270385.1      1
## KI270518.1      1
## KI270522.1      1
## KI270372.1      1
## KI270591.1      1
## KI270466.1      1
## KI270512.1      1
## KI270468.1      1
## KI270304.1      1
## KI270411.1      1
## KI270746.1      1
## KI270392.1      1
## KI270386.1      1
## KI270388.1      1
## KI270336.1      1
## KI270396.1      1
## KI270305.1      1
## KI270424.1      1
## KI270338.1      1
## KI270387.1      1
## KI270507.1      1
## KI270755.1      1
## KI270521.1      1
## Name: seqid, dtype: int64

K-scaffolds


df_kscflds = df_scflds[df_scflds['seqid'].str.startswith('K')]

df_kscflds.head()
##               seqid   source      type  start   end score strand phase                                          attribute
## 2807866  KI270302.1  Ensembl  scaffold      1  2274     .      .     .  ID=scaffold:KI270302.1;Alias=chrUn_KI270302v1,...
## 2807867  KI270303.1  Ensembl  scaffold      1  1942     .      .     .  ID=scaffold:KI270303.1;Alias=chrUn_KI270303v1,...
## 2807868  KI270304.1  Ensembl  scaffold      1  2165     .      .     .  ID=scaffold:KI270304.1;Alias=chrUn_KI270304v1,...
## 2807869  KI270305.1  Ensembl  scaffold      1  1472     .      .     .  ID=scaffold:KI270305.1;Alias=chrUn_KI270305v1,...
## 2807870  KI270310.1  Ensembl  scaffold      1  1201     .      .     .  ID=scaffold:KI270310.1;Alias=chrUn_KI270310v1,...

# plot number of entries by K-scaffolds

f, ax = plt.subplots(figsize=(16, 48))

sb.countplot(y="seqid", 
             data=df_kscflds,
             order = df_kscflds['seqid'].value_counts().index)

plt.title('Number of entries by K-scaffolds')

plt.show()

G-scaffolds


df_gscflds = df_scflds[df_scflds['seqid'].str.startswith('G')]

df_gscflds.head()
##               seqid   source               type  start     end score strand phase                                          attribute
## 2807159  GL000008.2  Ensembl           scaffold      1  209709     .      .     .  ID=scaffold:GL000008.2;Alias=chr4_GL000008v2_r...
## 2807160  GL000008.2        .  biological_region  83284   83394     1      -     .        external_name=rank %3D 1;logic_name=firstef
## 2807161  GL000008.2        .  biological_region  83318   83804   342      .     .           external_name=oe %3D 0.85;logic_name=cpg
## 2807162  GL000008.2        .  biological_region  83763   84145     1      +     .        external_name=rank %3D 1;logic_name=firstef
## 2807163  GL000009.2  Ensembl           scaffold      1  201709     .      .     .  ID=scaffold:GL000009.2;Alias=chr14_GL000009v2_...

# plot number of entries by G scaffolds

f, ax = plt.subplots(figsize=(16, 8))

sb.countplot(y="seqid", 
             data=df_gscflds,
             order = df_gscflds['seqid'].value_counts().index)

plt.title('Number of entries by G-scaffolds')

plt.show()

Mitochondria genome


df_mt = df[df['seqid']=='MT']

df_mt.head()
##         seqid   source        type  start    end score strand phase                                          attribute
## 2810411    MT  Ensembl  chromosome      1  16569     .      .     .   ID=chromosome:MT;Alias=chrM,J01415.2,NC_012920.1
## 2810412    MT    insdc  ncRNA_gene    577    647     .      +     .  ID=gene:ENSG00000210049;Name=MT-TF;biotype=Mt_...
## 2810413    MT  ensembl        tRNA    577    647     .      +     .  ID=transcript:ENST00000387314;Parent=gene:ENSG...
## 2810414    MT  ensembl        exon    577    647     .      +     .  Parent=transcript:ENST00000387314;Name=ENSE000...
## 2810415    MT    insdc  ncRNA_gene    648   1601     .      +     .  ID=gene:ENSG00000211459;Name=MT-RNR1;biotype=M...

# There ae 126 mitochondrial entry
df_mt['seqid'].value_counts()
## MT    126
## Name: seqid, dtype: int64

Sources of the sequence elements

There are 9 unique sources for the genome sequencing data.


df['source'].unique()
## array(['Ensembl', '.', 'havana', 'mirbase', 'ensembl_havana', 'ensembl',
##        'havana_tagene', 'ensembl_havana_tagene', 'insdc'], dtype=object)

df['source'].unique().shape
## (9,)

df['source'].value_counts()
## havana                   1755580
## ensembl_havana            637206
## ensembl                   223622
## .                         182510
## havana_tagene             106213
## mirbase                     5637
## Ensembl                      194
## ensembl_havana_tagene         87
## insdc                         37
## Name: source, dtype: int64

# plot number of entries by source

f, ax = plt.subplots(figsize=(16, 4))

sb.countplot(y="source", data=df, order = df['source'].value_counts().index)

plt.title('Number of entries by source')

plt.show()

Types of sequences in the genome

There are 26 unique sequence types in the human genome.


df['type'].unique()
## array(['chromosome', 'biological_region', 'pseudogene', 'lnc_RNA', 'exon',
##        'pseudogenic_transcript', 'ncRNA_gene', 'miRNA', 'gene', 'mRNA',
##        'five_prime_UTR', 'CDS', 'three_prime_UTR', 'snRNA', 'ncRNA',
##        'unconfirmed_transcript', 'snoRNA', 'scRNA', 'rRNA',
##        'V_gene_segment', 'D_gene_segment', 'J_gene_segment',
##        'C_gene_segment', 'vaultRNA_primary_transcript', 'scaffold',
##        'tRNA'], dtype=object)

df['type'].unique().shape
## (26,)

pd.set_option('display.max_rows', 26)

df['type'].value_counts()
## exon                           1371695
## CDS                             762023
## biological_region               182510
## three_prime_UTR                 153974
## five_prime_UTR                  152699
## lnc_RNA                         103513
## mRNA                             99916
## ncRNA_gene                       23934
## gene                             21487
## pseudogenic_transcript           15251
## pseudogene                       15202
## ncRNA                             2235
## snRNA                             1915
## miRNA                             1879
## unconfirmed_transcript            1155
## snoRNA                             954
## V_gene_segment                     250
## scaffold                           169
## J_gene_segment                      97
## rRNA                                60
## scRNA                               50
## D_gene_segment                      41
## C_gene_segment                      29
## chromosome                          25
## tRNA                                22
## vaultRNA_primary_transcript          1
## Name: type, dtype: int64

# Number of entries by sequence type

f, ax = plt.subplots(figsize=(16, 12))

sb.countplot(y="type", data=df, order = df['type'].value_counts().index)

plt.yticks(rotation=25)
## (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
##        17, 18, 19, 20, 21, 22, 23, 24, 25]), <a list of 26 Text yticklabel objects>)
plt.title('Number of entries by sequnce type')

plt.show()