library(reticulate)
import sys
print(sys.version)
## 3.7.5 (default, Oct 31 2019, 15:18:51) [MSC v.1916 64 bit (AMD64)]
import sys
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import pandas as pd
import numpy as np
import seaborn as sb
sb.set_style('whitegrid')
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use('ps') # generate postscript output by default
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 200)
source ~/.profile
wget -bqc ftp://ftp.ensembl.org/pub/release-98/gff3/homo_sapiens/Homo_sapiens.GRCh38.98.gff3.gz -O Homo_sapiens.GRCh38.98.gff3.gz
## Continuing in background, pid 1107.
source ~/.profile
zcat Homo_sapiens.GRCh38.98.gff3.gz | head -n 24
## ##gff-version 3
## ##sequence-region 1 1 248956422
## ##sequence-region 10 1 133797422
## ##sequence-region 11 1 135086622
## ##sequence-region 12 1 133275309
## ##sequence-region 13 1 114364328
## ##sequence-region 14 1 107043718
## ##sequence-region 15 1 101991189
## ##sequence-region 16 1 90338345
## ##sequence-region 17 1 83257441
## ##sequence-region 18 1 80373285
## ##sequence-region 19 1 58617616
## ##sequence-region 2 1 242193529
## ##sequence-region 20 1 64444167
## ##sequence-region 21 1 46709983
## ##sequence-region 22 1 50818468
## ##sequence-region 3 1 198295559
## ##sequence-region 4 1 190214555
## ##sequence-region 5 1 181538259
## ##sequence-region 6 1 170805979
## ##sequence-region 7 1 159345973
## ##sequence-region 8 1 145138636
## ##sequence-region 9 1 138394717
## ##sequence-region GL000008.2 1 209709
source ~/.profile
zcat Homo_sapiens.GRCh38.98.gff3.gz | tail
## Y havana pseudogene 26626520 26627159 . - . ID=gene:ENSG00000231514;Name=CCNQP2;biotype=processed_pseudogene;description=CCNQ pseudogene 2 [Source:HGNC Symbol%3BAcc:HGNC:38436];gene_id=ENSG00000231514;logic_name=havana_homo_sapiens;version=1
## Y havana pseudogenic_transcript 26626520 26627159 . - . ID=transcript:ENST00000435741;Parent=gene:ENSG00000231514;Name=CCNQP2-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000435741;transcript_support_level=NA;version=1
## Y havana exon 26626520 26627159 . - . Parent=transcript:ENST00000435741;Name=ENSE00001616687;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001616687;rank=1;version=1
## ###
## Y . biological_region 26626966 26627137 0.994 - . external_name=rank %3D 1;logic_name=firstef
## Y . biological_region 26627457 26628186 0.997 + . external_name=rank %3D 1;logic_name=firstef
## Y havana pseudogene 56855244 56855488 . + . ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=processed_pseudogene;description=C-terminal binding protein 2 pseudogene 1 [Source:HGNC Symbol%3BAcc:HGNC:23940];gene_id=ENSG00000235857;logic_name=havana_homo_sapiens;version=1
## Y havana pseudogenic_transcript 56855244 56855488 . + . ID=transcript:ENST00000431853;Parent=gene:ENSG00000235857;Name=CTBP2P1-201;biotype=processed_pseudogene;tag=basic;transcript_id=ENST00000431853;transcript_support_level=NA;version=1
## Y havana exon 56855244 56855488 . + . Parent=transcript:ENST00000431853;Name=ENSE00001794473;constitutive=1;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00001794473;rank=1;version=1
## ###
csv file
df = pd.read_csv('Homo_sapiens.GRCh38.98.gff3.gz', compression='gzip', sep='\t', comment='#', low_memory=False, header=None)
df.head()
## 0 1 2 3 4 5 6 7 8
## 0 1 Ensembl chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1 1 . biological_region 10469 11240 1.3e+03 . . external_name=oe %3D 0.79;logic_name=cpg
## 2 1 . biological_region 10650 10657 0.999 + . logic_name=eponine
## 3 1 . biological_region 10655 10657 0.999 - . logic_name=eponine
## 4 1 . biological_region 10678 10687 0.999 + . logic_name=eponine
df.tail()
## 0 1 2 3 4 5 6 7 8
## 2911081 Y . biological_region 26626966 26627137 0.994 - . external_name=rank %3D 1;logic_name=firstef
## 2911082 Y . biological_region 26627457 26628186 0.997 + . external_name=rank %3D 1;logic_name=firstef
## 2911083 Y havana pseudogene 56855244 56855488 . + . ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=p...
## 2911084 Y havana pseudogenic_transcript 56855244 56855488 . + . ID=transcript:ENST00000431853;Parent=gene:ENSG...
## 2911085 Y havana exon 56855244 56855488 . + . Parent=transcript:ENST00000431853;Name=ENSE000...
col_names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attribute']
df.columns=col_names
df.head()
## seqid source type start end score strand phase attribute
## 0 1 Ensembl chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1 1 . biological_region 10469 11240 1.3e+03 . . external_name=oe %3D 0.79;logic_name=cpg
## 2 1 . biological_region 10650 10657 0.999 + . logic_name=eponine
## 3 1 . biological_region 10655 10657 0.999 - . logic_name=eponine
## 4 1 . biological_region 10678 10687 0.999 + . logic_name=eponine
print(df.info())
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 2911086 entries, 0 to 2911085
## Data columns (total 9 columns):
## seqid object
## source object
## type object
## start int64
## end int64
## score object
## strand object
## phase object
## attribute object
## dtypes: int64(2), object(7)
## memory usage: 199.9+ MB
## None
There are 194 unique seqids. These are Chromosomes 1 to 22, X, Y, and mitochondrion (MT) DNA as well as 170 others (i.e., scaffolds)
df['seqid'].unique()
## array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
## '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9',
## 'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1',
## 'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1',
## 'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1',
## 'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1',
## 'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1',
## 'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1',
## 'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1',
## 'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1',
## 'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1',
## 'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1',
## 'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1',
## 'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1',
## 'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1',
## 'KI270384.1', 'KI270385.1', 'KI270386.1', 'KI270387.1',
## 'KI270388.1', 'KI270389.1', 'KI270390.1', 'KI270391.1',
## 'KI270392.1', 'KI270393.1', 'KI270394.1', 'KI270395.1',
## 'KI270396.1', 'KI270411.1', 'KI270412.1', 'KI270414.1',
## 'KI270417.1', 'KI270418.1', 'KI270419.1', 'KI270420.1',
## 'KI270422.1', 'KI270423.1', 'KI270424.1', 'KI270425.1',
## 'KI270429.1', 'KI270435.1', 'KI270438.1', 'KI270442.1',
## 'KI270448.1', 'KI270465.1', 'KI270466.1', 'KI270467.1',
## 'KI270468.1', 'KI270507.1', 'KI270508.1', 'KI270509.1',
## 'KI270510.1', 'KI270511.1', 'KI270512.1', 'KI270515.1',
## 'KI270516.1', 'KI270517.1', 'KI270518.1', 'KI270519.1',
## 'KI270521.1', 'KI270522.1', 'KI270528.1', 'KI270529.1',
## 'KI270530.1', 'KI270538.1', 'KI270539.1', 'KI270544.1',
## 'KI270548.1', 'KI270579.1', 'KI270580.1', 'KI270581.1',
## 'KI270582.1', 'KI270583.1', 'KI270584.1', 'KI270587.1',
## 'KI270588.1', 'KI270589.1', 'KI270590.1', 'KI270591.1',
## 'KI270593.1', 'KI270706.1', 'KI270707.1', 'KI270708.1',
## 'KI270709.1', 'KI270710.1', 'KI270711.1', 'KI270712.1',
## 'KI270713.1', 'KI270714.1', 'KI270715.1', 'KI270716.1',
## 'KI270717.1', 'KI270718.1', 'KI270719.1', 'KI270720.1',
## 'KI270721.1', 'KI270722.1', 'KI270723.1', 'KI270724.1',
## 'KI270725.1', 'KI270726.1', 'KI270727.1', 'KI270728.1',
## 'KI270729.1', 'KI270730.1', 'KI270731.1', 'KI270732.1',
## 'KI270733.1', 'KI270734.1', 'KI270735.1', 'KI270736.1',
## 'KI270737.1', 'KI270738.1', 'KI270739.1', 'KI270740.1',
## 'KI270741.1', 'KI270742.1', 'KI270743.1', 'KI270744.1',
## 'KI270745.1', 'KI270746.1', 'KI270747.1', 'KI270748.1',
## 'KI270749.1', 'KI270750.1', 'KI270751.1', 'KI270752.1',
## 'KI270753.1', 'KI270754.1', 'KI270755.1', 'KI270756.1',
## 'KI270757.1', 'MT', 'X', 'Y'], dtype=object)
df['seqid'].unique().shape
## (194,)
df['seqid'].value_counts()
## 1 266965
## 2 216479
## 3 185364
## 17 174091
## 19 172147
## ...
## KI270395.1 1
## KI270379.1 1
## KI270382.1 1
## KI270340.1 1
## KI270730.1 1
## Name: seqid, Length: 194, dtype: int64
# quotation marks/string notation for seqid as shown above
chrs=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']
df_chrs = df[df['seqid'].isin(chrs)]
df_chrs.head()
## seqid source type start end score strand phase attribute
## 0 1 Ensembl chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000...
## 1 1 . biological_region 10469 11240 1.3e+03 . . external_name=oe %3D 0.79;logic_name=cpg
## 2 1 . biological_region 10650 10657 0.999 + . logic_name=eponine
## 3 1 . biological_region 10655 10657 0.999 - . logic_name=eponine
## 4 1 . biological_region 10678 10687 0.999 + . logic_name=eponine
df_chrs['seqid'].unique()
## array(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
## '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'X', 'Y'],
## dtype=object)
df_chrs['seqid'].unique().shape
## (24,)
# number of entries by seqid/chromosomes
pd.set_option("display.max_rows", 24)
df_chrs['seqid'].value_counts()
## 1 266965
## 2 216479
## 3 185364
## 17 174091
## 19 172147
## 11 169346
## 12 164355
## 7 139890
## 16 135858
## 6 133533
## 5 129573
## 4 121926
## 10 109692
## 15 107924
## 8 106448
## 9 104055
## 14 99668
## X 92643
## 20 69131
## 22 64369
## 18 53030
## 13 48805
## 21 34510
## Y 7906
## Name: seqid, dtype: int64
# plot the number of entries by chromosomes
f, ax = plt.subplots(figsize=(16, 10))
sb.countplot(y="seqid", data=df_chrs, order = df_chrs['seqid'].value_counts().index)
plt.title('Number of entries by chromosomes in the human genome')
plt.show()
There are 169 unique scaffolds.
# Those are not in chromosomes nor in MT DNA
df_scflds = df[-(df['seqid'].isin(chrs)) & -(df['seqid']=='MT')]
df_scflds.head()
## seqid source type start end score strand phase attribute
## 2807159 GL000008.2 Ensembl scaffold 1 209709 . . . ID=scaffold:GL000008.2;Alias=chr4_GL000008v2_r...
## 2807160 GL000008.2 . biological_region 83284 83394 1 - . external_name=rank %3D 1;logic_name=firstef
## 2807161 GL000008.2 . biological_region 83318 83804 342 . . external_name=oe %3D 0.85;logic_name=cpg
## 2807162 GL000008.2 . biological_region 83763 84145 1 + . external_name=rank %3D 1;logic_name=firstef
## 2807163 GL000009.2 Ensembl scaffold 1 201709 . . . ID=scaffold:GL000009.2;Alias=chr14_GL000009v2_...
df_scflds['seqid'].unique()
## array(['GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1',
## 'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1',
## 'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1',
## 'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1',
## 'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1',
## 'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1',
## 'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1',
## 'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1',
## 'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1',
## 'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1',
## 'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1',
## 'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1',
## 'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1',
## 'KI270384.1', 'KI270385.1', 'KI270386.1', 'KI270387.1',
## 'KI270388.1', 'KI270389.1', 'KI270390.1', 'KI270391.1',
## 'KI270392.1', 'KI270393.1', 'KI270394.1', 'KI270395.1',
## 'KI270396.1', 'KI270411.1', 'KI270412.1', 'KI270414.1',
## 'KI270417.1', 'KI270418.1', 'KI270419.1', 'KI270420.1',
## 'KI270422.1', 'KI270423.1', 'KI270424.1', 'KI270425.1',
## 'KI270429.1', 'KI270435.1', 'KI270438.1', 'KI270442.1',
## 'KI270448.1', 'KI270465.1', 'KI270466.1', 'KI270467.1',
## 'KI270468.1', 'KI270507.1', 'KI270508.1', 'KI270509.1',
## 'KI270510.1', 'KI270511.1', 'KI270512.1', 'KI270515.1',
## 'KI270516.1', 'KI270517.1', 'KI270518.1', 'KI270519.1',
## 'KI270521.1', 'KI270522.1', 'KI270528.1', 'KI270529.1',
## 'KI270530.1', 'KI270538.1', 'KI270539.1', 'KI270544.1',
## 'KI270548.1', 'KI270579.1', 'KI270580.1', 'KI270581.1',
## 'KI270582.1', 'KI270583.1', 'KI270584.1', 'KI270587.1',
## 'KI270588.1', 'KI270589.1', 'KI270590.1', 'KI270591.1',
## 'KI270593.1', 'KI270706.1', 'KI270707.1', 'KI270708.1',
## 'KI270709.1', 'KI270710.1', 'KI270711.1', 'KI270712.1',
## 'KI270713.1', 'KI270714.1', 'KI270715.1', 'KI270716.1',
## 'KI270717.1', 'KI270718.1', 'KI270719.1', 'KI270720.1',
## 'KI270721.1', 'KI270722.1', 'KI270723.1', 'KI270724.1',
## 'KI270725.1', 'KI270726.1', 'KI270727.1', 'KI270728.1',
## 'KI270729.1', 'KI270730.1', 'KI270731.1', 'KI270732.1',
## 'KI270733.1', 'KI270734.1', 'KI270735.1', 'KI270736.1',
## 'KI270737.1', 'KI270738.1', 'KI270739.1', 'KI270740.1',
## 'KI270741.1', 'KI270742.1', 'KI270743.1', 'KI270744.1',
## 'KI270745.1', 'KI270746.1', 'KI270747.1', 'KI270748.1',
## 'KI270749.1', 'KI270750.1', 'KI270751.1', 'KI270752.1',
## 'KI270753.1', 'KI270754.1', 'KI270755.1', 'KI270756.1',
## 'KI270757.1'], dtype=object)
df_scflds['seqid'].unique().shape
## (169,)
pd.set_option("display.max_rows", 169)
df_scflds['seqid'].value_counts()
## KI270733.1 422
## GL000220.1 408
## KI270728.1 368
## KI270712.1 353
## KI270711.1 148
## KI270749.1 144
## KI270734.1 118
## KI270727.1 112
## KI270725.1 103
## KI270717.1 90
## GL000213.1 71
## KI270729.1 63
## GL000225.1 58
## KI270721.1 56
## KI270732.1 48
## KI270709.1 46
## KI270747.1 45
## KI270714.1 35
## GL000194.1 34
## KI270731.1 28
## GL000195.1 26
## KI270713.1 26
## KI270442.1 25
## GL000219.1 24
## GL000216.2 23
## KI270754.1 22
## KI270735.1 21
## GL000205.2 20
## KI270753.1 19
## KI270738.1 19
## KI270744.1 18
## KI270726.1 18
## GL000218.1 14
## KI270724.1 11
## KI270741.1 10
## KI270723.1 10
## KI270719.1 9
## GL000009.2 9
## KI270743.1 8
## GL000224.1 7
## KI270750.1 7
## KI270742.1 4
## KI270722.1 4
## GL000214.1 4
## KI270752.1 4
## GL000008.2 4
## KI270706.1 4
## KI270708.1 3
## KI270745.1 3
## GL000221.1 3
## KI270751.1 3
## KI270539.1 2
## KI270748.1 2
## KI270528.1 1
## KI270538.1 1
## KI270320.1 1
## KI270382.1 1
## KI270544.1 1
## KI270715.1 1
## KI270720.1 1
## KI270390.1 1
## KI270589.1 1
## KI270418.1 1
## KI270511.1 1
## KI270422.1 1
## KI270310.1 1
## KI270757.1 1
## KI270373.1 1
## KI270374.1 1
## KI270593.1 1
## KI270419.1 1
## KI270362.1 1
## KI270394.1 1
## KI270316.1 1
## GL000226.1 1
## KI270391.1 1
## KI270335.1 1
## KI270333.1 1
## KI270303.1 1
## KI270334.1 1
## KI270378.1 1
## KI270465.1 1
## KI270710.1 1
## KI270337.1 1
## KI270707.1 1
## KI270317.1 1
## GL000208.1 1
## KI270530.1 1
## KI270519.1 1
## KI270508.1 1
## KI270395.1 1
## KI270322.1 1
## KI270438.1 1
## KI270548.1 1
## KI270425.1 1
## KI270383.1 1
## KI270366.1 1
## KI270467.1 1
## KI270587.1 1
## KI270376.1 1
## KI270515.1 1
## KI270417.1 1
## KI270582.1 1
## KI270329.1 1
## KI270740.1 1
## KI270364.1 1
## KI270730.1 1
## KI270579.1 1
## KI270375.1 1
## KI270756.1 1
## KI270371.1 1
## KI270302.1 1
## KI270718.1 1
## KI270580.1 1
## KI270435.1 1
## KI270412.1 1
## KI270315.1 1
## KI270448.1 1
## KI270414.1 1
## KI270379.1 1
## KI270393.1 1
## KI270516.1 1
## KI270384.1 1
## KI270363.1 1
## KI270311.1 1
## KI270340.1 1
## KI270330.1 1
## KI270509.1 1
## KI270584.1 1
## KI270590.1 1
## KI270429.1 1
## KI270716.1 1
## KI270736.1 1
## KI270381.1 1
## KI270737.1 1
## KI270510.1 1
## KI270423.1 1
## KI270581.1 1
## KI270312.1 1
## KI270389.1 1
## KI270583.1 1
## KI270529.1 1
## KI270420.1 1
## KI270739.1 1
## KI270517.1 1
## KI270588.1 1
## KI270385.1 1
## KI270518.1 1
## KI270522.1 1
## KI270372.1 1
## KI270591.1 1
## KI270466.1 1
## KI270512.1 1
## KI270468.1 1
## KI270304.1 1
## KI270411.1 1
## KI270746.1 1
## KI270392.1 1
## KI270386.1 1
## KI270388.1 1
## KI270336.1 1
## KI270396.1 1
## KI270305.1 1
## KI270424.1 1
## KI270338.1 1
## KI270387.1 1
## KI270507.1 1
## KI270755.1 1
## KI270521.1 1
## Name: seqid, dtype: int64
df_kscflds = df_scflds[df_scflds['seqid'].str.startswith('K')]
df_kscflds.head()
## seqid source type start end score strand phase attribute
## 2807866 KI270302.1 Ensembl scaffold 1 2274 . . . ID=scaffold:KI270302.1;Alias=chrUn_KI270302v1,...
## 2807867 KI270303.1 Ensembl scaffold 1 1942 . . . ID=scaffold:KI270303.1;Alias=chrUn_KI270303v1,...
## 2807868 KI270304.1 Ensembl scaffold 1 2165 . . . ID=scaffold:KI270304.1;Alias=chrUn_KI270304v1,...
## 2807869 KI270305.1 Ensembl scaffold 1 1472 . . . ID=scaffold:KI270305.1;Alias=chrUn_KI270305v1,...
## 2807870 KI270310.1 Ensembl scaffold 1 1201 . . . ID=scaffold:KI270310.1;Alias=chrUn_KI270310v1,...
# plot number of entries by K-scaffolds
f, ax = plt.subplots(figsize=(16, 48))
sb.countplot(y="seqid",
data=df_kscflds,
order = df_kscflds['seqid'].value_counts().index)
plt.title('Number of entries by K-scaffolds')
plt.show()
df_gscflds = df_scflds[df_scflds['seqid'].str.startswith('G')]
df_gscflds.head()
## seqid source type start end score strand phase attribute
## 2807159 GL000008.2 Ensembl scaffold 1 209709 . . . ID=scaffold:GL000008.2;Alias=chr4_GL000008v2_r...
## 2807160 GL000008.2 . biological_region 83284 83394 1 - . external_name=rank %3D 1;logic_name=firstef
## 2807161 GL000008.2 . biological_region 83318 83804 342 . . external_name=oe %3D 0.85;logic_name=cpg
## 2807162 GL000008.2 . biological_region 83763 84145 1 + . external_name=rank %3D 1;logic_name=firstef
## 2807163 GL000009.2 Ensembl scaffold 1 201709 . . . ID=scaffold:GL000009.2;Alias=chr14_GL000009v2_...
# plot number of entries by G scaffolds
f, ax = plt.subplots(figsize=(16, 8))
sb.countplot(y="seqid",
data=df_gscflds,
order = df_gscflds['seqid'].value_counts().index)
plt.title('Number of entries by G-scaffolds')
plt.show()
df_mt = df[df['seqid']=='MT']
df_mt.head()
## seqid source type start end score strand phase attribute
## 2810411 MT Ensembl chromosome 1 16569 . . . ID=chromosome:MT;Alias=chrM,J01415.2,NC_012920.1
## 2810412 MT insdc ncRNA_gene 577 647 . + . ID=gene:ENSG00000210049;Name=MT-TF;biotype=Mt_...
## 2810413 MT ensembl tRNA 577 647 . + . ID=transcript:ENST00000387314;Parent=gene:ENSG...
## 2810414 MT ensembl exon 577 647 . + . Parent=transcript:ENST00000387314;Name=ENSE000...
## 2810415 MT insdc ncRNA_gene 648 1601 . + . ID=gene:ENSG00000211459;Name=MT-RNR1;biotype=M...
# There ae 126 mitochondrial entry
df_mt['seqid'].value_counts()
## MT 126
## Name: seqid, dtype: int64
There are 9 unique sources for the genome sequencing data.
df['source'].unique()
## array(['Ensembl', '.', 'havana', 'mirbase', 'ensembl_havana', 'ensembl',
## 'havana_tagene', 'ensembl_havana_tagene', 'insdc'], dtype=object)
df['source'].unique().shape
## (9,)
df['source'].value_counts()
## havana 1755580
## ensembl_havana 637206
## ensembl 223622
## . 182510
## havana_tagene 106213
## mirbase 5637
## Ensembl 194
## ensembl_havana_tagene 87
## insdc 37
## Name: source, dtype: int64
# plot number of entries by source
f, ax = plt.subplots(figsize=(16, 4))
sb.countplot(y="source", data=df, order = df['source'].value_counts().index)
plt.title('Number of entries by source')
plt.show()
There are 26 unique sequence types in the human genome.
df['type'].unique()
## array(['chromosome', 'biological_region', 'pseudogene', 'lnc_RNA', 'exon',
## 'pseudogenic_transcript', 'ncRNA_gene', 'miRNA', 'gene', 'mRNA',
## 'five_prime_UTR', 'CDS', 'three_prime_UTR', 'snRNA', 'ncRNA',
## 'unconfirmed_transcript', 'snoRNA', 'scRNA', 'rRNA',
## 'V_gene_segment', 'D_gene_segment', 'J_gene_segment',
## 'C_gene_segment', 'vaultRNA_primary_transcript', 'scaffold',
## 'tRNA'], dtype=object)
df['type'].unique().shape
## (26,)
pd.set_option('display.max_rows', 26)
df['type'].value_counts()
## exon 1371695
## CDS 762023
## biological_region 182510
## three_prime_UTR 153974
## five_prime_UTR 152699
## lnc_RNA 103513
## mRNA 99916
## ncRNA_gene 23934
## gene 21487
## pseudogenic_transcript 15251
## pseudogene 15202
## ncRNA 2235
## snRNA 1915
## miRNA 1879
## unconfirmed_transcript 1155
## snoRNA 954
## V_gene_segment 250
## scaffold 169
## J_gene_segment 97
## rRNA 60
## scRNA 50
## D_gene_segment 41
## C_gene_segment 29
## chromosome 25
## tRNA 22
## vaultRNA_primary_transcript 1
## Name: type, dtype: int64
# Number of entries by sequence type
f, ax = plt.subplots(figsize=(16, 12))
sb.countplot(y="type", data=df, order = df['type'].value_counts().index)
plt.yticks(rotation=25)
## (array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
## 17, 18, 19, 20, 21, 22, 23, 24, 25]), <a list of 26 Text yticklabel objects>)
plt.title('Number of entries by sequnce type')
plt.show()