Purpose: Locate the CSMD1 gene in the human genome dataset
# Set the biomart server and dataset List available biomart web services
listMarts()
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 109
## 2 ENSEMBL_MART_MOUSE Mouse strains 109
## 3 ENSEMBL_MART_SNP Ensembl Variation 109
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 109
# useMart is to connect to a specific biomart database
ensembl <- useMart("ensembl")
datasets <- listDatasets(ensembl)
head(datasets) #Look at available datasets
## dataset description version
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1) ASM259213v1
## 2 acalliptera_gene_ensembl Eastern happy genes (fAstCal1.2) fAstCal1.2
## 3 acarolinensis_gene_ensembl Green anole genes (AnoCar2.0v2) AnoCar2.0v2
## 4 acchrysaetos_gene_ensembl Golden eagle genes (bAquChr1.2) bAquChr1.2
## 5 acitrinellus_gene_ensembl Midas cichlid genes (Midas_v5) Midas_v5
## 6 amelanoleuca_gene_ensembl Giant panda genes (ASM200744v2) ASM200744v2
ensembl = useDataset("hsapiens_gene_ensembl", mart = ensembl) #Use human dataset
## Ensembl site unresponsive, trying useast mirror
# build biomaRt query- specify filters, attributes and values
filters <- listFilters(ensembl)
attributes <- listAttributes(ensembl)
# listAttributes(dataset) %>% view Get the gene start and end positions for human genome, limit to just chromosome 8
chrom = 8
genes <- getBM(attributes = c("chromosome_name", "start_position", "end_position", "external_gene_name", "description", "gene_biotype"), filters = "chromosome_name",
values = chrom, mart = ensembl)
unique(genes$gene_biotype)
## [1] "processed_pseudogene" "lncRNA" "protein_coding" "unprocessed_pseudogene" "transcribed_processed_pseudogene"
## [6] "TEC" "miRNA" "rRNA_pseudogene" "snoRNA" "misc_RNA"
## [11] "transcribed_unprocessed_pseudogene" "unitary_pseudogene" "snRNA" "transcribed_unitary_pseudogene" "IG_V_pseudogene"
## [16] "rRNA"
## Filter to protein coding and non-MT
pc_genes <- genes %>%
dplyr::filter(gene_biotype %in% c("protein_coding")) %>%
dplyr::select(gene = external_gene_name, CHR = chromosome_name, START = start_position, STOP = end_position, type = gene_biotype, gene_desc_source = description) %>%
dplyr::arrange(CHR, START) %>%
dplyr::filter(nchar(gene) > 0)
# View the first 10 rows of the resulting data frame
head(pc_genes, n = 10)
## gene CHR START STOP type gene_desc_source
## 1 OR4F21 8 166086 167024 protein_coding olfactory receptor family 4 subfamily F member 21 [Source:HGNC Symbol;Acc:HGNC:19583]
## 2 ZNF596 8 232137 264703 protein_coding zinc finger protein 596 [Source:HGNC Symbol;Acc:HGNC:27268]
## 3 FBXO25 8 406428 477967 protein_coding F-box protein 25 [Source:HGNC Symbol;Acc:HGNC:13596]
## 4 TDRP 8 489803 545781 protein_coding testis development related protein [Source:HGNC Symbol;Acc:HGNC:26951]
## 5 ERICH1 8 614746 738106 protein_coding glutamate rich 1 [Source:HGNC Symbol;Acc:HGNC:27234]
## 6 DLGAP2 8 737628 1708476 protein_coding DLG associated protein 2 [Source:HGNC Symbol;Acc:HGNC:2906]
## 7 CLN8 8 1755778 1801711 protein_coding CLN8 transmembrane ER and ERGIC protein [Source:HGNC Symbol;Acc:HGNC:2079]
## 8 KBTBD11-OT1 8 1763888 1958627 protein_coding KBTBD11 overlapping transcript 1 [Source:NCBI gene (formerly Entrezgene);Acc:104266957]
## 9 ARHGEF10 8 1823926 1958641 protein_coding Rho guanine nucleotide exchange factor 10 [Source:HGNC Symbol;Acc:HGNC:14103]
## 10 KBTBD11 8 1973677 2006936 protein_coding kelch repeat and BTB domain containing 11 [Source:HGNC Symbol;Acc:HGNC:29104]
# Look for the CSMD1 gene
filter(pc_genes, pc_genes$gene == "CSMD1") -> CSMD1
print(CSMD1)
# Located! The gene starts at 2935353 and stops at 4994972. It is a protein coding gene and the description is 'CUB and Sushi multiple domains 1
# [Source:HGNC Symbol;Acc:HGNC:14026]'