Purpose: Locate the CSMD1 gene in the human genome dataset

Step 1: Connect to the BiomaRt database and chose the human dataset

# Set the biomart server and dataset List available biomart web services
listMarts()
##                biomart                version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 109
## 2   ENSEMBL_MART_MOUSE      Mouse strains 109
## 3     ENSEMBL_MART_SNP  Ensembl Variation 109
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 109
# useMart is to connect to a specific biomart database
ensembl <- useMart("ensembl")
datasets <- listDatasets(ensembl)
head(datasets)  #Look at available datasets
##                        dataset                           description     version
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1) ASM259213v1
## 2     acalliptera_gene_ensembl      Eastern happy genes (fAstCal1.2)  fAstCal1.2
## 3   acarolinensis_gene_ensembl       Green anole genes (AnoCar2.0v2) AnoCar2.0v2
## 4    acchrysaetos_gene_ensembl       Golden eagle genes (bAquChr1.2)  bAquChr1.2
## 5    acitrinellus_gene_ensembl        Midas cichlid genes (Midas_v5)    Midas_v5
## 6    amelanoleuca_gene_ensembl       Giant panda genes (ASM200744v2) ASM200744v2
ensembl = useDataset("hsapiens_gene_ensembl", mart = ensembl)  #Use human dataset
## Ensembl site unresponsive, trying useast mirror

Step 2: Build a query to look at just protein coding genes on chromosome 8

# build biomaRt query- specify filters, attributes and values
filters <- listFilters(ensembl)
attributes <- listAttributes(ensembl)

# listAttributes(dataset) %>% view Get the gene start and end positions for human genome, limit to just chromosome 8
chrom = 8
genes <- getBM(attributes = c("chromosome_name", "start_position", "end_position", "external_gene_name", "description", "gene_biotype"), filters = "chromosome_name",
    values = chrom, mart = ensembl)

unique(genes$gene_biotype)
##  [1] "processed_pseudogene"               "lncRNA"                             "protein_coding"                     "unprocessed_pseudogene"             "transcribed_processed_pseudogene"  
##  [6] "TEC"                                "miRNA"                              "rRNA_pseudogene"                    "snoRNA"                             "misc_RNA"                          
## [11] "transcribed_unprocessed_pseudogene" "unitary_pseudogene"                 "snRNA"                              "transcribed_unitary_pseudogene"     "IG_V_pseudogene"                   
## [16] "rRNA"
## Filter to protein coding and non-MT
pc_genes <- genes %>%
    dplyr::filter(gene_biotype %in% c("protein_coding")) %>%
    dplyr::select(gene = external_gene_name, CHR = chromosome_name, START = start_position, STOP = end_position, type = gene_biotype, gene_desc_source = description) %>%
    dplyr::arrange(CHR, START) %>%
    dplyr::filter(nchar(gene) > 0)

# View the first 10 rows of the resulting data frame
head(pc_genes, n = 10)
##           gene CHR   START    STOP           type                                                                        gene_desc_source
## 1       OR4F21   8  166086  167024 protein_coding   olfactory receptor family 4 subfamily F member 21 [Source:HGNC Symbol;Acc:HGNC:19583]
## 2       ZNF596   8  232137  264703 protein_coding                             zinc finger protein 596 [Source:HGNC Symbol;Acc:HGNC:27268]
## 3       FBXO25   8  406428  477967 protein_coding                                    F-box protein 25 [Source:HGNC Symbol;Acc:HGNC:13596]
## 4         TDRP   8  489803  545781 protein_coding                  testis development related protein [Source:HGNC Symbol;Acc:HGNC:26951]
## 5       ERICH1   8  614746  738106 protein_coding                                    glutamate rich 1 [Source:HGNC Symbol;Acc:HGNC:27234]
## 6       DLGAP2   8  737628 1708476 protein_coding                             DLG associated protein 2 [Source:HGNC Symbol;Acc:HGNC:2906]
## 7         CLN8   8 1755778 1801711 protein_coding              CLN8 transmembrane ER and ERGIC protein [Source:HGNC Symbol;Acc:HGNC:2079]
## 8  KBTBD11-OT1   8 1763888 1958627 protein_coding KBTBD11 overlapping transcript 1 [Source:NCBI gene (formerly Entrezgene);Acc:104266957]
## 9     ARHGEF10   8 1823926 1958641 protein_coding           Rho guanine nucleotide exchange factor 10 [Source:HGNC Symbol;Acc:HGNC:14103]
## 10     KBTBD11   8 1973677 2006936 protein_coding           kelch repeat and BTB domain containing 11 [Source:HGNC Symbol;Acc:HGNC:29104]

Step 3- Locate the gene

# Look for the CSMD1 gene
filter(pc_genes, pc_genes$gene == "CSMD1") -> CSMD1
print(CSMD1)
# Located! The gene starts at 2935353 and stops at 4994972. It is a protein coding gene and the description is 'CUB and Sushi multiple domains 1
# [Source:HGNC Symbol;Acc:HGNC:14026]'