R Markdown

This is an R Markdown document to show you the search and merge steps you will need to run for this tutorial.

#Step 1: Read in file with SNP IDs, position, and genotype
mySNPs <- read.csv("First100SNPs.csv")
#Make a new variable that is just the rsid column data
myRSIDs <- mySNPs$rsid

#Step 2: Download biomaRt and set up mart parameters
# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("biomaRt")

library(biomaRt)

#Run a Biomart search to download p-value for each SNP
#Run biomart search with settings listed above
ensembl=useMart("ENSEMBL_MART_SNP",
                dataset="hsapiens_snp")

listAttributes(ensembl)
##                               name
## 1                        refsnp_id
## 2                    refsnp_source
## 3        refsnp_source_description
## 4                         chr_name
## 5                      chrom_start
## 6                        chrom_end
## 7                     chrom_strand
## 8                           allele
## 9                        mapweight
## 10                       validated
## 11                        allele_1
## 12                    minor_allele
## 13               minor_allele_freq
## 14              minor_allele_count
## 15           clinical_significance
## 16                    synonym_name
## 17                  synonym_source
## 18      synonym_source_description
## 19                 variation_names
## 20                      study_name
## 21                      study_type
## 22              study_external_ref
## 23               study_description
## 24                     source_name
## 25                 associated_gene
## 26                  phenotype_name
## 27           phenotype_description
## 28  associated_variant_risk_allele
## 29                         p_value
## 30                        set_name
## 31                 set_description
## 32                           title
## 33                         authors
## 34                            year
## 35                            pmid
## 36                           pmcid
## 37                         ucsc_id
## 38                             doi
## 39          ensembl_gene_stable_id
## 40               ensembl_gene_name
## 41    ensembl_transcript_stable_id
## 42 ensembl_transcript_chrom_strand
## 43                    ensembl_type
## 44             consequence_type_tv
## 45       consequence_allele_string
## 46          ensembl_peptide_allele
## 47                      cdna_start
## 48                        cdna_end
## 49               translation_start
## 50                 translation_end
## 51                       cds_start
## 52                         cds_end
## 53          distance_to_transcript
## 54             polyphen_prediction
## 55                  polyphen_score
## 56                 sift_prediction
## 57                      sift_score
## 58           reg_feature_stable_id
## 59               reg_allele_string
## 60           reg_consequence_types
## 61         motif_feature_stable_id
## 62             motif_allele_string
## 63         motif_consequence_types
## 64   motif_in_informative_position
## 65               motif_score_delta
## 66                      motif_name
## 67                     motif_start
## 68                             snp
## 69                  upstream_flank
## 70                downstream_flank
## 71                        chr_name
## 72                     chrom_start
## 73                       chrom_end
## 74                    chrom_strand
## 75                       refsnp_id
## 76                   refsnp_source
## 77                          allele
## 78                       validated
## 79                       mapweight
## 80          ensembl_peptide_allele
##                                        description      page
## 1                                     Variant name       snp
## 2                                   Variant source       snp
## 3                       Variant source description       snp
## 4                         Chromosome/scaffold name       snp
## 5          Chromosome/scaffold position start (bp)       snp
## 6            Chromosome/scaffold position end (bp)       snp
## 7                                           Strand       snp
## 8                                  Variant alleles       snp
## 9                                        Mapweight       snp
## 10                     Variant supporting evidence       snp
## 11                                Ancestral allele       snp
## 12                              Minor allele (ALL)       snp
## 13 Global minor allele frequency (all individuals)       snp
## 14     Global minor allele count (all individuals)       snp
## 15                           Clinical significance       snp
## 16                                    Synonym name       snp
## 17                                  Synonym source       snp
## 18                      Synonym source description       snp
## 19                        Associated variant names       snp
## 20                                      Study name       snp
## 21                                      Study type       snp
## 22                        Study External Reference       snp
## 23                               Study Description       snp
## 24                                     Source name       snp
## 25                  Associated gene with phenotype       snp
## 26                                  Phenotype name       snp
## 27                           Phenotype description       snp
## 28                  Associated variant risk allele       snp
## 29                                         P value       snp
## 30                                Variant Set Name       snp
## 31                         Variant Set Description       snp
## 32                                           Title       snp
## 33                                         Authors       snp
## 34                                            Year       snp
## 35                                       PubMed ID       snp
## 36                    PMC reference number (PMCID)       snp
## 37                                         UCSC ID       snp
## 38                       Digital Object Identifier       snp
## 39                                  Gene stable ID       snp
## 40                                       Gene Name       snp
## 41                            Transcript stable ID       snp
## 42                               Transcript strand       snp
## 43                                         Biotype       snp
## 44                             Variant consequence       snp
## 45                     Consequence specific allele       snp
## 46                                  Protein allele       snp
## 47                      Variant start in cDNA (bp)       snp
## 48                        Variant end in cDNA (bp)       snp
## 49               Variant start in translation (aa)       snp
## 50                 Variant end in translation (aa)       snp
## 51                       Variant start in CDS (bp)       snp
## 52                         Variant end in CDS (bp)       snp
## 53                          Distance to transcript       snp
## 54                             PolyPhen prediction       snp
## 55                                  PolyPhen score       snp
## 56                                 SIFT prediction       snp
## 57                                      SIFT score       snp
## 58                    Regulatory feature stable ID       snp
## 59                Regulatory feature allele string       snp
## 60             Regulatory feature consequence type       snp
## 61                         Motif Feature stable ID       snp
## 62                     Motif feature allele string       snp
## 63                  Motif feature consequence type       snp
## 64                       High information position       snp
## 65                              Motif score change       snp
## 66                                      Motif name       snp
## 67                                  Motif position       snp
## 68                               Variant sequences sequences
## 69                                  upstream_flank sequences
## 70                                downstream_flank sequences
## 71                        Chromosome/scaffold name sequences
## 72         Chromosome/scaffold position start (bp) sequences
## 73           Chromosome/scaffold position end (bp) sequences
## 74                                          Strand sequences
## 75                                    Variant name sequences
## 76                                  Variant source sequences
## 77                                 Variant alleles sequences
## 78                     Variant supporting evidence sequences
## 79                                       Mapweight sequences
## 80                                  Protein allele sequences
#Step 3: Run search and automatically download data from Biomart into dataframe
searchResults <-getBM(attributes = c('refsnp_id','p_value'),
                      filters = 'snp_filter',
                      values=myRSIDs,
                      mart=ensembl)

#Step 4: Merge mySNPs and searchResults
mergedDFs <- merge(mySNPs,searchResults, by.x='rsid',by.y='refsnp_id')

#Step 5: Find significant SNPs
sigDF <- subset(mergedDFs,mergedDFs$p_value < 1E-10)

#Step 6: Output as .csv file
write.csv(sigDF,'sigDF.csv',row.names=FALSE,quote=FALSE)