This is an R Markdown document to show you the search and merge steps you will need to run for this tutorial.
#Step 1: Read in file with SNP IDs, position, and genotype
mySNPs <- read.csv("First100SNPs.csv")
#Make a new variable that is just the rsid column data
myRSIDs <- mySNPs$rsid
#Step 2: Download biomaRt and set up mart parameters
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("biomaRt")
library(biomaRt)
#Run a Biomart search to download p-value for each SNP
#Run biomart search with settings listed above
ensembl=useMart("ENSEMBL_MART_SNP",
dataset="hsapiens_snp")
listAttributes(ensembl)
## name
## 1 refsnp_id
## 2 refsnp_source
## 3 refsnp_source_description
## 4 chr_name
## 5 chrom_start
## 6 chrom_end
## 7 chrom_strand
## 8 allele
## 9 mapweight
## 10 validated
## 11 allele_1
## 12 minor_allele
## 13 minor_allele_freq
## 14 minor_allele_count
## 15 clinical_significance
## 16 synonym_name
## 17 synonym_source
## 18 synonym_source_description
## 19 variation_names
## 20 study_name
## 21 study_type
## 22 study_external_ref
## 23 study_description
## 24 source_name
## 25 associated_gene
## 26 phenotype_name
## 27 phenotype_description
## 28 associated_variant_risk_allele
## 29 p_value
## 30 set_name
## 31 set_description
## 32 title
## 33 authors
## 34 year
## 35 pmid
## 36 pmcid
## 37 ucsc_id
## 38 doi
## 39 ensembl_gene_stable_id
## 40 ensembl_gene_name
## 41 ensembl_transcript_stable_id
## 42 ensembl_transcript_chrom_strand
## 43 ensembl_type
## 44 consequence_type_tv
## 45 consequence_allele_string
## 46 ensembl_peptide_allele
## 47 cdna_start
## 48 cdna_end
## 49 translation_start
## 50 translation_end
## 51 cds_start
## 52 cds_end
## 53 distance_to_transcript
## 54 polyphen_prediction
## 55 polyphen_score
## 56 sift_prediction
## 57 sift_score
## 58 reg_feature_stable_id
## 59 reg_allele_string
## 60 reg_consequence_types
## 61 motif_feature_stable_id
## 62 motif_allele_string
## 63 motif_consequence_types
## 64 motif_in_informative_position
## 65 motif_score_delta
## 66 motif_name
## 67 motif_start
## 68 snp
## 69 upstream_flank
## 70 downstream_flank
## 71 chr_name
## 72 chrom_start
## 73 chrom_end
## 74 chrom_strand
## 75 refsnp_id
## 76 refsnp_source
## 77 allele
## 78 validated
## 79 mapweight
## 80 ensembl_peptide_allele
## description page
## 1 Variant name snp
## 2 Variant source snp
## 3 Variant source description snp
## 4 Chromosome/scaffold name snp
## 5 Chromosome/scaffold position start (bp) snp
## 6 Chromosome/scaffold position end (bp) snp
## 7 Strand snp
## 8 Variant alleles snp
## 9 Mapweight snp
## 10 Variant supporting evidence snp
## 11 Ancestral allele snp
## 12 Minor allele (ALL) snp
## 13 Global minor allele frequency (all individuals) snp
## 14 Global minor allele count (all individuals) snp
## 15 Clinical significance snp
## 16 Synonym name snp
## 17 Synonym source snp
## 18 Synonym source description snp
## 19 Associated variant names snp
## 20 Study name snp
## 21 Study type snp
## 22 Study External Reference snp
## 23 Study Description snp
## 24 Source name snp
## 25 Associated gene with phenotype snp
## 26 Phenotype name snp
## 27 Phenotype description snp
## 28 Associated variant risk allele snp
## 29 P value snp
## 30 Variant Set Name snp
## 31 Variant Set Description snp
## 32 Title snp
## 33 Authors snp
## 34 Year snp
## 35 PubMed ID snp
## 36 PMC reference number (PMCID) snp
## 37 UCSC ID snp
## 38 Digital Object Identifier snp
## 39 Gene stable ID snp
## 40 Gene Name snp
## 41 Transcript stable ID snp
## 42 Transcript strand snp
## 43 Biotype snp
## 44 Variant consequence snp
## 45 Consequence specific allele snp
## 46 Protein allele snp
## 47 Variant start in cDNA (bp) snp
## 48 Variant end in cDNA (bp) snp
## 49 Variant start in translation (aa) snp
## 50 Variant end in translation (aa) snp
## 51 Variant start in CDS (bp) snp
## 52 Variant end in CDS (bp) snp
## 53 Distance to transcript snp
## 54 PolyPhen prediction snp
## 55 PolyPhen score snp
## 56 SIFT prediction snp
## 57 SIFT score snp
## 58 Regulatory feature stable ID snp
## 59 Regulatory feature allele string snp
## 60 Regulatory feature consequence type snp
## 61 Motif Feature stable ID snp
## 62 Motif feature allele string snp
## 63 Motif feature consequence type snp
## 64 High information position snp
## 65 Motif score change snp
## 66 Motif name snp
## 67 Motif position snp
## 68 Variant sequences sequences
## 69 upstream_flank sequences
## 70 downstream_flank sequences
## 71 Chromosome/scaffold name sequences
## 72 Chromosome/scaffold position start (bp) sequences
## 73 Chromosome/scaffold position end (bp) sequences
## 74 Strand sequences
## 75 Variant name sequences
## 76 Variant source sequences
## 77 Variant alleles sequences
## 78 Variant supporting evidence sequences
## 79 Mapweight sequences
## 80 Protein allele sequences
#Step 3: Run search and automatically download data from Biomart into dataframe
searchResults <-getBM(attributes = c('refsnp_id','p_value'),
filters = 'snp_filter',
values=myRSIDs,
mart=ensembl)
#Step 4: Merge mySNPs and searchResults
mergedDFs <- merge(mySNPs,searchResults, by.x='rsid',by.y='refsnp_id')
#Step 5: Find significant SNPs
sigDF <- subset(mergedDFs,mergedDFs$p_value < 1E-10)
#Step 6: Output as .csv file
write.csv(sigDF,'sigDF.csv',row.names=FALSE,quote=FALSE)