#Step 1: Download First100SNPs.csv file from Canvas and upload to a new
#Posit Workspace



#Step 2: Use read.csv() to get data from a spreadsheet, get one column
#Note this is the format used by companies like 23andMe and ancestry to report
#genotypes
mySNPs <- read.csv('First100SNPs.csv')
#Make a new variable that is just the rsid column data
myRSIDs <- mySNPs$rsid

#Step 3: Install bioconductor and biomaRt tools
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("biomaRt")
library(biomaRt)

#Run a Biomart search to download minor allele frequency for each SNP
#Run biomart search with settings listed above
ensembl=useMart("ENSEMBL_MART_SNP",
                dataset="hsapiens_snp")

searchResults <-getBM(attributes = c('refsnp_id','minor_allele_freq'),
                      filters = 'snp_filter',
                      values=myRSIDs,
                      mart=ensembl)

#Step 4: Merge mySNPs and searchResults
mergedDFs <- merge(mySNPs,searchResults, by.x='rsid',by.y='refsnp_id')

#Step 5: Find rare mutations (MAF < 0.01)
rareDF <- subset(mergedDFs,mergedDFs$minor_allele_freq < 0.01)

#Step 6: Output as .csv file
write.csv(rareDF,'rare.csv',row.names=FALSE,quote=FALSE)