#Part 1 find a list of phenotypes long enough to produce filterable list
myPheno <-read.csv('phenotype_options.csv',header=FALSE)
library(stringr)
## Warning: package 'stringr' was built under R version 4.2.2
myPheno <-myPheno[str_detect(myPheno$V1, "aciduria"), ] 


#Part 2 perform BioMart search to get best columns/attributes and 
#highest number of SNPs with best filter settings for your particular topic
library(biomaRt)
ensembl=useMart("ENSEMBL_MART_SNP",
                dataset="hsapiens_snp")

#Make a variable for your attributes you want to download
#Must have more than 6 to get full credit
att1 <- c('refsnp_id','phenotype_description','associated_variant_risk_allele','p_value')

#Run biomart search with settings listed above
searchResults <-getBM(att1,
                      filter='phenotype_description',#change if you read in list of rsids
                      values=myPheno, mart=ensembl)

#Part 3-read in 23andMe file and merge
allSNPs <- read.csv('FormattedSNPS.csv')

myMerge <- merge(allSNPs,searchResults,by.x="rsid",by.y="refsnp_id")

#Use unique and length functions to count unique rsids
#In this example, I have 32 SNPs to filter
length(unique(myMerge$rsid))
## [1] 32
#Part 4- keep only rows for which JW has the associated_variant_risk_allele
#Create a boolean vector to keep track of rows matching this rule
tester <- vector()

#get rid of rows with no associated_variant_risk_allele
myMerge2 <- subset(myMerge, associated_variant_risk_allele !="")

#Use loop to go through each row of data and compare columns 4 and 10
for (i in 1:nrow(myMerge2)){
    tester[i] <- grepl(myMerge2[i,6], myMerge2[i,4])
}

#Then you can use boolean vector to filter dataframe (keep only TRUE rows)
myMerge3 <- myMerge2[tester,]

#Use unique and length functions to count unique rsids
#In this example, I have 15 unique SNPs to filter
length(unique(myMerge3$rsid))
## [1] 15
#Part 5-run simple filter to get 8-12 of the most interesting SNPs for the report
#If you matched genotype to minor_allele, it makes sense to filter based on MAF
#If you matched genotype to minor_allele, it makes sense to filter based on p-values (keep rows with p-value)
#Pick a filter that makes sense for your disease (chromosome usually not a good choice)
myMerge4 <- subset(myMerge3, chromosome == 19)

#Use unique and length functions to count unique rsids
#In this example, I have 2 unique SNPs to write about in detail in the 
#report due during final exam week (goal is 8-12)
#This data frame has 7 unique SNPs
length(unique(myMerge4$rsid))
## [1] 7