#Load required libraries

library(biomaRt, quietly = TRUE)
library(dplyr, quietly = TRUE)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Import data

gData <- read.csv('FormattedSNPS.csv')

#Find options for ensembl mart

#Look at options for mart
myMarts <- listEnsembl()
#select Variation Dataset
ensembl <- useEnsembl(biomart = "snp")
## Ensembl site unresponsive, trying asia mirror
## Ensembl site unresponsive, trying useast mirror
#Look at options for dataset in that mart
myDatasets <- listDatasets(ensembl)
#Reset ensembl object to the dataset
ensembl <- useEnsembl(biomart = "snp",dataset="hsapiens_snp")

#Find options for filters

myFilters <- listFilters(ensembl)
myPhenotypes <- read.csv('phenotype_options.csv', header=FALSE)
projectPheno <- "Matrix metalloproteinase levels"

#Search with phenotype, select 6+ columns

myAttributes <- listAttributes(ensembl)
#Select attributes (choose at least 6 related to your search term/phenotype)
#There are some here that are empty and shouldn't be included
att1 <- c('refsnp_id',
          'allele_1',
          'minor_allele',
          'minor_allele_freq',
          'phenotype_name',
          'phenotype_description',
          'clinical_significance',
          'associated_variant_risk_allele',
          'p_value',
          'associated_gene',
          'validated')

searchResults <-getBM(att1,
                      filters= 'phenotype_description',
                      values= projectPheno, 
                      mart=ensembl)

#Merge data

myMerge<- merge(gData,searchResults, by.x="rsid", by.y="refsnp_id")

#Use function from learnr lesson

my_search_fxn <- function(allele, genotype){
    #Check if allele letter is empty
    if (allele == ""){
        # if it is, return FALSE to drop the row.
        FALSE
        
    } else {
        # if the allele is not empty, 
        # use grepl() to search for the allele somewhere
        # in the two genotype letters.
        
        grepl(allele, genotype)
        
    }
}

#Filter data

#Filter only rows where associated_variant_risk_allele is in genotype
#This example results in only one SNP and one row of data
#Most phenotypes will have to add an additional filter to get between 8-12 SNPs
# Storage logical vector
our_logic_vector<-logical(nrow(myMerge))

# Run the loop
for (i in 1:nrow(myMerge)){
    our_logic_vector[i]<-my_search_fxn(myMerge[i,"associated_variant_risk_allele"],
                            myMerge[i,"genotype"])
}
myMerge %>%
    filter(our_logic_vector) -> filteredSNPs