This notebook describes the analysis of the GWAS data generated using the ASReml script for the UCR miniCore cowpea population screened in 2022 by Hayley Sussman. GWAS was performed by Magda Julkowska.

Plotting the data

#install.packages("qqman")
#install.packages("qvalue")
library(qqman)

##

## For example usage please run: vignette('qqman')

##

## Citation appreciated but not required:

## Turner, (2018). qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. Journal of Open Source Software, 3(25), 731, https://doi.org/10.21105/joss.00731.

##

library(cowplot)

cowpea_GWAS <- list.files(pattern="gwas2029_new.rda")
cowpea_GWAS

##  [1] "day13_FvP_over_FmP.Cntrl_gwas2029_new.rda"    
##  [2] "day13_FvP_over_FmP.Drght_gwas2029_new.rda"    
##  [3] "day13_leaf_thickness.Cntrl_gwas2029_new.rda"  
##  [4] "day13_leaf_thickness.Drght_gwas2029_new.rda"  
##  [5] "day13_Leaf.Temperature.Cntrl_gwas2029_new.rda"
##  [6] "day13_Leaf.Temperature.Drght_gwas2029_new.rda"
##  [7] "day13_NPQt.Cntrl_gwas2029_new.rda"            
##  [8] "day13_NPQt.Drght_gwas2029_new.rda"            
##  [9] "day13_PhiNPQ.Cntrl_gwas2029_new.rda"          
## [10] "day13_PhiNPQ.Drght_gwas2029_new.rda"          
## [11] "day13_SPAD.Cntrl_gwas2029_new.rda"            
## [12] "day13_SPAD.Drght_gwas2029_new.rda"            
## [13] "day6_FvP_over_FmP.Cntrl_gwas2029_new.rda"     
## [14] "day6_FvP_over_FmP.Drght_gwas2029_new.rda"     
## [15] "day6_leaf_thickness.Cntrl_gwas2029_new.rda"   
## [16] "day6_leaf_thickness.Drght_gwas2029_new.rda"   
## [17] "day6_Leaf.Temperature.Cntrl_gwas2029_new.rda" 
## [18] "day6_Leaf.Temperature.Drght_gwas2029_new.rda" 
## [19] "day6_NPQt.Cntrl_gwas2029_new.rda"             
## [20] "day6_NPQt.Drght_gwas2029_new.rda"             
## [21] "day6_PhiNPQ.Cntrl_gwas2029_new.rda"           
## [22] "day6_PhiNPQ.Drght_gwas2029_new.rda"           
## [23] "day6_SPAD.Cntrl_gwas2029_new.rda"             
## [24] "day6_SPAD.Drght_gwas2029_new.rda"             
## [25] "DW.Cntrl_gwas2029_new.rda"                    
## [26] "DW.Drght_gwas2029_new.rda"                    
## [27] "EVT.med.Cntrl_gwas2029_new.rda"               
## [28] "EVT.med.Drght_gwas2029_new.rda"               
## [29] "EVT.p.FW.Cntrl_gwas2029_new.rda"              
## [30] "EVT.p.FW.Drght_gwas2029_new.rda"              
## [31] "EVT.sum.Cntrl_gwas2029_new.rda"               
## [32] "EVT.sum.Drght_gwas2029_new.rda"               
## [33] "FW.Cntrl_gwas2029_new.rda"                    
## [34] "FW.Drght_gwas2029_new.rda"                    
## [35] "GR.Cntrl_gwas2029_new.rda"                    
## [36] "GR.Drght_gwas2029_new.rda"                    
## [37] "WC.Cntrl_gwas2029_new.rda"                    
## [38] "WC.Drght_gwas2029_new.rda"

load(cowpea_GWAS[1])
# Prepare table for graphing:
args <- commandArgs(trailingOnly=TRUE)
results$Pval <-as.numeric(as.character(results$Pval))
a <- nrow(results)
b <- 0.05/a
head(results)

results$Chr <- substr(results$SNP, 1, 2)
results$Chr <- as.numeric(as.character(results$Chr))
head(results)

title <- gsub("_gwas2029_new.rda", "", cowpea_GWAS[1])
manhattan(results, chr="Chr",bp="Pos", p="Pval", snp="SNP", main=title, ylim=c(0,12), col = c("turquoise3", "purple4"), suggestiveline=FALSE, genomewideline=-log10(b))

pdf(paste("Manhattan.", title, ".pdf"))
manhattan(results, chr="Chr",bp="Pos", p="Pval", snp="SNP", main=title, ylim=c(0,12), col = c("turquoise3", "purple4"), suggestiveline=FALSE, genomewideline=-log10(b))
dev.off()

## quartz_off_screen 
##                 2

qq(results$Pval, main = title, cex = 1.5, las = 1)

pdf(paste("QQplot.", title, ".pdf"))
qq(results$Pval, main = title, cex = 1.5, las = 1)
dev.off()

## quartz_off_screen 
##                 2

Let’s think about how to visualize the effect size that is also included in our GWAS result files:

max_val <- max(results$beta) * 1.2
min_val <- min(results$beta) * 1.2
  
manhattan(results, chr="Chr",bp="Pos", logp = FALSE, p="beta", snp="SNP", main=title, ylim=c(min_val, max_val), col = c("turquoise3", "purple4"), suggestiveline = F, genomewideline = F, ylab = "beta")

length(cowpea_GWAS)

## [1] 38

for(i in 1:38){
  load(cowpea_GWAS[i])
  # Prepare table for graphing:
  args <- commandArgs(trailingOnly=TRUE)
  results$Pval <-as.numeric(as.character(results$Pval))
  a <- nrow(results)
  b <- 0.05/a
  
  
  results$Chr <- substr(results$SNP, 1, 2)
  results$Chr <- as.numeric(as.character(results$Chr))
  head(results)
  title <- gsub("_gwas2029_new.rda", "", cowpea_GWAS[i])
  
  pdf(paste("Manhattan.", title, ".pdf"))
  manhattan(results, chr="Chr",bp="Pos", p="Pval", snp="SNP", main=title, ylim=c(0,12), col = c("turquoise3", "purple4"), suggestiveline=FALSE, genomewideline=-log10(b))
  dev.off()
  
  max_val <- max(results$beta) * 1.2
  min_val <- min(results$beta) * 1.2
  
  pdf(paste("Effect.Size.", title, ".pdf"))
  manhattan(results, chr="Chr",bp="Pos", logp = FALSE, p="beta", snp="SNP", main=title, ylim=c(min_val, max_val), col = c("turquoise3", "purple4"), suggestiveline = F, genomewideline = F, ylab = "beta")
  dev.off()
  
  pdf(paste("QQplot.", title, ".pdf"))
  qq(results$Pval, main = title, cex = 1.5, las = 1)
  dev.off()
  
}

Isolating the most significant associations

Now - let’s have a look at the most significant SNPs that we have identified thus far:

i=3
load(cowpea_GWAS[i])
results$Pval <-as.numeric(as.character(results$Pval))
results$LOD <- -log10(results$Pval)
results$Chr <- substr(results$SNP, 1, 2)
results$Chr <- as.numeric(as.character(results$Chr))
results$trait <- gsub("_gwas2029_new.rda", "", cowpea_GWAS[i])
LOD5 <- subset(results, results$LOD > -log10(b))
LOD5

for(i in 4:38){
  load(cowpea_GWAS[i])
  results$Pval <-as.numeric(as.character(results$Pval))
  results$LOD <- -log10(results$Pval)
  results$Chr <- substr(results$SNP, 1, 2)
  results$Chr <- as.numeric(as.character(results$Chr))
  results$trait <- gsub("_gwas2029_new.rda", "", cowpea_GWAS[i])
  temp <- subset(results, results$LOD > -log10(b))
  LOD5 <- rbind(LOD5, temp)
}
i

## [1] 38

dim(LOD5)

## [1] 59 13

LOD5

And now - I want to organize this file - so we can actually see where the interesting loci are. For this purpose - we need to order SNPs per Chromosome and Position:

Bonf <- LOD5[order(LOD5$Chr, LOD5$Pos),]
head(Bonf)

Looks gravy!

Cool - since we said we are not going to consider associations represented only by 2 accessions - we will now subset all of our associations for MAC > 2:

Bonf2 <- subset(Bonf, Bonf$MAC > 2)
dim(Bonf2)

## [1] 58 13

OK - now in order to go through the SNPs - we need to calculate the distance between the neighboring SNPs:

Bonf2$dist <- 0
for(i in 2:nrow(Bonf2)){
  Bonf2$dist[i] <- Bonf2$Pos[i] - Bonf2$Pos[i-1]
}

Bonf2

write.csv(Bonf2, "Cowpea_Bonferroni_loci_2022.csv", row.names = FALSE)

Further inspection of most interesting loci:

So - I added some information to the table above - mainly if its of interest to us at this moment and how individual SNPs cluster into loci. Table look like this at the moment:

MMJ_Bonf <- read.csv("Cowpea_Bonferroni_loci_2022_MMJ.csv")
MMJ_Bonf

MMJ_Bonf <- subset(MMJ_Bonf, MMJ_Bonf$Of.Interest > 0)
MMJ_Bonf

Let’s generate more permissive dataset with all of the associations - since we are looking at the other SNPs associated with our loci of interest:

i=1
load(cowpea_GWAS[i])
results$Pval <-as.numeric(as.character(results$Pval))
results$LOD <- -log10(results$Pval)
results$Chr <- substr(results$SNP, 1, 2)
results$Chr <- as.numeric(as.character(results$Chr))
results$trait <- gsub("_gwas2029_new.rda", "", cowpea_GWAS[i])
LOD3 <- subset(results, results$LOD > 2)
LOD3

for(i in 2:38){
  load(cowpea_GWAS[i])
  results$Pval <-as.numeric(as.character(results$Pval))
  results$LOD <- -log10(results$Pval)
  results$Chr <- substr(results$SNP, 1, 2)
  results$Chr <- as.numeric(as.character(results$Chr))
  results$trait <- gsub("_gwas2029_new.rda", "", cowpea_GWAS[i])
  temp <- subset(results, results$LOD > 2)
  LOD3 <- rbind(LOD3, temp)
}
i

## [1] 38

dim(LOD3)

## [1] 5777   13

LOD3

First locus - Chr. 1 pos. 41703755

library(ggplot2)
library(ggpubr)

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:cowplot':
## 
##     get_legend

unique(MMJ_Bonf$Locus)[1]

## [1] "Locus3"

length(unique(MMJ_Bonf$Locus))

## [1] 12

for(i in 1:12){
  want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
  want
  Chr_i_want <- want$Chr[1]
  Pos_min <- min(want$Pos)
  Pos_max <- max(want$Pos)
  temp <- subset(LOD3, LOD3$Chr == Chr_i_want)
  temp <- subset(temp, temp$Pos > Pos_min - 100000)
  temp <- subset(temp, temp$Pos < Pos_max + 100000)
  temp
  plot(ggscatter(temp, x = "Pos", y = "LOD", size = "MAC", color = "trait") + theme(legend.position = "right"))
}

OK - nothing too alarming comes up.

calculating the LD

Now - let’s calculate the LD for the most interesting loci, to determine the size of the interval we should be using for candidate gene:

load("Cowpea_for_GAPIT.RData")

head(myGeno)

myGeno2 <- as.data.frame(myGeno)
myGeno2$chrom <- gsub("Vu", "", myGeno2$chrom)
myGeno2$chrom <- as.numeric(myGeno2$chrom)
colnames(myGeno2)[1] <- "SNP"
myGeno2$SNP <- paste(myGeno2$chrom, myGeno2$pos, sep="_")
colnames(myGeno2)[1] <- "rs#"
myGeno2

Locus 3

i=1
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   9

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                1_41684831 1_41685068 1_41694570 1_41695540 1_41695883
## X1393.1.2.3... "C"        "C"        "A"        "G"        "C"       
## X24.125.B.1    "C"        "C"        "A"        "G"        "C"       
## X58.53         "C"        "C"        "A"        "G"        "C"       
## X58.57         "C"        "C"        "A"        "G"        "C"       
## Apagbaala      "C"        "C"        "A"        "G"        "C"       
## Bambey.21      "C"        "C"        "A"        "G"        "C"       
##                1_41696204 1_41697520 1_41703755 1_41713095
## X1393.1.2.3... "T"        "G"        "A"        "G"       
## X24.125.B.1    "T"        "G"        "A"        "G"       
## X58.53         "T"        "G"        "A"        "G"       
## X58.57         "T"        "G"        "A"        "G"       
## Apagbaala      "T"        "G"        "A"        "G"       
## Bambey.21      "T"        "G"        "A"        "G"

dim(locusT)

## [1] 313   9

dim(good_locus)

## [1] 313   9

library(genetics)

## Loading required package: combinat

## 
## Attaching package: 'combinat'

## The following object is masked from 'package:utils':
## 
##     combn

## Loading required package: gdata

## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

##

## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

## Loading required package: gtools

## Loading required package: MASS

## Loading required package: mvtnorm

##

## NOTE: THIS PACKAGE IS NOW OBSOLETE.

##

##   The R-Genetics project has developed an set of enhanced genetics

##   packages to replace 'genetics'. Please visit the project homepage

##   at http://rgenetics.org for informtion.

##

## 
## Attaching package: 'genetics'

## The following objects are masked from 'package:base':
## 
##     %in%, as.factor, order

library(LDheatmap)

mymatrix_D <- matrix(, nrow = 9, ncol = 9)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("1_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 41684831 41685068 41694570 41695540 41695883 41696204 41697520 41703755
## [9] 41713095

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "1_41703755", color = heat.colors(20), add.key = TRUE)

# Locus 8

i=2
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   7

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                4_4552884 4_4554780 4_4555735 4_4575833 4_4601417 4_4601474
## X1393.1.2.3... "A"       "C"       "A"       "T"       "T"       "A"      
## X24.125.B.1    "A"       "C"       "A"       "T"       "T"       "A"      
## X58.53         "A"       "C"       "A"       "T"       "T"       "A"      
## X58.57         "A"       "C"       "A"       "T"       "T"       "A"      
## Apagbaala      "G"       "C"       "G"       "C"       "T"       "A"      
## Bambey.21      "A"       "C"       "A"       "T"       "T"       "A"      
##                4_4619510
## X1393.1.2.3... "C"      
## X24.125.B.1    "C"      
## X58.53         "C"      
## X58.57         "C"      
## Apagbaala      "C"      
## Bambey.21      "C"

dim(locusT)

## [1] 313   7

dim(good_locus)

## [1] 313   7

mymatrix_D <- matrix(, nrow = 7, ncol = 7)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("4_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 4552884 4554780 4555735 4575833 4601417 4601474 4619510

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "4_4601474", color = heat.colors(20), add.key = TRUE)

Locus 10

i=3
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313  19

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                5_1943907 5_1944871 5_1954620 5_1960123 5_1965515 5_1971429
## X1393.1.2.3... "C"       "A"       "C"       "C"       "G"       "A"      
## X24.125.B.1    "C"       "A"       "C"       "C"       "G"       "A"      
## X58.53         "T"       "G"       "C"       "T"       "G"       "G"      
## X58.57         "T"       "G"       "C"       "T"       "G"       "G"      
## Apagbaala      "T"       "G"       "A"       "T"       "A"       "G"      
## Bambey.21      "C"       "A"       "C"       "C"       "G"       "A"      
##                5_1971708 5_1978101 5_1982672 5_1983863 5_1983944 5_1985616
## X1393.1.2.3... "A"       "G"       "G"       "T"       "A"       "A"      
## X24.125.B.1    "A"       "G"       "G"       "T"       "A"       "A"      
## X58.53         "G"       "G"       "G"       "T"       "A"       "A"      
## X58.57         "G"       "G"       "G"       "T"       "A"       "A"      
## Apagbaala      "G"       "G"       "A"       "A"       "G"       "A"      
## Bambey.21      "A"       "G"       "G"       "T"       "A"       "A"      
##                5_1987481 5_1988010 5_1996175 5_2001697 5_2005290 5_2017086
## X1393.1.2.3... "T"       "T"       "A"       "G"       "C"       "C"      
## X24.125.B.1    "T"       "T"       "A"       "G"       "C"       "T"      
## X58.53         "T"       "T"       "C"       "G"       "C"       "T"      
## X58.57         "T"       "T"       "C"       "G"       "C"       "C"      
## Apagbaala      "T"       "G"       "C"       "A"       "T"       "T"      
## Bambey.21      "T"       "T"       "A"       "G"       "C"       "C"      
##                5_2017433
## X1393.1.2.3... "T"      
## X24.125.B.1    "C"      
## X58.53         "C"      
## X58.57         "T"      
## Apagbaala      "C"      
## Bambey.21      "T"

dim(locusT)

## [1] 313  19

dim(good_locus)

## [1] 313  19

mymatrix_D <- matrix(, nrow = 19, ncol = 19)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("5_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

##  [1] 1943907 1944871 1954620 1960123 1965515 1971429 1971708 1978101 1982672
## [10] 1983863 1983944 1985616 1987481 1988010 1996175 2001697 2005290 2017086
## [19] 2017433

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "5_1985616", color = heat.colors(20), add.key = TRUE)

Locus 11

i=4
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

unique(want$SNP)

## [1] "05_3329312" "05_3348824"

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313  11

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                5_3289895 5_3296554 5_3299310 5_3310368 5_3311762 5_3316827
## X1393.1.2.3... "A"       "C"       "T"       "T"       "G"       "T"      
## X24.125.B.1    "A"       "C"       "T"       "T"       "G"       "T"      
## X58.53         "A"       "C"       "T"       "T"       "G"       "T"      
## X58.57         "A"       "C"       "T"       "T"       "G"       "T"      
## Apagbaala      "A"       "C"       "T"       "T"       "G"       "T"      
## Bambey.21      "A"       "C"       "T"       "T"       "G"       "T"      
##                5_3329312 5_3347469 5_3348824 5_3370789 5_3397166
## X1393.1.2.3... "A"       "T"       "A"       "T"       "G"      
## X24.125.B.1    "A"       "T"       "A"       "T"       "G"      
## X58.53         "A"       "T"       "A"       "T"       "G"      
## X58.57         "A"       "T"       "A"       "T"       "G"      
## Apagbaala      "A"       "T"       "A"       "T"       "G"      
## Bambey.21      "A"       "T"       "A"       "T"       "G"

dim(locusT)

## [1] 313  11

dim(good_locus)

## [1] 313  11

mymatrix_D <- matrix(, nrow = 11, ncol = 11)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("5_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

##  [1] 3289895 3296554 3299310 3310368 3311762 3316827 3329312 3347469 3348824
## [10] 3370789 3397166

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = c("5_3329312", "5_3348824"), color = heat.colors(20), add.key = TRUE)

Locus 12

i=5
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

unique(want$SNP)

## [1] "05_39714266"

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313  11

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                5_39666642 5_39698557 5_39700341 5_39708588 5_39713643
## X1393.1.2.3... "G"        "T"        "C"        "T"        "A"       
## X24.125.B.1    "G"        "T"        "C"        "T"        "G"       
## X58.53         "G"        "T"        "C"        "T"        "A"       
## X58.57         "G"        "T"        "C"        "T"        "G"       
## Apagbaala      "G"        "T"        "C"        "T"        "G"       
## Bambey.21      "G"        "T"        "C"        "T"        "A"       
##                5_39714266 5_39720198 5_39741525 5_39743714 5_39754613
## X1393.1.2.3... "C"        "T"        "G"        "A"        "G"       
## X24.125.B.1    "C"        "T"        "G"        "A"        "G"       
## X58.53         "C"        "T"        "G"        "A"        "G"       
## X58.57         "C"        "T"        "G"        "A"        "G"       
## Apagbaala      "C"        "T"        "G"        "A"        "G"       
## Bambey.21      "C"        "T"        "G"        "A"        "G"       
##                5_39754999
## X1393.1.2.3... "A"       
## X24.125.B.1    "A"       
## X58.53         "A"       
## X58.57         "G"       
## Apagbaala      "A"       
## Bambey.21      "A"

dim(locusT)

## [1] 313  11

dim(good_locus)

## [1] 313  11

mymatrix_D <- matrix(, nrow = 11, ncol = 11)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("5_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

##  [1] 39666642 39698557 39700341 39708588 39713643 39714266 39720198 39741525
##  [9] 39743714 39754613 39754999

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "5_39714266", color = heat.colors(20), add.key = TRUE)

Locus 13

i=6
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   7

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                5_43955799 5_43958832 5_43958851 5_43988144 5_43988488
## X1393.1.2.3... "G"        "C"        "C"        "C"        "A"       
## X24.125.B.1    "G"        "C"        "C"        "C"        "G"       
## X58.53         "G"        "C"        "C"        "C"        "A"       
## X58.57         "G"        "C"        "C"        "C"        "A"       
## Apagbaala      "G"        "C"        "C"        "C"        "A"       
## Bambey.21      "G"        "C"        "C"        "C"        "A"       
##                5_44016902 5_44020824
## X1393.1.2.3... "G"        "A"       
## X24.125.B.1    "A"        "A"       
## X58.53         "A"        "A"       
## X58.57         "A"        "A"       
## Apagbaala      "A"        "A"       
## Bambey.21      "A"        "A"

dim(locusT)

## [1] 313   7

dim(good_locus)

## [1] 313   7

mymatrix_D <- matrix(, nrow = 7, ncol = 7)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("5_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 43955799 43958832 43958851 43988144 43988488 44016902 44020824

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "5_43988488", color = heat.colors(20), add.key = TRUE)

Locus 18

i=7
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   2

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                8_18095449 8_18133758
## X1393.1.2.3... "T"        "A"       
## X24.125.B.1    "T"        "G"       
## X58.53         "T"        "A"       
## X58.57         "C"        "A"       
## Apagbaala      "T"        "A"       
## Bambey.21      "T"        "A"

dim(locusT)

## [1] 313   2

dim(good_locus)

## [1] 313   2

mymatrix_D <- matrix(, nrow = 2, ncol = 2)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("8_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 18095449 18133758

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "8_18133758", color = heat.colors(20), add.key = TRUE)

Locus 19

i=8
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   6

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                8_21251025 8_21257958 8_21300057 8_21304811 8_21323000
## X1393.1.2.3... "G"        "A"        "C"        "C"        "T"       
## X24.125.B.1    "G"        "A"        "T"        "C"        "T"       
## X58.53         "G"        "A"        "C"        "C"        "T"       
## X58.57         "G"        "A"        "C"        "C"        "T"       
## Apagbaala      "A"        "G"        "C"        "T"        "T"       
## Bambey.21      "G"        "A"        "C"        "C"        "T"       
##                8_21341346
## X1393.1.2.3... "C"       
## X24.125.B.1    "C"       
## X58.53         "C"       
## X58.57         "C"       
## Apagbaala      "C"       
## Bambey.21      "C"

dim(locusT)

## [1] 313   6

dim(good_locus)

## [1] 313   6

mymatrix_D <- matrix(, nrow = 6, ncol = 6)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("8_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 21251025 21257958 21300057 21304811 21323000 21341346

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "8_21300057", color = heat.colors(20), add.key = TRUE)

Locus 20

i=9
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

Locus 21

i=10
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   7

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                8_27780564 8_27782280 8_27786635 8_27807782 8_27811479
## X1393.1.2.3... "G"        "T"        "T"        "C"        "C"       
## X24.125.B.1    "G"        "T"        "T"        "T"        "C"       
## X58.53         "G"        "T"        "T"        "C"        "C"       
## X58.57         "G"        "T"        "T"        "C"        "C"       
## Apagbaala      "G"        "T"        "T"        "C"        "C"       
## Bambey.21      "G"        "T"        "T"        "C"        "C"       
##                8_27854827 8_27856646
## X1393.1.2.3... "G"        "C"       
## X24.125.B.1    "G"        "C"       
## X58.53         "G"        "C"       
## X58.57         "G"        "C"       
## Apagbaala      "G"        "C"       
## Bambey.21      "G"        "C"

dim(locusT)

## [1] 313   7

dim(good_locus)

## [1] 313   7

mymatrix_D <- matrix(, nrow = 7, ncol = 7)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("8_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 27780564 27782280 27786635 27807782 27811479 27854827 27856646

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "8_27807782", color = heat.colors(20), add.key = TRUE)

Locus 23

i=11
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313   2

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                9_11513685 9_11539746
## X1393.1.2.3... "A"        "G"       
## X24.125.B.1    "A"        "G"       
## X58.53         "A"        "G"       
## X58.57         "A"        "G"       
## Apagbaala      "A"        "G"       
## Bambey.21      "A"        "G"

dim(locusT)

## [1] 313   2

dim(good_locus)

## [1] 313   2

mymatrix_D <- matrix(, nrow = 2, ncol = 2)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("9_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

## [1] 11513685 11539746

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "9_11513685", color = heat.colors(20), add.key = TRUE)

Locus 24

i=12
want <- subset(MMJ_Bonf, MMJ_Bonf$Locus == unique(MMJ_Bonf$Locus)[i])
want

locus <- subset(myGeno2, myGeno2$chrom == want$Chr[1])
locus$pos <- as.numeric(as.character(locus$pos))
locus <- subset(locus, locus$pos > (min(want$Pos) - 50000))
locus <- subset(locus, locus$pos < (max(want$Pos) + 50000))
head(locus)

locus2 <- locus[,12:324]
rownames(locus2) <- locus$`rs#`
head(locus2)

locusT <- t(locus2)
dim(locusT)

## [1] 313  10

plot(locus$pos)

now we need to select only SNPs that show variance within our population - so more than one allele!

if(length(unique(locusT[,1])) > 1){
  good_locus <- locusT[,1]
}

counter <- 2
for(i in 2:ncol(locusT)){
  if(length(unique(locusT[,i])) > 1){
  good_locus <- cbind(good_locus, locusT[,i])
  colnames(good_locus)[counter] <- colnames(locusT)[i]
  counter <- counter + 1
}
}
colnames(good_locus)[1] <- colnames(locusT)[1]
head(good_locus)

##                11_4673257 11_4679434 11_4681184 11_4692202 11_4695060
## X1393.1.2.3... "T"        "T"        "C"        "G"        "T"       
## X24.125.B.1    "T"        "C"        "C"        "G"        "T"       
## X58.53         "T"        "T"        "C"        "G"        "T"       
## X58.57         "T"        "C"        "C"        "G"        "T"       
## Apagbaala      "T"        "C"        "C"        "G"        "T"       
## Bambey.21      "T"        "T"        "C"        "G"        "T"       
##                11_4695911 11_4716529 11_4722273 11_4722991 11_4723675
## X1393.1.2.3... "G"        "A"        "C"        "G"        "A"       
## X24.125.B.1    "A"        "G"        "C"        "G"        "A"       
## X58.53         "G"        "A"        "C"        "G"        "A"       
## X58.57         "A"        "A"        "C"        "G"        "A"       
## Apagbaala      "A"        "A"        "C"        "G"        "A"       
## Bambey.21      "G"        "A"        "C"        "G"        "A"

dim(locusT)

## [1] 313  10

dim(good_locus)

## [1] 313  10

mymatrix_D <- matrix(, nrow = 10, ncol = 10)
colnames(mymatrix_D) <- colnames(good_locus)
rownames(mymatrix_D) <- colnames(good_locus)

for(c in 1:ncol(mymatrix_D)){
  for(r in 1:nrow(mymatrix_D)){
    geno1 <- genotype(good_locus[,c], good_locus[,c])
    geno2 <- genotype(good_locus[,r], good_locus[,r])
    LD_matrix <- LD(geno1, geno2)
    mymatrix_D[r,c]<- LD_matrix$`D'`
  }
}

SNPdist <- gsub("11_", "", colnames(mymatrix_D))
SNPdist <- as.numeric(as.character(SNPdist))
SNPdist

##  [1] 4673257 4679434 4681184 4692202 4695060 4695911 4716529 4722273 4722991
## [10] 4723675

LDheatmap(mymatrix_D, SNPdist, LDmeasure = "D'", title= "Pairwise LD in D'", add.map = TRUE, name = "LD_locus",
          SNP.name = "11_4716529", color = heat.colors(20), add.key = TRUE)

GWAS Cowpea analysis

Plotting the data

Isolating the most significant associations

Further inspection of most interesting loci:

calculating the LD

Locus 3

Locus 10

Locus 11

Locus 12

Locus 13

Locus 18

Locus 19

Locus 20

Locus 21

Locus 23

Locus 24