phenotype = read.delim("~/phenotypes.txt",sep='\t',header=T)
  library(VariantAnnotation)
  vcfFile = readVcf("~/rm_G6174_G5913_AT0721/dp10_final/dp10_final.vcf.gz",'hg19')
  sample_list = samples(header(vcfFile))
  pheno=phenotype[phenotype$PATIENT %in% sample_list,]
  phenotype$GOUTSUM[phenotype$GOUTSUM == 3] = NA
  g = geno(vcfFile)$GT

Let’s start with the most associated SNP, the ABCG2 one.

  rs2231142 =  t(t((g[which(rownames(g)== "rs2231142"),])))
  rs2231142 = merge(pheno, rs2231142,by.x="PATIENT",by.y="row.names")
  summary(lm(rs2231142$SURICACID  ~ rs2231142$V1))
## 
## Call:
## lm(formula = rs2231142$SURICACID ~ rs2231142$V1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29567 -0.09599 -0.01567  0.10401  0.32401 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.39599    0.00713  55.537  < 2e-16 ***
## rs2231142$V10/1  0.06967    0.01321   5.274 2.11e-07 ***
## rs2231142$V11/1  0.11115    0.02411   4.611 5.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1218 on 437 degrees of freedom
## Multiple R-squared:  0.08842,    Adjusted R-squared:  0.08425 
## F-statistic: 21.19 on 2 and 437 DF,  p-value: 1.64e-09
  summary(lm(rs2231142$SURICACID  ~ rs2231142$V1+rs2231142$SEX + rs2231142$AGECOL + rs2231142$BMI))
## 
## Call:
## lm(formula = rs2231142$SURICACID ~ rs2231142$V1 + rs2231142$SEX + 
##     rs2231142$AGECOL + rs2231142$BMI)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29652 -0.07413 -0.00119  0.06473  0.32598 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.3973993  0.0303344  13.101  < 2e-16 ***
## rs2231142$V10/1   0.0430159  0.0110061   3.908 0.000108 ***
## rs2231142$V11/1   0.0750766  0.0205823   3.648 0.000297 ***
## rs2231142$SEX    -0.1230596  0.0101241 -12.155  < 2e-16 ***
## rs2231142$AGECOL  0.0003731  0.0003464   1.077 0.282131    
## rs2231142$BMI     0.0046349  0.0006258   7.407    7e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09908 on 426 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.3914, Adjusted R-squared:  0.3842 
## F-statistic: 54.78 on 5 and 426 DF,  p-value: < 2.2e-16
  summary(lm(as.numeric(rs2231142$V1) ~ rs2231142$SEX))
## 
## Call:
## lm(formula = as.numeric(rs2231142$V1) ~ rs2231142$SEX)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4575 -0.4575 -0.2975  0.5425  1.7025 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.61743    0.08630  18.743  < 2e-16 ***
## rs2231142$SEX -0.15998    0.05988  -2.672  0.00782 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6025 on 438 degrees of freedom
## Multiple R-squared:  0.01604,    Adjusted R-squared:  0.01379 
## F-statistic: 7.139 on 1 and 438 DF,  p-value: 0.007824

Lrp2 locus where we find the association in a splice variant after adjustment let’s make sure we do some adjustment.

Lets start investigating rs2302694 - the Lrp2 locus.

  rs_2302694 = g[which(rownames(g)== "rs2302694"),]
  rs_2302694 = t(t(rs_2302694))
  rs_2302694 = merge(pheno,rs_2302694,by.x="PATIENT",by.y="row.names")  
  boxplot(rs_2302694$SURICACID ~ rs_2302694$GOUTSUM)

Now testing the gout associated loci, the 3’ UTR variant in the CYTIP gene.

Testing the gout association.

  rs267992 = t(t((g[which(rownames(g)== "rs267992"),])))
  rs267992 = merge(pheno, rs267992,by.x="PATIENT",by.y="row.names")
  m = lm(rs267992$SURICACID ~ rs267992$V1 + rs267992$SEX + rs267992$AGECOL  + rs267992$BMI) 

Subset the sample into hyper, and normo uricemics.

  temp=rs267992[rs267992$SURICACID > 0.40,]
  m = (lm(temp$GOUTSUM ~ temp$V1 +temp$SURICACID  + temp$AGECOL + temp$SEX + temp$BMI))

Lets plot this stuff.

  library(ggplot2)

Another variant rs193220 found in intergenic space, near C142orf.

    rs193220 = t(t((g[which(rownames(g)== "rs193220"),])))
  rs193220 = merge(pheno, rs193220,by.x="PATIENT",by.y="row.names")
  summary(lm(rs193220$SURICACID ~ rs193220$V1 + rs193220$SEX + rs193220$AGECOL + rs193220$BMI)) 
## 
## Call:
## lm(formula = rs193220$SURICACID ~ rs193220$V1 + rs193220$SEX + 
##     rs193220$AGECOL + rs193220$BMI)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26726 -0.07525 -0.00099  0.06692  0.30866 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.4079111  0.0321819  12.675  < 2e-16 ***
## rs193220$V10/1   0.0135055  0.0134711   1.003    0.317    
## rs193220$V11/1   0.0033334  0.0140193   0.238    0.812    
## rs193220$SEX    -0.1297425  0.0103178 -12.575  < 2e-16 ***
## rs193220$AGECOL  0.0001944  0.0003557   0.546    0.585    
## rs193220$BMI     0.0050940  0.0006344   8.029 9.66e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1017 on 426 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.3589, Adjusted R-squared:  0.3514 
## F-statistic:  47.7 on 5 and 426 DF,  p-value: < 2.2e-16
  summary(glm(as.factor(rs193220$GOUTSUM) ~ rs193220$V1 +rs193220$SURICACID + rs193220$SEX + rs193220$AGECOL + rs193220$BMI,family="binomial"))
## 
## Call:
## glm(formula = as.factor(rs193220$GOUTSUM) ~ rs193220$V1 + rs193220$SURICACID + 
##     rs193220$SEX + rs193220$AGECOL + rs193220$BMI, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0172  -0.1137  -0.0154   0.2560   2.7882  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -18.62423    2.59293  -7.183 6.83e-13 ***
## rs193220$V10/1      -1.12336    0.61421  -1.829 0.067405 .  
## rs193220$V11/1      -2.49067    0.67887  -3.669 0.000244 ***
## rs193220$SURICACID  33.91111    4.25859   7.963 1.68e-15 ***
## rs193220$SEX        -1.67849    0.56873  -2.951 0.003165 ** 
## rs193220$AGECOL      0.07980    0.01653   4.829 1.38e-06 ***
## rs193220$BMI         0.05848    0.02778   2.105 0.035321 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 564.51  on 419  degrees of freedom
## Residual deviance: 163.88  on 413  degrees of freedom
##   (20 observations deleted due to missingness)
## AIC: 177.88
## 
## Number of Fisher Scoring iterations: 7
 a=  rs193220[sapply(rs193220, function(x){return(is.numeric(x) && sum(is.na(x))<=30)})]
  temp=rs193220[rs193220$SURICACID > 0.40,]
  f = (glm(as.factor(temp$GOUTSUM) ~    temp$AGECOL + temp$SEX + temp$BMI,family="binomial"))
  anova(f,test='Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: as.factor(temp$GOUTSUM)
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          219     242.94              
## temp$AGECOL  1   8.4503       218     234.49 0.0036499 ** 
## temp$SEX     1  14.0062       217     220.48 0.0001822 ***
## temp$BMI     1  10.2082       216     210.27 0.0013982 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  AIC(f)
## [1] 218.2706
  f = (glm(as.factor(temp$GOUTSUM) ~ temp$SURICACID  + temp$AGECOL + temp$SEX + temp$BMI,family="binomial"))
  anova(f,test='Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: as.factor(temp$GOUTSUM)
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                             219     242.94             
## temp$SURICACID  1   39.655       218     203.28 3.03e-10 ***
## temp$AGECOL     1   15.721       217     187.56 7.34e-05 ***
## temp$SEX        1    7.843       216     179.72   0.0051 ** 
## temp$BMI        1    5.635       215     174.08   0.0176 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  AIC(f)
## [1] 184.0803
  f = (glm(as.factor(temp$GOUTSUM) ~ as.numeric(temp$V1) + temp$SURICACID + temp$AGECOL +temp$SEX + temp$BMI, family='binomial'))
  anova(f,test='Chisq')
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: as.factor(temp$GOUTSUM)
## 
## Terms added sequentially (first to last)
## 
## 
##                     Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                  219     242.94              
## as.numeric(temp$V1)  1    9.231       218     233.70  0.002380 ** 
## temp$SURICACID       1   41.714       217     191.99 1.057e-10 ***
## temp$AGECOL          1   22.726       216     169.26 1.868e-06 ***
## temp$SEX             1    8.225       215     161.04  0.004131 ** 
## temp$BMI             1    4.859       214     156.18  0.027509 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  AIC(f)
## [1] 168.1808

Allipurinol candidate.

Another variant rs36075327 found in the ALDH1A2 gene. Might be involved in allipurinol metabolism.

    rs372460491 = t(t((g[which(rownames(g)== "rs372460491"),])))
  rs372460491 = merge(pheno, rs372460491,by.x="PATIENT",by.y="row.names")
  summary(lm(rs372460491$SURICACID ~ rs372460491$ALLOP +as.numeric(rs372460491$V1) + rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI)) 
## 
## Call:
## lm(formula = rs372460491$SURICACID ~ rs372460491$ALLOP + as.numeric(rs372460491$V1) + 
##     rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22288 -0.06223 -0.01727  0.05880  0.28811 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 0.3282966  0.0348945   9.408  < 2e-16 ***
## rs372460491$ALLOP           0.1222809  0.0111874  10.930  < 2e-16 ***
## as.numeric(rs372460491$V1) -0.0073903  0.0100970  -0.732    0.465    
## rs372460491$SEX            -0.1017625  0.0114549  -8.884  < 2e-16 ***
## rs372460491$AGECOL         -0.0002204  0.0003627  -0.608    0.544    
## rs372460491$BMI             0.0029852  0.0006295   4.742  3.2e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08707 on 316 degrees of freedom
##   (118 observations deleted due to missingness)
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5369 
## F-statistic: 75.42 on 5 and 316 DF,  p-value: < 2.2e-16
  summary(glm(as.factor(rs372460491$GOUTSUM) ~  as.numeric(rs372460491$V1)  + rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI,family="binomial"))
## 
## Call:
## glm(formula = as.factor(rs372460491$GOUTSUM) ~ as.numeric(rs372460491$V1) + 
##     rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL + 
##     rs372460491$BMI, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2323  -0.1104  -0.0193   0.3399   1.6882  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -20.65090    2.80432  -7.364 1.79e-13 ***
## as.numeric(rs372460491$V1)   1.20400    0.48147   2.501  0.01240 *  
## rs372460491$SURICACID       33.93444    4.31197   7.870 3.55e-15 ***
## rs372460491$SEX             -1.70325    0.55420  -3.073  0.00212 ** 
## rs372460491$AGECOL           0.06765    0.01530   4.421 9.82e-06 ***
## rs372460491$BMI              0.05147    0.02845   1.809  0.07044 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 564.51  on 419  degrees of freedom
## Residual deviance: 174.73  on 414  degrees of freedom
##   (20 observations deleted due to missingness)
## AIC: 186.73
## 
## Number of Fisher Scoring iterations: 7
  summary(glm(as.factor(rs372460491$GOUTSUM) ~  as.numeric(rs372460491$V1)  + rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI,family="binomial")) 
## 
## Call:
## glm(formula = as.factor(rs372460491$GOUTSUM) ~ as.numeric(rs372460491$V1) + 
##     rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL + 
##     rs372460491$BMI, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2323  -0.1104  -0.0193   0.3399   1.6882  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -20.65090    2.80432  -7.364 1.79e-13 ***
## as.numeric(rs372460491$V1)   1.20400    0.48147   2.501  0.01240 *  
## rs372460491$SURICACID       33.93444    4.31197   7.870 3.55e-15 ***
## rs372460491$SEX             -1.70325    0.55420  -3.073  0.00212 ** 
## rs372460491$AGECOL           0.06765    0.01530   4.421 9.82e-06 ***
## rs372460491$BMI              0.05147    0.02845   1.809  0.07044 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 564.51  on 419  degrees of freedom
## Residual deviance: 174.73  on 414  degrees of freedom
##   (20 observations deleted due to missingness)
## AIC: 186.73
## 
## Number of Fisher Scoring iterations: 7
  temp=rs372460491[rs372460491$SURICACID > 0.40,]
  summary(glm(as.factor(temp$GOUTSUM) ~ as.numeric(temp$V1) + temp$SURICACID  + temp$AGECOL + temp$SEX + temp$BMI,family="binomial"))
## 
## Call:
## glm(formula = as.factor(temp$GOUTSUM) ~ as.numeric(temp$V1) + 
##     temp$SURICACID + temp$AGECOL + temp$SEX + temp$BMI, family = "binomial")
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.06120   0.05703   0.30119   0.62935   1.55374  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -16.78413    3.19293  -5.257 1.47e-07 ***
## as.numeric(temp$V1)   1.36923    0.53824   2.544 0.010962 *  
## temp$SURICACID       25.81394    5.11570   5.046 4.51e-07 ***
## temp$AGECOL           0.07407    0.01666   4.446 8.75e-06 ***
## temp$SEX             -1.96265    0.57106  -3.437 0.000589 ***
## temp$BMI              0.05607    0.02903   1.931 0.053460 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 242.94  on 219  degrees of freedom
## Residual deviance: 166.10  on 214  degrees of freedom
##   (7 observations deleted due to missingness)
## AIC: 178.1
## 
## Number of Fisher Scoring iterations: 6

rs193220

THis SNP is the most associated with GOUT in the adjusted set of data.

Manhattan plot

edit the function so that it takes a genome list with the lengths.

Making a manhattan plot of the polynesian GWAS data.

  raw_pvalues = read.table("~/rm_G6174_G5913_AT0721/dp10_final/results/raw_pvalues.txt",header=T,stringsAsFactors = F)
  adjust_pvalues = read.table("~/rm_G6174_G5913_AT0721/dp10_final/results/full_adjusted_pvalues.txt",  header=T, stringsAsFactors = F)
  # put data in the GWAS results dataframe
  position_and_chromosome = function(x, pvalue_col=11){
    split_s = strsplit(x=as.character(x[1]),split = ":",fixed=T)
    chr = strsplit(split_s[[1]][1],split="chr", fixed=T)
    chr = (chr[[1]][2])
    pos = (split_s[[1]][2])
    pvalue = as.numeric(x[pvalue_col])
    return(c(x[1],chr,pos,pvalue))    
}
  name_cols = c("SNP","CHR","BP","P") 
  r_df = as.data.frame(t(apply(raw_pvalues,1,position_and_chromosome)))
  r_df[,4] = as.numeric(as.character(r_df[,4]))
  r_df[,3] = as.numeric(as.character(r_df[,3]))
  r_df[,2] = as.numeric(as.character(r_df[,2]))
## Warning: NAs introduced by coercion
  r_df[is.na(r_df[,2]),2] = 23
  colnames(r_df) = name_cols
  adj_df = as.data.frame(t(apply(adjust_pvalues,1,function(x){position_and_chromosome(x,pvalue_col = 9)})))
  adj_df[,4]   = as.numeric(as.character(adj_df[,4]))
  adj_df[,3] = as.numeric(as.character(adj_df[,3]))
  adj_df[,2] = as.numeric(as.character(adj_df[,2]))
## Warning: NAs introduced by coercion
  adj_df[is.na(adj_df[,2]),2] = 23
  colnames(adj_df) = name_cols
  library(qqman)
  manhattan(r_df,main="Unadjusted manhattan plot for Urate \n Y ~ SNP", col = c("blue4", "orange3"),cex.axis=0.9 ,suggestiveline = -log10(1e-03),genomewideline = -log10(1.067692e-05),ylim=c(0,10))

    manhattan(adj_df,main="Adjusted manhattan plot for Urate \n Y ~ BMI MDS1 MDS2 SEX AGECOL",col = c("blue4", "orange3"),cex.axis=0.9 ,suggestiveline = -log10(1e-03),genomewideline = -log10(1.067692e-05),ylim=c(0,10))

Use FDR tool to estimate the false-discovery rate in the common SNPS within the resequencing dataset.

  library(fdrtool)
## Warning: package 'fdrtool' was built under R version 3.1.2
  eta_r=fdrtool(r_df[,4],statistic ="pvalue")
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting

  eta_adj=fdrtool(adj_df[,4],statistic = "pvalue")
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting