phenotype = read.delim("~/phenotypes.txt",sep='\t',header=T)
library(VariantAnnotation)
vcfFile = readVcf("~/rm_G6174_G5913_AT0721/dp10_final/dp10_final.vcf.gz",'hg19')
sample_list = samples(header(vcfFile))
pheno=phenotype[phenotype$PATIENT %in% sample_list,]
phenotype$GOUTSUM[phenotype$GOUTSUM == 3] = NA
g = geno(vcfFile)$GT
Let’s start with the most associated SNP, the ABCG2 one.
rs2231142 = t(t((g[which(rownames(g)== "rs2231142"),])))
rs2231142 = merge(pheno, rs2231142,by.x="PATIENT",by.y="row.names")
summary(lm(rs2231142$SURICACID ~ rs2231142$V1))
##
## Call:
## lm(formula = rs2231142$SURICACID ~ rs2231142$V1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29567 -0.09599 -0.01567 0.10401 0.32401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.39599 0.00713 55.537 < 2e-16 ***
## rs2231142$V10/1 0.06967 0.01321 5.274 2.11e-07 ***
## rs2231142$V11/1 0.11115 0.02411 4.611 5.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1218 on 437 degrees of freedom
## Multiple R-squared: 0.08842, Adjusted R-squared: 0.08425
## F-statistic: 21.19 on 2 and 437 DF, p-value: 1.64e-09
summary(lm(rs2231142$SURICACID ~ rs2231142$V1+rs2231142$SEX + rs2231142$AGECOL + rs2231142$BMI))
##
## Call:
## lm(formula = rs2231142$SURICACID ~ rs2231142$V1 + rs2231142$SEX +
## rs2231142$AGECOL + rs2231142$BMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29652 -0.07413 -0.00119 0.06473 0.32598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3973993 0.0303344 13.101 < 2e-16 ***
## rs2231142$V10/1 0.0430159 0.0110061 3.908 0.000108 ***
## rs2231142$V11/1 0.0750766 0.0205823 3.648 0.000297 ***
## rs2231142$SEX -0.1230596 0.0101241 -12.155 < 2e-16 ***
## rs2231142$AGECOL 0.0003731 0.0003464 1.077 0.282131
## rs2231142$BMI 0.0046349 0.0006258 7.407 7e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09908 on 426 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.3914, Adjusted R-squared: 0.3842
## F-statistic: 54.78 on 5 and 426 DF, p-value: < 2.2e-16
summary(lm(as.numeric(rs2231142$V1) ~ rs2231142$SEX))
##
## Call:
## lm(formula = as.numeric(rs2231142$V1) ~ rs2231142$SEX)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4575 -0.4575 -0.2975 0.5425 1.7025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.61743 0.08630 18.743 < 2e-16 ***
## rs2231142$SEX -0.15998 0.05988 -2.672 0.00782 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6025 on 438 degrees of freedom
## Multiple R-squared: 0.01604, Adjusted R-squared: 0.01379
## F-statistic: 7.139 on 1 and 438 DF, p-value: 0.007824
Lrp2 locus where we find the association in a splice variant after adjustment let’s make sure we do some adjustment.
Lets start investigating rs2302694 - the Lrp2 locus.
rs_2302694 = g[which(rownames(g)== "rs2302694"),]
rs_2302694 = t(t(rs_2302694))
rs_2302694 = merge(pheno,rs_2302694,by.x="PATIENT",by.y="row.names")
boxplot(rs_2302694$SURICACID ~ rs_2302694$GOUTSUM)
Now testing the gout associated loci, the 3’ UTR variant in the CYTIP gene.
Testing the gout association.
rs267992 = t(t((g[which(rownames(g)== "rs267992"),])))
rs267992 = merge(pheno, rs267992,by.x="PATIENT",by.y="row.names")
m = lm(rs267992$SURICACID ~ rs267992$V1 + rs267992$SEX + rs267992$AGECOL + rs267992$BMI)
Subset the sample into hyper, and normo uricemics.
temp=rs267992[rs267992$SURICACID > 0.40,]
m = (lm(temp$GOUTSUM ~ temp$V1 +temp$SURICACID + temp$AGECOL + temp$SEX + temp$BMI))
Lets plot this stuff.
library(ggplot2)
Another variant rs193220 found in intergenic space, near C142orf.
rs193220 = t(t((g[which(rownames(g)== "rs193220"),])))
rs193220 = merge(pheno, rs193220,by.x="PATIENT",by.y="row.names")
summary(lm(rs193220$SURICACID ~ rs193220$V1 + rs193220$SEX + rs193220$AGECOL + rs193220$BMI))
##
## Call:
## lm(formula = rs193220$SURICACID ~ rs193220$V1 + rs193220$SEX +
## rs193220$AGECOL + rs193220$BMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26726 -0.07525 -0.00099 0.06692 0.30866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4079111 0.0321819 12.675 < 2e-16 ***
## rs193220$V10/1 0.0135055 0.0134711 1.003 0.317
## rs193220$V11/1 0.0033334 0.0140193 0.238 0.812
## rs193220$SEX -0.1297425 0.0103178 -12.575 < 2e-16 ***
## rs193220$AGECOL 0.0001944 0.0003557 0.546 0.585
## rs193220$BMI 0.0050940 0.0006344 8.029 9.66e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1017 on 426 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.3589, Adjusted R-squared: 0.3514
## F-statistic: 47.7 on 5 and 426 DF, p-value: < 2.2e-16
summary(glm(as.factor(rs193220$GOUTSUM) ~ rs193220$V1 +rs193220$SURICACID + rs193220$SEX + rs193220$AGECOL + rs193220$BMI,family="binomial"))
##
## Call:
## glm(formula = as.factor(rs193220$GOUTSUM) ~ rs193220$V1 + rs193220$SURICACID +
## rs193220$SEX + rs193220$AGECOL + rs193220$BMI, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0172 -0.1137 -0.0154 0.2560 2.7882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -18.62423 2.59293 -7.183 6.83e-13 ***
## rs193220$V10/1 -1.12336 0.61421 -1.829 0.067405 .
## rs193220$V11/1 -2.49067 0.67887 -3.669 0.000244 ***
## rs193220$SURICACID 33.91111 4.25859 7.963 1.68e-15 ***
## rs193220$SEX -1.67849 0.56873 -2.951 0.003165 **
## rs193220$AGECOL 0.07980 0.01653 4.829 1.38e-06 ***
## rs193220$BMI 0.05848 0.02778 2.105 0.035321 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 564.51 on 419 degrees of freedom
## Residual deviance: 163.88 on 413 degrees of freedom
## (20 observations deleted due to missingness)
## AIC: 177.88
##
## Number of Fisher Scoring iterations: 7
a= rs193220[sapply(rs193220, function(x){return(is.numeric(x) && sum(is.na(x))<=30)})]
temp=rs193220[rs193220$SURICACID > 0.40,]
f = (glm(as.factor(temp$GOUTSUM) ~ temp$AGECOL + temp$SEX + temp$BMI,family="binomial"))
anova(f,test='Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: as.factor(temp$GOUTSUM)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 219 242.94
## temp$AGECOL 1 8.4503 218 234.49 0.0036499 **
## temp$SEX 1 14.0062 217 220.48 0.0001822 ***
## temp$BMI 1 10.2082 216 210.27 0.0013982 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(f)
## [1] 218.2706
f = (glm(as.factor(temp$GOUTSUM) ~ temp$SURICACID + temp$AGECOL + temp$SEX + temp$BMI,family="binomial"))
anova(f,test='Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: as.factor(temp$GOUTSUM)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 219 242.94
## temp$SURICACID 1 39.655 218 203.28 3.03e-10 ***
## temp$AGECOL 1 15.721 217 187.56 7.34e-05 ***
## temp$SEX 1 7.843 216 179.72 0.0051 **
## temp$BMI 1 5.635 215 174.08 0.0176 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(f)
## [1] 184.0803
f = (glm(as.factor(temp$GOUTSUM) ~ as.numeric(temp$V1) + temp$SURICACID + temp$AGECOL +temp$SEX + temp$BMI, family='binomial'))
anova(f,test='Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: as.factor(temp$GOUTSUM)
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 219 242.94
## as.numeric(temp$V1) 1 9.231 218 233.70 0.002380 **
## temp$SURICACID 1 41.714 217 191.99 1.057e-10 ***
## temp$AGECOL 1 22.726 216 169.26 1.868e-06 ***
## temp$SEX 1 8.225 215 161.04 0.004131 **
## temp$BMI 1 4.859 214 156.18 0.027509 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(f)
## [1] 168.1808
Another variant rs36075327 found in the ALDH1A2 gene. Might be involved in allipurinol metabolism.
rs372460491 = t(t((g[which(rownames(g)== "rs372460491"),])))
rs372460491 = merge(pheno, rs372460491,by.x="PATIENT",by.y="row.names")
summary(lm(rs372460491$SURICACID ~ rs372460491$ALLOP +as.numeric(rs372460491$V1) + rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI))
##
## Call:
## lm(formula = rs372460491$SURICACID ~ rs372460491$ALLOP + as.numeric(rs372460491$V1) +
## rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22288 -0.06223 -0.01727 0.05880 0.28811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3282966 0.0348945 9.408 < 2e-16 ***
## rs372460491$ALLOP 0.1222809 0.0111874 10.930 < 2e-16 ***
## as.numeric(rs372460491$V1) -0.0073903 0.0100970 -0.732 0.465
## rs372460491$SEX -0.1017625 0.0114549 -8.884 < 2e-16 ***
## rs372460491$AGECOL -0.0002204 0.0003627 -0.608 0.544
## rs372460491$BMI 0.0029852 0.0006295 4.742 3.2e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08707 on 316 degrees of freedom
## (118 observations deleted due to missingness)
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5369
## F-statistic: 75.42 on 5 and 316 DF, p-value: < 2.2e-16
summary(glm(as.factor(rs372460491$GOUTSUM) ~ as.numeric(rs372460491$V1) + rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI,family="binomial"))
##
## Call:
## glm(formula = as.factor(rs372460491$GOUTSUM) ~ as.numeric(rs372460491$V1) +
## rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL +
## rs372460491$BMI, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2323 -0.1104 -0.0193 0.3399 1.6882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.65090 2.80432 -7.364 1.79e-13 ***
## as.numeric(rs372460491$V1) 1.20400 0.48147 2.501 0.01240 *
## rs372460491$SURICACID 33.93444 4.31197 7.870 3.55e-15 ***
## rs372460491$SEX -1.70325 0.55420 -3.073 0.00212 **
## rs372460491$AGECOL 0.06765 0.01530 4.421 9.82e-06 ***
## rs372460491$BMI 0.05147 0.02845 1.809 0.07044 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 564.51 on 419 degrees of freedom
## Residual deviance: 174.73 on 414 degrees of freedom
## (20 observations deleted due to missingness)
## AIC: 186.73
##
## Number of Fisher Scoring iterations: 7
summary(glm(as.factor(rs372460491$GOUTSUM) ~ as.numeric(rs372460491$V1) + rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL + rs372460491$BMI,family="binomial"))
##
## Call:
## glm(formula = as.factor(rs372460491$GOUTSUM) ~ as.numeric(rs372460491$V1) +
## rs372460491$SURICACID + rs372460491$SEX + rs372460491$AGECOL +
## rs372460491$BMI, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2323 -0.1104 -0.0193 0.3399 1.6882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -20.65090 2.80432 -7.364 1.79e-13 ***
## as.numeric(rs372460491$V1) 1.20400 0.48147 2.501 0.01240 *
## rs372460491$SURICACID 33.93444 4.31197 7.870 3.55e-15 ***
## rs372460491$SEX -1.70325 0.55420 -3.073 0.00212 **
## rs372460491$AGECOL 0.06765 0.01530 4.421 9.82e-06 ***
## rs372460491$BMI 0.05147 0.02845 1.809 0.07044 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 564.51 on 419 degrees of freedom
## Residual deviance: 174.73 on 414 degrees of freedom
## (20 observations deleted due to missingness)
## AIC: 186.73
##
## Number of Fisher Scoring iterations: 7
temp=rs372460491[rs372460491$SURICACID > 0.40,]
summary(glm(as.factor(temp$GOUTSUM) ~ as.numeric(temp$V1) + temp$SURICACID + temp$AGECOL + temp$SEX + temp$BMI,family="binomial"))
##
## Call:
## glm(formula = as.factor(temp$GOUTSUM) ~ as.numeric(temp$V1) +
## temp$SURICACID + temp$AGECOL + temp$SEX + temp$BMI, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.06120 0.05703 0.30119 0.62935 1.55374
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.78413 3.19293 -5.257 1.47e-07 ***
## as.numeric(temp$V1) 1.36923 0.53824 2.544 0.010962 *
## temp$SURICACID 25.81394 5.11570 5.046 4.51e-07 ***
## temp$AGECOL 0.07407 0.01666 4.446 8.75e-06 ***
## temp$SEX -1.96265 0.57106 -3.437 0.000589 ***
## temp$BMI 0.05607 0.02903 1.931 0.053460 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 242.94 on 219 degrees of freedom
## Residual deviance: 166.10 on 214 degrees of freedom
## (7 observations deleted due to missingness)
## AIC: 178.1
##
## Number of Fisher Scoring iterations: 6
THis SNP is the most associated with GOUT in the adjusted set of data.
edit the function so that it takes a genome list with the lengths.
Making a manhattan plot of the polynesian GWAS data.
raw_pvalues = read.table("~/rm_G6174_G5913_AT0721/dp10_final/results/raw_pvalues.txt",header=T,stringsAsFactors = F)
adjust_pvalues = read.table("~/rm_G6174_G5913_AT0721/dp10_final/results/full_adjusted_pvalues.txt", header=T, stringsAsFactors = F)
# put data in the GWAS results dataframe
position_and_chromosome = function(x, pvalue_col=11){
split_s = strsplit(x=as.character(x[1]),split = ":",fixed=T)
chr = strsplit(split_s[[1]][1],split="chr", fixed=T)
chr = (chr[[1]][2])
pos = (split_s[[1]][2])
pvalue = as.numeric(x[pvalue_col])
return(c(x[1],chr,pos,pvalue))
}
name_cols = c("SNP","CHR","BP","P")
r_df = as.data.frame(t(apply(raw_pvalues,1,position_and_chromosome)))
r_df[,4] = as.numeric(as.character(r_df[,4]))
r_df[,3] = as.numeric(as.character(r_df[,3]))
r_df[,2] = as.numeric(as.character(r_df[,2]))
## Warning: NAs introduced by coercion
r_df[is.na(r_df[,2]),2] = 23
colnames(r_df) = name_cols
adj_df = as.data.frame(t(apply(adjust_pvalues,1,function(x){position_and_chromosome(x,pvalue_col = 9)})))
adj_df[,4] = as.numeric(as.character(adj_df[,4]))
adj_df[,3] = as.numeric(as.character(adj_df[,3]))
adj_df[,2] = as.numeric(as.character(adj_df[,2]))
## Warning: NAs introduced by coercion
adj_df[is.na(adj_df[,2]),2] = 23
colnames(adj_df) = name_cols
library(qqman)
manhattan(r_df,main="Unadjusted manhattan plot for Urate \n Y ~ SNP", col = c("blue4", "orange3"),cex.axis=0.9 ,suggestiveline = -log10(1e-03),genomewideline = -log10(1.067692e-05),ylim=c(0,10))
manhattan(adj_df,main="Adjusted manhattan plot for Urate \n Y ~ BMI MDS1 MDS2 SEX AGECOL",col = c("blue4", "orange3"),cex.axis=0.9 ,suggestiveline = -log10(1e-03),genomewideline = -log10(1.067692e-05),ylim=c(0,10))
Use FDR tool to estimate the false-discovery rate in the common SNPS within the resequencing dataset.
library(fdrtool)
## Warning: package 'fdrtool' was built under R version 3.1.2
eta_r=fdrtool(r_df[,4],statistic ="pvalue")
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting
eta_adj=fdrtool(adj_df[,4],statistic = "pvalue")
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting