# read tsv gene_trait_association and target indication
require (tidyverse)
## Loading required package: tidyverse
## Error: package or namespace load failed for 'tidyverse' in namespaceExport(ns, exports):
## undefined exports: %>%
require(tidyr)
## Loading required package: tidyr
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v dplyr 0.8.5
## v tibble 2.1.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.3
## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
setwd("~/Home051219onward/King2019/genetic-evidence-approval-master/genetic-evidence-approval-master/data")
GeneTrait <- read_tsv("gene_trait_assoc.tsv")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## CHR_A = col_double(),
## BP_A = col_double(),
## SNP_A = col_character(),
## CHR_B = col_double(),
## BP_B = col_double(),
## SNP_B = col_character(),
## R2 = col_double(),
## ensembl_id = col_character(),
## eQTL_pval = col_double(),
## Distance = col_double(),
## Deleterious = col_character(),
## RDB = col_character(),
## ECat = col_character(),
## MAPPED_TRAIT = col_character(),
## SNP_ID_CURRENT = col_double(),
## NStudy = col_double(),
## SourceScore = col_double(),
## LDScore = col_double(),
## RegScore = col_double(),
## NAssocScore = col_double()
## # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 42043 parsing failures.
## row col expected actual file
## 61333 MapType 1/0/T/F/TRUE/FALSE manual 'gene_trait_assoc.tsv'
## 61333 Inheritance 1/0/T/F/TRUE/FALSE Autosomal recessive 'gene_trait_assoc.tsv'
## 61333 Phenotype 1/0/T/F/TRUE/FALSE Immunodeficiency 38 'gene_trait_assoc.tsv'
## 61333 AssociationCode 1/0/T/F/TRUE/FALSE (3) 'gene_trait_assoc.tsv'
## 61333 PhenotypeMIM 1/0/T/F/TRUE/FALSE 616126 'gene_trait_assoc.tsv'
## ..... ............... .................. ................... ......................
## See problems(...) for more details.
TargetIndicationNMSH <- read_tsv("target_indication_nmsh.tsv")
## Parsed with column specification:
## cols(
## MSH = col_character(),
## ensembl_id = col_character(),
## lApprovedUS.EU = col_logical(),
## Phase.Latest = col_character(),
## First.Added = col_date(format = ""),
## NelsonStatus = col_character(),
## symbol = col_character()
## )
GeneTraitTargetIndication <- full_join(TargetIndicationNMSH, GeneTrait, by = "symbol", suffix = c("TargetIndication", "GeneTrait"))
GTTI <- GeneTraitTargetIndication %>% filter(xMHCGene == FALSE)
print (head(GTTI))
## # A tibble: 6 x 48
## MSHTargetIndica~ ensembl_idTarge~ lApprovedUS.EU Phase.Latest First.Added
## <chr> <chr> <lgl> <chr> <date>
## 1 Abortion, Spont~ ENSG00000119535 FALSE unknown NA
## 2 Abortion, Spont~ ENSG00000119535 FALSE unknown NA
## 3 Abortion, Spont~ ENSG00000119535 FALSE unknown NA
## 4 Abortion, Spont~ ENSG00000119535 FALSE unknown NA
## 5 Abortion, Spont~ ENSG00000119535 FALSE unknown NA
## 6 Abortion, Spont~ ENSG00000119535 FALSE unknown NA
## # ... with 43 more variables: NelsonStatus <chr>, symbol <chr>, CHR_A <dbl>,
## # BP_A <dbl>, SNP_A <chr>, CHR_B <dbl>, BP_B <dbl>, SNP_B <chr>, R2 <dbl>,
## # ensembl_idGeneTrait <chr>, eQTL <lgl>, eQTL_pval <dbl>, DHS <lgl>,
## # Distance <dbl>, Deleterious <chr>, missense <lgl>, distance <lgl>,
## # RDB <chr>, ECat <chr>, MAPPED_TRAIT <chr>, SNP_ID_CURRENT <dbl>,
## # NStudy <dbl>, SourceScore <dbl>, LDScore <dbl>, RegScore <dbl>,
## # NAssocScore <dbl>, TotalScore <dbl>, MSHGeneTrait <chr>, Rank <dbl>,
## # Source <chr>, MapType <lgl>, Inheritance <lgl>, first_added <date>,
## # Phenotype <lgl>, AssociationCode <lgl>, MultifactorialSusceptibility <lgl>,
## # has_mim <lgl>, PhenotypeMIM <lgl>, UI <lgl>, Somatic <lgl>, Updated <lgl>,
## # Created <lgl>, xMHCGene <lgl>
require(epitools)
## Loading required package: epitools
library(epitools)
# is aprroval dependent on OMIM?
GTTI_OMIM <- GTTI %>% filter (Source == "OMIM")
GTTI_OMIM$Phase.Latest_Logical <- GTTI_OMIM$Phase.Latest == "Approved"
table(GTTI_OMIM$Phase.Latest_Logical)
##
## FALSE TRUE
## 22873 3112
GTTI_OMIM_Nelson <- GTTI_OMIM %>% filter(NelsonStatus != "Unused", NelsonStatus != "Absent")
predictor <- table(GTTI_OMIM_Nelson$has_mim)
print(predictor)
##
## FALSE TRUE
## 486 7991
modelglm_GTTI_OMIM_Nelson <- (glm(as.formula("lApprovedUS.EU ~ has_mim"), data = GTTI_OMIM_Nelson, family = "binomial"))
summary(modelglm_GTTI_OMIM_Nelson)
##
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ has_mim"), family = "binomial",
## data = GTTI_OMIM_Nelson)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7562 -0.7562 -0.7562 -0.7458 1.6826
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1374 0.1058 -10.75 <2e-16 ***
## has_mimTRUE 0.0316 0.1089 0.29 0.772
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9502.4 on 8476 degrees of freedom
## Residual deviance: 9502.3 on 8475 degrees of freedom
## AIC: 9506.3
##
## Number of Fisher Scoring iterations: 4
oddsratio(GTTI_OMIM_Nelson$lApprovedUS.EU, GTTI_OMIM_Nelson$has_mim)
## $data
## Outcome
## Predictor FALSE TRUE Total
## FALSE 368 6004 6372
## TRUE 118 1987 2105
## Total 486 7991 8477
##
## $measure
## odds ratio with 95% C.I.
## Predictor estimate lower upper
## FALSE 1.000000 NA NA
## TRUE 1.031167 0.835615 1.281406
##
## $p.value
## two-sided
## Predictor midp.exact fisher.exact chi.square
## FALSE NA NA NA
## TRUE 0.7776981 0.828767 0.7717103
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
# repeat analysis if not previously approved in Nelson dataset
GTTI_OMIM_Nelson_NotApproved <- GTTI_OMIM_Nelson %>% filter (NelsonStatus!= "Approved")
modelglm_GTTI_OMIM_Nelson_NotApproved <- (glm(as.formula("lApprovedUS.EU ~ has_mim"), data = GTTI_OMIM_Nelson_NotApproved, family = "binomial"))
summary(modelglm_GTTI_OMIM_Nelson_NotApproved)
##
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ has_mim"), family = "binomial",
## data = GTTI_OMIM_Nelson_NotApproved)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2985 -0.2985 -0.2985 -0.2985 2.6587
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5047 0.3060 -11.453 <2e-16 ***
## has_mimTRUE 0.4162 0.3122 1.333 0.183
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2329.6 on 6594 degrees of freedom
## Residual deviance: 2327.6 on 6593 degrees of freedom
## AIC: 2331.6
##
## Number of Fisher Scoring iterations: 6
oddsratio(GTTI_OMIM_Nelson_NotApproved$lApprovedUS.EU, GTTI_OMIM_Nelson_NotApproved$has_mim)
## $data
## Outcome
## Predictor FALSE TRUE Total
## FALSE 366 5947 6313
## TRUE 11 271 282
## Total 377 6218 6595
##
## $measure
## odds ratio with 95% C.I.
## Predictor estimate lower upper
## FALSE 1.000000 NA NA
## TRUE 1.495347 0.8507782 2.938625
##
## $p.value
## two-sided
## Predictor midp.exact fisher.exact chi.square
## FALSE NA NA NA
## TRUE 0.1723723 0.2359679 0.1794612
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
# use this dataset for prediction
GTTI_OMIM_notNelson <- GTTI_OMIM %>% filter(NelsonStatus %in% c("Unused", "Absent", NA))
GTTI_OMIM_notNelson$Approval <- predict(modelglm_GTTI_OMIM_Nelson,
newdata = GTTI_OMIM_notNelson, type ="response")
sum(GTTI_OMIM_notNelson$Approval>0.5)
## [1] 0
modelglm_GTTI_OMIM <- (glm(as.formula("Phase.Latest_Logical ~ has_mim"), data = GTTI_OMIM, family = "binomial"))
summary(modelglm_GTTI_OMIM)
##
## Call:
## glm(formula = as.formula("Phase.Latest_Logical ~ has_mim"), family = "binomial",
## data = GTTI_OMIM)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5087 -0.5087 -0.5087 -0.5087 2.1537
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.21578 0.07893 -28.074 < 2e-16 ***
## has_mimTRUE 0.23621 0.08135 2.904 0.00369 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19044 on 25984 degrees of freedom
## Residual deviance: 19035 on 25983 degrees of freedom
## (3724 observations deleted due to missingness)
## AIC: 19039
##
## Number of Fisher Scoring iterations: 4
OR_OMIM <- oddsratio(GTTI_OMIM$Phase.Latest_Logical, GTTI_OMIM$has_mim)
print(OR_OMIM)
## $data
## Outcome
## Predictor FALSE TRUE Total
## FALSE 1632 21241 22873
## TRUE 178 2934 3112
## Total 1810 24175 25985
##
## $measure
## odds ratio with 95% C.I.
## Predictor estimate lower upper
## FALSE 1.000000 NA NA
## TRUE 1.265441 1.082046 1.488869
##
## $p.value
## two-sided
## Predictor midp.exact fisher.exact chi.square
## FALSE NA NA NA
## TRUE 0.0029362 0.003392414 0.003617224
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
require(epitools)
library(epitools)
# is aprroval dependent on OMIM?
GTTI_GWAS <- GTTI %>% filter (Source == "GWAS:A")
GTTI_GWAS$Phase.Latest_Logical <- GTTI_GWAS$Phase.Latest == "Approved"
table(GTTI_GWAS$Phase.Latest_Logical)
##
## FALSE TRUE
## 97203 9391
GTTI_GWAS$Deleterious_Pr <- (GTTI_GWAS$Deleterious)%>% replace_na(replace = "NA")%>% factor()
GTTI_GWAS$Deleterious_Pr <- relevel(GTTI_GWAS$Deleterious_Pr, ref = "NA")
table(GTTI_GWAS$Deleterious_Pr)
##
## NA HIGH LOW MODERATE MODIFIER
## 41239 1977 1357 20300 80857
GTTI_GWAS_Nelson <- GTTI_GWAS %>% filter(NelsonStatus != "Unused", NelsonStatus != "Absent")
predictor <- table(GTTI_GWAS_Nelson$Deleterious_Pr)
print(predictor)
##
## NA HIGH LOW MODERATE MODIFIER
## 6873 796 323 4208 19761
modelglm_GTTI_GWAS_Nelson <- (glm(as.formula("lApprovedUS.EU ~ Deleterious_Pr"), data = GTTI_GWAS_Nelson, family = "binomial"))
summary(modelglm_GTTI_GWAS_Nelson)
##
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ Deleterious_Pr"),
## family = "binomial", data = GTTI_GWAS_Nelson)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5405 -0.6377 -0.6377 -0.6358 2.1956
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.43952 0.03065 -46.970 < 2e-16 ***
## Deleterious_PrHIGH 2.26182 0.08284 27.303 < 2e-16 ***
## Deleterious_PrLOW -0.87676 0.19699 -4.451 8.56e-06 ***
## Deleterious_PrMODERATE -0.05673 0.05029 -1.128 0.259
## Deleterious_PrMODIFIER -0.04996 0.03573 -1.399 0.162
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31743 on 31960 degrees of freedom
## Residual deviance: 30763 on 31956 degrees of freedom
## AIC: 30773
##
## Number of Fisher Scoring iterations: 4
# riskratio.boot(GTTI_GWAS_Nelson$lApprovedUS.EU, GTTI_GWAS_Nelson$Deleterious_Pr)
# repeat analysis if not previously approved in Nelson dataset
GTTI_GWAS_Nelson_NotApproved <- GTTI_GWAS_Nelson %>% filter (NelsonStatus!= "Approved")
modelglm_GTTI_GWAS_Nelson_NotApproved <- (glm(as.formula("lApprovedUS.EU ~ Deleterious_Pr"), data = GTTI_GWAS_Nelson_NotApproved, family = "binomial"))
summary(modelglm_GTTI_GWAS_Nelson_NotApproved)
##
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ Deleterious_Pr"),
## family = "binomial", data = GTTI_GWAS_Nelson_NotApproved)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2949 -0.2678 -0.2678 -0.2207 2.9278
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.70257 0.08680 -42.657 < 2e-16 ***
## Deleterious_PrHIGH -0.56945 0.58745 -0.969 0.332
## Deleterious_PrLOW -0.56712 0.51061 -1.111 0.267
## Deleterious_PrMODERATE 0.58891 0.12021 4.899 9.64e-07 ***
## Deleterious_PrMODIFIER 0.39268 0.09645 4.071 4.68e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7712.4 on 26339 degrees of freedom
## Residual deviance: 7677.6 on 26335 degrees of freedom
## AIC: 7687.6
##
## Number of Fisher Scoring iterations: 6
# oddsratio(GTTI_GWAS_Nelson_NotApproved$lApprovedUS.EU,GTTI_GWAS_Nelson_NotApproved$Deleterious_Pr)
# use this dataset for prediction
GTTI_GWAS_notNelson <- GTTI_GWAS %>% filter(NelsonStatus %in% c("Unused", "Absent", NA))
GTTI_GWAS_notNelson$Approval <- predict(modelglm_GTTI_GWAS_Nelson,
newdata = GTTI_GWAS_notNelson, type ="response")
sum(GTTI_GWAS_notNelson$Approval>0.5)
## [1] 1181
GTTI_GWAS_notNelson$Approval_Logical <- ifelse(GTTI_GWAS_notNelson$Approval > 0.5, TRUE, FALSE)
table(GTTI_GWAS_notNelson$lApprovedUS.EU, GTTI_GWAS_notNelson$Approval_Logical)
##
## FALSE TRUE
## FALSE 72492 961
## TRUE 1121 59
GTTI_GWAS$Deleterious_Pr2 <- !is.na(GTTI_GWAS$Deleterious)
modelglm_GTTI_GWAS <- (glm(as.formula("Phase.Latest_Logical ~ Deleterious_Pr2"), data = GTTI_GWAS, family = "binomial"))
summary(modelglm_GTTI_GWAS)
##
## Call:
## glm(formula = as.formula("Phase.Latest_Logical ~ Deleterious_Pr2"),
## family = "binomial", data = GTTI_GWAS)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4403 -0.4403 -0.4403 -0.3923 2.2817
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.52605 0.02422 -104.295 <2e-16 ***
## Deleterious_Pr2TRUE 0.24107 0.02707 8.907 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 63556 on 106593 degrees of freedom
## Residual deviance: 63473 on 106592 degrees of freedom
## (39136 observations deleted due to missingness)
## AIC: 63477
##
## Number of Fisher Scoring iterations: 5
OR_GWAS <- oddsratio(GTTI_GWAS$Phase.Latest_Logical, GTTI_GWAS$Deleterious_Pr2)
print(OR_GWAS)
## $data
## Outcome
## Predictor FALSE TRUE Total
## FALSE 23020 74183 97203
## TRUE 1841 7550 9391
## Total 24861 81733 106594
##
## $measure
## odds ratio with 95% C.I.
## Predictor estimate lower upper
## FALSE 1.000000 NA NA
## TRUE 1.272528 1.207064 1.342201
##
## $p.value
## two-sided
## Predictor midp.exact fisher.exact chi.square
## FALSE NA NA NA
## TRUE 0 1.098695e-19 4.458191e-19
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
ORtable <- rbind(OR_OMIM$measure[2,], OR_GWAS$measure[2,])
row.names(ORtable) <- c("OMIM", "GWAS")
ORtable <- as.data.frame(ORtable)
ORtable$Genetics <-row.names(ORtable)
glimpse (ORtable)
## Observations: 2
## Variables: 4
## $ estimate <dbl> 1.265441, 1.272528
## $ lower <dbl> 1.082046, 1.207064
## $ upper <dbl> 1.488869, 1.342201
## $ Genetics <chr> "OMIM", "GWAS"
library(ggplot2)
ggplot(ORtable, aes(x=Genetics, y =estimate))+
geom_pointrange(aes(ymin = lower, ymax = upper))
