# read tsv gene_trait_association and target indication
require (tidyverse)
## Loading required package: tidyverse
## Error: package or namespace load failed for 'tidyverse' in namespaceExport(ns, exports):
##  undefined exports: %>%
require(tidyr)
## Loading required package: tidyr
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0     v dplyr   0.8.5
## v tibble  2.1.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.3
## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
setwd("~/Home051219onward/King2019/genetic-evidence-approval-master/genetic-evidence-approval-master/data")
GeneTrait <- read_tsv("gene_trait_assoc.tsv")
## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   CHR_A = col_double(),
##   BP_A = col_double(),
##   SNP_A = col_character(),
##   CHR_B = col_double(),
##   BP_B = col_double(),
##   SNP_B = col_character(),
##   R2 = col_double(),
##   ensembl_id = col_character(),
##   eQTL_pval = col_double(),
##   Distance = col_double(),
##   Deleterious = col_character(),
##   RDB = col_character(),
##   ECat = col_character(),
##   MAPPED_TRAIT = col_character(),
##   SNP_ID_CURRENT = col_double(),
##   NStudy = col_double(),
##   SourceScore = col_double(),
##   LDScore = col_double(),
##   RegScore = col_double(),
##   NAssocScore = col_double()
##   # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 42043 parsing failures.
##   row             col           expected              actual                   file
## 61333 MapType         1/0/T/F/TRUE/FALSE manual              'gene_trait_assoc.tsv'
## 61333 Inheritance     1/0/T/F/TRUE/FALSE Autosomal recessive 'gene_trait_assoc.tsv'
## 61333 Phenotype       1/0/T/F/TRUE/FALSE Immunodeficiency 38 'gene_trait_assoc.tsv'
## 61333 AssociationCode 1/0/T/F/TRUE/FALSE (3)                 'gene_trait_assoc.tsv'
## 61333 PhenotypeMIM    1/0/T/F/TRUE/FALSE 616126              'gene_trait_assoc.tsv'
## ..... ............... .................. ................... ......................
## See problems(...) for more details.
TargetIndicationNMSH <- read_tsv("target_indication_nmsh.tsv")
## Parsed with column specification:
## cols(
##   MSH = col_character(),
##   ensembl_id = col_character(),
##   lApprovedUS.EU = col_logical(),
##   Phase.Latest = col_character(),
##   First.Added = col_date(format = ""),
##   NelsonStatus = col_character(),
##   symbol = col_character()
## )
GeneTraitTargetIndication <- full_join(TargetIndicationNMSH, GeneTrait, by = "symbol", suffix = c("TargetIndication", "GeneTrait"))
GTTI <- GeneTraitTargetIndication %>% filter(xMHCGene == FALSE)
print (head(GTTI))
## # A tibble: 6 x 48
##   MSHTargetIndica~ ensembl_idTarge~ lApprovedUS.EU Phase.Latest First.Added
##   <chr>            <chr>            <lgl>          <chr>        <date>     
## 1 Abortion, Spont~ ENSG00000119535  FALSE          unknown      NA         
## 2 Abortion, Spont~ ENSG00000119535  FALSE          unknown      NA         
## 3 Abortion, Spont~ ENSG00000119535  FALSE          unknown      NA         
## 4 Abortion, Spont~ ENSG00000119535  FALSE          unknown      NA         
## 5 Abortion, Spont~ ENSG00000119535  FALSE          unknown      NA         
## 6 Abortion, Spont~ ENSG00000119535  FALSE          unknown      NA         
## # ... with 43 more variables: NelsonStatus <chr>, symbol <chr>, CHR_A <dbl>,
## #   BP_A <dbl>, SNP_A <chr>, CHR_B <dbl>, BP_B <dbl>, SNP_B <chr>, R2 <dbl>,
## #   ensembl_idGeneTrait <chr>, eQTL <lgl>, eQTL_pval <dbl>, DHS <lgl>,
## #   Distance <dbl>, Deleterious <chr>, missense <lgl>, distance <lgl>,
## #   RDB <chr>, ECat <chr>, MAPPED_TRAIT <chr>, SNP_ID_CURRENT <dbl>,
## #   NStudy <dbl>, SourceScore <dbl>, LDScore <dbl>, RegScore <dbl>,
## #   NAssocScore <dbl>, TotalScore <dbl>, MSHGeneTrait <chr>, Rank <dbl>,
## #   Source <chr>, MapType <lgl>, Inheritance <lgl>, first_added <date>,
## #   Phenotype <lgl>, AssociationCode <lgl>, MultifactorialSusceptibility <lgl>,
## #   has_mim <lgl>, PhenotypeMIM <lgl>, UI <lgl>, Somatic <lgl>, Updated <lgl>,
## #   Created <lgl>, xMHCGene <lgl>
require(epitools)
## Loading required package: epitools
library(epitools)

# is aprroval dependent on OMIM?
GTTI_OMIM <- GTTI %>% filter (Source == "OMIM")
GTTI_OMIM$Phase.Latest_Logical <- GTTI_OMIM$Phase.Latest == "Approved"
table(GTTI_OMIM$Phase.Latest_Logical)
## 
## FALSE  TRUE 
## 22873  3112
GTTI_OMIM_Nelson <- GTTI_OMIM %>% filter(NelsonStatus != "Unused",  NelsonStatus != "Absent")
predictor <- table(GTTI_OMIM_Nelson$has_mim)
print(predictor)
## 
## FALSE  TRUE 
##   486  7991
modelglm_GTTI_OMIM_Nelson <- (glm(as.formula("lApprovedUS.EU ~ has_mim"), data = GTTI_OMIM_Nelson, family = "binomial"))
summary(modelglm_GTTI_OMIM_Nelson)
## 
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ has_mim"), family = "binomial", 
##     data = GTTI_OMIM_Nelson)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7562  -0.7562  -0.7562  -0.7458   1.6826  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.1374     0.1058  -10.75   <2e-16 ***
## has_mimTRUE   0.0316     0.1089    0.29    0.772    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9502.4  on 8476  degrees of freedom
## Residual deviance: 9502.3  on 8475  degrees of freedom
## AIC: 9506.3
## 
## Number of Fisher Scoring iterations: 4
oddsratio(GTTI_OMIM_Nelson$lApprovedUS.EU, GTTI_OMIM_Nelson$has_mim) 
## $data
##          Outcome
## Predictor FALSE TRUE Total
##     FALSE   368 6004  6372
##     TRUE    118 1987  2105
##     Total   486 7991  8477
## 
## $measure
##          odds ratio with 95% C.I.
## Predictor estimate    lower    upper
##     FALSE 1.000000       NA       NA
##     TRUE  1.031167 0.835615 1.281406
## 
## $p.value
##          two-sided
## Predictor midp.exact fisher.exact chi.square
##     FALSE         NA           NA         NA
##     TRUE   0.7776981     0.828767  0.7717103
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
# repeat analysis if not previously approved in Nelson dataset
GTTI_OMIM_Nelson_NotApproved <- GTTI_OMIM_Nelson %>% filter (NelsonStatus!= "Approved")

modelglm_GTTI_OMIM_Nelson_NotApproved <- (glm(as.formula("lApprovedUS.EU ~ has_mim"), data = GTTI_OMIM_Nelson_NotApproved, family = "binomial"))

summary(modelglm_GTTI_OMIM_Nelson_NotApproved)                                          
## 
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ has_mim"), family = "binomial", 
##     data = GTTI_OMIM_Nelson_NotApproved)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2985  -0.2985  -0.2985  -0.2985   2.6587  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.5047     0.3060 -11.453   <2e-16 ***
## has_mimTRUE   0.4162     0.3122   1.333    0.183    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2329.6  on 6594  degrees of freedom
## Residual deviance: 2327.6  on 6593  degrees of freedom
## AIC: 2331.6
## 
## Number of Fisher Scoring iterations: 6
oddsratio(GTTI_OMIM_Nelson_NotApproved$lApprovedUS.EU, GTTI_OMIM_Nelson_NotApproved$has_mim)
## $data
##          Outcome
## Predictor FALSE TRUE Total
##     FALSE   366 5947  6313
##     TRUE     11  271   282
##     Total   377 6218  6595
## 
## $measure
##          odds ratio with 95% C.I.
## Predictor estimate     lower    upper
##     FALSE 1.000000        NA       NA
##     TRUE  1.495347 0.8507782 2.938625
## 
## $p.value
##          two-sided
## Predictor midp.exact fisher.exact chi.square
##     FALSE         NA           NA         NA
##     TRUE   0.1723723    0.2359679  0.1794612
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
# use this dataset for prediction 
GTTI_OMIM_notNelson <- GTTI_OMIM %>% filter(NelsonStatus %in% c("Unused",  "Absent", NA))
GTTI_OMIM_notNelson$Approval <- predict(modelglm_GTTI_OMIM_Nelson, 
                                        newdata = GTTI_OMIM_notNelson, type ="response")
sum(GTTI_OMIM_notNelson$Approval>0.5)
## [1] 0
modelglm_GTTI_OMIM <- (glm(as.formula("Phase.Latest_Logical ~ has_mim"), data = GTTI_OMIM, family = "binomial"))
summary(modelglm_GTTI_OMIM)
## 
## Call:
## glm(formula = as.formula("Phase.Latest_Logical ~ has_mim"), family = "binomial", 
##     data = GTTI_OMIM)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5087  -0.5087  -0.5087  -0.5087   2.1537  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.21578    0.07893 -28.074  < 2e-16 ***
## has_mimTRUE  0.23621    0.08135   2.904  0.00369 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19044  on 25984  degrees of freedom
## Residual deviance: 19035  on 25983  degrees of freedom
##   (3724 observations deleted due to missingness)
## AIC: 19039
## 
## Number of Fisher Scoring iterations: 4
OR_OMIM <- oddsratio(GTTI_OMIM$Phase.Latest_Logical, GTTI_OMIM$has_mim) 
print(OR_OMIM)
## $data
##          Outcome
## Predictor FALSE  TRUE Total
##     FALSE  1632 21241 22873
##     TRUE    178  2934  3112
##     Total  1810 24175 25985
## 
## $measure
##          odds ratio with 95% C.I.
## Predictor estimate    lower    upper
##     FALSE 1.000000       NA       NA
##     TRUE  1.265441 1.082046 1.488869
## 
## $p.value
##          two-sided
## Predictor midp.exact fisher.exact  chi.square
##     FALSE         NA           NA          NA
##     TRUE   0.0029362  0.003392414 0.003617224
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
require(epitools)
library(epitools)

# is aprroval dependent on OMIM?
GTTI_GWAS <- GTTI %>% filter (Source == "GWAS:A")
GTTI_GWAS$Phase.Latest_Logical <- GTTI_GWAS$Phase.Latest == "Approved"
table(GTTI_GWAS$Phase.Latest_Logical)
## 
## FALSE  TRUE 
## 97203  9391
GTTI_GWAS$Deleterious_Pr <- (GTTI_GWAS$Deleterious)%>% replace_na(replace = "NA")%>% factor()
GTTI_GWAS$Deleterious_Pr <- relevel(GTTI_GWAS$Deleterious_Pr, ref = "NA")
table(GTTI_GWAS$Deleterious_Pr)
## 
##       NA     HIGH      LOW MODERATE MODIFIER 
##    41239     1977     1357    20300    80857
GTTI_GWAS_Nelson <- GTTI_GWAS %>% filter(NelsonStatus != "Unused",  NelsonStatus != "Absent")


predictor <- table(GTTI_GWAS_Nelson$Deleterious_Pr)
print(predictor)
## 
##       NA     HIGH      LOW MODERATE MODIFIER 
##     6873      796      323     4208    19761
modelglm_GTTI_GWAS_Nelson <- (glm(as.formula("lApprovedUS.EU ~ Deleterious_Pr"), data = GTTI_GWAS_Nelson, family = "binomial"))
summary(modelglm_GTTI_GWAS_Nelson)
## 
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ Deleterious_Pr"), 
##     family = "binomial", data = GTTI_GWAS_Nelson)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5405  -0.6377  -0.6377  -0.6358   2.1956  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.43952    0.03065 -46.970  < 2e-16 ***
## Deleterious_PrHIGH      2.26182    0.08284  27.303  < 2e-16 ***
## Deleterious_PrLOW      -0.87676    0.19699  -4.451 8.56e-06 ***
## Deleterious_PrMODERATE -0.05673    0.05029  -1.128    0.259    
## Deleterious_PrMODIFIER -0.04996    0.03573  -1.399    0.162    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31743  on 31960  degrees of freedom
## Residual deviance: 30763  on 31956  degrees of freedom
## AIC: 30773
## 
## Number of Fisher Scoring iterations: 4
# riskratio.boot(GTTI_GWAS_Nelson$lApprovedUS.EU, GTTI_GWAS_Nelson$Deleterious_Pr) 


# repeat analysis if not previously approved in Nelson dataset
GTTI_GWAS_Nelson_NotApproved <- GTTI_GWAS_Nelson %>% filter (NelsonStatus!= "Approved")

modelglm_GTTI_GWAS_Nelson_NotApproved <- (glm(as.formula("lApprovedUS.EU ~ Deleterious_Pr"), data = GTTI_GWAS_Nelson_NotApproved, family = "binomial"))

summary(modelglm_GTTI_GWAS_Nelson_NotApproved)                                          
## 
## Call:
## glm(formula = as.formula("lApprovedUS.EU ~ Deleterious_Pr"), 
##     family = "binomial", data = GTTI_GWAS_Nelson_NotApproved)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.2949  -0.2678  -0.2678  -0.2207   2.9278  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -3.70257    0.08680 -42.657  < 2e-16 ***
## Deleterious_PrHIGH     -0.56945    0.58745  -0.969    0.332    
## Deleterious_PrLOW      -0.56712    0.51061  -1.111    0.267    
## Deleterious_PrMODERATE  0.58891    0.12021   4.899 9.64e-07 ***
## Deleterious_PrMODIFIER  0.39268    0.09645   4.071 4.68e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7712.4  on 26339  degrees of freedom
## Residual deviance: 7677.6  on 26335  degrees of freedom
## AIC: 7687.6
## 
## Number of Fisher Scoring iterations: 6
# oddsratio(GTTI_GWAS_Nelson_NotApproved$lApprovedUS.EU,GTTI_GWAS_Nelson_NotApproved$Deleterious_Pr)

# use this dataset for prediction 
GTTI_GWAS_notNelson <- GTTI_GWAS %>% filter(NelsonStatus %in% c("Unused",  "Absent", NA))
GTTI_GWAS_notNelson$Approval <- predict(modelglm_GTTI_GWAS_Nelson, 
                                        newdata = GTTI_GWAS_notNelson, type ="response")
sum(GTTI_GWAS_notNelson$Approval>0.5)
## [1] 1181
GTTI_GWAS_notNelson$Approval_Logical <- ifelse(GTTI_GWAS_notNelson$Approval > 0.5, TRUE, FALSE)
table(GTTI_GWAS_notNelson$lApprovedUS.EU, GTTI_GWAS_notNelson$Approval_Logical)
##        
##         FALSE  TRUE
##   FALSE 72492   961
##   TRUE   1121    59
GTTI_GWAS$Deleterious_Pr2 <- !is.na(GTTI_GWAS$Deleterious)
modelglm_GTTI_GWAS <- (glm(as.formula("Phase.Latest_Logical ~ Deleterious_Pr2"), data = GTTI_GWAS, family = "binomial"))
summary(modelglm_GTTI_GWAS)
## 
## Call:
## glm(formula = as.formula("Phase.Latest_Logical ~ Deleterious_Pr2"), 
##     family = "binomial", data = GTTI_GWAS)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4403  -0.4403  -0.4403  -0.3923   2.2817  
## 
## Coefficients:
##                     Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)         -2.52605    0.02422 -104.295   <2e-16 ***
## Deleterious_Pr2TRUE  0.24107    0.02707    8.907   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63556  on 106593  degrees of freedom
## Residual deviance: 63473  on 106592  degrees of freedom
##   (39136 observations deleted due to missingness)
## AIC: 63477
## 
## Number of Fisher Scoring iterations: 5
OR_GWAS <- oddsratio(GTTI_GWAS$Phase.Latest_Logical, GTTI_GWAS$Deleterious_Pr2) 
print(OR_GWAS)
## $data
##          Outcome
## Predictor FALSE  TRUE  Total
##     FALSE 23020 74183  97203
##     TRUE   1841  7550   9391
##     Total 24861 81733 106594
## 
## $measure
##          odds ratio with 95% C.I.
## Predictor estimate    lower    upper
##     FALSE 1.000000       NA       NA
##     TRUE  1.272528 1.207064 1.342201
## 
## $p.value
##          two-sided
## Predictor midp.exact fisher.exact   chi.square
##     FALSE         NA           NA           NA
##     TRUE           0 1.098695e-19 4.458191e-19
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"
ORtable <- rbind(OR_OMIM$measure[2,], OR_GWAS$measure[2,])
row.names(ORtable) <- c("OMIM", "GWAS")
ORtable <- as.data.frame(ORtable)
ORtable$Genetics <-row.names(ORtable)
glimpse (ORtable)
## Observations: 2
## Variables: 4
## $ estimate <dbl> 1.265441, 1.272528
## $ lower    <dbl> 1.082046, 1.207064
## $ upper    <dbl> 1.488869, 1.342201
## $ Genetics <chr> "OMIM", "GWAS"
library(ggplot2)
ggplot(ORtable, aes(x=Genetics, y =estimate))+
geom_pointrange(aes(ymin = lower, ymax = upper))