library(readxl)
library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 3.2.1     <U+221A> readr   1.3.1
## <U+221A> tibble  2.1.3     <U+221A> purrr   0.3.2
## <U+221A> tidyr   1.0.0     <U+221A> stringr 1.4.0
## <U+221A> ggplot2 3.2.1     <U+221A> forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(psycho)
## Registered S3 method overwritten by 'MuMIn':
##   method         from
##   predict.merMod lme4
## Registered S3 methods overwritten by 'huge':
##   method    from   
##   plot.sim  BDgraph
##   print.sim BDgraph
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## message: psycho's `analyze()` is deprecated in favour of the report package. Check it out at https://github.com/easystats/report
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(car)
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(tibble)
library(nnet)

resp <- read_excel("C:/Users/HP/Desktop/resp.xlsx")

summary(resp)
##     gender              SITEID           SUBJID          TRTPN      
##  Length:582         Min.   :  1.00   Min.   :  1.0   Min.   :1.000  
##  Class :character   1st Qu.: 23.00   1st Qu.:197.2   1st Qu.:1.000  
##  Mode  :character   Median : 52.00   Median :408.0   Median :1.000  
##                     Mean   : 59.42   Mean   :402.4   Mean   :1.498  
##                     3rd Qu.: 99.00   3rd Qu.:599.8   3rd Qu.:2.000  
##                     Max.   :141.00   Max.   :797.0   Max.   :2.000  
##  responseCategory  
##  Length:582        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
resp$gender <- as.factor(resp$gender)
resp$responseCategory<- as.factor(resp$responseCategory)
resp$TRTPN <- as.factor(resp$TRTPN)
str(resp)
## Classes 'tbl_df', 'tbl' and 'data.frame':    582 obs. of  5 variables:
##  $ gender          : Factor w/ 2 levels "FEMALE","MALE": 2 1 2 2 1 1 2 1 2 2 ...
##  $ SITEID          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ SUBJID          : num  27 39 126 154 161 198 221 280 540 557 ...
##  $ TRTPN           : Factor w/ 2 levels "1","2": 2 1 2 1 1 1 1 1 1 1 ...
##  $ responseCategory: Factor w/ 5 levels "CR","NE","PD",..: 5 3 3 5 3 5 5 5 5 5 ...
#Recoding target variable into binary outcome ('responder (1, event)-nonresponder(0)') 
response <- function(res){
  res <- as.character(res)
  if (res=='CR'| res=='PR'){
    return(1)
  }else{
    return(0)
  }
}

resp$responseCategory <- sapply(resp$responseCategory,response)
table(resp$responseCategory)
## 
##   0   1 
## 490  92
resp$responseCategory<- as.factor(resp$responseCategory)

#creating training and test subset
library(caret)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lava':
##   method    from
##   plot.sim  huge
##   print.sim huge
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
library(caTools)
set.seed(101)

split = sample.split(resp$responseCategory, SplitRatio = 0.70)

final.train = subset(resp, split == TRUE)
final.test = subset(resp, split == FALSE)

summary(final.train)
##     gender        SITEID           SUBJID      TRTPN   responseCategory
##  FEMALE:188   Min.   :  1.00   Min.   :  4.0   1:207   0:343           
##  MALE  :219   1st Qu.: 23.00   1st Qu.:190.0   2:200   1: 64           
##               Median : 51.00   Median :415.0                           
##               Mean   : 59.25   Mean   :398.6                           
##               3rd Qu.: 97.50   3rd Qu.:591.5                           
##               Max.   :141.00   Max.   :797.0
summary(final.test)
##     gender        SITEID           SUBJID      TRTPN  responseCategory
##  FEMALE: 75   Min.   :  1.00   Min.   :  1.0   1:85   0:147           
##  MALE  :100   1st Qu.: 21.00   1st Qu.:222.5   2:90   1: 28           
##               Median : 55.00   Median :406.0                          
##               Mean   : 59.83   Mean   :411.4                          
##               3rd Qu.:102.00   3rd Qu.:616.5                          
##               Max.   :141.00   Max.   :795.0
#logit model with TRTPN as predictor


logit <- glm(formula = responseCategory ~ TRTPN, family = binomial(logit), data = resp)
summary(logit)
## 
## Call:
## glm(formula = responseCategory ~ TRTPN, family = binomial(logit), 
##     data = resp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6526  -0.6526  -0.5149  -0.5149   2.0427  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.4385     0.1486  -9.677   <2e-16 ***
## TRTPN2       -0.5153     0.2320  -2.222   0.0263 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.05  on 581  degrees of freedom
## Residual deviance: 503.01  on 580  degrees of freedom
## AIC: 507.01
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(logit), confint(logit)))
## Waiting for profiling to be done...
##                    OR     2.5 %    97.5 %
## (Intercept) 0.2372881 0.1756712 0.3149589
## TRTPN2      0.5973003 0.3765543 0.9371668
#Odds ratio is exponented reggresion coefficient in logistic regression that says to us
#how much times is event more probable when predictor changes for one unit (OR=OR(x+1)/OR(x)).
#For categorical predictors, odd ratio is saying to us how much times is an event more likely to occur
#for respondetns in one category of the predictor. In our example, how much time the respondets from trial 1 will
#more likely to have good response to the treatment.probability OR=p/(1-p); for probability of 0,5(50%), odd ratio
#is 1, meaning that both events have the same probability of occurance. 
#In our case, TRTPN treatmant has 0,59 time less chances to give response to the treatment than treatment 1.  
#Confidence interval (can be arbitrary selected) is an interval that tell us with which possibility 
#our result is in that range. For our results, for this sample, we are sure with 95% that our odds ratio for TRTPN 
#goes from 0.37 to 0.93. Larger samples decrease the standard error which improves confidence interval (making it more narrow). 

logit1 <- glm(formula = responseCategory ~ TRTPN + gender + factor(gender):factor(TRTPN), family = binomial(logit), data = resp)

#P value for the interaction term is 0.415 which is high. We can accept the null hypothesis
# There is no impact of the interaction on the response (target variable) so we could 
#drop interaction from the logit model.

summary(logit1)
## 
## Call:
## glm(formula = responseCategory ~ TRTPN + gender + factor(gender):factor(TRTPN), 
##     family = binomial(logit), data = resp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6902  -0.6109  -0.5302  -0.5035   2.0631  
## 
## Coefficients: (3 not defined because of singularities)
##                                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)                          -1.2028     0.4103  -2.931  0.00337
## TRTPN2                               -0.6881     0.3105  -2.216  0.02669
## genderMALE                           -0.1106     0.3587  -0.308  0.75779
## factor(gender)FEMALE:factor(TRTPN)1  -0.3814     0.4675  -0.816  0.41464
## factor(gender)MALE:factor(TRTPN)1         NA         NA      NA       NA
## factor(gender)FEMALE:factor(TRTPN)2       NA         NA      NA       NA
## factor(gender)MALE:factor(TRTPN)2         NA         NA      NA       NA
##                                       
## (Intercept)                         **
## TRTPN2                              * 
## genderMALE                            
## factor(gender)FEMALE:factor(TRTPN)1   
## factor(gender)MALE:factor(TRTPN)1     
## factor(gender)FEMALE:factor(TRTPN)2   
## factor(gender)MALE:factor(TRTPN)2     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 508.05  on 581  degrees of freedom
## Residual deviance: 502.09  on 578  degrees of freedom
## AIC: 510.09
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(logit1), confint(logit1)))
## Waiting for profiling to be done...
##                                            OR     2.5 %    97.5 %
## (Intercept)                         0.3003647 0.1327742 0.6678051
## TRTPN2                              0.5025338 0.2698128 0.9166996
## genderMALE                          0.8952703 0.4440637 1.8300400
## factor(gender)FEMALE:factor(TRTPN)1 0.6829305 0.2728769 1.7155189
## factor(gender)MALE:factor(TRTPN)1          NA        NA        NA
## factor(gender)FEMALE:factor(TRTPN)2        NA        NA        NA
## factor(gender)MALE:factor(TRTPN)2          NA        NA        NA
table(resp$gender, resp$TRTPN)
##         
##            1   2
##   FEMALE 141 122
##   MALE   151 168
#odds ratio for treatment 1 vs treatment 2 | Male, Wald method for confidence intervals

or <- (151/319)/(168/319)
or
## [1] 0.8988095
#standard eror for interaction term is 0.46
#confidence interval (95%) for particular odd ratio is
#upper cri
exp(log(or) + (0.46 * 1.96))
## [1] 2.214255
#lower cri
exp(log(or) - (0.46 * 1.96))
## [1] 0.3648445
#odd ratio for treatment 1 vs treatment 2 | Female

or1 <- (141/263)/(122/263)
or1
## [1] 1.155738
#standard eror for interaction term is 0.46
#confidence interval (95%) for particular odd ratio is
#upper cri
exp(log(or1) + (0.46 * 1.96))
## [1] 2.847208
#lower cri
exp(log(or1) - (0.46 * 1.96))
## [1] 0.4691367
#Odd ratio for Female vs Male | treatment 1
or2 <- (141/292)/(151/292)
or2
## [1] 0.9337748
#standard eror for interaction term is 0.46
#confidence interval (95%) for particular odd ratio is
#upper cri
exp(log(or2) + (0.46 * 1.96))
## [1] 2.300393
#lower cri
exp(log(or2) - (0.46 * 1.96))
## [1] 0.3790376
#Odds ratio Female vs Male | treatment 2.

or3 <- (122/290)/(168/290)
or3
## [1] 0.7261905
#standard eror for interaction term is 0.46
#confidence interval (95%) for particular odd ratio is
#upper cri
exp(log(or3) + (0.46 * 1.96))
## [1] 1.789
#lower cri
exp(log(or3) - (0.46 * 1.96))
## [1] 0.294775