packages

# install if required
if(!require(googlesheets4)){install.packages("googlesheets4")}
if(!require(ggplot2)){install.packages("ggplot2")}
if(!require(GGally)){install.packages("GGally")}
if(!require(reshape2)){install.packages("reshape2")}
if(!require(lme4)){install.packages("lme4")}
if(!require(compiler)){install.packages("compiler")}
if(!require(parallel)){install.packages("parallel")}
if(!require(boot)){install.packages("boot")}
if(!require(lattice)){install.packages("lattice")}
# Load
library(googlesheets4); gs4_deauth()
library(ggplot2)
library(GGally)
library(reshape2)
library(lme4)
library(compiler)
library(parallel)
library(boot)
library(lattice)

Data import

Read Cancer data link

Base de datos en Google-Sheet

ss= "https://docs.google.com/spreadsheets/d/11pMP8yY7VNi3g6UAYC7J5-do3QMjQCAlDDgtAUEn9ec/edit?usp=sharing"
hoja="cancer"
rango="A2:AA8527"
can <- read_sheet(ss,
                  sheet=hoja,
                  range=rango,
                  col_names = TRUE,
                  col_types = NULL,
                  na= "NA")
dim(can)
## [1] 8525   27

edit/reformat

can <- within(can, 
              {
  Married <- factor(Married, levels = 0:1, labels = c("no", "yes"))
  DID <- factor(DID)
  HID <- factor(HID)
  FamilyHx <- factor(FamilyHx)
  SmokingHx <- factor(SmokingHx)
  Sex <- factor(Sex)
  CancerStage <- factor(CancerStage)
  School <- factor(School)
  }
  )
str(can)
## tibble [8,525 x 27] (S3: tbl_df/tbl/data.frame)
##  $ remission   : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ntumors     : num [1:8525] 0 0 0 0 0 0 0 0 2 0 ...
##  $ tumorsize   : num [1:8525] 68 64.7 51.6 86.4 53.4 ...
##  $ co2         : num [1:8525] 1.53 1.68 1.53 1.45 1.57 ...
##  $ pain        : num [1:8525] 4 2 6 3 3 4 3 3 4 5 ...
##  $ wound       : num [1:8525] 4 3 3 3 4 5 4 3 4 4 ...
##  $ mobility    : num [1:8525] 2 2 2 2 2 2 2 3 3 3 ...
##  $ nmorphine   : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lungcapacity: num [1:8525] 0.801 0.326 0.565 0.848 0.886 ...
##  $ Age         : num [1:8525] 65 53.9 53.3 41.4 46.8 ...
##  $ Married     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 2 1 2 1 ...
##  $ FamilyHx    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ SmokingHx   : Factor w/ 3 levels "a.former","current",..: 1 1 3 1 3 3 2 1 1 3 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 1 2 2 2 ...
##  $ CancerStage : Factor w/ 4 levels "I","II","III",..: 2 2 2 1 2 1 2 2 2 2 ...
##  $ LengthofStay: num [1:8525] 6 6 5 5 6 5 4 5 6 7 ...
##  $ WBC         : num [1:8525] 6088 6700 6043 7163 6443 ...
##  $ RBC         : num [1:8525] 4.87 4.68 5.01 5.27 4.98 ...
##  $ BMI         : num [1:8525] 24.1 29.4 29.5 21.6 29.8 ...
##  $ IL6         : num [1:8525] 3.7 2.63 13.9 3.01 3.89 ...
##  $ CRP         : num [1:8525] 8.086 0.803 4.034 2.126 1.349 ...
##  $ DID         : Factor w/ 407 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Experience  : num [1:8525] 25 25 25 25 25 25 25 25 25 25 ...
##  $ School      : Factor w/ 2 levels "average","top": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Lawsuits    : num [1:8525] 3 3 3 3 3 3 3 3 3 3 ...
##  $ HID         : Factor w/ 35 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Medicaid    : num [1:8525] 0.606 0.606 0.606 0.606 0.606 ...

Checking for multicoinearity

ggpairs(can[ , c("IL6", "CRP", "LengthofStay", "Experience")])

Exploratory PLOTS

tmp <- melt(can[, c("remission", "IL6", "CRP", "LengthofStay", "Experience")], id.vars="remission")
ggplot(tmp, aes(factor(remission), y = value, fill=factor(remission))) +
  geom_boxplot() +
  facet_wrap(~variable, scales="free_y")

response = ntumors

Min. Squares
p <- ggplot(aes(y=ntumors, x= Age, col=FamilyHx), data=can)
p +
  geom_point() +
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

##### Deviance (Poisson errors)

p <- ggplot(aes(y=ntumors, x= Age, col=FamilyHx), data=can)
p +
  geom_point() +
  geom_smooth(method="glm", method.args=list(family="poisson"))
## `geom_smooth()` using formula 'y ~ x'

response= ntumors

Regression (Linear model)
lm <- lm(ntumors ~ FamilyHx * Age,
           data = can)
#
summary(lm)
## 
## Call:
## lm(formula = ntumors ~ FamilyHx * Age, data = can)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1453 -1.6114 -0.3041  1.3434  7.3265 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.180688   0.208082  -5.674 1.44e-08 ***
## FamilyHxyes      3.568717   0.472595   7.551 4.75e-14 ***
## Age              0.069948   0.004055  17.248  < 2e-16 ***
## FamilyHxyes:Age -0.003145   0.009177  -0.343    0.732    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.107 on 8521 degrees of freedom
## Multiple R-squared:  0.3177, Adjusted R-squared:  0.3174 
## F-statistic:  1322 on 3 and 8521 DF,  p-value: < 2.2e-16
Poisson Regression (Deviance model)
m <- glm(ntumors ~ FamilyHx * Age,
           data = can, 
           family = poisson)
#
summary(m)
## 
## Call:
## glm(formula = ntumors ~ FamilyHx * Age, family = poisson, data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5042  -1.0872  -0.1826   0.7301   3.8796  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -0.639799   0.065199  -9.813   <2e-16 ***
## FamilyHxyes      1.807329   0.106564  16.960   <2e-16 ***
## Age              0.029269   0.001243  23.553   <2e-16 ***
## FamilyHxyes:Age -0.017760   0.002043  -8.694   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 19849  on 8524  degrees of freedom
## Residual deviance: 14815  on 8521  degrees of freedom
## AIC: 36096
## 
## Number of Fisher Scoring iterations: 5
m <- glm(ntumors ~ FamilyHx + (Age * CancerStage),
           data = can, 
           family = poisson)
#
summary(m)
## 
## Call:
## glm(formula = ntumors ~ FamilyHx + (Age * CancerStage), family = poisson, 
##     data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5016  -0.9363  -0.1095   0.6683   3.7067  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.233165   0.112919   2.065 0.038933 *  
## FamilyHxyes         0.805729   0.013098  61.516  < 2e-16 ***
## Age                 0.009773   0.002345   4.168 3.08e-05 ***
## CancerStageII       0.135405   0.147401   0.919 0.358298    
## CancerStageIII      0.438324   0.167670   2.614 0.008943 ** 
## CancerStageIV       0.690586   0.179630   3.845 0.000121 ***
## Age:CancerStageII  -0.002011   0.002984  -0.674 0.500269    
## Age:CancerStageIII -0.003468   0.003289  -1.054 0.291680    
## Age:CancerStageIV  -0.001076   0.003397  -0.317 0.751366    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 19849  on 8524  degrees of freedom
## Residual deviance: 13791  on 8516  degrees of freedom
## AIC: 35081
## 
## Number of Fisher Scoring iterations: 5

Response = Remission

Logistic model

Plot
Response = remission
Factor = Sex
Predictor = Age
p <- ggplot(aes(y= remission, x= Age, col= Sex ), data=can)
p +
  geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

Model
m <- glm(remission ~ Sex * Age,
         data = can,
         family = binomial)
summary(m)
## 
## Call:
## glm(formula = remission ~ Sex * Age, family = binomial, data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1367  -0.8588  -0.7890   1.4588   1.9013  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.884523   0.252706   3.500 0.000465 ***
## Sexmale     -0.184346   0.397621  -0.464 0.642918    
## Age         -0.035000   0.004974  -7.036 1.98e-12 ***
## Sexmale:Age  0.004734   0.007821   0.605 0.545013    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10353  on 8524  degrees of freedom
## Residual deviance: 10276  on 8521  degrees of freedom
## AIC: 10284
## 
## Number of Fisher Scoring iterations: 4
PLOT
Response = remission
Factor = CancerStage
Predictor = Age
p <- ggplot(aes(y= remission, x= Age, col= CancerStage ), data=can)
p +
  geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

Model
m <- glm(remission ~ CancerStage * Age,
         data = can,
         family = binomial)
summary(m)
## 
## Call:
## glm(formula = remission ~ CancerStage * Age, family = binomial, 
##     data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0687  -0.8598  -0.7594   1.3786   2.1709  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)         0.009950   0.347183   0.029   0.9771  
## CancerStageII      -0.791813   0.482402  -1.641   0.1007  
## CancerStageIII     -0.030048   0.650567  -0.046   0.9632  
## CancerStageIV      -2.581626   1.172406  -2.202   0.0277 *
## Age                -0.010305   0.007263  -1.419   0.1559  
## CancerStageII:Age   0.009818   0.009767   1.005   0.3148  
## CancerStageIII:Age -0.011297   0.012618  -0.895   0.3706  
## CancerStageIV:Age   0.017321   0.020983   0.825   0.4091  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10353  on 8524  degrees of freedom
## Residual deviance: 10039  on 8517  degrees of freedom
## AIC: 10055
## 
## Number of Fisher Scoring iterations: 4

Global Model (FIXED EFFECTS)

m <- glm(remission ~ Sex + Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience,
         data = can,
         family = binomial)
#
summary(m)
## 
## Call:
## glm(formula = remission ~ Sex + Age + FamilyHx + IL6 + CRP + 
##     CancerStage + LengthofStay + Experience, family = binomial, 
##     data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5402  -0.8828  -0.6541   1.2331   2.5875  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.045183   0.251837  -4.150 3.32e-05 ***
## Sexmale         0.054308   0.050342   1.079 0.280685    
## Age            -0.010407   0.004652  -2.237 0.025284 *  
## FamilyHxyes    -0.881383   0.074276 -11.866  < 2e-16 ***
## IL6            -0.039059   0.008939  -4.369 1.25e-05 ***
## CRP            -0.018408   0.008091  -2.275 0.022896 *  
## CancerStageII  -0.209446   0.059625  -3.513 0.000444 ***
## CancerStageIII -0.522096   0.078493  -6.652 2.90e-11 ***
## CancerStageIV  -1.438941   0.131152 -10.972  < 2e-16 ***
## LengthofStay   -0.034507   0.028092  -1.228 0.219313    
## Experience      0.086535   0.006179  14.004  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10353  on 8524  degrees of freedom
## Residual deviance:  9654  on 8514  degrees of freedom
## AIC: 9676
## 
## Number of Fisher Scoring iterations: 4

glMM

random Interpcept (DID)

mm <- glmer(remission ~  Sex + Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience +
              (1 | DID),
            data = can,
            family = binomial,
            control = glmerControl(optimizer = "bobyqa"),
            nAGQ = 10)
#
summary(mm, cor=FALSE)
## Generalized linear mixed model fit by maximum likelihood (Adaptive
##   Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
##  Family: binomial  ( logit )
## Formula: remission ~ Sex + Age + FamilyHx + IL6 + CRP + CancerStage +  
##     LengthofStay + Experience + (1 | DID)
##    Data: can
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   7189.5   7274.1  -3582.8   7165.5     8513 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2500 -0.4220 -0.1861  0.3563  7.8029 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  DID    (Intercept) 4.278    2.068   
## Number of obs: 8525, groups:  DID, 407
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.656352   0.590981  -2.803  0.00507 ** 
## Sexmale         0.063142   0.065863   0.959  0.33772    
## Age            -0.016086   0.006067  -2.651  0.00802 ** 
## FamilyHxyes    -1.308113   0.095559 -13.689  < 2e-16 ***
## IL6            -0.058460   0.011740  -4.980 6.37e-07 ***
## CRP            -0.023302   0.010384  -2.244  0.02483 *  
## CancerStageII  -0.323973   0.078594  -4.122 3.75e-05 ***
## CancerStageIII -0.865806   0.102722  -8.429  < 2e-16 ***
## CancerStageIV  -2.166400   0.165944 -13.055  < 2e-16 ***
## LengthofStay   -0.041568   0.036457  -1.140  0.25421    
## Experience      0.125855   0.028155   4.470 7.82e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

random Interpcept (DID) & slopes (LengthofStay|DID)

mm <- glmer(remission ~  Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience +
              (1 + LengthofStay | DID),
           data = can,
           family = binomial,
           control = glmerControl(optimizer = "bobyqa"),
           nAGQ = 1)
summary(mm, cor= FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: remission ~ Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay +  
##     Experience + (1 + LengthofStay | DID)
##    Data: can
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   7165.5   7257.1  -3569.7   7139.5     8512 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1986 -0.4207 -0.1806  0.3652  7.9920 
## 
## Random effects:
##  Groups Name         Variance Std.Dev. Corr 
##  DID    (Intercept)  1.3344   1.1552        
##         LengthofStay 0.1258   0.3547   -0.14
## Number of obs: 8525, groups:  DID, 407
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.693966   0.562758  -1.233 0.217520    
## Age            -0.015508   0.005961  -2.601 0.009284 ** 
## FamilyHxyes    -1.344064   0.095553 -14.066  < 2e-16 ***
## IL6            -0.058416   0.011534  -5.065 4.09e-07 ***
## CRP            -0.021955   0.010197  -2.153 0.031313 *  
## CancerStageII  -0.302916   0.076624  -3.953 7.71e-05 ***
## CancerStageIII -0.869792   0.101627  -8.559  < 2e-16 ***
## CancerStageIV  -2.289410   0.169943 -13.472  < 2e-16 ***
## LengthofStay   -0.169884   0.045198  -3.759 0.000171 ***
## Experience      0.106703   0.025551   4.176 2.97e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

random Interpcepts (nested = HID/DID) & slope (LengthofStay|DID)

mm <- glmer(remission ~   Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience + 
              (1 + LengthofStay | DID) +
              (1|HID),
           data = can,
           family = binomial,
           nAGQ = 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 1.83149 (tol = 0.002, component 1)
summary(mm, cor=FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: remission ~ Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay +  
##     Experience + (1 + LengthofStay | DID) + (1 | HID)
##    Data: can
## 
##      AIC      BIC   logLik deviance df.resid 
##   7147.8   7246.5  -3559.9   7119.8     8511 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4853 -0.4217 -0.1780  0.3630  8.3223 
## 
## Random effects:
##  Groups Name         Variance Std.Dev. Corr 
##  DID    (Intercept)  0.2558   0.5058        
##         LengthofStay 0.1380   0.3715   -0.10
##  HID    (Intercept)  0.5328   0.7299        
## Number of obs: 8525, groups:  DID, 407; HID, 35
## 
## Fixed effects:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.54691    0.54608  -1.002 0.316577    
## Age            -0.01528    0.00595  -2.568 0.010239 *  
## FamilyHxyes    -1.34014    0.09544 -14.042  < 2e-16 ***
## IL6            -0.05863    0.01150  -5.096 3.47e-07 ***
## CRP            -0.02105    0.01018  -2.069 0.038571 *  
## CancerStageII  -0.29409    0.07639  -3.850 0.000118 ***
## CancerStageIII -0.86536    0.10157  -8.520  < 2e-16 ***
## CancerStageIV  -2.29499    0.17097 -13.424  < 2e-16 ***
## LengthofStay   -0.18974    0.04524  -4.194 2.74e-05 ***
## Experience      0.10459    0.02392   4.373 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 1.83149 (tol = 0.002, component 1)