packages
# install if required
if(!require(googlesheets4)){install.packages("googlesheets4")}
if(!require(ggplot2)){install.packages("ggplot2")}
if(!require(GGally)){install.packages("GGally")}
if(!require(reshape2)){install.packages("reshape2")}
if(!require(lme4)){install.packages("lme4")}
if(!require(compiler)){install.packages("compiler")}
if(!require(parallel)){install.packages("parallel")}
if(!require(boot)){install.packages("boot")}
if(!require(lattice)){install.packages("lattice")}
# Load
library(googlesheets4); gs4_deauth()
library(ggplot2)
library(GGally)
library(reshape2)
library(lme4)
library(compiler)
library(parallel)
library(boot)
library(lattice)
Read Cancer data link
Base de datos en Google-Sheet
ss= "https://docs.google.com/spreadsheets/d/11pMP8yY7VNi3g6UAYC7J5-do3QMjQCAlDDgtAUEn9ec/edit?usp=sharing"
hoja="cancer"
rango="A2:AA8527"
can <- read_sheet(ss,
sheet=hoja,
range=rango,
col_names = TRUE,
col_types = NULL,
na= "NA")
dim(can)
## [1] 8525 27
can <- within(can,
{
Married <- factor(Married, levels = 0:1, labels = c("no", "yes"))
DID <- factor(DID)
HID <- factor(HID)
FamilyHx <- factor(FamilyHx)
SmokingHx <- factor(SmokingHx)
Sex <- factor(Sex)
CancerStage <- factor(CancerStage)
School <- factor(School)
}
)
str(can)
## tibble [8,525 x 27] (S3: tbl_df/tbl/data.frame)
## $ remission : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
## $ ntumors : num [1:8525] 0 0 0 0 0 0 0 0 2 0 ...
## $ tumorsize : num [1:8525] 68 64.7 51.6 86.4 53.4 ...
## $ co2 : num [1:8525] 1.53 1.68 1.53 1.45 1.57 ...
## $ pain : num [1:8525] 4 2 6 3 3 4 3 3 4 5 ...
## $ wound : num [1:8525] 4 3 3 3 4 5 4 3 4 4 ...
## $ mobility : num [1:8525] 2 2 2 2 2 2 2 3 3 3 ...
## $ nmorphine : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
## $ lungcapacity: num [1:8525] 0.801 0.326 0.565 0.848 0.886 ...
## $ Age : num [1:8525] 65 53.9 53.3 41.4 46.8 ...
## $ Married : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 2 1 2 1 ...
## $ FamilyHx : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ SmokingHx : Factor w/ 3 levels "a.former","current",..: 1 1 3 1 3 3 2 1 1 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 1 2 2 2 ...
## $ CancerStage : Factor w/ 4 levels "I","II","III",..: 2 2 2 1 2 1 2 2 2 2 ...
## $ LengthofStay: num [1:8525] 6 6 5 5 6 5 4 5 6 7 ...
## $ WBC : num [1:8525] 6088 6700 6043 7163 6443 ...
## $ RBC : num [1:8525] 4.87 4.68 5.01 5.27 4.98 ...
## $ BMI : num [1:8525] 24.1 29.4 29.5 21.6 29.8 ...
## $ IL6 : num [1:8525] 3.7 2.63 13.9 3.01 3.89 ...
## $ CRP : num [1:8525] 8.086 0.803 4.034 2.126 1.349 ...
## $ DID : Factor w/ 407 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Experience : num [1:8525] 25 25 25 25 25 25 25 25 25 25 ...
## $ School : Factor w/ 2 levels "average","top": 1 1 1 1 1 1 1 1 1 1 ...
## $ Lawsuits : num [1:8525] 3 3 3 3 3 3 3 3 3 3 ...
## $ HID : Factor w/ 35 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Medicaid : num [1:8525] 0.606 0.606 0.606 0.606 0.606 ...
ggpairs(can[ , c("IL6", "CRP", "LengthofStay", "Experience")])
tmp <- melt(can[, c("remission", "IL6", "CRP", "LengthofStay", "Experience")], id.vars="remission")
ggplot(tmp, aes(factor(remission), y = value, fill=factor(remission))) +
geom_boxplot() +
facet_wrap(~variable, scales="free_y")
ntumorsp <- ggplot(aes(y=ntumors, x= Age, col=FamilyHx), data=can)
p +
geom_point() +
geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
##### Deviance (Poisson errors)
p <- ggplot(aes(y=ntumors, x= Age, col=FamilyHx), data=can)
p +
geom_point() +
geom_smooth(method="glm", method.args=list(family="poisson"))
## `geom_smooth()` using formula 'y ~ x'
ntumorslm <- lm(ntumors ~ FamilyHx * Age,
data = can)
#
summary(lm)
##
## Call:
## lm(formula = ntumors ~ FamilyHx * Age, data = can)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1453 -1.6114 -0.3041 1.3434 7.3265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.180688 0.208082 -5.674 1.44e-08 ***
## FamilyHxyes 3.568717 0.472595 7.551 4.75e-14 ***
## Age 0.069948 0.004055 17.248 < 2e-16 ***
## FamilyHxyes:Age -0.003145 0.009177 -0.343 0.732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.107 on 8521 degrees of freedom
## Multiple R-squared: 0.3177, Adjusted R-squared: 0.3174
## F-statistic: 1322 on 3 and 8521 DF, p-value: < 2.2e-16
m <- glm(ntumors ~ FamilyHx * Age,
data = can,
family = poisson)
#
summary(m)
##
## Call:
## glm(formula = ntumors ~ FamilyHx * Age, family = poisson, data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5042 -1.0872 -0.1826 0.7301 3.8796
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.639799 0.065199 -9.813 <2e-16 ***
## FamilyHxyes 1.807329 0.106564 16.960 <2e-16 ***
## Age 0.029269 0.001243 23.553 <2e-16 ***
## FamilyHxyes:Age -0.017760 0.002043 -8.694 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 19849 on 8524 degrees of freedom
## Residual deviance: 14815 on 8521 degrees of freedom
## AIC: 36096
##
## Number of Fisher Scoring iterations: 5
m <- glm(ntumors ~ FamilyHx + (Age * CancerStage),
data = can,
family = poisson)
#
summary(m)
##
## Call:
## glm(formula = ntumors ~ FamilyHx + (Age * CancerStage), family = poisson,
## data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5016 -0.9363 -0.1095 0.6683 3.7067
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.233165 0.112919 2.065 0.038933 *
## FamilyHxyes 0.805729 0.013098 61.516 < 2e-16 ***
## Age 0.009773 0.002345 4.168 3.08e-05 ***
## CancerStageII 0.135405 0.147401 0.919 0.358298
## CancerStageIII 0.438324 0.167670 2.614 0.008943 **
## CancerStageIV 0.690586 0.179630 3.845 0.000121 ***
## Age:CancerStageII -0.002011 0.002984 -0.674 0.500269
## Age:CancerStageIII -0.003468 0.003289 -1.054 0.291680
## Age:CancerStageIV -0.001076 0.003397 -0.317 0.751366
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 19849 on 8524 degrees of freedom
## Residual deviance: 13791 on 8516 degrees of freedom
## AIC: 35081
##
## Number of Fisher Scoring iterations: 5
RemissionremissionSexAgep <- ggplot(aes(y= remission, x= Age, col= Sex ), data=can)
p +
geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
m <- glm(remission ~ Sex * Age,
data = can,
family = binomial)
summary(m)
##
## Call:
## glm(formula = remission ~ Sex * Age, family = binomial, data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1367 -0.8588 -0.7890 1.4588 1.9013
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.884523 0.252706 3.500 0.000465 ***
## Sexmale -0.184346 0.397621 -0.464 0.642918
## Age -0.035000 0.004974 -7.036 1.98e-12 ***
## Sexmale:Age 0.004734 0.007821 0.605 0.545013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10353 on 8524 degrees of freedom
## Residual deviance: 10276 on 8521 degrees of freedom
## AIC: 10284
##
## Number of Fisher Scoring iterations: 4
remissionCancerStageAgep <- ggplot(aes(y= remission, x= Age, col= CancerStage ), data=can)
p +
geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
m <- glm(remission ~ CancerStage * Age,
data = can,
family = binomial)
summary(m)
##
## Call:
## glm(formula = remission ~ CancerStage * Age, family = binomial,
## data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0687 -0.8598 -0.7594 1.3786 2.1709
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.009950 0.347183 0.029 0.9771
## CancerStageII -0.791813 0.482402 -1.641 0.1007
## CancerStageIII -0.030048 0.650567 -0.046 0.9632
## CancerStageIV -2.581626 1.172406 -2.202 0.0277 *
## Age -0.010305 0.007263 -1.419 0.1559
## CancerStageII:Age 0.009818 0.009767 1.005 0.3148
## CancerStageIII:Age -0.011297 0.012618 -0.895 0.3706
## CancerStageIV:Age 0.017321 0.020983 0.825 0.4091
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10353 on 8524 degrees of freedom
## Residual deviance: 10039 on 8517 degrees of freedom
## AIC: 10055
##
## Number of Fisher Scoring iterations: 4
m <- glm(remission ~ Sex + Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience,
data = can,
family = binomial)
#
summary(m)
##
## Call:
## glm(formula = remission ~ Sex + Age + FamilyHx + IL6 + CRP +
## CancerStage + LengthofStay + Experience, family = binomial,
## data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5402 -0.8828 -0.6541 1.2331 2.5875
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.045183 0.251837 -4.150 3.32e-05 ***
## Sexmale 0.054308 0.050342 1.079 0.280685
## Age -0.010407 0.004652 -2.237 0.025284 *
## FamilyHxyes -0.881383 0.074276 -11.866 < 2e-16 ***
## IL6 -0.039059 0.008939 -4.369 1.25e-05 ***
## CRP -0.018408 0.008091 -2.275 0.022896 *
## CancerStageII -0.209446 0.059625 -3.513 0.000444 ***
## CancerStageIII -0.522096 0.078493 -6.652 2.90e-11 ***
## CancerStageIV -1.438941 0.131152 -10.972 < 2e-16 ***
## LengthofStay -0.034507 0.028092 -1.228 0.219313
## Experience 0.086535 0.006179 14.004 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10353 on 8524 degrees of freedom
## Residual deviance: 9654 on 8514 degrees of freedom
## AIC: 9676
##
## Number of Fisher Scoring iterations: 4
mm <- glmer(remission ~ Sex + Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience +
(1 | DID),
data = can,
family = binomial,
control = glmerControl(optimizer = "bobyqa"),
nAGQ = 10)
#
summary(mm, cor=FALSE)
## Generalized linear mixed model fit by maximum likelihood (Adaptive
## Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
## Family: binomial ( logit )
## Formula: remission ~ Sex + Age + FamilyHx + IL6 + CRP + CancerStage +
## LengthofStay + Experience + (1 | DID)
## Data: can
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 7189.5 7274.1 -3582.8 7165.5 8513
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2500 -0.4220 -0.1861 0.3563 7.8029
##
## Random effects:
## Groups Name Variance Std.Dev.
## DID (Intercept) 4.278 2.068
## Number of obs: 8525, groups: DID, 407
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.656352 0.590981 -2.803 0.00507 **
## Sexmale 0.063142 0.065863 0.959 0.33772
## Age -0.016086 0.006067 -2.651 0.00802 **
## FamilyHxyes -1.308113 0.095559 -13.689 < 2e-16 ***
## IL6 -0.058460 0.011740 -4.980 6.37e-07 ***
## CRP -0.023302 0.010384 -2.244 0.02483 *
## CancerStageII -0.323973 0.078594 -4.122 3.75e-05 ***
## CancerStageIII -0.865806 0.102722 -8.429 < 2e-16 ***
## CancerStageIV -2.166400 0.165944 -13.055 < 2e-16 ***
## LengthofStay -0.041568 0.036457 -1.140 0.25421
## Experience 0.125855 0.028155 4.470 7.82e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mm <- glmer(remission ~ Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience +
(1 + LengthofStay | DID),
data = can,
family = binomial,
control = glmerControl(optimizer = "bobyqa"),
nAGQ = 1)
summary(mm, cor= FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: remission ~ Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay +
## Experience + (1 + LengthofStay | DID)
## Data: can
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 7165.5 7257.1 -3569.7 7139.5 8512
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1986 -0.4207 -0.1806 0.3652 7.9920
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## DID (Intercept) 1.3344 1.1552
## LengthofStay 0.1258 0.3547 -0.14
## Number of obs: 8525, groups: DID, 407
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.693966 0.562758 -1.233 0.217520
## Age -0.015508 0.005961 -2.601 0.009284 **
## FamilyHxyes -1.344064 0.095553 -14.066 < 2e-16 ***
## IL6 -0.058416 0.011534 -5.065 4.09e-07 ***
## CRP -0.021955 0.010197 -2.153 0.031313 *
## CancerStageII -0.302916 0.076624 -3.953 7.71e-05 ***
## CancerStageIII -0.869792 0.101627 -8.559 < 2e-16 ***
## CancerStageIV -2.289410 0.169943 -13.472 < 2e-16 ***
## LengthofStay -0.169884 0.045198 -3.759 0.000171 ***
## Experience 0.106703 0.025551 4.176 2.97e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mm <- glmer(remission ~ Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay + Experience +
(1 + LengthofStay | DID) +
(1|HID),
data = can,
family = binomial,
nAGQ = 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 1.83149 (tol = 0.002, component 1)
summary(mm, cor=FALSE)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: remission ~ Age + FamilyHx + IL6 + CRP + CancerStage + LengthofStay +
## Experience + (1 + LengthofStay | DID) + (1 | HID)
## Data: can
##
## AIC BIC logLik deviance df.resid
## 7147.8 7246.5 -3559.9 7119.8 8511
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4853 -0.4217 -0.1780 0.3630 8.3223
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## DID (Intercept) 0.2558 0.5058
## LengthofStay 0.1380 0.3715 -0.10
## HID (Intercept) 0.5328 0.7299
## Number of obs: 8525, groups: DID, 407; HID, 35
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.54691 0.54608 -1.002 0.316577
## Age -0.01528 0.00595 -2.568 0.010239 *
## FamilyHxyes -1.34014 0.09544 -14.042 < 2e-16 ***
## IL6 -0.05863 0.01150 -5.096 3.47e-07 ***
## CRP -0.02105 0.01018 -2.069 0.038571 *
## CancerStageII -0.29409 0.07639 -3.850 0.000118 ***
## CancerStageIII -0.86536 0.10157 -8.520 < 2e-16 ***
## CancerStageIV -2.29499 0.17097 -13.424 < 2e-16 ***
## LengthofStay -0.18974 0.04524 -4.194 2.74e-05 ***
## Experience 0.10459 0.02392 4.373 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 1.83149 (tol = 0.002, component 1)