packages

if(!require(googlesheets4)){install.packages("googlesheets4")}
if(!require(ggplot2)){install.packages("ggplot2")}
#
library(googlesheets4)
library(ggplot2)

Data import

Read Cancer data link
_________________________________________________________

Base de datos en Google-Sheet
_________________________________________________________

ss= "https://docs.google.com/spreadsheets/d/11pMP8yY7VNi3g6UAYC7J5-do3QMjQCAlDDgtAUEn9ec/edit?usp=sharing"
hoja="cancer"
rango="A2:AA8527"
can <- read_sheet(ss,
                  sheet=hoja,
                  range=rango,
                  col_names = TRUE,
                  col_types = NULL,
                  na= "NA")
dim(can)
## [1] 8525   27

Edit/reformat database

can <- within(can, 
              {
  Married <- factor(Married, levels = 0:1, labels = c("no", "yes"))
  DID <- factor(DID)
  HID <- factor(HID)
  FamilyHx <- factor(FamilyHx)
  SmokingHx <- factor(SmokingHx)
  Sex <- factor(Sex)
  CancerStage <- factor(CancerStage)
  School <- factor(School)
  }
  )
str(can)
## tibble [8,525 x 27] (S3: tbl_df/tbl/data.frame)
##  $ remission   : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ntumors     : num [1:8525] 0 0 0 0 0 0 0 0 2 0 ...
##  $ tumorsize   : num [1:8525] 68 64.7 51.6 86.4 53.4 ...
##  $ co2         : num [1:8525] 1.53 1.68 1.53 1.45 1.57 ...
##  $ pain        : num [1:8525] 4 2 6 3 3 4 3 3 4 5 ...
##  $ wound       : num [1:8525] 4 3 3 3 4 5 4 3 4 4 ...
##  $ mobility    : num [1:8525] 2 2 2 2 2 2 2 3 3 3 ...
##  $ nmorphine   : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lungcapacity: num [1:8525] 0.801 0.326 0.565 0.848 0.886 ...
##  $ Age         : num [1:8525] 65 53.9 53.3 41.4 46.8 ...
##  $ Married     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 2 1 2 1 ...
##  $ FamilyHx    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ SmokingHx   : Factor w/ 3 levels "a.former","current",..: 1 1 3 1 3 3 2 1 1 3 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 1 2 2 2 ...
##  $ CancerStage : Factor w/ 4 levels "I","II","III",..: 2 2 2 1 2 1 2 2 2 2 ...
##  $ LengthofStay: num [1:8525] 6 6 5 5 6 5 4 5 6 7 ...
##  $ WBC         : num [1:8525] 6088 6700 6043 7163 6443 ...
##  $ RBC         : num [1:8525] 4.87 4.68 5.01 5.27 4.98 ...
##  $ BMI         : num [1:8525] 24.1 29.4 29.5 21.6 29.8 ...
##  $ IL6         : num [1:8525] 3.7 2.63 13.9 3.01 3.89 ...
##  $ CRP         : num [1:8525] 8.086 0.803 4.034 2.126 1.349 ...
##  $ DID         : Factor w/ 407 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Experience  : num [1:8525] 25 25 25 25 25 25 25 25 25 25 ...
##  $ School      : Factor w/ 2 levels "average","top": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Lawsuits    : num [1:8525] 3 3 3 3 3 3 3 3 3 3 ...
##  $ HID         : Factor w/ 35 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Medicaid    : num [1:8525] 0.606 0.606 0.606 0.606 0.606 ...
hist(can$WBC)

head(can$WBC)
## [1] 6087.649 6700.310 6042.809 7162.697 6443.440 6800.549
head(can$WBC)
## [1] 6087.649 6700.310 6042.809 7162.697 6443.440 6800.549
head(can$Age)
## [1] 64.96824 53.91714 53.34730 41.36804 46.80042 51.92936
plot(can$WBC ~ can$Age)

plot(can$WBC ~ can$Age, pch=20)
ml <- lm(can$WBC ~ can$Age)
abline(ml)

ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_point()

ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_point(position=position_jitter(width=0.2, height=0.03))

ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_boxplot() + geom_point(position=position_jitter(width=0.2, height=0.03))

ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_point(position=position_jitter(width=0.2, height=0.03)) + geom_boxplot()

t.test(WBC ~ remission, data=can)
## 
##  Welch Two Sample t-test
## 
## data:  WBC by remission
## t = -1.6783, df = 4734.2, p-value = 0.09335
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -85.902778   6.660036
## sample estimates:
## mean in group 0 mean in group 1 
##        5985.863        6025.485
t.test(WBC ~ remission, data=can, alternative= "less")
## 
##  Welch Two Sample t-test
## 
## data:  WBC by remission
## t = -1.6783, df = 4734.2, p-value = 0.04667
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##        -Inf -0.7831199
## sample estimates:
## mean in group 0 mean in group 1 
##        5985.863        6025.485
table(can$CancerStage, can$remission)
##      
##          0    1
##   I   1580  978
##   II  2357 1052
##   III 1301  404
##   IV   766   87
etapaC <- table(can$CancerStage, can$remission)
etapaC
##      
##          0    1
##   I   1580  978
##   II  2357 1052
##   III 1301  404
##   IV   766   87
chisq.test(etapaC)
## 
##  Pearson's Chi-squared test
## 
## data:  etapaC
## X-squared = 276.83, df = 3, p-value < 2.2e-16
chisq.test(etapaC)$stdres
##      
##                0          1
##   I   -11.473145  11.473145
##   II   -2.126563   2.126563
##   III   5.944949  -5.944949
##   IV   13.068985 -13.068985
FamHx <- table(can$FamilyHx, can$remission)
FamHx
##      
##          0    1
##   no  4565 2255
##   yes 1439  266
chisq.test(FamHx)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  FamHx
## X-squared = 198.89, df = 1, p-value < 2.2e-16
chisq.test(FamHx)$stdres
##      
##              0        1
##   no  -14.1326  14.1326
##   yes  14.1326 -14.1326
SmokeHx <- table(can$SmokingHx, can$remission)
SmokeHx
##           
##               0    1
##   a.former 1190  515
##   current  1140  565
##   never    3674 1441
chisq.test(SmokeHx)
## 
##  Pearson's Chi-squared test
## 
## data:  SmokeHx
## X-squared = 15.551, df = 2, p-value = 0.0004199
chisq.test(SmokeHx)$stdres
##           
##                    0         1
##   a.former -0.640773  0.640773
##   current  -3.607315  3.607315
##   never     3.468549 -3.468549
ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point()

ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.02, width = 0.02))

ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.2, width = 0.2))

ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.2, width = 0.2)) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.2, width = 0.2)) + geom_smooth(method="glm", method.args=list(family="poisson"))
## `geom_smooth()` using formula 'y ~ x'

hist(can$ntumors)

hist( lm(can$ntumors~can$Age)$residuals )

plot(can$Experience)

plot(can$Experience, pch=20)

head(can$Experience)
## [1] 25 25 25 25 25 25
head(can$Experience); tail(can$Experoi)
## [1] 25 25 25 25 25 25
## Warning: Unknown or uninitialised column: `Experoi`.
## NULL
head(can$Experience); tail(can$Experience)
## [1] 25 25 25 25 25 25
## [1] 24 24 24 24 24 24
table(can$Experience, can$remission)
##     
##        0   1
##   7   38   0
##   8   12  16
##   9  127  25
##   10  93  18
##   11 164  24
##   12 233  40
##   13 422 113
##   14 521 136
##   15 516 210
##   16 561 203
##   17 448 190
##   18 621 254
##   19 559 241
##   20 373 231
##   21 394 217
##   22 304 179
##   23 239 174
##   24 130 113
##   25  84  66
##   26  47  29
##   27  63  19
##   28  37  16
##   29  18   7
exp <- table(can$Experience, can$remission)
chisq.test(exp)
## 
##  Pearson's Chi-squared test
## 
## data:  exp
## X-squared = 279.04, df = 22, p-value < 2.2e-16
chisq.test(exp)$stdres
##     
##                0           1
##   7   4.00339066 -4.00339066
##   8  -3.20209117  3.20209117
##   9   3.57765238 -3.57765238
##   10  3.10355519 -3.10355519
##   11  5.10587712 -5.10587712
##   12  5.49035579 -5.49035579
##   13  4.42397972 -4.42397972
##   14  5.18670973 -5.18670973
##   15  0.39890476 -0.39890476
##   16  1.90507816 -1.90507816
##   17 -0.12010098  0.12010098
##   18  0.37173094 -0.37173094
##   19 -0.36014338  0.36014338
##   20 -4.84553706  4.84553706
##   21 -3.34129127  3.34129127
##   22 -3.71281602  3.71281602
##   23 -5.73320646  5.73320646
##   24 -5.86722426  5.86722426
##   25 -3.90659835  3.90659835
##   26 -1.64752583  1.64752583
##   27  1.27628902 -1.27628902
##   28 -0.09870667  0.09870667
##   29  0.17246690 -0.17246690
m <- glm( remission ~ Experience, data=can, family=binomial)
summary(m)
## 
## Call:
## glm(formula = remission ~ Experience, family = binomial, data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1909  -0.8690  -0.7568   1.3757   1.9203  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.320558   0.111349  -20.84   <2e-16 ***
## Experience   0.081116   0.005985   13.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10353  on 8524  degrees of freedom
## Residual deviance: 10164  on 8523  degrees of freedom
## AIC: 10168
## 
## Number of Fisher Scoring iterations: 4
p <- ggplot(aes(x= Experience, y=remission), data=can)
p

p + geom_point(position=position_jitter(height = 0.2, width = 0.02))

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.2, width = 0.02))

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.2))

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.5))

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.3))

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4))

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

p <- ggplot(aes(x= Age, y=remission), data=can)
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

m <- glm( remission ~ Age, data=can, family=binomial)
summary(m)
## 
## Call:
## glm(formula = remission ~ Age, family = binomial, data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1517  -0.8570  -0.7889   1.4594   1.9129  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.810430   0.195069   4.155 3.26e-05 ***
## Age         -0.033095   0.003838  -8.624  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10353  on 8524  degrees of freedom
## Residual deviance: 10277  on 8523  degrees of freedom
## AIC: 10281
## 
## Number of Fisher Scoring iterations: 4
m <- glm( remission ~ Age + Experience, data=can, family=binomial)
summary(m)
## 
## Call:
## glm(formula = remission ~ Age + Experience, family = binomial, 
##     data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3586  -0.8615  -0.7355   1.3546   2.1027  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.579138   0.221016  -2.620  0.00878 ** 
## Age         -0.035050   0.003889  -9.012  < 2e-16 ***
## Experience   0.083113   0.006024  13.796  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10353  on 8524  degrees of freedom
## Residual deviance: 10081  on 8522  degrees of freedom
## AIC: 10087
## 
## Number of Fisher Scoring iterations: 4
m <- glm( remission ~ Age + Experience + CancerStage, data=can, family=binomial)
summary(m)
## 
## Call:
## glm(formula = remission ~ Age + Experience + CancerStage, family = binomial, 
##     data = can)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3917  -0.8766  -0.7135   1.2908   2.3318  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.565954   0.234559  -6.676 2.45e-11 ***
## Age            -0.008714   0.004346  -2.005   0.0449 *  
## Experience      0.084201   0.006103  13.796  < 2e-16 ***
## CancerStageII  -0.301294   0.057706  -5.221 1.78e-07 ***
## CancerStageIII -0.659473   0.075110  -8.780  < 2e-16 ***
## CancerStageIV  -1.646902   0.126937 -12.974  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10352.6  on 8524  degrees of freedom
## Residual deviance:  9846.3  on 8519  degrees of freedom
## AIC: 9858.3
## 
## Number of Fisher Scoring iterations: 4
range(can$Medicaid)
## [1] 0.1415814 0.8187299
head(can$Medicaid)
## [1] 0.6058667 0.6058667 0.6058667 0.6058667 0.6058667 0.6058667
p <- ggplot(aes(x= Medicaid, y=remission), data=can)
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'

p <- ggplot(aes(x= as.factor(remission), y=Medicaid), data=can)
p + geom_point()

p + geom_point(position=position_jitter(width=0.5))

p + geom_point(position=position_jitter(width=0.02))

p + geom_point(position=position_jitter(width=0.02)) + geom_boxplot()

t.test(can$Medicaid ~ as.factor(can$remission))
## 
##  Welch Two Sample t-test
## 
## data:  can$Medicaid by as.factor(can$remission)
## t = -5.4477, df = 4748.6, p-value = 5.361e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03631188 -0.01709301
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5046166       0.5313190
t.test(can$Medicaid ~ as.factor(can$remission), alternative="less")
## 
##  Welch Two Sample t-test
## 
## data:  can$Medicaid by as.factor(can$remission)
## t = -5.4477, df = 4748.6, p-value = 2.68e-08
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##         -Inf -0.01863844
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5046166       0.5313190
p + geom_point(position=position_jitter(width=0.02)) + geom_boxplot()