packages
if(!require(googlesheets4)){install.packages("googlesheets4")}
if(!require(ggplot2)){install.packages("ggplot2")}
#
library(googlesheets4)
library(ggplot2)
Read Cancer data link
_________________________________________________________
_________________________________________________________
ss= "https://docs.google.com/spreadsheets/d/11pMP8yY7VNi3g6UAYC7J5-do3QMjQCAlDDgtAUEn9ec/edit?usp=sharing"
hoja="cancer"
rango="A2:AA8527"
can <- read_sheet(ss,
sheet=hoja,
range=rango,
col_names = TRUE,
col_types = NULL,
na= "NA")
dim(can)
## [1] 8525 27
can <- within(can,
{
Married <- factor(Married, levels = 0:1, labels = c("no", "yes"))
DID <- factor(DID)
HID <- factor(HID)
FamilyHx <- factor(FamilyHx)
SmokingHx <- factor(SmokingHx)
Sex <- factor(Sex)
CancerStage <- factor(CancerStage)
School <- factor(School)
}
)
str(can)
## tibble [8,525 x 27] (S3: tbl_df/tbl/data.frame)
## $ remission : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
## $ ntumors : num [1:8525] 0 0 0 0 0 0 0 0 2 0 ...
## $ tumorsize : num [1:8525] 68 64.7 51.6 86.4 53.4 ...
## $ co2 : num [1:8525] 1.53 1.68 1.53 1.45 1.57 ...
## $ pain : num [1:8525] 4 2 6 3 3 4 3 3 4 5 ...
## $ wound : num [1:8525] 4 3 3 3 4 5 4 3 4 4 ...
## $ mobility : num [1:8525] 2 2 2 2 2 2 2 3 3 3 ...
## $ nmorphine : num [1:8525] 0 0 0 0 0 0 0 0 0 0 ...
## $ lungcapacity: num [1:8525] 0.801 0.326 0.565 0.848 0.886 ...
## $ Age : num [1:8525] 65 53.9 53.3 41.4 46.8 ...
## $ Married : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 2 1 2 1 ...
## $ FamilyHx : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ SmokingHx : Factor w/ 3 levels "a.former","current",..: 1 1 3 1 3 3 2 1 1 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 2 2 2 1 2 2 2 ...
## $ CancerStage : Factor w/ 4 levels "I","II","III",..: 2 2 2 1 2 1 2 2 2 2 ...
## $ LengthofStay: num [1:8525] 6 6 5 5 6 5 4 5 6 7 ...
## $ WBC : num [1:8525] 6088 6700 6043 7163 6443 ...
## $ RBC : num [1:8525] 4.87 4.68 5.01 5.27 4.98 ...
## $ BMI : num [1:8525] 24.1 29.4 29.5 21.6 29.8 ...
## $ IL6 : num [1:8525] 3.7 2.63 13.9 3.01 3.89 ...
## $ CRP : num [1:8525] 8.086 0.803 4.034 2.126 1.349 ...
## $ DID : Factor w/ 407 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Experience : num [1:8525] 25 25 25 25 25 25 25 25 25 25 ...
## $ School : Factor w/ 2 levels "average","top": 1 1 1 1 1 1 1 1 1 1 ...
## $ Lawsuits : num [1:8525] 3 3 3 3 3 3 3 3 3 3 ...
## $ HID : Factor w/ 35 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Medicaid : num [1:8525] 0.606 0.606 0.606 0.606 0.606 ...
hist(can$WBC)
head(can$WBC)
## [1] 6087.649 6700.310 6042.809 7162.697 6443.440 6800.549
head(can$WBC)
## [1] 6087.649 6700.310 6042.809 7162.697 6443.440 6800.549
head(can$Age)
## [1] 64.96824 53.91714 53.34730 41.36804 46.80042 51.92936
plot(can$WBC ~ can$Age)
plot(can$WBC ~ can$Age, pch=20)
ml <- lm(can$WBC ~ can$Age)
abline(ml)
ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_point()
ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_point(position=position_jitter(width=0.2, height=0.03))
ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_boxplot() + geom_point(position=position_jitter(width=0.2, height=0.03))
ggplot(aes(x= as.factor(remission), y= WBC, col= as.factor(remission) ), data=can) + geom_point(position=position_jitter(width=0.2, height=0.03)) + geom_boxplot()
t.test(WBC ~ remission, data=can)
##
## Welch Two Sample t-test
##
## data: WBC by remission
## t = -1.6783, df = 4734.2, p-value = 0.09335
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -85.902778 6.660036
## sample estimates:
## mean in group 0 mean in group 1
## 5985.863 6025.485
t.test(WBC ~ remission, data=can, alternative= "less")
##
## Welch Two Sample t-test
##
## data: WBC by remission
## t = -1.6783, df = 4734.2, p-value = 0.04667
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.7831199
## sample estimates:
## mean in group 0 mean in group 1
## 5985.863 6025.485
table(can$CancerStage, can$remission)
##
## 0 1
## I 1580 978
## II 2357 1052
## III 1301 404
## IV 766 87
etapaC <- table(can$CancerStage, can$remission)
etapaC
##
## 0 1
## I 1580 978
## II 2357 1052
## III 1301 404
## IV 766 87
chisq.test(etapaC)
##
## Pearson's Chi-squared test
##
## data: etapaC
## X-squared = 276.83, df = 3, p-value < 2.2e-16
chisq.test(etapaC)$stdres
##
## 0 1
## I -11.473145 11.473145
## II -2.126563 2.126563
## III 5.944949 -5.944949
## IV 13.068985 -13.068985
FamHx <- table(can$FamilyHx, can$remission)
FamHx
##
## 0 1
## no 4565 2255
## yes 1439 266
chisq.test(FamHx)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: FamHx
## X-squared = 198.89, df = 1, p-value < 2.2e-16
chisq.test(FamHx)$stdres
##
## 0 1
## no -14.1326 14.1326
## yes 14.1326 -14.1326
SmokeHx <- table(can$SmokingHx, can$remission)
SmokeHx
##
## 0 1
## a.former 1190 515
## current 1140 565
## never 3674 1441
chisq.test(SmokeHx)
##
## Pearson's Chi-squared test
##
## data: SmokeHx
## X-squared = 15.551, df = 2, p-value = 0.0004199
chisq.test(SmokeHx)$stdres
##
## 0 1
## a.former -0.640773 0.640773
## current -3.607315 3.607315
## never 3.468549 -3.468549
ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point()
ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.02, width = 0.02))
ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.2, width = 0.2))
ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.2, width = 0.2)) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'
ggplot(aes(x= Age, y= ntumors ), data=can) + geom_point(position=position_jitter(height = 0.2, width = 0.2)) + geom_smooth(method="glm", method.args=list(family="poisson"))
## `geom_smooth()` using formula 'y ~ x'
hist(can$ntumors)
hist( lm(can$ntumors~can$Age)$residuals )
plot(can$Experience)
plot(can$Experience, pch=20)
head(can$Experience)
## [1] 25 25 25 25 25 25
head(can$Experience); tail(can$Experoi)
## [1] 25 25 25 25 25 25
## Warning: Unknown or uninitialised column: `Experoi`.
## NULL
head(can$Experience); tail(can$Experience)
## [1] 25 25 25 25 25 25
## [1] 24 24 24 24 24 24
table(can$Experience, can$remission)
##
## 0 1
## 7 38 0
## 8 12 16
## 9 127 25
## 10 93 18
## 11 164 24
## 12 233 40
## 13 422 113
## 14 521 136
## 15 516 210
## 16 561 203
## 17 448 190
## 18 621 254
## 19 559 241
## 20 373 231
## 21 394 217
## 22 304 179
## 23 239 174
## 24 130 113
## 25 84 66
## 26 47 29
## 27 63 19
## 28 37 16
## 29 18 7
exp <- table(can$Experience, can$remission)
chisq.test(exp)
##
## Pearson's Chi-squared test
##
## data: exp
## X-squared = 279.04, df = 22, p-value < 2.2e-16
chisq.test(exp)$stdres
##
## 0 1
## 7 4.00339066 -4.00339066
## 8 -3.20209117 3.20209117
## 9 3.57765238 -3.57765238
## 10 3.10355519 -3.10355519
## 11 5.10587712 -5.10587712
## 12 5.49035579 -5.49035579
## 13 4.42397972 -4.42397972
## 14 5.18670973 -5.18670973
## 15 0.39890476 -0.39890476
## 16 1.90507816 -1.90507816
## 17 -0.12010098 0.12010098
## 18 0.37173094 -0.37173094
## 19 -0.36014338 0.36014338
## 20 -4.84553706 4.84553706
## 21 -3.34129127 3.34129127
## 22 -3.71281602 3.71281602
## 23 -5.73320646 5.73320646
## 24 -5.86722426 5.86722426
## 25 -3.90659835 3.90659835
## 26 -1.64752583 1.64752583
## 27 1.27628902 -1.27628902
## 28 -0.09870667 0.09870667
## 29 0.17246690 -0.17246690
m <- glm( remission ~ Experience, data=can, family=binomial)
summary(m)
##
## Call:
## glm(formula = remission ~ Experience, family = binomial, data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1909 -0.8690 -0.7568 1.3757 1.9203
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.320558 0.111349 -20.84 <2e-16 ***
## Experience 0.081116 0.005985 13.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10353 on 8524 degrees of freedom
## Residual deviance: 10164 on 8523 degrees of freedom
## AIC: 10168
##
## Number of Fisher Scoring iterations: 4
p <- ggplot(aes(x= Experience, y=remission), data=can)
p
p + geom_point(position=position_jitter(height = 0.2, width = 0.02))
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.2, width = 0.02))
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.2))
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.5))
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.3))
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4))
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
p <- ggplot(aes(x= Age, y=remission), data=can)
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
m <- glm( remission ~ Age, data=can, family=binomial)
summary(m)
##
## Call:
## glm(formula = remission ~ Age, family = binomial, data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1517 -0.8570 -0.7889 1.4594 1.9129
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.810430 0.195069 4.155 3.26e-05 ***
## Age -0.033095 0.003838 -8.624 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10353 on 8524 degrees of freedom
## Residual deviance: 10277 on 8523 degrees of freedom
## AIC: 10281
##
## Number of Fisher Scoring iterations: 4
m <- glm( remission ~ Age + Experience, data=can, family=binomial)
summary(m)
##
## Call:
## glm(formula = remission ~ Age + Experience, family = binomial,
## data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3586 -0.8615 -0.7355 1.3546 2.1027
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.579138 0.221016 -2.620 0.00878 **
## Age -0.035050 0.003889 -9.012 < 2e-16 ***
## Experience 0.083113 0.006024 13.796 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10353 on 8524 degrees of freedom
## Residual deviance: 10081 on 8522 degrees of freedom
## AIC: 10087
##
## Number of Fisher Scoring iterations: 4
m <- glm( remission ~ Age + Experience + CancerStage, data=can, family=binomial)
summary(m)
##
## Call:
## glm(formula = remission ~ Age + Experience + CancerStage, family = binomial,
## data = can)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3917 -0.8766 -0.7135 1.2908 2.3318
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.565954 0.234559 -6.676 2.45e-11 ***
## Age -0.008714 0.004346 -2.005 0.0449 *
## Experience 0.084201 0.006103 13.796 < 2e-16 ***
## CancerStageII -0.301294 0.057706 -5.221 1.78e-07 ***
## CancerStageIII -0.659473 0.075110 -8.780 < 2e-16 ***
## CancerStageIV -1.646902 0.126937 -12.974 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10352.6 on 8524 degrees of freedom
## Residual deviance: 9846.3 on 8519 degrees of freedom
## AIC: 9858.3
##
## Number of Fisher Scoring iterations: 4
range(can$Medicaid)
## [1] 0.1415814 0.8187299
head(can$Medicaid)
## [1] 0.6058667 0.6058667 0.6058667 0.6058667 0.6058667 0.6058667
p <- ggplot(aes(x= Medicaid, y=remission), data=can)
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
p + geom_point(aes(col=as.factor(remission)), position=position_jitter(height = 0.05, width = 0.4)) + geom_smooth(method="glm", method.args=list(family="binomial"))
## `geom_smooth()` using formula 'y ~ x'
p <- ggplot(aes(x= as.factor(remission), y=Medicaid), data=can)
p + geom_point()
p + geom_point(position=position_jitter(width=0.5))
p + geom_point(position=position_jitter(width=0.02))
p + geom_point(position=position_jitter(width=0.02)) + geom_boxplot()
t.test(can$Medicaid ~ as.factor(can$remission))
##
## Welch Two Sample t-test
##
## data: can$Medicaid by as.factor(can$remission)
## t = -5.4477, df = 4748.6, p-value = 5.361e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03631188 -0.01709301
## sample estimates:
## mean in group 0 mean in group 1
## 0.5046166 0.5313190
t.test(can$Medicaid ~ as.factor(can$remission), alternative="less")
##
## Welch Two Sample t-test
##
## data: can$Medicaid by as.factor(can$remission)
## t = -5.4477, df = 4748.6, p-value = 2.68e-08
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.01863844
## sample estimates:
## mean in group 0 mean in group 1
## 0.5046166 0.5313190
p + geom_point(position=position_jitter(width=0.02)) + geom_boxplot()