Here, I’m going to install and load the packages needed for the assignment
install.packages("tidyverse")
install.packages("ggpubr")
install.packages("survival")
install.packages("survminer")
install.packages("readr")
library(tidyverse)
library(ggpubr)
library(survival)
library(survminer)
library(readr)
library(readr)
lung_cancer <- read_csv("lung-cancer.csv",
col_types = cols(X1 = col_skip()))
lung_cancer
## # A tibble: 228 x 10
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 306 2 74 1 1 90 100 1175 NA
## 2 3 455 2 68 1 0 90 90 1225 15
## 3 3 1010 1 56 1 0 90 90 NA 15
## 4 5 210 2 57 1 1 90 60 1150 11
## 5 1 883 2 60 1 0 100 90 NA 0
## 6 12 1022 1 74 1 1 50 80 513 0
## 7 7 310 2 68 2 2 70 60 384 10
## 8 11 361 2 71 2 2 60 80 538 1
## 9 1 218 2 53 1 1 70 80 825 16
## 10 7 166 2 61 1 2 70 70 271 34
## # … with 218 more rows
lungfemale <- lung %>% filter(sex == "2")
lungfemale %>% summarize(average = mean(age))
## average
## 1 61.07778
lungfemale %>% summarize(average = mean(meal.cal, na.rm=TRUE))
## average
## 1 840.7015
The average age is of 61.1 years and the average meal calories is 841 cal.
censored <- lung %>% filter(status == "1") %>% summarize(average = mean(time))
dead <- lung %>% filter(status == "2") %>% summarize(average = mean(time))
lung %>% group_by(status) %>% tally()
## # A tibble: 2 x 2
## status n
## <dbl> <int>
## 1 1 63
## 2 2 165
lung %>% group_by(status, sex) %>% tally()
## # A tibble: 4 x 3
## # Groups: status [2]
## status sex n
## <dbl> <dbl> <int>
## 1 1 1 26
## 2 1 2 37
## 3 2 1 112
## 4 2 2 53
lung <- lung %>% mutate(karnodiff = ph.karno-pat.karno)
lung <- lung %>% mutate(karnodiff = abs(ph.karno-pat.karno))
lung %>% summarize(average=mean(karnodiff, na.rm=TRUE))
## average
## 1 10.58036
t.test(lung$meal.cal ~ lung$sex)
##
## Welch Two Sample t-test
##
## data: lung$meal.cal by lung$sex
## t = 2.3533, df = 151.16, p-value = 0.01989
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 22.43394 257.25080
## sample estimates:
## mean in group 1 mean in group 2
## 980.5439 840.7015
fit <- survfit(Surv(time, status) ~ sex, data = lung)
ggsurvplot(fit, data = lung)
a. Customizetheplotbyaddingp-value,thep-valuemethod (pval.method), the line size to 2, no confidence interval, the ggtheme to theme_classic(), and no risk table
ggsurvplot(fit, data = lung, pval = TRUE, pval.method = TRUE, size = 2, conf.int = FALSE, ggtheme = theme_classic(), risk.table = FALSE)
gghistogram(lung,x = "meal.cal")
gghistogram(lung,x = "meal.cal", add = "mean")
gghistogram(lung, x = "meal.cal",
add = "mean", title = "Calories Consumed at Meals", xlab = "Calories", ylab = FALSE
)
gghistogram(lung,
x = "meal.cal", add = "mean",
title = "Calories Consumed at Meals", xlab = "Calories", ylab = FALSE, fill = "sex")
lung$sex <- as.factor(lung$sex)
Make sure the “sex” variable is a factor now with 2 levels (use str() and levels() to look at the structure and the levels of the variable.
levels(lung$sex)
## [1] "1" "2"
Now, use the previous code for gghistogram(). What is the difference?
gghistogram(lung, x = "meal.cal",
add = "mean", title = "Calories Consumed at Meals", xlab = "Calories", ylab = FALSE,
fill = "sex", palette = c("green", "blue"))
cor(lung$pat.karno, lung$ph.karno, use = "complete.obs")
## [1] 0.5202974
ggscatter(lung, x = "ph.karno", y = "pat.karno")
Customize the scatterplot with a. color as sex, b. title as ““Correlation Between Karnofsky Performance Done by Physicians and Patients”, c. x axis label as “Score by Physician”, d. y axis label as “Score by Patient” e. addalinearregressionline f. Modify the color of the regression line to light blue (color = “lightblue”) g. Addtheconfidenceintervals h. Add the group mean point to the plot i. Change the group mean points to 5 j. Add the “spearman” correlation coefficient to the plot k. Copy paste the plot here
ggscatter(lung,
x = "ph.karno",
y = "pat.karno",
color = "sex",
title = "Correlation Between Karnofsky Performance \nScore Done by
Physicians & Patients",
xlab = "Score by Physician",
ylab = "Score by Patient",
add = "reg.line",
add.params = list(color = "lightblue"),
conf.int = TRUE,
ellipse.type = "confidence",
mean.point = TRUE,
mean.point.size = 5,
cor.coef = TRUE,
cor.coeff.args = list(method = "spearman", label.x.npc = "middle",
label.y.npc = "bottom"))
ggscatter(lung, x = "meal.cal", y="wt.loss")
lung <- filter(lung, meal.cal < 2000)
ggscatter(lung,
x = "meal.cal",
y = "wt.loss",
color = "sex",
title = "Correlation Between Calories Consumed and Meals \n and
Weight Loss in the Last 6 Months, by Sex", xlab = "Calories Consumed",
ylab = "Weight Loss",
add = "reg.line",
ellipse = TRUE,
conf.int = TRUE,
mean.point = TRUE)