library(readr)
mhas <- read_csv("MHAS0121.csv")
## Rows: 9636 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): locsize01, died18, died21
## dbl (36): id, perwght01, mobirth, yrbirth, female, moint, yrint, schooling, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(survival)
library(asaur)
library(eha)
library(flextable)
library(officer)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(survminer)
## Loading required package: ggpubr
##
## Attaching package: 'ggpubr'
##
## The following objects are masked from 'package:flextable':
##
## border, font, rotate
##
##
## Attaching package: 'survminer'
##
## The following object is masked from 'package:survival':
##
## myeloma
## (1) the variable that measures the time each person was exposed to the risk of death before dying/being censored for other reasons
summary(mhas$yrcensor)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2001 2008 2018 2015 2021 2022 847
summary(mhas$yrint)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2001 2001 2001 2001 2001 2001
summary(mhas$mocensor)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.000 9.000 8.822 12.000 99.000 102
summary(mhas$moint)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 7.000 7.000 7.494 8.000 11.000
table(mhas$mocensor)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 88 99
## 1387 327 385 333 343 546 874 435 387 481 1171 2755 12 98
mhas <- mhas[!is.na(mhas$yrcensor), ]
mhas <- mhas[!is.na(mhas$mocensor), ]
mhas <- mhas[mhas$mocensor != 88, ]
mhas <- mhas[mhas$mocensor != 99, ]
mhas <- mhas[mhas$yrcensor > mhas$yrint | (mhas$yrcensor == mhas$yrint & mhas$mocensor >= mhas$moint), ]
mhas$time_exposed <- with(mhas, ((yrcensor + mocensor/12) - (yrint + moint/12)) * 12 + 1)
summary(mhas$time_exposed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 89.0 201.0 165.3 245.0 248.0
## (2) the variable that indicates whether the event occurred or not (i.e., whether the person died in the interval)
summary(mhas$died0121)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 1.0000 0.5223 1.0000 1.0000 1224
mhas <- mhas[!is.na(mhas$died0121), ]
mhas$event_occurred <- mhas$died0121
## (3) the time in which the person entered and exited the risk set
summary(mhas$ageint)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.50 55.25 61.25 63.42 69.50 106.08
summary(mhas$agecensor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 51.17 72.75 77.58 78.20 83.67 114.42
mhas$entry_time <- mhas$ageint
mhas$exit_time <-mhas$agecensor
## a.Obtain a table with the Kaplan-Meier survival function as well as its standard errors and the cumulative hazard function.
surv_object <- with(mhas, Surv(time = time_exposed, event = event_occurred))
km_fit <- survfit(surv_object ~ 1, data = mhas)
km_summary <- summary(km_fit)
surv_times <- km_summary$time # time point
surv_prob <- km_summary$surv # survival prop
std_error <- km_summary$std.err # SE
cum_hazard <- -log(surv_prob) # cumulative
km_results <- data.frame(
time = surv_times,
survival_prob = surv_prob,
std_error = std_error,
cumulative_hazard = cum_hazard
)
km_table <- flextable(km_results)
doc <- read_docx()
doc <- doc %>%
body_add_par("Kaplan-Meier Survival Results", style = "heading 1") %>%
body_add_flextable(km_table)
print(doc, target = "km_survival_results.docx")
## c.Using the built-in functions in your statistical software, graph the survival, cumulative hazard, and (smoothed, estimated) hazard.
surv_object <- with(mhas, Surv(time = time_exposed, event = event_occurred))
km_fit <- survfit(surv_object ~ 1)
# 1. Kaplan-Meier survival curve
plot(km_fit, xlab = "Time", ylab = "Survival Probability", main = "Kaplan-Meier Survival Curve")

# 2. Cumulative Hazard
km_summary <- summary(km_fit)
surv_times <- km_summary$time
surv_prob <- km_summary$surv
cum_hazard <- -log(surv_prob)
plot(surv_times, cum_hazard, type = "l", xlab = "Time", ylab = "Cumulative Hazard",
main = "Cumulative Hazard Function")

# 3. Smoothed Hazard Function
hazard_estimate <- c(0, diff(-log(surv_prob)) / diff(surv_times))
hazard_df <- data.frame(time = surv_times[-1], hazard = hazard_estimate[-1])
ggplot(hazard_df, aes(x = time, y = hazard)) +
geom_line() +
geom_smooth(method = "loess", se = FALSE) +
labs(x = "Time", y = "Hazard Rate", title = "Smoothed Hazard Function") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

## d.Finally, graph the survival function by sex/gender, locality size, and schooling levels.
surv_object <- with(mhas, Surv(time = time_exposed, event = event_occurred))
# 1.Kaplan-Meier Survival Function by Sex
km_fit_gender <- survfit(surv_object ~ female, data = mhas)
km_fit_gender_df <- tidy(km_fit_gender)
ggplot(km_fit_gender_df, aes(x = time, y = estimate, color = strata)) +
geom_step(size = 1) +
labs(x = "Time", y = "Survival Probability", color = "Sex (Female = 1, Male = 0)",
title = "Kaplan-Meier Survival Function by Sex") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 2. Kaplan-Meier Survival Function by Locality Size
km_fit_locsize <- survfit(surv_object ~ locsize01, data = mhas)
km_fit_locsize_df <- tidy(km_fit_locsize)
ggplot(km_fit_locsize_df, aes(x = time, y = estimate, color = factor(strata))) +
geom_step(size = 1) +
labs(x = "Time", y = "Survival Probability", color = "Locality Size",
title = "Kaplan-Meier Survival Function by Locality Size") +
theme_minimal()

# 3. Kaplan-Meier Survival Function by Education Level
km_fit_educ <- survfit(surv_object ~ educlevel, data = mhas)
km_fit_educ_df <- tidy(km_fit_educ)
ggplot(km_fit_educ_df, aes(x = time, y = estimate, color = factor(strata))) +
geom_step(size = 1) +
labs(x = "Time", y = "Survival Probability", color = "Education Level",
title = "Kaplan-Meier Survival Function by Education Level") +
theme_minimal()

## e.Do they appear to be different for these groups? Is there a more formal test (i.e., that provides statistical significance) to compare them (e.g., using the sts command) otherwise?
survdiff(Surv(time_exposed, event_occurred) ~ female + locsize01 + educlevel, data = mhas)
## Call:
## survdiff(formula = Surv(time_exposed, event_occurred) ~ female +
## locsize01 + educlevel, data = mhas)
##
## n=7463, 12 observations deleted due to missingness.
##
## N Observed Expected
## female=0, locsize01=< 2,500 , educlevel=0 271 167 126.65
## female=0, locsize01=< 2,500 , educlevel=15 317 155 169.69
## female=0, locsize01=< 2,500 , educlevel=68 84 33 53.53
## female=0, locsize01=< 2,500 , educlevel=919 18 7 10.37
## female=0, locsize01=100,000 - + , educlevel=0 291 193 125.75
## female=0, locsize01=100,000 - + , educlevel=15 530 335 246.09
## female=0, locsize01=100,000 - + , educlevel=68 497 266 264.33
## female=0, locsize01=100,000 - + , educlevel=919 496 199 284.33
## female=0, locsize01=15,000 - 99,999, educlevel=0 123 78 54.20
## female=0, locsize01=15,000 - 99,999, educlevel=15 190 107 97.32
## female=0, locsize01=15,000 - 99,999, educlevel=68 130 61 75.66
## female=0, locsize01=15,000 - 99,999, educlevel=919 93 41 50.61
## female=0, locsize01=2,500 - 14,999 , educlevel=0 112 76 47.73
## female=0, locsize01=2,500 - 14,999 , educlevel=15 169 97 84.93
## female=0, locsize01=2,500 - 14,999 , educlevel=68 51 23 28.20
## female=0, locsize01=2,500 - 14,999 , educlevel=919 30 14 17.19
## female=1, locsize01=< 2,500 , educlevel=0 317 190 154.19
## female=1, locsize01=< 2,500 , educlevel=15 224 102 123.58
## female=1, locsize01=< 2,500 , educlevel=68 47 16 28.84
## female=1, locsize01=< 2,500 , educlevel=919 12 1 8.41
## female=1, locsize01=100,000 - + , educlevel=0 547 340 260.55
## female=1, locsize01=100,000 - + , educlevel=15 815 432 419.03
## female=1, locsize01=100,000 - + , educlevel=68 612 256 345.56
## female=1, locsize01=100,000 - + , educlevel=919 505 188 302.59
## female=1, locsize01=15,000 - 99,999, educlevel=0 192 118 88.95
## female=1, locsize01=15,000 - 99,999, educlevel=15 248 127 132.38
## female=1, locsize01=15,000 - 99,999, educlevel=68 123 58 70.99
## female=1, locsize01=15,000 - 99,999, educlevel=919 55 25 31.05
## female=1, locsize01=2,500 - 14,999 , educlevel=0 141 83 69.70
## female=1, locsize01=2,500 - 14,999 , educlevel=15 156 79 82.86
## female=1, locsize01=2,500 - 14,999 , educlevel=68 51 21 29.64
## female=1, locsize01=2,500 - 14,999 , educlevel=919 16 6 9.12
## (O-E)^2/E (O-E)^2/V
## female=0, locsize01=< 2,500 , educlevel=0 12.8555 13.3830
## female=0, locsize01=< 2,500 , educlevel=15 1.2711 1.3389
## female=0, locsize01=< 2,500 , educlevel=68 7.8745 8.0474
## female=0, locsize01=< 2,500 , educlevel=919 1.0953 1.1066
## female=0, locsize01=100,000 - + , educlevel=0 35.9642 37.4341
## female=0, locsize01=100,000 - + , educlevel=15 32.1224 34.5438
## female=0, locsize01=100,000 - + , educlevel=68 0.0105 0.0113
## female=0, locsize01=100,000 - + , educlevel=919 25.6064 27.8482
## female=0, locsize01=15,000 - 99,999, educlevel=0 10.4555 10.6778
## female=0, locsize01=15,000 - 99,999, educlevel=15 0.9619 0.9936
## female=0, locsize01=15,000 - 99,999, educlevel=68 2.8407 2.9188
## female=0, locsize01=15,000 - 99,999, educlevel=919 1.8233 1.8615
## female=0, locsize01=2,500 - 14,999 , educlevel=0 16.7403 17.0661
## female=0, locsize01=2,500 - 14,999 , educlevel=15 1.7158 1.7667
## female=0, locsize01=2,500 - 14,999 , educlevel=68 0.9573 0.9716
## female=0, locsize01=2,500 - 14,999 , educlevel=919 0.5936 0.6008
## female=1, locsize01=< 2,500 , educlevel=0 8.3177 8.7231
## female=1, locsize01=< 2,500 , educlevel=15 3.7677 3.9210
## female=1, locsize01=< 2,500 , educlevel=68 5.7198 5.8077
## female=1, locsize01=< 2,500 , educlevel=919 6.5245 6.5939
## female=1, locsize01=100,000 - + , educlevel=0 24.2280 26.1556
## female=1, locsize01=100,000 - + , educlevel=15 0.4013 0.4530
## female=1, locsize01=100,000 - + , educlevel=68 23.2102 25.6722
## female=1, locsize01=100,000 - + , educlevel=919 43.3926 47.4364
## female=1, locsize01=15,000 - 99,999, educlevel=0 9.4852 9.7767
## female=1, locsize01=15,000 - 99,999, educlevel=15 0.2183 0.2276
## female=1, locsize01=15,000 - 99,999, educlevel=68 2.3769 2.4392
## female=1, locsize01=15,000 - 99,999, educlevel=919 1.1795 1.1977
## female=1, locsize01=2,500 - 14,999 , educlevel=0 2.5396 2.6047
## female=1, locsize01=2,500 - 14,999 , educlevel=15 0.1794 0.1846
## female=1, locsize01=2,500 - 14,999 , educlevel=68 2.5199 2.5587
## female=1, locsize01=2,500 - 14,999 , educlevel=919 1.0674 1.0781
##
## Chisq= 291 on 31 degrees of freedom, p= <2e-16