library(readr)
mhas <- read_csv("MHAS0121.csv")
## Rows: 9636 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): locsize01, died18, died21
## dbl (36): id, perwght01, mobirth, yrbirth, female, moint, yrint, schooling, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(survival)
library(asaur)
library(eha)
library(flextable)
library(officer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom) 
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'ggpubr'
## 
## The following objects are masked from 'package:flextable':
## 
##     border, font, rotate
## 
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
## (1) the variable that measures the time each person was exposed to the risk of death before dying/being censored for other reasons

summary(mhas$yrcensor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2001    2008    2018    2015    2021    2022     847
summary(mhas$yrint)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2001    2001    2001    2001    2001    2001
summary(mhas$mocensor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   9.000   8.822  12.000  99.000     102
summary(mhas$moint)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   7.000   7.000   7.494   8.000  11.000
table(mhas$mocensor)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   88   99 
## 1387  327  385  333  343  546  874  435  387  481 1171 2755   12   98
mhas <- mhas[!is.na(mhas$yrcensor), ]
mhas <- mhas[!is.na(mhas$mocensor), ]

mhas <- mhas[mhas$mocensor != 88, ]
mhas <- mhas[mhas$mocensor != 99, ]


mhas <- mhas[mhas$yrcensor > mhas$yrint | (mhas$yrcensor == mhas$yrint & mhas$mocensor >= mhas$moint), ]
mhas$time_exposed <- with(mhas, ((yrcensor + mocensor/12) - (yrint + moint/12)) * 12 + 1)


summary(mhas$time_exposed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    89.0   201.0   165.3   245.0   248.0
## (2) the variable that indicates whether the event occurred or not (i.e., whether the person died in the interval)

summary(mhas$died0121)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  1.0000  0.5223  1.0000  1.0000    1224
mhas <- mhas[!is.na(mhas$died0121), ]

mhas$event_occurred <- mhas$died0121








## (3) the time in which the person entered and exited the risk set 

summary(mhas$ageint)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.50   55.25   61.25   63.42   69.50  106.08
summary(mhas$agecensor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   51.17   72.75   77.58   78.20   83.67  114.42
mhas$entry_time <- mhas$ageint
mhas$exit_time <-mhas$agecensor







##  a.Obtain a table with the Kaplan-Meier survival function as well as its standard errors and the cumulative hazard function. 


surv_object <- with(mhas, Surv(time = time_exposed, event = event_occurred))

km_fit <- survfit(surv_object ~ 1, data = mhas)

km_summary <- summary(km_fit)


surv_times <- km_summary$time        # time point 
surv_prob <- km_summary$surv         # survival prop
std_error <- km_summary$std.err      # SE
cum_hazard <- -log(surv_prob)        # cumulative 

km_results <- data.frame(
  time = surv_times,
  survival_prob = surv_prob,
  std_error = std_error,
  cumulative_hazard = cum_hazard
)


km_table <- flextable(km_results)


doc <- read_docx()
doc <- doc %>%
  body_add_par("Kaplan-Meier Survival Results", style = "heading 1") %>%
  body_add_flextable(km_table)


print(doc, target = "km_survival_results.docx")











## c.Using the built-in functions in your statistical software, graph the survival, cumulative hazard, and (smoothed, estimated) hazard. 

surv_object <- with(mhas, Surv(time = time_exposed, event = event_occurred))

km_fit <- survfit(surv_object ~ 1)


# 1. Kaplan-Meier survival curve

plot(km_fit, xlab = "Time", ylab = "Survival Probability", main = "Kaplan-Meier Survival Curve")

# 2. Cumulative Hazard

km_summary <- summary(km_fit)

surv_times <- km_summary$time
surv_prob <- km_summary$surv

cum_hazard <- -log(surv_prob)

plot(surv_times, cum_hazard, type = "l", xlab = "Time", ylab = "Cumulative Hazard",
     main = "Cumulative Hazard Function")

# 3. Smoothed Hazard Function

hazard_estimate <- c(0, diff(-log(surv_prob)) / diff(surv_times))

hazard_df <- data.frame(time = surv_times[-1], hazard = hazard_estimate[-1])

ggplot(hazard_df, aes(x = time, y = hazard)) +
  geom_line() + 
  geom_smooth(method = "loess", se = FALSE) +
  labs(x = "Time", y = "Hazard Rate", title = "Smoothed Hazard Function") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

## d.Finally, graph the survival function by sex/gender, locality size, and schooling levels. 


surv_object <- with(mhas, Surv(time = time_exposed, event = event_occurred))

# 1.Kaplan-Meier Survival Function by Sex

km_fit_gender <- survfit(surv_object ~ female, data = mhas)

km_fit_gender_df <- tidy(km_fit_gender)

ggplot(km_fit_gender_df, aes(x = time, y = estimate, color = strata)) +
  geom_step(size = 1) +
  labs(x = "Time", y = "Survival Probability", color = "Sex (Female = 1, Male = 0)",
       title = "Kaplan-Meier Survival Function by Sex") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 2. Kaplan-Meier Survival Function by Locality Size

km_fit_locsize <- survfit(surv_object ~ locsize01, data = mhas)

km_fit_locsize_df <- tidy(km_fit_locsize)

ggplot(km_fit_locsize_df, aes(x = time, y = estimate, color = factor(strata))) +
  geom_step(size = 1) +
  labs(x = "Time", y = "Survival Probability", color = "Locality Size",
       title = "Kaplan-Meier Survival Function by Locality Size") +
  theme_minimal()

# 3. Kaplan-Meier Survival Function by Education Level

km_fit_educ <- survfit(surv_object ~ educlevel, data = mhas)

km_fit_educ_df <- tidy(km_fit_educ)

ggplot(km_fit_educ_df, aes(x = time, y = estimate, color = factor(strata))) +
  geom_step(size = 1) +
  labs(x = "Time", y = "Survival Probability", color = "Education Level",
       title = "Kaplan-Meier Survival Function by Education Level") +
  theme_minimal()

## e.Do they appear to be different for these groups? Is there a more formal test (i.e., that provides statistical significance) to compare them (e.g., using the sts command) otherwise?


survdiff(Surv(time_exposed, event_occurred) ~ female + locsize01 + educlevel, data = mhas)
## Call:
## survdiff(formula = Surv(time_exposed, event_occurred) ~ female + 
##     locsize01 + educlevel, data = mhas)
## 
## n=7463, 12 observations deleted due to missingness.
## 
##                                                      N Observed Expected
## female=0, locsize01=< 2,500        , educlevel=0   271      167   126.65
## female=0, locsize01=< 2,500        , educlevel=15  317      155   169.69
## female=0, locsize01=< 2,500        , educlevel=68   84       33    53.53
## female=0, locsize01=< 2,500        , educlevel=919  18        7    10.37
## female=0, locsize01=100,000 - +    , educlevel=0   291      193   125.75
## female=0, locsize01=100,000 - +    , educlevel=15  530      335   246.09
## female=0, locsize01=100,000 - +    , educlevel=68  497      266   264.33
## female=0, locsize01=100,000 - +    , educlevel=919 496      199   284.33
## female=0, locsize01=15,000 - 99,999, educlevel=0   123       78    54.20
## female=0, locsize01=15,000 - 99,999, educlevel=15  190      107    97.32
## female=0, locsize01=15,000 - 99,999, educlevel=68  130       61    75.66
## female=0, locsize01=15,000 - 99,999, educlevel=919  93       41    50.61
## female=0, locsize01=2,500 - 14,999 , educlevel=0   112       76    47.73
## female=0, locsize01=2,500 - 14,999 , educlevel=15  169       97    84.93
## female=0, locsize01=2,500 - 14,999 , educlevel=68   51       23    28.20
## female=0, locsize01=2,500 - 14,999 , educlevel=919  30       14    17.19
## female=1, locsize01=< 2,500        , educlevel=0   317      190   154.19
## female=1, locsize01=< 2,500        , educlevel=15  224      102   123.58
## female=1, locsize01=< 2,500        , educlevel=68   47       16    28.84
## female=1, locsize01=< 2,500        , educlevel=919  12        1     8.41
## female=1, locsize01=100,000 - +    , educlevel=0   547      340   260.55
## female=1, locsize01=100,000 - +    , educlevel=15  815      432   419.03
## female=1, locsize01=100,000 - +    , educlevel=68  612      256   345.56
## female=1, locsize01=100,000 - +    , educlevel=919 505      188   302.59
## female=1, locsize01=15,000 - 99,999, educlevel=0   192      118    88.95
## female=1, locsize01=15,000 - 99,999, educlevel=15  248      127   132.38
## female=1, locsize01=15,000 - 99,999, educlevel=68  123       58    70.99
## female=1, locsize01=15,000 - 99,999, educlevel=919  55       25    31.05
## female=1, locsize01=2,500 - 14,999 , educlevel=0   141       83    69.70
## female=1, locsize01=2,500 - 14,999 , educlevel=15  156       79    82.86
## female=1, locsize01=2,500 - 14,999 , educlevel=68   51       21    29.64
## female=1, locsize01=2,500 - 14,999 , educlevel=919  16        6     9.12
##                                                    (O-E)^2/E (O-E)^2/V
## female=0, locsize01=< 2,500        , educlevel=0     12.8555   13.3830
## female=0, locsize01=< 2,500        , educlevel=15     1.2711    1.3389
## female=0, locsize01=< 2,500        , educlevel=68     7.8745    8.0474
## female=0, locsize01=< 2,500        , educlevel=919    1.0953    1.1066
## female=0, locsize01=100,000 - +    , educlevel=0     35.9642   37.4341
## female=0, locsize01=100,000 - +    , educlevel=15    32.1224   34.5438
## female=0, locsize01=100,000 - +    , educlevel=68     0.0105    0.0113
## female=0, locsize01=100,000 - +    , educlevel=919   25.6064   27.8482
## female=0, locsize01=15,000 - 99,999, educlevel=0     10.4555   10.6778
## female=0, locsize01=15,000 - 99,999, educlevel=15     0.9619    0.9936
## female=0, locsize01=15,000 - 99,999, educlevel=68     2.8407    2.9188
## female=0, locsize01=15,000 - 99,999, educlevel=919    1.8233    1.8615
## female=0, locsize01=2,500 - 14,999 , educlevel=0     16.7403   17.0661
## female=0, locsize01=2,500 - 14,999 , educlevel=15     1.7158    1.7667
## female=0, locsize01=2,500 - 14,999 , educlevel=68     0.9573    0.9716
## female=0, locsize01=2,500 - 14,999 , educlevel=919    0.5936    0.6008
## female=1, locsize01=< 2,500        , educlevel=0      8.3177    8.7231
## female=1, locsize01=< 2,500        , educlevel=15     3.7677    3.9210
## female=1, locsize01=< 2,500        , educlevel=68     5.7198    5.8077
## female=1, locsize01=< 2,500        , educlevel=919    6.5245    6.5939
## female=1, locsize01=100,000 - +    , educlevel=0     24.2280   26.1556
## female=1, locsize01=100,000 - +    , educlevel=15     0.4013    0.4530
## female=1, locsize01=100,000 - +    , educlevel=68    23.2102   25.6722
## female=1, locsize01=100,000 - +    , educlevel=919   43.3926   47.4364
## female=1, locsize01=15,000 - 99,999, educlevel=0      9.4852    9.7767
## female=1, locsize01=15,000 - 99,999, educlevel=15     0.2183    0.2276
## female=1, locsize01=15,000 - 99,999, educlevel=68     2.3769    2.4392
## female=1, locsize01=15,000 - 99,999, educlevel=919    1.1795    1.1977
## female=1, locsize01=2,500 - 14,999 , educlevel=0      2.5396    2.6047
## female=1, locsize01=2,500 - 14,999 , educlevel=15     0.1794    0.1846
## female=1, locsize01=2,500 - 14,999 , educlevel=68     2.5199    2.5587
## female=1, locsize01=2,500 - 14,999 , educlevel=919    1.0674    1.0781
## 
##  Chisq= 291  on 31 degrees of freedom, p= <2e-16