remove()  #Remove objects from the workspace
 rm(list=ls())  #removes all objects from the current workspace (R memory)

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(readr)
data1<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\EHA\\MHAS0121.csv")

## Rows: 9636 Columns: 39

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): locsize01, died18, died21
## dbl (36): id, perwght01, mobirth, yrbirth, female, moint, yrint, schooling, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(survival)
library(ggsurvfit)

summary(data1$yrcensor)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2001    2008    2018    2015    2021    2022     847

summary(data1$yrint)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2001    2001    2001    2001    2001    2001

summary(data1$mocensor)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   9.000   8.822  12.000  99.000     102

summary(data1$moint)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   7.000   7.000   7.494   8.000  11.000

table(data1$mocensor)

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   88   99 
## 1387  327  385  333  343  546  874  435  387  481 1171 2755   12   98

data1 <- data1[!is.na(data1$yrcensor), ]
data1 <- data1[!is.na(data1$mocensor), ]
data1 <- data1[data1$yrcensor > data1$yrint | (data1$yrcensor == data1$yrint & data1$mocensor >= data1$moint), ]
data1$time_exposed <- with(data1, ((yrcensor + mocensor/12) - (yrint + moint/12)) * 12 + 1)
summary(data1$time_exposed)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    91.0   201.0   165.5   245.0   297.0

data1 <- data1[!is.na(data1$died0121), ]
data1$event_occurred <- data1$died0121

summary(data1$ageint)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.50   55.33   61.33   63.51   69.67  106.08

summary(data1$agecensor)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   51.17   72.75   77.67   78.30   83.75  114.42

data1$entry_time <- data1$ageint
data1$exit_time <-data1$agecensor

survobj <- with(data1, Surv(time = time_exposed, event = event_occurred))

fit <- survfit(survobj ~ 1, data = data1)

km_summary <- summary(fit)

surv_times <- km_summary$time        
surv_prob <- km_summary$surv         
std_error <- km_summary$std.err      
cum_hazard <- -log(surv_prob)        

km_results <- data.frame(
  time = surv_times,
  survival_prob = surv_prob,
  std_error = std_error,
  cumulative_hazard = cum_hazard
)

library(flextable)
km_table <- flextable(km_results)

library(officer)
library(dplyr)
doc <- read_docx()
doc <- doc %>%
  body_add_par("Kaplan-Meier Survival Results") %>%
  body_add_flextable(km_table)
print(doc, target = "Survival_Results.docx")

library(survival)
survobj <- with(data1, Surv(time = time_exposed, event = event_occurred))

fit <- survfit(survobj ~ 1)

1. Kaplan-Meier survival curve

plot(fit, xlab = "Time", ylab = "Survival Probability", main = "Kaplan-Meier Survival Curve")

km_summary <- summary(fit)

surv_times <- km_summary$time
surv_prob <- km_summary$surv

cum_hazard <- -log(surv_prob)

plot(surv_times, cum_hazard, type = "l", xlab = "Time", ylab = "Cumulative Hazard",
     main = "Cumulative Hazard Curve")

library(ggplot2)
hazard_estimate <- c(0, diff(-log(surv_prob)) / diff(surv_times))

hazard_df <- data.frame(time = surv_times[-1], hazard = hazard_estimate[-1])

ggplot(hazard_df, aes(x = time, y = hazard)) +
  geom_line() + 
  geom_smooth(color="red",method = "loess", se = FALSE) +
  labs(x = "Time", y = "Hazard", title = "Smoothed Hazard Curve") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).