getwd()
## [1] "/Users/limasaha/Library/CloudStorage/OneDrive-UniversityofTexasatSanAntonio/RPubs_Uploaded"
setwd("/Users/limasaha/Downloads/")
##Load the data for downloads
Assignmentdata2<- read.csv("/Users/limasaha/Downloads/MHAS0121.csv")
###Check the coulmn names for understanding the data and information
colnames(Assignmentdata2)
## [1] "id" "locsize01" "perwght01" "died18"
## [5] "died21" "mobirth" "yrbirth" "female"
## [9] "moint" "yrint" "schooling" "died0103"
## [13] "refused0103" "lost0103" "followup0103" "died0312"
## [17] "refused0312" "lost0312" "followup0312" "died1215"
## [21] "refused1215" "lost1215" "followup1215" "died1518"
## [25] "refused1518" "lost1518" "followup1518" "died1821"
## [29] "refused1821" "lost1821" "followup1821" "died0121"
## [33] "mocensor" "yrcensor" "ageint" "agecensor"
## [37] "ageintexact" "exposure_months" "educlevel"
#install.packages("survival")
#install.packages("ggsurvfit")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(survival)
## Warning: package 'survival' was built under R version 4.3.3
library(ggsurvfit)
## Warning: package 'ggsurvfit' was built under R version 4.3.3
## Loading required package: ggplot2
####Using the built-in functions in your statistical software, graph the survival, cumulative hazard, and (smoothed, estimated) hazard. ######Making Death_age and Death_event variable
###This example shows the use of an age-at-death outcome. If the respondent died during the follow up period, their age at death is the difference between their year of mortality and their year of birth, calculated as the year of the survey minus their age.The current year of mortality follow up is 2014, that is the censoring year for mortality. ###Here I am follwing the step from Dr. Corey Sparks documentation
Assignmentdata2 <- Assignmentdata2 %>%
mutate(death_age = ifelse( died0121 ==1,
yrcensor - yrbirth,
2021 - yrbirth ),
d.event = ifelse(died0121 == 1, 1, 0))
age_fit <- survfit(Surv(death_age, d.event) ~ 1,
data = Assignmentdata2)
age_fit %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile() +
scale_x_continuous(limits = c(50, 125), breaks = c(50, 75, 100, 125))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_step()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_segment()`).
######Interpretation: In this graph, y axis= percent of Life expectancy (survival probability) and the x axis=time/age. We can see that it starts from 50 years age and the probabilty starts from 0-100%. After 75 years old, people are more likely to be died which means they have lower life expectancy. During this age, we can see that the decline of the curve gradually from 75 to 100 years old which is notable fall. The confidence interval is smaller which means the certain estimation.
### Fit the Kaplan-Meier model
# Get summary statistics from the survival fit
km_summary <- summary(age_fit)
# Create a table with time, survival, standard errors, and cumulative hazard
km_table <- data.frame(
time = km_summary$time,
survival = km_summary$surv,
std_error = km_summary$std.err,
cumulative_hazard = -log(km_summary$surv)
)
# Display the table
print(km_table)
## time survival std_error cumulative_hazard
## 1 51 0.999735625 0.0001869169 0.0002644104
## 2 52 0.999206874 0.0003236640 0.0007934409
## 3 53 0.997752809 0.0005444111 0.0022497197
## 4 54 0.996563120 0.0006728684 0.0034428000
## 5 55 0.994976867 0.0008128106 0.0050357912
## 6 56 0.991936550 0.0010282481 0.0080961355
## 7 57 0.989424983 0.0011760546 0.0106313294
## 8 58 0.986649042 0.0013195720 0.0134408837
## 9 59 0.982947786 0.0014885079 0.0171992774
## 10 60 0.977528090 0.0017040414 0.0227282511
## 11 61 0.972504957 0.0018800467 0.0278801063
## 12 62 0.965234633 0.0021061310 0.0353840640
## 13 63 0.957435558 0.0023209927 0.0434968620
## 14 64 0.951222736 0.0024765401 0.0500070312
## 15 65 0.939590218 0.0027391664 0.0623114369
## 16 66 0.928883014 0.0029550356 0.0737724750
## 17 67 0.917779247 0.0031583145 0.0857983895
## 18 68 0.900330469 0.0034441156 0.1049933950
## 19 69 0.880634501 0.0037276299 0.1271126075
## 20 70 0.861335096 0.0039734238 0.1492716565
## 21 71 0.846530073 0.0041440823 0.1666095521
## 22 72 0.827297042 0.0043592069 0.1895914679
## 23 73 0.809201288 0.0045566969 0.2117075825
## 24 74 0.787077691 0.0047905858 0.2394283180
## 25 75 0.765453784 0.0050125002 0.2672864396
## 26 76 0.741967296 0.0052449808 0.2984501124
## 27 77 0.719461439 0.0054614216 0.3292523488
## 28 78 0.693773285 0.0057025938 0.3656100502
## 29 79 0.664198580 0.0059683543 0.4091741074
## 30 80 0.632247018 0.0062396024 0.4584751104
## 31 81 0.599358213 0.0064929341 0.5118958409
## 32 82 0.567683814 0.0067220763 0.5661906815
## 33 83 0.527750194 0.0069815691 0.6391322255
## 34 84 0.488945032 0.0071974281 0.7155052043
## 35 85 0.451828464 0.0073661866 0.7944526761
## 36 86 0.411525995 0.0075213240 0.8878830907
## 37 87 0.380339511 0.0076217890 0.9666909762
## 38 88 0.346131025 0.0077177147 1.0609378917
## 39 89 0.310009553 0.0077859995 1.1711521658
## 40 90 0.269870575 0.0078013090 1.3098127864
## 41 91 0.237228649 0.0077601547 1.4387308408
## 42 92 0.199771494 0.0076457768 1.6105810977
## 43 93 0.165664165 0.0074507255 1.7977926398
## 44 94 0.139111721 0.0072121030 1.9724779221
## 45 95 0.108470373 0.0068019035 2.2212782070
## 46 96 0.089966603 0.0064513479 2.4083167536
## 47 97 0.080126506 0.0062588512 2.5241485692
## 48 98 0.067862245 0.0060491167 2.6902754400
## 49 99 0.051331698 0.0056404764 2.9694468228
## 50 100 0.045628176 0.0054733025 3.0872298585
## 51 101 0.040558379 0.0053140526 3.2050128941
## 52 102 0.026243657 0.0047825151 3.6403309654
## 53 103 0.018370560 0.0042940844 3.9970059093
## 54 104 0.015746194 0.0040618760 4.1511565892
## 55 105 0.013996617 0.0039695119 4.2689396248
## 56 106 0.012247040 0.0038395787 4.4024710174
## 57 107 0.008164693 0.0034795613 4.8079361255
## 58 110 0.005443129 0.0032123177 5.2134012337
## 59 113 0.002721564 0.0025066317 5.9065484142
### Making the Death_Time variable
Assignmentdata2 <- Assignmentdata2 %>%
mutate(death_time = ifelse( died0121 ==1,
yrcensor - yrint ,
2021 - yrint ),
d5.event = ifelse(died0121 == 1 & death_time <= 5, 1, 0))
time_fit <- survfit(Surv(death_time, d5.event) ~ 1,
conf.type = "log",
data = Assignmentdata2)
time_fit %>%
ggsurvfit()+
xlim(-1, 6) +
add_censor_mark() +
add_confidence_interval()+
add_quantile(y_value = .975)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_step()`).
###cumulative hazard function
age_fit%>%
broom::tidy() %>%
select(time, estimate) %>%
mutate(cumhaz = -log(estimate)) %>%
mutate(haz = c(0, diff(cumhaz)))%>%
ggplot()+
aes(x=time, y=haz)+
geom_line()+
scale_x_continuous(limits = c(50, 125), breaks = c(50, 75, 100, 125))
###Interpretation: In this graph, we can see the hazard ration is 0.0 in age of 50 to 70 which is flat. After age of 100, we can see that the risk of death(risk of event) increasing. If the hazard ratio is increasing which means the risk of event is increasing also. According to the graph, we can conclude that age of 113 is the highest peak to risk of death.
### Alternative Hazrd ratio
p<-age_fit %>%
ggsurvfit(type = "risk")
test<- ggplot_build(p)$data %>%
as.data.frame() %>%
mutate(ft = c(0, diff(y)))%>%
mutate(st = 1-y)%>%
mutate(haz = ft/st)
test %>%
ggplot() +
aes(x = x, y = ft) +
geom_line()+
scale_x_continuous(limits = c(50, 125), breaks = c(50, 75, 100, 125))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
#### Finally, graph the survival function by sex/gender, locality size, and schooling levels.
###For gender variable
age_fit1 <- survfit(Surv(death_age, d.event) ~ female,
data = Assignmentdata2)
age_fit1 %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile() +
scale_x_continuous(limits = c(50, 125), breaks = c(50, 75, 100, 125))+
labs(title = "Kaplan-Meier Survival by Gender")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_step()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_segment()`).
###Interpretation: In this graph, we can see that the survival probability of the female=0 (Assumed male) is little lower percent of probability of survive compared to Female=1(Assumed female) with their growing age after 70 years. So, we can conclude that Female=1(Assumed female) is better survival history(Life expectancy) after 70 to 105 years age. Then both are same otherwise.
#### Finally, graph the survival function by locality size
age_fit2 <- survfit(Surv(death_age, d.event) ~ locsize01,
data = Assignmentdata2)
age_fit2 %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile() +
scale_x_continuous(limits = c(50, 125), breaks = c(50, 75, 100, 125))+
labs(title = "Kaplan-Meier Survival by the size of Locality of the Resident")
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_segment()`).
###Interpretation: In this graph, we can see that the survival probability of the all groups in terms of locality size is little similar with their growing age. After 75 years, we can see the little discrepancy among them. The less than 2,500 local size have little higher percent of life expectancy among them. So, we can conclude that the less than 2,500 local size have better survival history(Life expectancy) after 75 years age which is not much notable. Then both are same otherwise.
#### Finally, graph the survival function by schooling levels
age_fit3 <- survfit(Surv(death_age, d.event) ~ educlevel,
data = Assignmentdata2)
age_fit3 %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile() +
scale_x_continuous(limits = c(50, 125), breaks = c(50, 75, 100, 125))+
labs(title = "Kaplan-Meier Survival by the Schooling Level")
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_segment()`).
###Interpretation:In this graph, we can see that the survival probability of the all groups in terms of education level is little similar with their growing age. However, during 70 to 80, the no schooling group have little high percent of life expectancy compared to other group. After 85 years, we can see the little discrepancy among them. The higher education level have little higher life expectancy among them. So, we can conclude that higher education level have better survival history(Life expectancy) after 75 years age.Then both are same otherwise.
###Do they appear to be different for these groups? Is there a more formal test (i.e., that provides statistical significance) to compare them (e.g., using the sts command) otherwise?
###Log-Rank Test using survdiff()
# Fit the survival model stratified by gender
surv_difffgender <- survdiff(Surv(death_age, d.event) ~ female, data = Assignmentdata2)
# Print the results
print(surv_difffgender)
## Call:
## survdiff(formula = Surv(death_age, d.event) ~ female, data = Assignmentdata2)
##
## n=7565, 2071 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## female=0 3450 1898 1651 37.1 67
## female=1 4115 2096 2343 26.1 67
##
## Chisq= 67 on 1 degrees of freedom, p= 3e-16
###Interpretation: A log rank test calculated to see if there is a difference between male and female in terms of distribution of time to event occurrence.We can conclude that there is a statistically significant difference in survival probability between males and females.The p value is 3e-16 (p<0.001). We can reject the null hypothesis.
### Fit the survival model stratified by Locality Size
surv_diffflocality <- survdiff(Surv(death_age, d.event) ~ locsize01, data = Assignmentdata2)
# Print the results
print(surv_diffflocality)
## Call:
## survdiff(formula = Surv(death_age, d.event) ~ locsize01, data = Assignmentdata2)
##
## n=7565, 2071 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## locsize01= 2,500 - 14,999 739 411 430 0.832 0.991
## locsize01= 15,000 - 99,999 1167 627 602 1.029 1.281
## locsize01=< 2,500 1309 690 758 6.174 8.094
## locsize01=100,000 - + 4350 2266 2204 1.770 4.185
##
## Chisq= 10.4 on 3 degrees of freedom, p= 0.02
###Interpretation: A log rank test calculated to see if there is a difference among Locality ranging from 100,000+, 15,000 – 99,999 , 2,500 – 14,999 , and less than 2,500 in terms of distribution of time to event occurrence.We can conclude that there is a statistically significant difference in survival probability among Locality ranging from 100,000+, 15,000 – 99,999 , 2,500 – 14,999 , and less than 2,500.The p value is 0.02 (p<0.05). We can reject the null hypothesis.
# Fit the survival model stratified by Education Level.
surv_difffeducation <- survdiff(Surv(death_age, d.event) ~ educlevel, data = Assignmentdata2)
# Print the results
print(surv_difffeducation)
## Call:
## survdiff(formula = Surv(death_age, d.event) ~ educlevel, data = Assignmentdata2)
##
## n=7552, 2084 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## educlevel=0 2024 1275 1396 10.419 17.550
## educlevel=15 2676 1461 1439 0.323 0.536
## educlevel=68 1620 759 680 9.070 11.582
## educlevel=919 1232 488 468 0.895 1.073
##
## Chisq= 22.6 on 3 degrees of freedom, p= 5e-05
###Interpretation: A log rank test calculated to see if there is a difference among no formal schooling (0), 1 – 5 years (15), 6-8 years (68), and 9 or more years (because it is topped at 19 years, value is 919 in terms of distribution of time to event occurrence.We can conclude that there is a statistically significant difference in survival probability among no formal schooling (0), 1 – 5 years (15), 6-8 years (68), and 9 or more years (because it is topped at 19 years, value is 919.The p value is no formal schooling (0), 1 – 5 years (15), 6-8 years (68), and 9 or more years (because it is topped at 19 years, value is 919. The p value is (p<0.001). We can reject the null hypothesis.
# Fit the survival model stratified by gender
surv_difffgroup <- survdiff(Surv(death_age, d.event) ~ female+educlevel+locsize01, data = Assignmentdata2)
# Print the results
print(surv_difffgroup)
## Call:
## survdiff(formula = Surv(death_age, d.event) ~ female + educlevel +
## locsize01, data = Assignmentdata2)
##
## n=7552, 2084 observations deleted due to missingness.
##
## N Observed Expected
## female=0, educlevel=0 , locsize01= 2,500 - 14,999 114 78 80.63
## female=0, educlevel=0 , locsize01= 15,000 - 99,999 124 79 75.46
## female=0, educlevel=0 , locsize01=< 2,500 277 173 187.36
## female=0, educlevel=0 , locsize01=100,000 - + 296 198 193.38
## female=0, educlevel=15 , locsize01= 2,500 - 14,999 171 99 97.08
## female=0, educlevel=15 , locsize01= 15,000 - 99,999 191 108 95.95
## female=0, educlevel=15 , locsize01=< 2,500 319 157 157.72
## female=0, educlevel=15 , locsize01=100,000 - + 537 342 259.71
## female=0, educlevel=68 , locsize01= 2,500 - 14,999 52 24 16.79
## female=0, educlevel=68 , locsize01= 15,000 - 99,999 132 63 51.69
## female=0, educlevel=68 , locsize01=< 2,500 85 34 34.96
## female=0, educlevel=68 , locsize01=100,000 - + 506 275 187.69
## female=0, educlevel=919, locsize01= 2,500 - 14,999 30 14 8.99
## female=0, educlevel=919, locsize01= 15,000 - 99,999 93 41 22.98
## female=0, educlevel=919, locsize01=< 2,500 18 7 7.03
## female=0, educlevel=919, locsize01=100,000 - + 497 200 167.83
## female=1, educlevel=0 , locsize01= 2,500 - 14,999 144 86 108.00
## female=1, educlevel=0 , locsize01= 15,000 - 99,999 195 121 141.44
## female=1, educlevel=0 , locsize01=< 2,500 320 193 225.69
## female=1, educlevel=0 , locsize01=100,000 - + 554 347 383.63
## female=1, educlevel=15 , locsize01= 2,500 - 14,999 159 82 91.24
## female=1, educlevel=15 , locsize01= 15,000 - 99,999 250 129 133.16
## female=1, educlevel=15 , locsize01=< 2,500 229 107 120.66
## female=1, educlevel=15 , locsize01=100,000 - + 820 437 483.90
## female=1, educlevel=68 , locsize01= 2,500 - 14,999 51 21 18.35
## female=1, educlevel=68 , locsize01= 15,000 - 99,999 124 59 55.54
## female=1, educlevel=68 , locsize01=< 2,500 47 16 19.18
## female=1, educlevel=68 , locsize01=100,000 - + 623 267 296.23
## female=1, educlevel=919, locsize01= 2,500 - 14,999 16 6 7.71
## female=1, educlevel=919, locsize01= 15,000 - 99,999 55 25 23.60
## female=1, educlevel=919, locsize01=< 2,500 12 1 2.93
## female=1, educlevel=919, locsize01=100,000 - + 511 194 226.46
## (O-E)^2/E (O-E)^2/V
## female=0, educlevel=0 , locsize01= 2,500 - 14,999 0.08547 9.40e-02
## female=0, educlevel=0 , locsize01= 15,000 - 99,999 0.16654 1.80e-01
## female=0, educlevel=0 , locsize01=< 2,500 1.10089 1.24e+00
## female=0, educlevel=0 , locsize01=100,000 - + 0.11020 1.24e-01
## female=0, educlevel=15 , locsize01= 2,500 - 14,999 0.03789 4.09e-02
## female=0, educlevel=15 , locsize01= 15,000 - 99,999 1.51333 1.64e+00
## female=0, educlevel=15 , locsize01=< 2,500 0.00332 3.63e-03
## female=0, educlevel=15 , locsize01=100,000 - + 26.07178 2.94e+01
## female=0, educlevel=68 , locsize01= 2,500 - 14,999 3.09603 3.22e+00
## female=0, educlevel=68 , locsize01= 15,000 - 99,999 2.47385 2.61e+00
## female=0, educlevel=68 , locsize01=< 2,500 0.02654 2.81e-02
## female=0, educlevel=68 , locsize01=100,000 - + 40.61312 4.46e+01
## female=0, educlevel=919, locsize01= 2,500 - 14,999 2.79308 2.88e+00
## female=0, educlevel=919, locsize01= 15,000 - 99,999 14.13056 1.46e+01
## female=0, educlevel=919, locsize01=< 2,500 0.00012 1.27e-04
## female=0, educlevel=919, locsize01=100,000 - + 6.16530 6.74e+00
## female=1, educlevel=0 , locsize01= 2,500 - 14,999 4.48034 4.95e+00
## female=1, educlevel=0 , locsize01= 15,000 - 99,999 2.95485 3.29e+00
## female=1, educlevel=0 , locsize01=< 2,500 4.73464 5.38e+00
## female=1, educlevel=0 , locsize01=100,000 - + 3.49767 4.15e+00
## female=1, educlevel=15 , locsize01= 2,500 - 14,999 0.93584 1.01e+00
## female=1, educlevel=15 , locsize01= 15,000 - 99,999 0.13013 1.42e-01
## female=1, educlevel=15 , locsize01=< 2,500 1.54610 1.68e+00
## female=1, educlevel=15 , locsize01=100,000 - + 4.54589 5.49e+00
## female=1, educlevel=68 , locsize01= 2,500 - 14,999 0.38140 3.95e-01
## female=1, educlevel=68 , locsize01= 15,000 - 99,999 0.21617 2.32e-01
## female=1, educlevel=68 , locsize01=< 2,500 0.52761 5.50e-01
## female=1, educlevel=68 , locsize01=100,000 - + 2.88479 3.28e+00
## female=1, educlevel=919, locsize01= 2,500 - 14,999 0.38026 4.06e-01
## female=1, educlevel=919, locsize01= 15,000 - 99,999 0.08251 8.71e-02
## female=1, educlevel=919, locsize01=< 2,500 1.27355 1.30e+00
## female=1, educlevel=919, locsize01=100,000 - + 4.65315 5.18e+00
##
## Chisq= 140 on 31 degrees of freedom, p= 9e-16
###Interpretation:A log rank test calculated to see if there is a difference among gender, Locality size, and educational Level in terms of distribution of time to event occurrence.We can conclude that there is a statistically significant difference in survival probability based on socioeconomic factors (p<0.001). We can reject the null hypothesis.So, with their survival time, the socioeconomic factors also changed over the time and impacts on their survivality.