#1. Load Data and create Variables

library(readr)
mhas <- read_csv("MHAS0121.csv")
## Rows: 9636 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): locsize01, died18, died21
## dbl (36): id, perwght01, mobirth, yrbirth, female, moint, yrint, schooling, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(survival)
library(asaur)
library(eha)
library(flextable)
library(officer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom) 
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'ggpubr'
## 
## The following objects are masked from 'package:flextable':
## 
##     border, font, rotate
## 
## 
## Attaching package: 'survminer'
## 
## The following object is masked from 'package:survival':
## 
##     myeloma
summary(mhas$yrcensor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2001    2008    2018    2015    2021    2022     847
summary(mhas$yrint)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2001    2001    2001    2001    2001    2001
summary(mhas$mocensor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   9.000   8.822  12.000  99.000     102
summary(mhas$moint)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   7.000   7.000   7.494   8.000  11.000
table(mhas$mocensor)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   88   99 
## 1387  327  385  333  343  546  874  435  387  481 1171 2755   12   98
mhas <- mhas[!is.na(mhas$yrcensor), ]
mhas <- mhas[!is.na(mhas$mocensor), ]

mhas <- mhas[mhas$mocensor != 88, ]
mhas <- mhas[mhas$mocensor != 99, ]


mhas <- mhas[mhas$yrcensor > mhas$yrint | (mhas$yrcensor == mhas$yrint & mhas$mocensor >= mhas$moint), ]
mhas$time_exposed <- with(mhas, ((yrcensor + mocensor/12) - (yrint + moint/12)) * 12 + 1)


summary(mhas$time_exposed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    89.0   201.0   165.3   245.0   248.0
summary(mhas$died0121)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  1.0000  0.5223  1.0000  1.0000    1224
mhas <- mhas[!is.na(mhas$died0121), ]

mhas$event_occurred <- mhas$died0121




summary(mhas$ageint)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   50.50   55.25   61.25   63.42   69.50  106.08
summary(mhas$agecensor)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   51.17   72.75   77.58   78.20   83.67  114.42
mhas$entry_time <- mhas$ageint
mhas$exit_time <-mhas$agecensor

#2. Problem Set 3 ## Part I

##Part I.   Multivariate parametric proportional hazards models

### a.  Estimate parametric proportional hazards model with an exponential (i.e., constant) with no covariates. Instead of hazard ratios, obtain coefficients. Exponentiate the intercept of the model. (Later on, we will compare this to the crude death rate for people 50+ we will calculate in a later problem set).



model <- survreg(Surv(time_exposed, event_occurred) ~ 1, data = mhas, dist = "exponential")
summary(model)
## 
## Call:
## survreg(formula = Surv(time_exposed, event_occurred) ~ 1, data = mhas, 
##     dist = "exponential")
##             Value Std. Error   z      p
## (Intercept) 5.834      0.016 365 <2e-16
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -26679.5   Loglik(intercept only)= -26679.5
## Number of Newton-Raphson Iterations: 4 
## n= 7475
intercept_log <- coef(model)

baseline_hazard_rate <- exp(intercept_log)
baseline_hazard_rate
## (Intercept) 
##    341.6837
###b.   Estimate another exponential, proportional hazards model, using FEMALE as the only covariate. Obtain coefficients, not hazard ratios. 


#Interpret the coefficient for FEMALE after exponentiating it. Exponentiate the intercept of this model and interpret this figure.

model_female <- survreg(Surv(time_exposed, event_occurred) ~ female, data = mhas, dist = "exponential")
summary(model_female)
## 
## Call:
## survreg(formula = Surv(time_exposed, event_occurred) ~ female, 
##     data = mhas, dist = "exponential")
##              Value Std. Error      z      p
## (Intercept) 5.7728     0.0232 248.77 <2e-16
## female      0.1135     0.0320   3.54  4e-04
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -26673.2   Loglik(intercept only)= -26679.5
##  Chisq= 12.52 on 1 degrees of freedom, p= 4e-04 
## Number of Newton-Raphson Iterations: 4 
## n= 7475
coef_female_log <- coef(model_female)

exp_coef_female <- exp(coef_female_log)
exp_coef_female
## (Intercept)      female 
##  321.423802    1.120213
#Sum the constant and the coefficient for female, and exponentiate this number. 

intercept_log_female <- coef(model_female)

sum_log <- intercept_log_female + coef_female_log

hazard_rate_female <- exp(sum_log)
hazard_rate_female
##  (Intercept)       female 
## 1.033133e+05 1.254876e+00
###c.   Estimate another PH parametric model controlling for FEMALE, SCHOOLING (or categories of EDUCLEVEL, whatever you prefer, justifying your choice), and LOCSIZE01. Assuming the baseline hazard is distributed…

# Exponential

model_exp <- survreg(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas, dist = "exponential")
summary(model_exp)
## 
## Call:
## survreg(formula = Surv(time_exposed, event_occurred) ~ female + 
##     schooling + locsize01, data = mhas, dist = "exponential")
##                             Value Std. Error      z       p
## (Intercept)               5.64443    0.04212 134.00 < 2e-16
## female                    0.17639    0.03248   5.43 5.6e-08
## schooling                 0.06251    0.00442  14.13 < 2e-16
## locsize01100,000 - +     -0.21179    0.04600  -4.60 4.1e-06
## locsize0115,000 - 99,999 -0.15607    0.05638  -2.77  0.0056
## locsize012,500 - 14,999  -0.13834    0.06332  -2.18  0.0289
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -26500.9   Loglik(intercept only)= -26617.6
##  Chisq= 233.35 on 5 degrees of freedom, p= 2e-48 
## Number of Newton-Raphson Iterations: 4 
## n=7463 (12 observations deleted due to missingness)
# Gompertz

library(flexsurv)
## 
## Attaching package: 'flexsurv'
## The following objects are masked from 'package:eha':
## 
##     dgompertz, dllogis, hgompertz, Hgompertz, hllogis, Hllogis, hlnorm,
##     Hlnorm, hweibull, Hweibull, pgompertz, pllogis, qgompertz, qllogis,
##     rgompertz, rllogis
model_gompertz <- flexsurvreg(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas, dist = "gompertz")
summary(model_gompertz)
## female=0.544151145651883,schooling=4.28018223234624,locsize01100,000 - +=0.57523784001072,locsize0115,000 - 99,999=0.154629505560766,locsize012,500 - 14,999=0.0972799142436018 
##     time       est       lcl       ucl
## 1      1 0.9978860 0.9977595 0.9980190
## 2      2 0.9957706 0.9955185 0.9960356
## 3      2 0.9957706 0.9955185 0.9960356
## 4      3 0.9936539 0.9932771 0.9940498
## 5      3 0.9936539 0.9932771 0.9940498
## 6      4 0.9915359 0.9910351 0.9920616
## 7      5 0.9894165 0.9887927 0.9900709
## 8      5 0.9894165 0.9887927 0.9900709
## 9      6 0.9872959 0.9865499 0.9880778
## 10     6 0.9872959 0.9865499 0.9880778
## 11     7 0.9851740 0.9843066 0.9860824
## 12     8 0.9830508 0.9820629 0.9840846
## 13     8 0.9830508 0.9820629 0.9840846
## 14     9 0.9809264 0.9798188 0.9820844
## 15     9 0.9809264 0.9798188 0.9820844
## 16    10 0.9788007 0.9775744 0.9800819
## 17    11 0.9766738 0.9753296 0.9780770
## 18    11 0.9766738 0.9753296 0.9780770
## 19    12 0.9745457 0.9730844 0.9760697
## 20    12 0.9745457 0.9730844 0.9760697
## 21    13 0.9724163 0.9708413 0.9740602
## 22    14 0.9702858 0.9685991 0.9720483
## 23    14 0.9702858 0.9685991 0.9720483
## 24    15 0.9681541 0.9663570 0.9700341
## 25    15 0.9681541 0.9663570 0.9700341
## 26    16 0.9660212 0.9641153 0.9680177
## 27    17 0.9638871 0.9618737 0.9659989
## 28    17 0.9638871 0.9618737 0.9659989
## 29    18 0.9617519 0.9596323 0.9639779
## 30    18 0.9617519 0.9596323 0.9639779
## 31    19 0.9596155 0.9573915 0.9619546
## 32    20 0.9574781 0.9551532 0.9599297
## 33    20 0.9574781 0.9551532 0.9599297
## 34    21 0.9553395 0.9529153 0.9579081
## 35    21 0.9553395 0.9529153 0.9579081
## 36    22 0.9531998 0.9506780 0.9558790
## 37    23 0.9510590 0.9484344 0.9538478
## 38    23 0.9510590 0.9484344 0.9538478
## 39    24 0.9489172 0.9461902 0.9518160
## 40    24 0.9489172 0.9461902 0.9518160
## 41    25 0.9467743 0.9439492 0.9497864
## 42    26 0.9446303 0.9417064 0.9477562
## 43    26 0.9446303 0.9417064 0.9477562
## 44    27 0.9424854 0.9394584 0.9457243
## 45    27 0.9424854 0.9394584 0.9457243
## 46    28 0.9403394 0.9372125 0.9436909
## 47    29 0.9381923 0.9349678 0.9416559
## 48    29 0.9381923 0.9349678 0.9416559
## 49    30 0.9360443 0.9327232 0.9396156
## 50    30 0.9360443 0.9327232 0.9396156
## 51    31 0.9338953 0.9304785 0.9375687
## 52    32 0.9317454 0.9282338 0.9355195
## 53    32 0.9317454 0.9282338 0.9355195
## 54    33 0.9295945 0.9259891 0.9334681
## 55    33 0.9295945 0.9259891 0.9334681
## 56    34 0.9274426 0.9237446 0.9314151
## 57    35 0.9252898 0.9215003 0.9293651
## 58    35 0.9252898 0.9215003 0.9293651
## 59    36 0.9231361 0.9192560 0.9273102
## 60    36 0.9231361 0.9192560 0.9273102
## 61    37 0.9209815 0.9170118 0.9252485
## 62    38 0.9188260 0.9147677 0.9231846
## 63    38 0.9188260 0.9147677 0.9231846
## 64    39 0.9166696 0.9125260 0.9211185
## 65    39 0.9166696 0.9125260 0.9211185
## 66    40 0.9145124 0.9102889 0.9190526
## 67    41 0.9123543 0.9080523 0.9169895
## 68    41 0.9123543 0.9080523 0.9169895
## 69    42 0.9101953 0.9058161 0.9149247
## 70    42 0.9101953 0.9058161 0.9149247
## 71    43 0.9080356 0.9035804 0.9128581
## 72    44 0.9058750 0.9013453 0.9107899
## 73    44 0.9058750 0.9013453 0.9107899
## 74    45 0.9037137 0.8990982 0.9087155
## 75    45 0.9037137 0.8990982 0.9087155
## 76    46 0.9015516 0.8968455 0.9066446
## 77    47 0.8993887 0.8945926 0.9045731
## 78    47 0.8993887 0.8945926 0.9045731
## 79    48 0.8972250 0.8923395 0.9024921
## 80    48 0.8972250 0.8923395 0.9024921
## 81    49 0.8950607 0.8900973 0.9004092
## 82    50 0.8928956 0.8878637 0.8983305
## 83    50 0.8928956 0.8878637 0.8983305
## 84    51 0.8907297 0.8856307 0.8962532
## 85    51 0.8907297 0.8856307 0.8962532
## 86    52 0.8885632 0.8833983 0.8941744
## 87    53 0.8863960 0.8811607 0.8921014
## 88    53 0.8863960 0.8811607 0.8921014
## 89    54 0.8842282 0.8789134 0.8900172
## 90    54 0.8842282 0.8789134 0.8900172
## 91    55 0.8820597 0.8766595 0.8879310
## 92    56 0.8798905 0.8744013 0.8858446
## 93    56 0.8798905 0.8744013 0.8858446
## 94    57 0.8777208 0.8721608 0.8837582
## 95    57 0.8777208 0.8721608 0.8837582
## 96    58 0.8755504 0.8699206 0.8816628
## 97    59 0.8733794 0.8676767 0.8795706
## 98    59 0.8733794 0.8676767 0.8795706
## 99    60 0.8712079 0.8654295 0.8774769
## 100   61 0.8690357 0.8631825 0.8753816
## 101   62 0.8668631 0.8609356 0.8732850
## 102   62 0.8668631 0.8609356 0.8732850
## 103   63 0.8646899 0.8586889 0.8711868
## 104   63 0.8646899 0.8586889 0.8711868
## 105   64 0.8625162 0.8564423 0.8690872
## 106   65 0.8603419 0.8541964 0.8669862
## 107   65 0.8603419 0.8541964 0.8669862
## 108   66 0.8581672 0.8519512 0.8648838
## 109   66 0.8581672 0.8519512 0.8648838
## 110   67 0.8559920 0.8497063 0.8627800
## 111   68 0.8538164 0.8474616 0.8606784
## 112   68 0.8538164 0.8474616 0.8606784
## 113   69 0.8516403 0.8452172 0.8585867
## 114   69 0.8516403 0.8452172 0.8585867
## 115   70 0.8494638 0.8429731 0.8564938
## 116   71 0.8472868 0.8407446 0.8543911
## 117   71 0.8472868 0.8407446 0.8543911
## 118   72 0.8451095 0.8385253 0.8522750
## 119   72 0.8451095 0.8385253 0.8522750
## 120   73 0.8429318 0.8363070 0.8501720
## 121   74 0.8407537 0.8340897 0.8480745
## 122   74 0.8407537 0.8340897 0.8480745
## 123   75 0.8385753 0.8318581 0.8459761
## 124   75 0.8385753 0.8318581 0.8459761
## 125   76 0.8363965 0.8296138 0.8438717
## 126   77 0.8342174 0.8273697 0.8417740
## 127   77 0.8342174 0.8273697 0.8417740
## 128   78 0.8320380 0.8251264 0.8396756
## 129   78 0.8320380 0.8251264 0.8396756
## 130   79 0.8298583 0.8228835 0.8375699
## 131   80 0.8276783 0.8206411 0.8354558
## 132   80 0.8276783 0.8206411 0.8354558
## 133   81 0.8254981 0.8183990 0.8333406
## 134   81 0.8254981 0.8183990 0.8333406
## 135   82 0.8233176 0.8161629 0.8312243
## 136   83 0.8211369 0.8139332 0.8291070
## 137   83 0.8211369 0.8139332 0.8291070
## 138   84 0.8189560 0.8117224 0.8269883
## 139   84 0.8189560 0.8117224 0.8269883
## 140   85 0.8167749 0.8095127 0.8248681
## 141   86 0.8145936 0.8072763 0.8227469
## 142   86 0.8145936 0.8072763 0.8227469
## 143   87 0.8124122 0.8050194 0.8206124
## 144   87 0.8124122 0.8050194 0.8206124
## 145   88 0.8102306 0.8027898 0.8184667
## 146   89 0.8080488 0.8005560 0.8163356
## 147   89 0.8080488 0.8005560 0.8163356
## 148   90 0.8058670 0.7983163 0.8142101
## 149   90 0.8058670 0.7983163 0.8142101
## 150   91 0.8036851 0.7960771 0.8120747
## 151   92 0.8015030 0.7938384 0.8099105
## 152   92 0.8015030 0.7938384 0.8099105
## 153   93 0.7993209 0.7916003 0.8078004
## 154   93 0.7993209 0.7916003 0.8078004
## 155   94 0.7971388 0.7893627 0.8056907
## 156   95 0.7949566 0.7871256 0.8035709
## 157   95 0.7949566 0.7871256 0.8035709
## 158   96 0.7927744 0.7848892 0.8014411
## 159   96 0.7927744 0.7848892 0.8014411
## 160   97 0.7905922 0.7826533 0.7993105
## 161   98 0.7884101 0.7804183 0.7971786
## 162   98 0.7884101 0.7804183 0.7971786
## 163   99 0.7862279 0.7781845 0.7950459
## 164   99 0.7862279 0.7781845 0.7950459
## 165  100 0.7840458 0.7759516 0.7929124
## 166  101 0.7818638 0.7737433 0.7907782
## 167  101 0.7818638 0.7737433 0.7907782
## 168  102 0.7796819 0.7715461 0.7886427
## 169  102 0.7796819 0.7715461 0.7886427
## 170  103 0.7775000 0.7693502 0.7864987
## 171  104 0.7753183 0.7671340 0.7843258
## 172  104 0.7753183 0.7671340 0.7843258
## 173  105 0.7731367 0.7648983 0.7821513
## 174  105 0.7731367 0.7648983 0.7821513
## 175  106 0.7709553 0.7626632 0.7799754
## 176  107 0.7687740 0.7604287 0.7777981
## 177  107 0.7687740 0.7604287 0.7777981
## 178  108 0.7665929 0.7581950 0.7756196
## 179  108 0.7665929 0.7581950 0.7756196
## 180  109 0.7644120 0.7559619 0.7734399
## 181  110 0.7622313 0.7537295 0.7712589
## 182  110 0.7622313 0.7537295 0.7712589
## 183  111 0.7600509 0.7514978 0.7690767
## 184  111 0.7600509 0.7514978 0.7690767
## 185  112 0.7578707 0.7492669 0.7669155
## 186  113 0.7556908 0.7470367 0.7647643
## 187  113 0.7556908 0.7470367 0.7647643
## 188  114 0.7535112 0.7448073 0.7626124
## 189  114 0.7535112 0.7448073 0.7626124
## 190  115 0.7513319 0.7425787 0.7604600
## 191  116 0.7491529 0.7403509 0.7583069
## 192  116 0.7491529 0.7403509 0.7583069
## 193  117 0.7469742 0.7381239 0.7561534
## 194  117 0.7469742 0.7381239 0.7561534
## 195  118 0.7447959 0.7358978 0.7539992
## 196  119 0.7426180 0.7336725 0.7518420
## 197  119 0.7426180 0.7336725 0.7518420
## 198  120 0.7404404 0.7314482 0.7496673
## 199  120 0.7404404 0.7314482 0.7496673
## 200  121 0.7382633 0.7292339 0.7474920
## 201  122 0.7360866 0.7270252 0.7453666
## 202  122 0.7360866 0.7270252 0.7453666
## 203  123 0.7339103 0.7248177 0.7432416
## 204  123 0.7339103 0.7248177 0.7432416
## 205  124 0.7317345 0.7226113 0.7411222
## 206  125 0.7295592 0.7204064 0.7390029
## 207  125 0.7295592 0.7204064 0.7390029
## 208  126 0.7273844 0.7182027 0.7368763
## 209  126 0.7273844 0.7182027 0.7368763
## 210  127 0.7252101 0.7160003 0.7347087
## 211  128 0.7230363 0.7137991 0.7325405
## 212  128 0.7230363 0.7137991 0.7325405
## 213  129 0.7208631 0.7115990 0.7303988
## 214  129 0.7208631 0.7115990 0.7303988
## 215  130 0.7186905 0.7094000 0.7282754
## 216  131 0.7165184 0.7072072 0.7260830
## 217  131 0.7165184 0.7072072 0.7260830
## 218  132 0.7143469 0.7050301 0.7238781
## 219  132 0.7143469 0.7050301 0.7238781
## 220  133 0.7121761 0.7028435 0.7217293
## 221  134 0.7100059 0.7006583 0.7196064
## 222  134 0.7100059 0.7006583 0.7196064
## 223  135 0.7078363 0.6984745 0.7174154
## 224  135 0.7078363 0.6984745 0.7174154
## 225  136 0.7056675 0.6962921 0.7152237
## 226  137 0.7034993 0.6941112 0.7130315
## 227  137 0.7034993 0.6941112 0.7130315
## 228  138 0.7013318 0.6919318 0.7108497
## 229  138 0.7013318 0.6919318 0.7108497
## 230  140 0.6969991 0.6875774 0.7065406
## 231  146 0.6840200 0.6745396 0.6935642
## 232  146 0.6840200 0.6745396 0.6935642
## 233  148 0.6797005 0.6701060 0.6892434
## 234  149 0.6775420 0.6678908 0.6870827
## 235  149 0.6775420 0.6678908 0.6870827
## 236  150 0.6753846 0.6657245 0.6849441
## 237  161 0.6517191 0.6420383 0.6616482
## 238  162 0.6495743 0.6399536 0.6595332
## 239  162 0.6495743 0.6399536 0.6595332
## 240  163 0.6474306 0.6377829 0.6573880
## 241  164 0.6452882 0.6356789 0.6551990
## 242  164 0.6452882 0.6356789 0.6551990
## 243  165 0.6431469 0.6335770 0.6530122
## 244  165 0.6431469 0.6335770 0.6530122
## 245  166 0.6410069 0.6314787 0.6509251
## 246  167 0.6388681 0.6293827 0.6487954
## 247  167 0.6388681 0.6293827 0.6487954
## 248  168 0.6367306 0.6272344 0.6466972
## 249  168 0.6367306 0.6272344 0.6466972
## 250  169 0.6345944 0.6250416 0.6445978
## 251  170 0.6324595 0.6228503 0.6424993
## 252  170 0.6324595 0.6228503 0.6424993
## 253  171 0.6303259 0.6206603 0.6403976
## 254  171 0.6303259 0.6206603 0.6403976
## 255  172 0.6281937 0.6184347 0.6382435
## 256  173 0.6260629 0.6162480 0.6361207
## 257  173 0.6260629 0.6162480 0.6361207
## 258  174 0.6239334 0.6140963 0.6339783
## 259  175 0.6218053 0.6119173 0.6319000
## 260  179 0.6133074 0.6035029 0.6234583
## 261  196 0.5774737 0.5671515 0.5877691
## 262  197 0.5753813 0.5650235 0.5857148
## 263  198 0.5732908 0.5629180 0.5836620
## 264  198 0.5732908 0.5629180 0.5836620
## 265  199 0.5712022 0.5608713 0.5816109
## 266  200 0.5691154 0.5587696 0.5795615
## 267  200 0.5691154 0.5587696 0.5795615
## 268  201 0.5670305 0.5566452 0.5775137
## 269  201 0.5670305 0.5566452 0.5775137
## 270  202 0.5649476 0.5545653 0.5754677
## 271  203 0.5628666 0.5524892 0.5734233
## 272  203 0.5628666 0.5524892 0.5734233
## 273  204 0.5607876 0.5504157 0.5713807
## 274  204 0.5607876 0.5504157 0.5713807
## 275  205 0.5587105 0.5483447 0.5693311
## 276  206 0.5566354 0.5462713 0.5672799
## 277  206 0.5566354 0.5462713 0.5672799
## 278  207 0.5545624 0.5441753 0.5652305
## 279  207 0.5545624 0.5441753 0.5652305
## 280  208 0.5524914 0.5421188 0.5631827
## 281  209 0.5504224 0.5400670 0.5611368
## 282  209 0.5504224 0.5400670 0.5611368
## 283  210 0.5483556 0.5380262 0.5590926
## 284  210 0.5483556 0.5380262 0.5590926
## 285  211 0.5462908 0.5359701 0.5570501
## 286  212 0.5442281 0.5339249 0.5550095
## 287  212 0.5442281 0.5339249 0.5550095
## 288  213 0.5421676 0.5318825 0.5529934
## 289  213 0.5421676 0.5318825 0.5529934
## 290  214 0.5401092 0.5298163 0.5510231
## 291  215 0.5380530 0.5277690 0.5490355
## 292  216 0.5359990 0.5257270 0.5470168
## 293  217 0.5339472 0.5236838 0.5449985
## 294  218 0.5318977 0.5216255 0.5429849
## 295  218 0.5318977 0.5216255 0.5429849
## 296  219 0.5298504 0.5195556 0.5409863
## 297  219 0.5298504 0.5195556 0.5409863
## 298  220 0.5278053 0.5174895 0.5389585
## 299  221 0.5257626 0.5154258 0.5369503
## 300  221 0.5257626 0.5154258 0.5369503
## 301  242 0.4834273 0.4723510 0.4952719
## 302  242 0.4834273 0.4723510 0.4952719
## 303  243 0.4814397 0.4702397 0.4932875
## 304  243 0.4814397 0.4702397 0.4932875
## 305  244 0.4794549 0.4682065 0.4913247
## 306  245 0.4774728 0.4662321 0.4893690
## 307  245 0.4774728 0.4662321 0.4893690
## 308  246 0.4754934 0.4642393 0.4874158
## 309  246 0.4754934 0.4642393 0.4874158
## 310  247 0.4735169 0.4621664 0.4855135
## 311  248 0.4715431 0.4601385 0.4836078
# Weibull

model_weibull <- survreg(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas, dist = "weibull")
summary(model_weibull)
## 
## Call:
## survreg(formula = Surv(time_exposed, event_occurred) ~ female + 
##     schooling + locsize01, data = mhas, dist = "weibull")
##                             Value Std. Error      z       p
## (Intercept)               5.58183    0.03503 159.33 < 2e-16
## female                    0.15176    0.02689   5.64 1.7e-08
## schooling                 0.05354    0.00372  14.40 < 2e-16
## locsize01100,000 - +     -0.18185    0.03805  -4.78 1.8e-06
## locsize0115,000 - 99,999 -0.13379    0.04659  -2.87  0.0041
## locsize012,500 - 14,999  -0.11934    0.05231  -2.28  0.0225
## Log(scale)               -0.19140    0.01476 -12.97 < 2e-16
## 
## Scale= 0.826 
## 
## Weibull distribution
## Loglik(model)= -26422.5   Loglik(intercept only)= -26547.6
##  Chisq= 250.2 on 5 degrees of freedom, p= 5e-52 
## Number of Newton-Raphson Iterations: 5 
## n=7463 (12 observations deleted due to missingness)
# Log logistic

model_loglogistic <- survreg(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas, dist = "loglogistic")
summary(model_loglogistic)
## 
## Call:
## survreg(formula = Surv(time_exposed, event_occurred) ~ female + 
##     schooling + locsize01, data = mhas, dist = "loglogistic")
##                             Value Std. Error      z       p
## (Intercept)               5.28347    0.03953 133.67 < 2e-16
## female                    0.16658    0.03042   5.48 4.3e-08
## schooling                 0.05818    0.00397  14.66 < 2e-16
## locsize01100,000 - +     -0.20983    0.04301  -4.88 1.1e-06
## locsize0115,000 - 99,999 -0.14261    0.05262  -2.71  0.0067
## locsize012,500 - 14,999  -0.13227    0.05937  -2.23  0.0259
## Log(scale)               -0.35098    0.01425 -24.63 < 2e-16
## 
## Scale= 0.704 
## 
## Log logistic distribution
## Loglik(model)= -26458.3   Loglik(intercept only)= -26580.6
##  Chisq= 244.56 on 5 degrees of freedom, p= 8.1e-51 
## Number of Newton-Raphson Iterations: 4 
## n=7463 (12 observations deleted due to missingness)
# Log Normal

model_lognormal <- survreg(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas, dist = "lognormal")
summary(model_lognormal)
## 
## Call:
## survreg(formula = Surv(time_exposed, event_occurred) ~ female + 
##     schooling + locsize01, data = mhas, dist = "lognormal")
##                             Value Std. Error      z       p
## (Intercept)               5.29451    0.04534 116.77 < 2e-16
## female                    0.18573    0.03456   5.37 7.7e-08
## schooling                 0.06491    0.00437  14.86 < 2e-16
## locsize01100,000 - +     -0.24280    0.04898  -4.96 7.2e-07
## locsize0115,000 - 99,999 -0.15424    0.06015  -2.56    0.01
## locsize012,500 - 14,999  -0.13286    0.06789  -1.96    0.05
## Log(scale)                0.28647    0.01240  23.11 < 2e-16
## 
## Scale= 1.33 
## 
## Log Normal distribution
## Loglik(model)= -26590.8   Loglik(intercept only)= -26710.3
##  Chisq= 238.99 on 5 degrees of freedom, p= 1.3e-49 
## Number of Newton-Raphson Iterations: 3 
## n=7463 (12 observations deleted due to missingness)

Part II

## Part II Cox regression

###a.Estimate a non-parametric model via Cox regression. Add these HRs to the table.


cox_model <- coxph(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas)
summary(cox_model)
## Call:
## coxph(formula = Surv(time_exposed, event_occurred) ~ female + 
##     schooling + locsize01, data = mhas)
## 
##   n= 7463, number of events= 3894 
##    (12 observations deleted due to missingness)
## 
##                               coef exp(coef)  se(coef)       z Pr(>|z|)    
## female                   -0.185390  0.830780  0.032496  -5.705 1.16e-08 ***
## schooling                -0.065714  0.936399  0.004442 -14.795  < 2e-16 ***
## locsize01100,000 - +      0.216463  1.241677  0.046011   4.705 2.54e-06 ***
## locsize0115,000 - 99,999  0.163791  1.177968  0.056379   2.905  0.00367 ** 
## locsize012,500 - 14,999   0.145022  1.156064  0.063324   2.290  0.02201 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                          exp(coef) exp(-coef) lower .95 upper .95
## female                      0.8308     1.2037    0.7795    0.8854
## schooling                   0.9364     1.0679    0.9283    0.9446
## locsize01100,000 - +        1.2417     0.8054    1.1346    1.3589
## locsize0115,000 - 99,999    1.1780     0.8489    1.0547    1.3156
## locsize012,500 - 14,999     1.1561     0.8650    1.0211    1.3088
## 
## Concordance= 0.578  (se = 0.005 )
## Likelihood ratio test= 257.1  on 5 df,   p=<2e-16
## Wald test            = 237.3  on 5 df,   p=<2e-16
## Score (logrank) test = 238.8  on 5 df,   p=<2e-16
coef_cox <- coef(cox_model)
hr_cox <- exp(coef_cox)

hr_cox
##                   female                schooling     locsize01100,000 - + 
##                0.8307803                0.9363990                1.2416773 
## locsize0115,000 - 99,999  locsize012,500 - 14,999 
##                1.1779677                1.1560645

Part III

##III. Prediction, diagnostics, time-varying covariates.

### a. Choosing whatever model you prefer from I or II, assess whether the proportionality of hazards assumption is violated for FEMALE and SCHOOLING (separately). Does it hold for either/both? 


cox_model <- coxph(Surv(time_exposed, event_occurred) ~ female + schooling + locsize01, data = mhas)
summary(cox_model)
## Call:
## coxph(formula = Surv(time_exposed, event_occurred) ~ female + 
##     schooling + locsize01, data = mhas)
## 
##   n= 7463, number of events= 3894 
##    (12 observations deleted due to missingness)
## 
##                               coef exp(coef)  se(coef)       z Pr(>|z|)    
## female                   -0.185390  0.830780  0.032496  -5.705 1.16e-08 ***
## schooling                -0.065714  0.936399  0.004442 -14.795  < 2e-16 ***
## locsize01100,000 - +      0.216463  1.241677  0.046011   4.705 2.54e-06 ***
## locsize0115,000 - 99,999  0.163791  1.177968  0.056379   2.905  0.00367 ** 
## locsize012,500 - 14,999   0.145022  1.156064  0.063324   2.290  0.02201 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                          exp(coef) exp(-coef) lower .95 upper .95
## female                      0.8308     1.2037    0.7795    0.8854
## schooling                   0.9364     1.0679    0.9283    0.9446
## locsize01100,000 - +        1.2417     0.8054    1.1346    1.3589
## locsize0115,000 - 99,999    1.1780     0.8489    1.0547    1.3156
## locsize012,500 - 14,999     1.1561     0.8650    1.0211    1.3088
## 
## Concordance= 0.578  (se = 0.005 )
## Likelihood ratio test= 257.1  on 5 df,   p=<2e-16
## Wald test            = 237.3  on 5 df,   p=<2e-16
## Score (logrank) test = 238.8  on 5 df,   p=<2e-16
ph_test <- cox.zph(cox_model)

print(ph_test)
##           chisq df      p
## female     0.10  1 0.7517
## schooling  9.11  1 0.0025
## locsize01  2.07  3 0.5583
## GLOBAL    13.60  5 0.0184
plot(ph_test)

###b. Estimate a “Cox-stratified” model using a new dummy variable that divides people into whether they have less than 9 vs. 9 or more years of formal schooling as the stratification variable. Would your conclusions about the effect of education (linearly anyway) ###change doing this?

mhas$schooling_strata <- ifelse(mhas$schooling < 9, "<9 years", ">=9 years")


cox_stratified_model <- coxph(Surv(time_exposed, event_occurred) ~ female + locsize01 + strata(schooling_strata), data = mhas)
summary(cox_stratified_model)
## Call:
## coxph(formula = Surv(time_exposed, event_occurred) ~ female + 
##     locsize01 + strata(schooling_strata), data = mhas)
## 
##   n= 7463, number of events= 3894 
##    (12 observations deleted due to missingness)
## 
##                              coef exp(coef) se(coef)      z Pr(>|z|)    
## female                   -0.14619   0.86399  0.03235 -4.519  6.2e-06 ***
## locsize01100,000 - +      0.10516   1.11089  0.04508  2.333   0.0197 *  
## locsize0115,000 - 99,999  0.08491   1.08862  0.05601  1.516   0.1296    
## locsize012,500 - 14,999   0.10939   1.11560  0.06327  1.729   0.0838 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                          exp(coef) exp(-coef) lower .95 upper .95
## female                       0.864     1.1574    0.8109    0.9205
## locsize01100,000 - +         1.111     0.9002    1.0169    1.2135
## locsize0115,000 - 99,999     1.089     0.9186    0.9754    1.2149
## locsize012,500 - 14,999      1.116     0.8964    0.9855    1.2629
## 
## Concordance= 0.521  (se = 0.005 )
## Likelihood ratio test= 24.38  on 4 df,   p=7e-05
## Wald test            = 24.38  on 4 df,   p=7e-05
## Score (logrank) test = 24.42  on 4 df,   p=7e-05
### c.“Recover” and graph the baseline hazard under the Cox model.

# Recover the baseline hazard
baseline_hazard <- basehaz(cox_model, centered = FALSE)

# Plot the baseline hazard
plot(baseline_hazard$time, baseline_hazard$hazard, type = "l", xlab = "Time", ylab = "Baseline Hazard", main = "Baseline Hazard under Cox Model")

### d.Estimate predicted survival curves or smoothed hazards (whichever you prefer or find more intuitive) for 


mhas$locsize01 <- as.factor(mhas$locsize01)

surv_men_0 <- survfit(cox_model, newdata = data.frame(female = 0, schooling = 0, locsize01 = factor("100,000 - +", levels = levels(mhas$locsize01))))
surv_women_0 <- survfit(cox_model, newdata = data.frame(female = 1, schooling = 0, locsize01 = factor("100,000 - +", levels = levels(mhas$locsize01)))) 
surv_men_6 <- survfit(cox_model, newdata = data.frame(female = 0, schooling = 6, locsize01 = factor("100,000 - +", levels = levels(mhas$locsize01)))) 
surv_women_6 <- survfit(cox_model, newdata = data.frame(female = 1, schooling = 6, locsize01 = factor("100,000 - +", levels = levels(mhas$locsize01)))) 


plot(surv_men_0, xlab = "Time", ylab = "Survival Probability", col = "blue", lwd = 2, main = "Survival Curve", conf.int = FALSE)

lines(surv_women_0, col = "red", lwd = 2, conf.int = FALSE)
lines(surv_men_6, col = "green", lwd = 2, conf.int = FALSE)
lines(surv_women_6, col = "orange", lwd = 2, conf.int = FALSE)

legend("bottomleft", 
       legend = c("male_0", "female_0", "male_6", "female_6"), 
       col = c("blue", "red", "green", "orange"), 
       lwd = 2)