1. Explore the data using descriptive statistics:

# import data set
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(PredictABEL)
library(dplyr)
library(tidyverse)
pbcData = read_csv("pbctrial.csv")
## Rows: 312 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): sex, agecat
## dbl (16): case, drug, bil, histo, death, survyr, _st, _d, _t, _t0, ageyr, ag...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## exploring the data
summary(pbcData)
##       case             drug            sex                 bil        
##  Min.   :  1.00   Min.   :0.0000   Length:312         Min.   : 0.300  
##  1st Qu.: 78.75   1st Qu.:0.0000   Class :character   1st Qu.: 0.800  
##  Median :156.50   Median :1.0000   Mode  :character   Median : 1.350  
##  Mean   :156.50   Mean   :0.5064                      Mean   : 3.256  
##  3rd Qu.:234.25   3rd Qu.:1.0000                      3rd Qu.: 3.425  
##  Max.   :312.00   Max.   :1.0000                      Max.   :28.000  
##      histo           death            survyr             _st   
##  Min.   :1.000   Min.   :0.0000   Min.   : 0.1123   Min.   :1  
##  1st Qu.:2.000   1st Qu.:0.0000   1st Qu.: 3.2630   1st Qu.:1  
##  Median :3.000   Median :0.0000   Median : 5.0397   Median :1  
##  Mean   :3.032   Mean   :0.4006   Mean   : 5.4969   Mean   :1  
##  3rd Qu.:4.000   3rd Qu.:1.0000   3rd Qu.: 7.3897   3rd Qu.:1  
##  Max.   :4.000   Max.   :1.0000   Max.   :12.4822   Max.   :1  
##        _d               _t               _t0        ageyr      
##  Min.   :0.0000   Min.   : 0.1123   Min.   :0   Min.   :26.30  
##  1st Qu.:0.0000   1st Qu.: 3.2630   1st Qu.:0   1st Qu.:42.27  
##  Median :0.0000   Median : 5.0397   Median :0   Median :49.83  
##  Mean   :0.4006   Mean   : 5.4969   Mean   :0   Mean   :50.05  
##  3rd Qu.:1.0000   3rd Qu.: 7.3897   3rd Qu.:0   3rd Qu.:56.75  
##  Max.   :1.0000   Max.   :12.4822   Max.   :0   Max.   :78.49  
##     agecat             agegr_2          agegr_3          hstage2      
##  Length:312         Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000   Median :0.0000   Median :0.0000  
##                     Mean   :0.3237   Mean   :0.3365   Mean   :0.2147  
##                     3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##                     Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     hstage3          hstage4      
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000  
##  Mean   :0.3846   Mean   :0.3494  
##  3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000
## table 1 (outcome = death) - stratified by exposure 
## age
pbcData %>%
  tabyl(drug, agecat) %>%
  adorn_totals(where="row")%>%
  adorn_totals(where="col")%>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits=1) %>%
  adorn_ns()%>%
  adorn_title()
##             agecat                                     
##   drug    < 45 yrs   >= 55 yrs 45 - 55 yrs        Total
##      0 37.7%  (58) 28.6%  (44) 33.8%  (52) 100.0% (154)
##      1 30.4%  (48) 38.6%  (61) 31.0%  (49) 100.0% (158)
##  Total 34.0% (106) 33.7% (105) 32.4% (101) 100.0% (312)
## sex
pbcData %>%
  tabyl(drug, sex) %>%
  adorn_totals(where="row")%>%
  adorn_totals(where="col")%>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits=1) %>%
  adorn_ns()%>%
  adorn_title()
##                sex                        
##   drug      Female       Male        Total
##      0 90.3% (139)  9.7% (15) 100.0% (154)
##      1 86.7% (137) 13.3% (21) 100.0% (158)
##  Total 88.5% (276) 11.5% (36) 100.0% (312)
## histo
pbcData %>%
  tabyl(drug, histo) %>%
  adorn_totals(where="row")%>%
  adorn_totals(where="col")%>%
  adorn_percentages("row") %>%
  adorn_pct_formatting(digits=1) %>%
  adorn_ns()%>%
  adorn_title()
##            histo                                                
##   drug         1          2           3           4        Total
##      0 2.6%  (4) 20.8% (32) 41.6%  (64) 35.1%  (54) 100.0% (154)
##      1 7.6% (12) 22.2% (35) 35.4%  (56) 34.8%  (55) 100.0% (158)
##  Total 5.1% (16) 21.5% (67) 38.5% (120) 34.9% (109) 100.0% (312)
## bill
summary(subset(pbcData$bil, pbcData$drug == 1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.300   0.800   1.400   2.873   3.200  20.000
sd(subset(pbcData$bil, pbcData$drug == 1))
## [1] 3.628855
summary(subset(pbcData$bil, pbcData$drug == 0))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.300   0.725   1.300   3.649   3.600  28.000
sd(subset(pbcData$bil, pbcData$drug == 0))
## [1] 5.281949
### mission data
variables <- c("case", "drug", "sex", "bil", "histo", "survyr", "ageyr")

#missing value check
for (i in variables) {
  missing_count <- sum(is.na(pbcData[[i]]))
  cat("Missing values in", i, ":", missing_count, "\n")
}
## Missing values in case : 0 
## Missing values in drug : 0 
## Missing values in sex : 0 
## Missing values in bil : 0 
## Missing values in histo : 0 
## Missing values in survyr : 0 
## Missing values in ageyr : 0

2. Define a survival object, defining the time variable (survyr) and the event (death == 1)

## survival analysis
## Define a survival object, defining the time variable (survyr) and the event (death == 1).
#install.packages("survival")
library(survival)

pbcData$SurvObj = with(pbcData, Surv(survyr, death == 1))

## drug failure incidence rates
pbcData %>%
  group_by(drug) %>%
  summarize(failure = sum(death==1), time = sum(survyr), incidence.rate=failure/time)
## # A tibble: 2 × 4
##    drug failure  time incidence.rate
##   <dbl>   <int> <dbl>          <dbl>
## 1     0      60  843.         0.0712
## 2     1      65  873.         0.0745

3. Explore differences in time to death by different baseline variables using graphs and complementary log-log plots

# estimate survival curves for entire sample
km.overall = survfit(SurvObj ~ 1, data = pbcData,
                     type="kaplan-meier", conf.type="log-log")
km.overall
## Call: survfit(formula = SurvObj ~ 1, data = pbcData, type = "kaplan-meier", 
##     conf.type = "log-log")
## 
##        n events median 0.95LCL 0.95UCL
## [1,] 312    125    9.3    8.45    10.5
summary(km.overall)
## Call: survfit(formula = SurvObj ~ 1, data = pbcData, type = "kaplan-meier", 
##     conf.type = "log-log")
## 
##    time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   0.112    312       1    0.997 0.00320        0.977        1.000
##   0.140    311       1    0.994 0.00452        0.975        0.998
##   0.195    310       1    0.990 0.00552        0.970        0.997
##   0.211    309       1    0.987 0.00637        0.966        0.995
##   0.301    308       1    0.984 0.00711        0.962        0.993
##   0.356    307       1    0.981 0.00778        0.958        0.991
##   0.359    306       1    0.978 0.00838        0.954        0.989
##   0.384    305       1    0.974 0.00895        0.949        0.987
##   0.490    304       1    0.971 0.00948        0.945        0.985
##   0.510    303       1    0.968 0.00997        0.941        0.983
##   0.523    302       1    0.965 0.01044        0.937        0.980
##   0.542    301       1    0.962 0.01089        0.933        0.978
##   0.567    300       1    0.958 0.01131        0.929        0.976
##   0.592    299       1    0.955 0.01172        0.925        0.973
##   0.611    298       1    0.952 0.01211        0.922        0.971
##   0.723    297       2    0.946 0.01285        0.914        0.966
##   0.833    295       1    0.942 0.01320        0.910        0.963
##   0.879    294       1    0.939 0.01354        0.906        0.961
##   0.893    293       1    0.936 0.01387        0.902        0.958
##   0.915    292       1    0.933 0.01418        0.899        0.956
##   0.953    291       1    0.929 0.01449        0.895        0.953
##   1.063    290       1    0.926 0.01479        0.891        0.950
##   1.096    289       1    0.923 0.01509        0.887        0.948
##   1.260    288       1    0.920 0.01537        0.884        0.945
##   1.411    287       1    0.917 0.01565        0.880        0.942
##   1.504    285       1    0.913 0.01592        0.876        0.940
##   1.512    284       1    0.910 0.01619        0.873        0.937
##   1.636    283       1    0.907 0.01644        0.869        0.934
##   1.674    282       1    0.904 0.01670        0.865        0.932
##   1.844    281       1    0.901 0.01695        0.862        0.929
##   1.901    280       1    0.897 0.01719        0.858        0.926
##   1.940    279       1    0.894 0.01742        0.854        0.924
##   2.008    277       1    0.891 0.01766        0.851        0.921
##   2.055    275       1    0.888 0.01789        0.847        0.918
##   2.088    274       1    0.884 0.01811        0.843        0.915
##   2.107    273       1    0.881 0.01833        0.840        0.912
##   2.153    272       1    0.878 0.01855        0.836        0.910
##   2.164    270       1    0.875 0.01877        0.833        0.907
##   2.184    269       1    0.871 0.01898        0.829        0.904
##   2.189    268       1    0.868 0.01918        0.825        0.901
##   2.258    267       1    0.865 0.01938        0.822        0.898
##   2.329    264       1    0.862 0.01958        0.818        0.896
##   2.337    263       1    0.858 0.01978        0.814        0.893
##   2.353    262       1    0.855 0.01998        0.811        0.890
##   2.438    260       1    0.852 0.02017        0.807        0.887
##   2.477    258       1    0.849 0.02036        0.804        0.884
##   2.548    257       1    0.845 0.02055        0.800        0.881
##   2.584    255       1    0.842 0.02073        0.796        0.878
##   2.660    254       1    0.839 0.02091        0.793        0.875
##   2.668    253       1    0.835 0.02109        0.789        0.872
##   2.685    252       1    0.832 0.02127        0.785        0.869
##   2.737    250       1    0.829 0.02144        0.782        0.866
##   2.740    249       1    0.825 0.02161        0.778        0.863
##   2.773    248       1    0.822 0.02178        0.775        0.860
##   2.841    246       1    0.819 0.02194        0.771        0.857
##   2.951    244       1    0.815 0.02211        0.767        0.854
##   2.959    243       1    0.812 0.02227        0.764        0.851
##   2.967    242       1    0.809 0.02243        0.760        0.848
##   3.156    239       1    0.805 0.02259        0.756        0.845
##   3.192    237       1    0.802 0.02275        0.753        0.842
##   3.205    236       1    0.798 0.02291        0.749        0.839
##   3.263    235       2    0.792 0.02321        0.742        0.833
##   3.321    233       1    0.788 0.02336        0.738        0.830
##   3.334    230       1    0.785 0.02350        0.734        0.827
##   3.384    227       1    0.781 0.02365        0.731        0.824
##   3.553    222       1    0.778 0.02381        0.727        0.820
##   3.699    214       1    0.774 0.02397        0.723        0.817
##   3.715    213       1    0.771 0.02413        0.719        0.814
##   3.726    212       1    0.767 0.02429        0.715        0.811
##   3.871    206       1    0.763 0.02446        0.711        0.807
##   3.910    203       1    0.759 0.02462        0.707        0.804
##   3.929    201       1    0.756 0.02479        0.703        0.800
##   3.956    198       1    0.752 0.02496        0.699        0.797
##   4.074    193       1    0.748 0.02513        0.695        0.793
##   4.088    192       1    0.744 0.02530        0.690        0.790
##   4.208    189       1    0.740 0.02547        0.686        0.786
##   4.318    184       1    0.736 0.02565        0.682        0.783
##   4.540    178       1    0.732 0.02583        0.677        0.779
##   4.608    175       1    0.728 0.02602        0.673        0.775
##   4.630    174       2    0.719 0.02639        0.664        0.767
##   4.770    169       1    0.715 0.02657        0.659        0.764
##   4.893    162       1    0.711 0.02677        0.654        0.760
##   5.005    159       1    0.706 0.02697        0.650        0.755
##   5.060    156       1    0.702 0.02718        0.645        0.751
##   5.274    151       1    0.697 0.02739        0.640        0.747
##   5.630    141       1    0.692 0.02764        0.634        0.743
##   5.701    140       1    0.687 0.02788        0.629        0.738
##   5.726    139       1    0.682 0.02812        0.624        0.734
##   5.767    138       1    0.677 0.02834        0.618        0.729
##   6.093    127       1    0.672 0.02862        0.612        0.725
##   6.181    123       1    0.667 0.02890        0.606        0.720
##   6.268    121       1    0.661 0.02918        0.600        0.715
##   6.293    119       1    0.655 0.02946        0.594        0.710
##   6.537    110       1    0.649 0.02979        0.588        0.704
##   6.575    109       1    0.644 0.03011        0.581        0.699
##   6.627    108       1    0.638 0.03041        0.575        0.694
##   6.756    103       1    0.631 0.03074        0.568        0.688
##   6.858    100       1    0.625 0.03108        0.561        0.683
##   6.959     96       1    0.619 0.03143        0.554        0.677
##   7.077     88       1    0.612 0.03185        0.546        0.671
##   7.118     87       1    0.604 0.03225        0.538        0.664
##   7.367     80       1    0.597 0.03272        0.530        0.658
##   7.586     76       1    0.589 0.03322        0.521        0.651
##   7.660     74       1    0.581 0.03371        0.512        0.644
##   7.800     71       1    0.573 0.03421        0.503        0.637
##   8.455     60       1    0.563 0.03495        0.492        0.629
##   8.466     59       1    0.554 0.03564        0.481        0.620
##   8.685     53       1    0.543 0.03646        0.469        0.612
##   8.827     52       1    0.533 0.03723        0.457        0.603
##   8.888     50       1    0.522 0.03798        0.445        0.594
##   8.992     48       1    0.511 0.03872        0.433        0.584
##   9.200     45       1    0.500 0.03949        0.420        0.574
##   9.301     43       1    0.488 0.04025        0.407        0.564
##   9.392     41       1    0.476 0.04099        0.394        0.554
##   9.438     40       1    0.465 0.04166        0.381        0.544
##   9.792     37       1    0.452 0.04238        0.368        0.533
##   9.819     34       1    0.439 0.04317        0.353        0.521
##  10.307     30       1    0.424 0.04414        0.337        0.509
##  10.518     27       1    0.408 0.04522        0.319        0.495
##  10.556     25       1    0.392 0.04626        0.302        0.481
##  11.175     17       1    0.369 0.04895        0.274        0.464
##  11.482     13       1    0.341 0.05278        0.240        0.444
# plot km curves
plot(km.overall)

# for drug group
# distribution
Freq <- table(pbcData$drug, useNA="ifany")
Percentage <- round((prop.table(Freq)*100),2)
cbind(Freq,Percentage)
##   Freq Percentage
## 0  154      49.36
## 1  158      50.64
# estimate survival curves 
km.drug = survfit(SurvObj ~ drug, data = pbcData,
                  type="kaplan-meier", conf.type="log-log")
plot(km.drug, main="Kaplan-Meier survival estimates", xlab="Time(years)", ylab="S(t)", col=c("blue", "red"))
legend("topright", legend=c("drug = 0", "drug = 1"), col=c("blue", "red"), lwd=1)

km.drug
## Call: survfit(formula = SurvObj ~ drug, data = pbcData, type = "kaplan-meier", 
##     conf.type = "log-log")
## 
##          n events median 0.95LCL 0.95UCL
## drug=0 154     60   9.39    8.47    10.6
## drug=1 158     65   8.99    6.96    11.5
summary(km.drug)
## Call: survfit(formula = SurvObj ~ drug, data = pbcData, type = "kaplan-meier", 
##     conf.type = "log-log")
## 
##                 drug=0 
##    time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   0.140    154       1    0.994 0.00647        0.955        0.999
##   0.211    153       1    0.987 0.00912        0.949        0.997
##   0.301    152       1    0.981 0.01114        0.941        0.994
##   0.356    151       1    0.974 0.01282        0.932        0.990
##   0.510    150       1    0.968 0.01428        0.924        0.986
##   0.523    149       1    0.961 0.01559        0.915        0.982
##   0.567    148       1    0.955 0.01679        0.907        0.978
##   0.592    147       1    0.948 0.01788        0.899        0.974
##   0.723    146       2    0.935 0.01986        0.883        0.965
##   0.833    144       1    0.929 0.02075        0.875        0.960
##   0.879    143       1    0.922 0.02160        0.867        0.955
##   0.893    142       1    0.916 0.02240        0.859        0.950
##   1.260    141       1    0.909 0.02317        0.851        0.945
##   1.504    140       1    0.903 0.02389        0.844        0.940
##   1.512    139       1    0.896 0.02459        0.836        0.935
##   1.636    138       1    0.890 0.02525        0.828        0.930
##   1.674    137       1    0.883 0.02589        0.821        0.925
##   1.940    136       1    0.877 0.02650        0.813        0.919
##   2.008    135       1    0.870 0.02709        0.806        0.914
##   2.107    134       1    0.864 0.02765        0.799        0.909
##   2.153    133       1    0.857 0.02820        0.791        0.904
##   2.164    131       1    0.851 0.02873        0.784        0.898
##   2.184    130       1    0.844 0.02925        0.776        0.893
##   2.329    128       1    0.837 0.02975        0.769        0.887
##   2.337    127       1    0.831 0.03024        0.762        0.882
##   2.353    126       1    0.824 0.03071        0.754        0.876
##   2.438    125       1    0.818 0.03116        0.747        0.870
##   2.548    124       1    0.811 0.03160        0.740        0.865
##   2.584    123       1    0.804 0.03203        0.732        0.859
##   2.668    122       1    0.798 0.03244        0.725        0.853
##   2.959    118       1    0.791 0.03286        0.718        0.847
##   3.192    115       1    0.784 0.03328        0.710        0.841
##   3.321    114       1    0.777 0.03370        0.703        0.836
##   3.334    111       1    0.770 0.03411        0.695        0.829
##   3.715    103       1    0.763 0.03459        0.687        0.823
##   3.871    101       1    0.755 0.03506        0.678        0.816
##   3.910     98       1    0.748 0.03554        0.670        0.810
##   3.956     95       1    0.740 0.03603        0.661        0.803
##   4.074     93       1    0.732 0.03651        0.652        0.796
##   4.208     91       1    0.724 0.03698        0.644        0.789
##   4.893     79       1    0.715 0.03763        0.633        0.781
##   5.060     76       1    0.705 0.03829        0.623        0.773
##   5.726     69       1    0.695 0.03908        0.611        0.764
##   6.627     56       1    0.683 0.04030        0.596        0.754
##   6.756     53       1    0.670 0.04155        0.581        0.744
##   6.858     51       1    0.657 0.04276        0.566        0.733
##   7.586     40       1    0.640 0.04473        0.545        0.720
##   7.660     38       1    0.623 0.04662        0.525        0.707
##   7.800     35       1    0.605 0.04857        0.503        0.693
##   8.466     32       1    0.587 0.05060        0.481        0.678
##   8.685     29       1    0.566 0.05275        0.457        0.662
##   8.888     28       1    0.546 0.05460        0.433        0.646
##   9.200     26       1    0.525 0.05640        0.409        0.628
##   9.301     24       1    0.503 0.05814        0.385        0.610
##   9.392     22       1    0.480 0.05983        0.360        0.591
##   9.438     21       1    0.457 0.06119        0.335        0.572
##  10.307     15       1    0.427 0.06427        0.300        0.548
##  10.518     13       1    0.394 0.06719        0.264        0.522
##  10.556     12       1    0.361 0.06916        0.230        0.494
## 
##                 drug=1 
##    time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   0.112    158       1    0.994 0.00631        0.956        0.999
##   0.195    157       1    0.987 0.00889        0.950        0.997
##   0.359    156       1    0.981 0.01086        0.942        0.994
##   0.384    155       1    0.975 0.01250        0.934        0.990
##   0.490    154       1    0.968 0.01393        0.926        0.987
##   0.542    153       1    0.962 0.01521        0.917        0.983
##   0.611    152       1    0.956 0.01637        0.909        0.979
##   0.915    151       1    0.949 0.01744        0.901        0.974
##   0.953    150       1    0.943 0.01844        0.893        0.970
##   1.063    149       1    0.937 0.01937        0.886        0.965
##   1.096    148       1    0.930 0.02025        0.878        0.961
##   1.411    147       1    0.924 0.02108        0.870        0.956
##   1.844    145       1    0.918 0.02187        0.862        0.951
##   1.901    144       1    0.911 0.02263        0.855        0.946
##   2.055    141       1    0.905 0.02337        0.847        0.942
##   2.088    140       1    0.898 0.02408        0.839        0.936
##   2.189    139       1    0.892 0.02476        0.832        0.931
##   2.258    138       1    0.885 0.02541        0.824        0.926
##   2.477    134       1    0.879 0.02607        0.817        0.921
##   2.660    132       1    0.872 0.02671        0.809        0.916
##   2.685    131       1    0.866 0.02732        0.801        0.910
##   2.737    130       1    0.859 0.02791        0.794        0.905
##   2.740    129       1    0.852 0.02848        0.786        0.899
##   2.773    128       1    0.846 0.02902        0.778        0.894
##   2.841    127       1    0.839 0.02955        0.771        0.888
##   2.951    126       1    0.832 0.03005        0.763        0.883
##   2.967    125       1    0.826 0.03054        0.756        0.877
##   3.156    124       1    0.819 0.03101        0.749        0.871
##   3.205    122       1    0.812 0.03148        0.741        0.866
##   3.263    121       2    0.799 0.03236        0.726        0.854
##   3.384    117       1    0.792 0.03279        0.719        0.848
##   3.553    114       1    0.785 0.03323        0.711        0.842
##   3.699    111       1    0.778 0.03368        0.703        0.836
##   3.726    110       1    0.771 0.03411        0.695        0.830
##   3.929    105       1    0.764 0.03456        0.687        0.823
##   4.088    100       1    0.756 0.03505        0.679        0.817
##   4.318     97       1    0.748 0.03554        0.670        0.810
##   4.540     93       1    0.740 0.03606        0.661        0.803
##   4.608     92       1    0.732 0.03655        0.653        0.796
##   4.630     91       2    0.716 0.03748        0.635        0.782
##   4.770     87       1    0.708 0.03794        0.626        0.775
##   5.005     82       1    0.699 0.03845        0.616        0.767
##   5.274     78       1    0.690 0.03899        0.607        0.759
##   5.630     72       1    0.681 0.03960        0.596        0.751
##   5.701     71       1    0.671 0.04019        0.585        0.743
##   5.767     70       1    0.661 0.04074        0.575        0.734
##   6.093     65       1    0.651 0.04137        0.564        0.725
##   6.181     63       1    0.641 0.04198        0.552        0.716
##   6.268     61       1    0.630 0.04259        0.541        0.707
##   6.293     60       1    0.620 0.04315        0.529        0.698
##   6.537     54       1    0.608 0.04385        0.517        0.688
##   6.575     53       1    0.597 0.04450        0.504        0.678
##   6.959     47       1    0.584 0.04533        0.490        0.667
##   7.077     42       1    0.570 0.04634        0.474        0.655
##   7.118     41       1    0.556 0.04725        0.459        0.643
##   7.367     38       1    0.542 0.04822        0.443        0.631
##   8.455     28       1    0.522 0.05023        0.420        0.615
##   8.827     24       1    0.501 0.05264        0.394        0.598
##   8.992     22       1    0.478 0.05495        0.367        0.580
##   9.792     18       1    0.451 0.05795        0.336        0.560
##   9.819     17       1    0.425 0.06032        0.306        0.539
##  11.175      8       1    0.372 0.07247        0.233        0.510
##  11.482      7       1    0.319 0.07922        0.173        0.474
#install.packages("survminer")
library(survminer)
## Loading required package: ggpubr
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
ggsurvplot(km.drug, risk.table=TRUE)

# log rank test for equality of survivor functions
survdiff(SurvObj ~ drug, data=pbcData)
## Call:
## survdiff(formula = SurvObj ~ drug, data = pbcData)
## 
##          N Observed Expected (O-E)^2/E (O-E)^2/V
## drug=0 154       60     61.8    0.0513     0.102
## drug=1 158       65     63.2    0.0502     0.102
## 
##  Chisq= 0.1  on 1 degrees of freedom, p= 0.7
# complimentary log-log plot - CLL plot
plot(km.drug, fun="cloglog", main="Complimentary Log-log Plot", ylab="log(-log(Survival Probability)",
     xlab="Analysis time (shown on log scale)", col=c("blue", "red"))
legend("topleft", legend=c("drug = 0", "drug = 1"), col=c("blue", "red"), lwd=1)

# for age group - categories
# distribution
Freq <- table(pbcData$agecat, useNA="ifany")
Percentage <- round((prop.table(Freq)*100),2)
cbind(Freq,Percentage)
##             Freq Percentage
## < 45 yrs     106      33.97
## >= 55 yrs    105      33.65
## 45 - 55 yrs  101      32.37
# estimate survival curves 
km.agecat = survfit(SurvObj ~ agecat, data = pbcData,
                  type="kaplan-meier", conf.type="log-log")
plot(km.agecat, main="Kaplan-Meier survival estimates", xlab="Time(years)", ylab="S(t)", col=c("blue", "red", "green"))
legend("topright", legend=c("< 45 yrs", "45 - 55 yrs", ">= 55 yrs"), col=c("blue", "red", "green"), lwd=1)

# complimentary log-log plot - CLL plot
plot(km.agecat, fun="cloglog", main="Complimentary Log-log Plot", ylab="log(-log(Survival Probability)",
     xlab="Analysis time (shown on log scale)", col=c("blue", "red", "green"))
legend("topleft", legend=c("< 45 yrs", "45 - 55 yrs", ">= 55 yrs"), col=c("blue", "red", "green"), lwd=1)

# log rank test for equality of survivor functions
survdiff(SurvObj ~ agecat, data=pbcData)
## Call:
## survdiff(formula = SurvObj ~ agecat, data = pbcData)
## 
##                      N Observed Expected (O-E)^2/E (O-E)^2/V
## agecat=< 45 yrs    106       27     45.0     7.171     11.22
## agecat=>= 55 yrs   105       50     37.4     4.259      6.11
## agecat=45 - 55 yrs 101       48     42.7     0.668      1.02
## 
##  Chisq= 12.1  on 2 degrees of freedom, p= 0.002
## sex
# distribution
Freq <- table(pbcData$sex, useNA="ifany")
Percentage <- round((prop.table(Freq)*100),2)
cbind(Freq,Percentage)
##        Freq Percentage
## Female  276      88.46
## Male     36      11.54
# estimate survival curves 
km.sex = survfit(SurvObj ~ sex, data = pbcData,
                  type="kaplan-meier", conf.type="log-log")
plot(km.sex, main="Kaplan-Meier survival estimates", xlab="Time(years)", ylab="S(t)", col=c("blue", "red"))
legend("topright", legend=c("Female", "Male"), col=c("blue", "red"), lwd=1)

# complimentary log-log plot - CLL plot
plot(km.sex, fun="cloglog", main="Complimentary Log-log Plot", ylab="log(-log(Survival Probability)",
     xlab="Analysis time (shown on log scale)", col=c("blue", "red"))
legend("topleft", legend=c("Female", "Male"), col=c("blue", "red"), lwd=1)

# log rank test for equality of survivor functions
survdiff(SurvObj ~ sex, data=pbcData)
## Call:
## survdiff(formula = SurvObj ~ sex, data = pbcData)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=Female 276      103    110.4     0.494      4.27
## sex=Male    36       22     14.6     3.728      4.27
## 
##  Chisq= 4.3  on 1 degrees of freedom, p= 0.04
## histo
# distribution
Freq <- table(pbcData$histo, useNA="ifany")
Percentage <- round((prop.table(Freq)*100),2)
cbind(Freq,Percentage)
##   Freq Percentage
## 1   16       5.13
## 2   67      21.47
## 3  120      38.46
## 4  109      34.94
# estimate survival curves 
km.histo = survfit(SurvObj ~ histo, data = pbcData,
                 type="kaplan-meier", conf.type="log-log")
plot(km.histo, main="Kaplan-Meier survival estimates", xlab="Time(years)", ylab="S(t)", col=c("blue", "red", "green", "pink"))
legend("topright", legend=c("Stage 1", "Stage 2", "Stage 3", "Stage 4"), col=c("blue", "red", "green", "pink"), lwd=1)

# complimentary log-log plot - CLL plot
plot(km.histo, fun="cloglog", main="Complimentary Log-log Plot", ylab="log(-log(Survival Probability)",
     xlab="Analysis time (shown on log scale)", col=c("blue", "red", "green", "pink"))
legend("topleft", legend=c("Stage 1", "Stage 2", "Stage 3", "Stage 4"), col=c("blue", "red", "green", "pink"), lwd=1)

# log rank test for equality of survivor functions
survdiff(SurvObj ~ histo, data=pbcData)
## Call:
## survdiff(formula = SurvObj ~ histo, data = pbcData)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## histo=1  16        1      9.9      8.00      8.78
## histo=2  67       16     32.3      8.22     11.17
## histo=3 120       43     51.2      1.30      2.22
## histo=4 109       65     31.6     35.17     47.84
## 
##  Chisq= 53.8  on 3 degrees of freedom, p= 1e-11
## bil
summary(pbcData$bil)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.300   0.800   1.350   3.256   3.425  28.000
sd(pbcData$bil)
## [1] 4.530315
# log rank test for equality of survivor functions
survdiff(SurvObj ~ bil, data=pbcData)
## Call:
## survdiff(formula = SurvObj ~ bil, data = pbcData)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## bil=0.30000001  3        2  1.64469  7.68e-02  7.79e-02
## bil=0.40000001  8        1  3.62971  1.91e+00  1.96e+00
## bil=0.5        20        0  8.54246  8.54e+00  9.22e+00
## bil=0.60000002 22        1 11.38216  9.47e+00  1.05e+01
## bil=0.69999999 20        2 12.89572  9.21e+00  1.05e+01
## bil=0.80000001 17        8  8.05251  3.42e-04  3.68e-04
## bil=0.89999998 14        2  7.24020  3.79e+00  4.04e+00
## bil=1          12        1  5.11018  3.31e+00  3.45e+00
## bil=1.1        17        5  7.90057  1.06e+00  1.14e+00
## bil=1.2         9        2  4.74131  1.58e+00  1.65e+00
## bil=1.3        14        4  6.67194  1.07e+00  1.13e+00
## bil=1.4         7        4  3.35475  1.24e-01  1.28e-01
## bil=1.5         3        1  1.34333  8.77e-02  8.89e-02
## bil=1.6         5        1  1.80598  3.60e-01  3.66e-01
## bil=1.7         1        0  0.16783  1.68e-01  1.68e-01
## bil=1.8         5        2  3.02130  3.45e-01  3.56e-01
## bil=1.9         4        1  2.34961  7.75e-01  7.93e-01
## bil=2           6        4  1.91347  2.28e+00  2.32e+00
## bil=2.0999999   7        4  4.24525  1.42e-02  1.48e-02
## bil=2.2         2        0  0.55111  5.51e-01  5.55e-01
## bil=2.3         6        4  2.17507  1.53e+00  1.56e+00
## bil=2.4000001   3        1  0.42842  7.63e-01  7.68e-01
## bil=2.5         3        3  0.69072  7.72e+00  7.78e+00
## bil=2.5999999   1        1  0.19163  3.41e+00  3.42e+00
## bil=2.7         1        1  0.12624  6.05e+00  6.07e+00
## bil=2.8         2        1  0.46000  6.34e-01  6.38e-01
## bil=2.9000001   2        1  0.36743  1.09e+00  1.10e+00
## bil=3           3        1  1.29212  6.60e-02  6.69e-02
## bil=3.0999999   2        0  0.42107  4.21e-01  4.24e-01
## bil=3.2         7        5  2.07074  4.14e+00  4.22e+00
## bil=3.3         3        2  1.74440  3.75e-02  3.81e-02
## bil=3.4000001   5        2  1.37899  2.80e-01  2.84e-01
## bil=3.5         3        1  0.98505  2.27e-04  2.29e-04
## bil=3.5999999   3        2  0.38689  6.73e+00  6.77e+00
## bil=3.7         1        0  0.42136  4.21e-01  4.24e-01
## bil=3.8         1        1  0.24627  2.31e+00  2.32e+00
## bil=3.9000001   2        2  0.49255  4.61e+00  4.64e+00
## bil=4           2        2  0.34489  7.94e+00  7.99e+00
## bil=4.4000001   1        0  0.14481  1.45e-01  1.45e-01
## bil=4.5         3        3  0.95998  4.34e+00  4.39e+00
## bil=4.6999998   3        1  1.78973  3.48e-01  3.54e-01
## bil=5           2        2  0.39477  6.53e+00  6.56e+00
## bil=5.0999999   2        2  0.43583  5.61e+00  5.65e+00
## bil=5.1999998   2        2  0.32715  8.55e+00  8.60e+00
## bil=5.5         1        0  0.42136  4.21e-01  4.24e-01
## bil=5.5999999   2        0  0.57549  5.75e-01  5.80e-01
## bil=5.6999998   2        1  0.91907  7.13e-03  7.20e-03
## bil=5.9000001   1        1  0.20794  3.02e+00  3.03e+00
## bil=6           1        1  0.43045  7.54e-01  7.57e-01
## bil=6.0999999   1        0  0.11168  1.12e-01  1.12e-01
## bil=6.3000002   1        1  0.32860  1.37e+00  1.38e+00
## bil=6.4000001   3        1  0.60500  2.58e-01  2.60e-01
## bil=6.5         1        1  0.21207  2.93e+00  2.94e+00
## bil=6.5999999   3        2  0.34612  7.90e+00  7.95e+00
## bil=6.6999998   1        1  0.18361  3.63e+00  3.64e+00
## bil=6.8000002   1        1  0.25544  2.17e+00  2.18e+00
## bil=7.0999999   2        1  0.38371  9.90e-01  9.95e-01
## bil=7.1999998   2        2  0.43470  5.64e+00  5.67e+00
## bil=7.3000002   1        1  0.25077  2.24e+00  2.25e+00
## bil=8           1        1  0.16783  4.13e+00  4.14e+00
## bil=8.3999996   1        1  0.34698  1.23e+00  1.23e+00
## bil=8.5         2        1  0.30670  1.57e+00  1.58e+00
## bil=8.6000004   1        0  0.25077  2.51e-01  2.52e-01
## bil=8.6999998   1        0  0.19972  2.00e-01  2.00e-01
## bil=10.8        1        1  0.13734  5.42e+00  5.44e+00
## bil=11          1        1  0.17964  3.75e+00  3.76e+00
## bil=11.4        2        2  0.15627  2.18e+01  2.19e+01
## bil=12.2        1        1  0.00965  1.02e+02  1.02e+02
## bil=12.6        1        1  0.00642  1.54e+02  1.54e+02
## bil=13          1        0  0.35339  3.53e-01  3.55e-01
## bil=14          1        1  0.11529  6.79e+00  6.81e+00
## bil=14.1        1        1  0.06955  1.24e+01  1.25e+01
## bil=14.4        1        1  0.47901  5.67e-01  5.70e-01
## bil=14.5        1        1  0.07990  1.06e+01  1.06e+01
## bil=16.200001   1        1  0.19972  3.21e+00  3.22e+00
## bil=17.1        1        1  0.23322  2.52e+00  2.53e+00
## bil=17.200001   1        1  0.09036  9.16e+00  9.19e+00
## bil=17.4        3        3  0.34502  2.04e+01  2.05e+01
## bil=17.9        1        1  0.00321  3.10e+02  3.11e+02
## bil=20          1        1  0.22471  2.67e+00  2.68e+00
## bil=21.6        1        1  0.01288  7.56e+01  7.59e+01
## bil=22.5        1        1  0.15622  4.56e+00  4.57e+00
## bil=24.5        1        1  0.04583  1.99e+01  1.99e+01
## bil=25.5        1        1  0.15240  4.71e+00  4.73e+00
## bil=28          1        1  0.17175  3.99e+00  4.01e+00
## 
##  Chisq= 963  on 84 degrees of freedom, p= <2e-16

4. Fit several Cox proportional hazards regression models to the ungrouped survival data

## Fit several Cox proportional hazards regression models to the ungrouped survival data
model1 = coxph(SurvObj ~ drug, data = pbcData)
summary(model1)
## Call:
## coxph(formula = SurvObj ~ drug, data = pbcData)
## 
##   n= 312, number of events= 125 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)
## drug 0.05722   1.05889  0.17916 0.319    0.749
## 
##      exp(coef) exp(-coef) lower .95 upper .95
## drug     1.059     0.9444    0.7453     1.504
## 
## Concordance= 0.499  (se = 0.025 )
## Likelihood ratio test= 0.1  on 1 df,   p=0.7
## Wald test            = 0.1  on 1 df,   p=0.7
## Score (logrank) test = 0.1  on 1 df,   p=0.7
model2 = coxph(SurvObj ~ drug + sex + bil + as.factor(histo), data = pbcData) 
summary(model2)
## Call:
## coxph(formula = SurvObj ~ drug + sex + bil + as.factor(histo), 
##     data = pbcData)
## 
##   n= 312, number of events= 125 
## 
##                       coef exp(coef) se(coef)      z Pr(>|z|)    
## drug               0.15742   1.17049  0.18168  0.866  0.38623    
## sexMale            0.64217   1.90061  0.23929  2.684  0.00728 ** 
## bil                0.15231   1.16452  0.01434 10.623  < 2e-16 ***
## as.factor(histo)2  1.67167   5.32107  1.03412  1.617  0.10598    
## as.factor(histo)3  2.06631   7.89564  1.01691  2.032  0.04216 *  
## as.factor(histo)4  2.92749  18.68059  1.01247  2.891  0.00384 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## drug                  1.170    0.85434    0.8198     1.671
## sexMale               1.901    0.52615    1.1891     3.038
## bil                   1.165    0.85872    1.1323     1.198
## as.factor(histo)2     5.321    0.18793    0.7011    40.387
## as.factor(histo)3     7.896    0.12665    1.0760    57.940
## as.factor(histo)4    18.681    0.05353    2.5679   135.897
## 
## Concordance= 0.814  (se = 0.016 )
## Likelihood ratio test= 133.9  on 6 df,   p=<2e-16
## Wald test            = 150.5  on 6 df,   p=<2e-16
## Score (logrank) test = 219.6  on 6 df,   p=<2e-16
model3 = coxph(SurvObj ~ drug + sex + bil + agecat + as.factor(histo), data = pbcData) 
summary(model3)
## Call:
## coxph(formula = SurvObj ~ drug + sex + bil + agecat + as.factor(histo), 
##     data = pbcData)
## 
##   n= 312, number of events= 125 
## 
##                       coef exp(coef) se(coef)      z Pr(>|z|)    
## drug               0.11006   1.11635  0.18358  0.600  0.54880    
## sexMale            0.53702   1.71091  0.24314  2.209  0.02720 *  
## bil                0.15091   1.16289  0.01411 10.692  < 2e-16 ***
## agecat>= 55 yrs    0.53710   1.71104  0.24820  2.164  0.03047 *  
## agecat45 - 55 yrs  0.39301   1.48143  0.24653  1.594  0.11090    
## as.factor(histo)2  1.51412   4.54543  1.03626  1.461  0.14398    
## as.factor(histo)3  1.89441   6.64863  1.01945  1.858  0.06313 .  
## as.factor(histo)4  2.70987  15.02735  1.01536  2.669  0.00761 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                   exp(coef) exp(-coef) lower .95 upper .95
## drug                  1.116    0.89578    0.7790     1.600
## sexMale               1.711    0.58449    1.0623     2.755
## bil                   1.163    0.85992    1.1312     1.196
## agecat>= 55 yrs       1.711    0.58444    1.0519     2.783
## agecat45 - 55 yrs     1.481    0.67502    0.9138     2.402
## as.factor(histo)2     4.545    0.22000    0.5964    34.645
## as.factor(histo)3     6.649    0.15041    0.9015    49.033
## as.factor(histo)4    15.027    0.06655    2.0540   109.940
## 
## Concordance= 0.82  (se = 0.017 )
## Likelihood ratio test= 139  on 8 df,   p=<2e-16
## Wald test            = 157.8  on 8 df,   p=<2e-16
## Score (logrank) test = 230.4  on 8 df,   p=<2e-16
##calculate aic 
aic_1 <- AIC(model1)
aic_1
## [1] 1281.831
aic_2 <- AIC(model2)
aic_2
## [1] 1157.989
aic_3 <- AIC(model3)
aic_3
## [1] 1156.934