## Warning: package 'carData' was built under R version 4.4.1

Exploratory Data Analysis

### checking the names of the variables suing the names function
names(Rossi)
##  [1] "week"   "arrest" "fin"    "age"    "race"   "wexp"   "mar"    "paro"  
##  [9] "prio"   "educ"   "emp1"   "emp2"   "emp3"   "emp4"   "emp5"   "emp6"  
## [17] "emp7"   "emp8"   "emp9"   "emp10"  "emp11"  "emp12"  "emp13"  "emp14" 
## [25] "emp15"  "emp16"  "emp17"  "emp18"  "emp19"  "emp20"  "emp21"  "emp22" 
## [33] "emp23"  "emp24"  "emp25"  "emp26"  "emp27"  "emp28"  "emp29"  "emp30" 
## [41] "emp31"  "emp32"  "emp33"  "emp34"  "emp35"  "emp36"  "emp37"  "emp38" 
## [49] "emp39"  "emp40"  "emp41"  "emp42"  "emp43"  "emp44"  "emp45"  "emp46" 
## [57] "emp47"  "emp48"  "emp49"  "emp50"  "emp51"  "emp52"
## then dim function tells you the dimension of the dataset
## here we have 62 variables and 432 rows
dim(Rossi)
## [1] 432  62
### the head functions tells you the first 10 data. Here we have asked for the first 6 
head(Rossi, 3)
##   week arrest fin age  race wexp         mar paro prio educ emp1 emp2 emp3 emp4
## 1   20      1  no  27 black   no not married  yes    3    3   no   no   no   no
## 2   17      1  no  18 black   no not married  yes    8    4   no   no   no   no
## 3   25      1  no  19 other  yes not married  yes   13    3   no   no   no   no
##   emp5 emp6 emp7 emp8 emp9 emp10 emp11 emp12 emp13 emp14 emp15 emp16 emp17
## 1   no   no   no   no   no    no    no    no    no    no    no    no    no
## 2   no   no   no   no   no   yes   yes   yes   yes   yes    no    no    no
## 3   no   no   no   no   no    no    no    no    no    no    no    no   yes
##   emp18 emp19 emp20 emp21 emp22 emp23 emp24 emp25 emp26 emp27 emp28 emp29 emp30
## 1    no    no    no  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
## 2  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
## 3    no    no    no    no    no    no    no    no  <NA>  <NA>  <NA>  <NA>  <NA>
##   emp31 emp32 emp33 emp34 emp35 emp36 emp37 emp38 emp39 emp40 emp41 emp42 emp43
## 1  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
## 2  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
## 3  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
##   emp44 emp45 emp46 emp47 emp48 emp49 emp50 emp51 emp52
## 1  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
## 2  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
## 3  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
##the table function looks at the numbers of each ... in a variable

### we have 379 black and 53 other
table(Rossi$race)
## 
## black other 
##   379    53
table(Rossi$mar)
## 
##     married not married 
##          53         379
## checking the distribution of age
hist(Rossi$age)

### Change the name to survival_data

survival_data <- Rossi

## race
survival_data$race <- ifelse(survival_data$race == "other", 0, 1)

table(survival_data$race)
## 
##   0   1 
##  53 379
### marriage
survival_data$mar <- ifelse(survival_data$mar == "not married", 0, 1)
table(survival_data$mar)
## 
##   0   1 
## 379  53
### wexp

survival_data$wexp <- ifelse(survival_data$wexp == "no", 0, 1)
table(survival_data$wexp)
## 
##   0   1 
## 185 247
### paro
survival_data$paro <- ifelse(survival_data$paro == "no", 0, 1)

table(survival_data$paro)
## 
##   0   1 
## 165 267
### age

survival_data$age <- ifelse(survival_data$age <= mean(survival_data$age), 0, 1)

table(survival_data$age)
## 
##   0   1 
## 278 154

Bar Chart of education by age.

It appears that most age group falls in the level 3 and 4 education

#### education 
table(survival_data$educ)
## 
##   2   3   4   5   6 
##  24 239 119  39  11
ggplot(survival_data, aes(x=educ, y=age, fill = educ)) +
  geom_bar(stat="identity")

Kaplan-Meier estimator

### Building the Kaplan-Meier estimator


fit_survival_data <- survfit(Surv(week, arrest) ~ 1, data = survival_data)



### summary
summary(fit_survival_data)
## Call: survfit(formula = Surv(week, arrest) ~ 1, data = survival_data)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    432       1    0.998 0.00231        0.993        1.000
##     2    431       1    0.995 0.00327        0.989        1.000
##     3    430       1    0.993 0.00400        0.985        1.000
##     4    429       1    0.991 0.00461        0.982        1.000
##     5    428       1    0.988 0.00515        0.978        0.999
##     6    427       1    0.986 0.00563        0.975        0.997
##     7    426       1    0.984 0.00607        0.972        0.996
##     8    425       5    0.972 0.00791        0.957        0.988
##     9    420       2    0.968 0.00852        0.951        0.984
##    10    418       1    0.965 0.00881        0.948        0.983
##    11    417       2    0.961 0.00935        0.942        0.979
##    12    415       2    0.956 0.00987        0.937        0.976
##    13    413       1    0.954 0.01011        0.934        0.974
##    14    412       3    0.947 0.01080        0.926        0.968
##    15    409       2    0.942 0.01123        0.920        0.964
##    16    407       2    0.937 0.01165        0.915        0.961
##    17    405       3    0.931 0.01223        0.907        0.955
##    18    402       3    0.924 0.01278        0.899        0.949
##    19    399       2    0.919 0.01313        0.894        0.945
##    20    397       5    0.907 0.01395        0.880        0.935
##    21    392       2    0.903 0.01425        0.875        0.931
##    22    390       1    0.900 0.01440        0.873        0.929
##    23    389       1    0.898 0.01455        0.870        0.927
##    24    388       4    0.889 0.01512        0.860        0.919
##    25    384       3    0.882 0.01552        0.852        0.913
##    26    381       3    0.875 0.01591        0.844        0.907
##    27    378       2    0.870 0.01616        0.839        0.903
##    28    376       2    0.866 0.01640        0.834        0.898
##    30    374       2    0.861 0.01664        0.829        0.894
##    31    372       1    0.859 0.01675        0.827        0.892
##    32    371       2    0.854 0.01698        0.822        0.888
##    33    369       2    0.850 0.01720        0.816        0.884
##    34    367       2    0.845 0.01742        0.811        0.880
##    35    365       4    0.836 0.01783        0.801        0.871
##    36    361       3    0.829 0.01813        0.794        0.865
##    37    358       4    0.819 0.01851        0.784        0.857
##    38    354       1    0.817 0.01860        0.781        0.854
##    39    353       2    0.812 0.01878        0.777        0.850
##    40    351       4    0.803 0.01913        0.767        0.842
##    42    347       2    0.799 0.01929        0.762        0.837
##    43    345       4    0.789 0.01962        0.752        0.829
##    44    341       2    0.785 0.01977        0.747        0.824
##    45    339       2    0.780 0.01993        0.742        0.820
##    46    337       4    0.771 0.02022        0.732        0.812
##    47    333       1    0.769 0.02029        0.730        0.809
##    48    332       2    0.764 0.02043        0.725        0.805
##    49    330       5    0.752 0.02077        0.713        0.794
##    50    325       3    0.745 0.02096        0.705        0.788
##    52    322       4    0.736 0.02121        0.696        0.779

Financial Aid survival function

financial_fit <- survfit(Surv(week, arrest) ~ fin, data = survival_data)

#summary(financial_fit)


### plot for financial aid

ggsurvplot(financial_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Financial",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("No Financial Aid", "Financial Aid"), data =survival_data, pval = TRUE)

Kaplan-Meier estimator for age

age_fit <- survfit(Surv(week, arrest) ~ age, data = survival_data)
summary(age_fit)
## Call: survfit(formula = Surv(week, arrest) ~ age, data = survival_data)
## 
##                 age=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1    278       1    0.996 0.00359        0.989        1.000
##     4    277       1    0.993 0.00507        0.983        1.000
##     5    276       1    0.989 0.00620        0.977        1.000
##     6    275       1    0.986 0.00714        0.972        1.000
##     7    274       1    0.982 0.00797        0.967        0.998
##     8    273       3    0.971 0.01003        0.952        0.991
##    10    270       1    0.968 0.01062        0.947        0.989
##    11    269       2    0.960 0.01169        0.938        0.984
##    12    267       1    0.957 0.01219        0.933        0.981
##    13    266       1    0.953 0.01266        0.929        0.978
##    14    265       3    0.942 0.01397        0.915        0.970
##    15    262       2    0.935 0.01476        0.907        0.965
##    17    260       3    0.924 0.01585        0.894        0.956
##    18    257       3    0.914 0.01684        0.881        0.947
##    19    254       2    0.906 0.01746        0.873        0.941
##    20    252       3    0.896 0.01833        0.860        0.932
##    22    249       1    0.892 0.01861        0.856        0.929
##    24    248       2    0.885 0.01914        0.848        0.923
##    25    246       2    0.878 0.01965        0.840        0.917
##    26    244       1    0.874 0.01990        0.836        0.914
##    27    243       1    0.871 0.02014        0.832        0.911
##    28    242       2    0.863 0.02060        0.824        0.905
##    30    240       2    0.856 0.02105        0.816        0.898
##    31    238       1    0.853 0.02127        0.812        0.895
##    32    237       2    0.845 0.02169        0.804        0.889
##    33    235       2    0.838 0.02209        0.796        0.883
##    34    233       1    0.835 0.02229        0.792        0.879
##    35    232       4    0.820 0.02303        0.776        0.867
##    36    228       3    0.809 0.02356        0.764        0.857
##    37    225       2    0.802 0.02389        0.757        0.850
##    38    223       1    0.799 0.02405        0.753        0.847
##    39    222       1    0.795 0.02421        0.749        0.844
##    40    221       4    0.781 0.02482        0.733        0.831
##    42    217       1    0.777 0.02497        0.730        0.827
##    43    216       4    0.763 0.02552        0.714        0.814
##    44    212       2    0.755 0.02578        0.707        0.808
##    45    210       2    0.748 0.02603        0.699        0.801
##    46    208       3    0.737 0.02639        0.687        0.791
##    47    205       1    0.734 0.02651        0.684        0.788
##    48    204       2    0.727 0.02673        0.676        0.781
##    49    202       4    0.712 0.02715        0.661        0.767
##    50    198       3    0.701 0.02745        0.650        0.757
##    52    195       2    0.694 0.02763        0.642        0.751
## 
##                 age=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     2    154       1    0.994 0.00647        0.981        1.000
##     3    153       1    0.987 0.00912        0.969        1.000
##     8    152       2    0.974 0.01282        0.949        0.999
##     9    150       2    0.961 0.01559        0.931        0.992
##    12    148       1    0.955 0.01679        0.922        0.988
##    16    147       2    0.942 0.01890        0.905        0.979
##    20    145       2    0.929 0.02075        0.889        0.970
##    21    143       2    0.916 0.02240        0.873        0.961
##    23    141       1    0.909 0.02317        0.865        0.956
##    24    140       2    0.896 0.02459        0.849        0.946
##    25    138       1    0.890 0.02525        0.841        0.941
##    26    137       2    0.877 0.02650        0.826        0.930
##    27    135       1    0.870 0.02709        0.819        0.925
##    34    134       1    0.864 0.02765        0.811        0.920
##    37    133       2    0.851 0.02872        0.796        0.909
##    39    131       1    0.844 0.02923        0.789        0.903
##    42    130       1    0.838 0.02972        0.781        0.898
##    46    129       1    0.831 0.03019        0.774        0.892
##    49    128       1    0.825 0.03064        0.767        0.887
##    52    127       2    0.812 0.03150        0.752        0.876
#ggsurvplot(age_fit, data = survival_data, pval = TRUE)

ggsurvplot(age_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Age",surv.median.line ="v",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("young", "Old"), data =survival_data, pval = TRUE)
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

Paro survival function

paro_fit <- survfit(Surv(week, arrest) ~ paro, data = survival_data)

#summary(paro_fit)

### plot for paro

ggsurvplot(paro_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "paro",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("Not Parolee", "Parolee"), data =survival_data, pval = TRUE)

Work Experience survival function

Wexp_fit <- survfit(Surv(week, arrest) ~ wexp, data = survival_data)

#summary(Wexp_fit)

### plot for Work Experience


ggsurvplot(Wexp_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "wexp",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("No Work Experience", "Work Experience"), data =survival_data, pval = TRUE)

Race survival function

race_fit <- survfit(Surv(week, arrest) ~ race, data = survival_data)

#summary(race_fit)


### plot for Race


ggsurvplot(race_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Race",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("White", "Black"), data =survival_data, pval = TRUE)

Marriage survival function

mar_fit <- survfit(Surv(week, arrest) ~ mar, data = survival_data)

#summary(mar_fit)


### plot for Marriage

ggsurvplot(mar_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Marriage",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("Not Married", "Married"), data =survival_data, pval = TRUE)

Education survival function

educ_fit <- survfit(Surv(week, arrest) ~ educ, data = survival_data)

#summary(educ_fit)

### plot for education

ggsurvplot(educ_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Level of Education",
           risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
           legend.labs=c("Level 2", "Level 3", "Level 4", "Level 5", "Level 6"), data =survival_data, pval = TRUE)

Fit a cox proportional hazards model

data_fit <- Surv(time = survival_data$week, event = survival_data$arrest)

hazard_fit <- coxph(data_fit ~ mar + fin + race + educ + wexp + paro + age, data = survival_data)
hazard_fit
## Call:
## coxph(formula = data_fit ~ mar + fin + race + educ + wexp + paro + 
##     age, data = survival_data)
## 
##           coef exp(coef) se(coef)      z      p
## mar    -0.4269    0.6525   0.3807 -1.122 0.2620
## finyes -0.3764    0.6863   0.1908 -1.973 0.0485
## race    0.2927    1.3401   0.3082  0.950 0.3422
## educ   -0.2389    0.7875   0.1278 -1.870 0.0615
## wexp   -0.3450    0.7082   0.2038 -1.693 0.0905
## paro   -0.1162    0.8903   0.1923 -0.604 0.5456
## age    -0.3685    0.6918   0.2286 -1.612 0.1070
## 
## Likelihood ratio test=22.6  on 7 df, p=0.002002
## n= 432, number of events= 114
summary(hazard_fit)
## Call:
## coxph(formula = data_fit ~ mar + fin + race + educ + wexp + paro + 
##     age, data = survival_data)
## 
##   n= 432, number of events= 114 
## 
##           coef exp(coef) se(coef)      z Pr(>|z|)  
## mar    -0.4269    0.6525   0.3807 -1.122   0.2620  
## finyes -0.3764    0.6863   0.1908 -1.973   0.0485 *
## race    0.2927    1.3401   0.3082  0.950   0.3422  
## educ   -0.2389    0.7875   0.1278 -1.870   0.0615 .
## wexp   -0.3450    0.7082   0.2038 -1.693   0.0905 .
## paro   -0.1162    0.8903   0.1923 -0.604   0.5456  
## age    -0.3685    0.6918   0.2286 -1.612   0.1070  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## mar       0.6525     1.5326    0.3094    1.3759
## finyes    0.6863     1.4571    0.4722    0.9976
## race      1.3401     0.7462    0.7324    2.4519
## educ      0.7875     1.2699    0.6130    1.0116
## wexp      0.7082     1.4121    0.4749    1.0560
## paro      0.8903     1.1232    0.6107    1.2978
## age       0.6918     1.4455    0.4420    1.0828
## 
## Concordance= 0.631  (se = 0.026 )
## Likelihood ratio test= 22.6  on 7 df,   p=0.002
## Wald test            = 20.92  on 7 df,   p=0.004
## Score (logrank) test = 21.6  on 7 df,   p=0.003

Hazard Risk

ggforest(hazard_fit, data = survival_data)