## Warning: package 'carData' was built under R version 4.4.1
Exploratory Data Analysis
### checking the names of the variables suing the names function
names(Rossi)
## [1] "week" "arrest" "fin" "age" "race" "wexp" "mar" "paro"
## [9] "prio" "educ" "emp1" "emp2" "emp3" "emp4" "emp5" "emp6"
## [17] "emp7" "emp8" "emp9" "emp10" "emp11" "emp12" "emp13" "emp14"
## [25] "emp15" "emp16" "emp17" "emp18" "emp19" "emp20" "emp21" "emp22"
## [33] "emp23" "emp24" "emp25" "emp26" "emp27" "emp28" "emp29" "emp30"
## [41] "emp31" "emp32" "emp33" "emp34" "emp35" "emp36" "emp37" "emp38"
## [49] "emp39" "emp40" "emp41" "emp42" "emp43" "emp44" "emp45" "emp46"
## [57] "emp47" "emp48" "emp49" "emp50" "emp51" "emp52"
## then dim function tells you the dimension of the dataset
## here we have 62 variables and 432 rows
dim(Rossi)
## [1] 432 62
### the head functions tells you the first 10 data. Here we have asked for the first 6
head(Rossi, 3)
## week arrest fin age race wexp mar paro prio educ emp1 emp2 emp3 emp4
## 1 20 1 no 27 black no not married yes 3 3 no no no no
## 2 17 1 no 18 black no not married yes 8 4 no no no no
## 3 25 1 no 19 other yes not married yes 13 3 no no no no
## emp5 emp6 emp7 emp8 emp9 emp10 emp11 emp12 emp13 emp14 emp15 emp16 emp17
## 1 no no no no no no no no no no no no no
## 2 no no no no no yes yes yes yes yes no no no
## 3 no no no no no no no no no no no no yes
## emp18 emp19 emp20 emp21 emp22 emp23 emp24 emp25 emp26 emp27 emp28 emp29 emp30
## 1 no no no <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 no no no no no no no no <NA> <NA> <NA> <NA> <NA>
## emp31 emp32 emp33 emp34 emp35 emp36 emp37 emp38 emp39 emp40 emp41 emp42 emp43
## 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## emp44 emp45 emp46 emp47 emp48 emp49 emp50 emp51 emp52
## 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
##the table function looks at the numbers of each ... in a variable
### we have 379 black and 53 other
table(Rossi$race)
##
## black other
## 379 53
table(Rossi$mar)
##
## married not married
## 53 379
## checking the distribution of age
hist(Rossi$age)

### Change the name to survival_data
survival_data <- Rossi
## race
survival_data$race <- ifelse(survival_data$race == "other", 0, 1)
table(survival_data$race)
##
## 0 1
## 53 379
### marriage
survival_data$mar <- ifelse(survival_data$mar == "not married", 0, 1)
table(survival_data$mar)
##
## 0 1
## 379 53
### wexp
survival_data$wexp <- ifelse(survival_data$wexp == "no", 0, 1)
table(survival_data$wexp)
##
## 0 1
## 185 247
### paro
survival_data$paro <- ifelse(survival_data$paro == "no", 0, 1)
table(survival_data$paro)
##
## 0 1
## 165 267
### age
survival_data$age <- ifelse(survival_data$age <= mean(survival_data$age), 0, 1)
table(survival_data$age)
##
## 0 1
## 278 154
Bar Chart of education by age.
It appears that most age group falls in the level 3 and 4
education
#### education
table(survival_data$educ)
##
## 2 3 4 5 6
## 24 239 119 39 11
ggplot(survival_data, aes(x=educ, y=age, fill = educ)) +
geom_bar(stat="identity")

Kaplan-Meier estimator
### Building the Kaplan-Meier estimator
fit_survival_data <- survfit(Surv(week, arrest) ~ 1, data = survival_data)
### summary
summary(fit_survival_data)
## Call: survfit(formula = Surv(week, arrest) ~ 1, data = survival_data)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 432 1 0.998 0.00231 0.993 1.000
## 2 431 1 0.995 0.00327 0.989 1.000
## 3 430 1 0.993 0.00400 0.985 1.000
## 4 429 1 0.991 0.00461 0.982 1.000
## 5 428 1 0.988 0.00515 0.978 0.999
## 6 427 1 0.986 0.00563 0.975 0.997
## 7 426 1 0.984 0.00607 0.972 0.996
## 8 425 5 0.972 0.00791 0.957 0.988
## 9 420 2 0.968 0.00852 0.951 0.984
## 10 418 1 0.965 0.00881 0.948 0.983
## 11 417 2 0.961 0.00935 0.942 0.979
## 12 415 2 0.956 0.00987 0.937 0.976
## 13 413 1 0.954 0.01011 0.934 0.974
## 14 412 3 0.947 0.01080 0.926 0.968
## 15 409 2 0.942 0.01123 0.920 0.964
## 16 407 2 0.937 0.01165 0.915 0.961
## 17 405 3 0.931 0.01223 0.907 0.955
## 18 402 3 0.924 0.01278 0.899 0.949
## 19 399 2 0.919 0.01313 0.894 0.945
## 20 397 5 0.907 0.01395 0.880 0.935
## 21 392 2 0.903 0.01425 0.875 0.931
## 22 390 1 0.900 0.01440 0.873 0.929
## 23 389 1 0.898 0.01455 0.870 0.927
## 24 388 4 0.889 0.01512 0.860 0.919
## 25 384 3 0.882 0.01552 0.852 0.913
## 26 381 3 0.875 0.01591 0.844 0.907
## 27 378 2 0.870 0.01616 0.839 0.903
## 28 376 2 0.866 0.01640 0.834 0.898
## 30 374 2 0.861 0.01664 0.829 0.894
## 31 372 1 0.859 0.01675 0.827 0.892
## 32 371 2 0.854 0.01698 0.822 0.888
## 33 369 2 0.850 0.01720 0.816 0.884
## 34 367 2 0.845 0.01742 0.811 0.880
## 35 365 4 0.836 0.01783 0.801 0.871
## 36 361 3 0.829 0.01813 0.794 0.865
## 37 358 4 0.819 0.01851 0.784 0.857
## 38 354 1 0.817 0.01860 0.781 0.854
## 39 353 2 0.812 0.01878 0.777 0.850
## 40 351 4 0.803 0.01913 0.767 0.842
## 42 347 2 0.799 0.01929 0.762 0.837
## 43 345 4 0.789 0.01962 0.752 0.829
## 44 341 2 0.785 0.01977 0.747 0.824
## 45 339 2 0.780 0.01993 0.742 0.820
## 46 337 4 0.771 0.02022 0.732 0.812
## 47 333 1 0.769 0.02029 0.730 0.809
## 48 332 2 0.764 0.02043 0.725 0.805
## 49 330 5 0.752 0.02077 0.713 0.794
## 50 325 3 0.745 0.02096 0.705 0.788
## 52 322 4 0.736 0.02121 0.696 0.779
Financial Aid survival function
financial_fit <- survfit(Surv(week, arrest) ~ fin, data = survival_data)
#summary(financial_fit)
### plot for financial aid
ggsurvplot(financial_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Financial",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("No Financial Aid", "Financial Aid"), data =survival_data, pval = TRUE)

- With Log-rank test p-value = 0.05, there is an indication that there
is no statistical significance within this group. However, one can argue
that additional data will be able to show otherwise.
Kaplan-Meier estimator for age
age_fit <- survfit(Surv(week, arrest) ~ age, data = survival_data)
summary(age_fit)
## Call: survfit(formula = Surv(week, arrest) ~ age, data = survival_data)
##
## age=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 278 1 0.996 0.00359 0.989 1.000
## 4 277 1 0.993 0.00507 0.983 1.000
## 5 276 1 0.989 0.00620 0.977 1.000
## 6 275 1 0.986 0.00714 0.972 1.000
## 7 274 1 0.982 0.00797 0.967 0.998
## 8 273 3 0.971 0.01003 0.952 0.991
## 10 270 1 0.968 0.01062 0.947 0.989
## 11 269 2 0.960 0.01169 0.938 0.984
## 12 267 1 0.957 0.01219 0.933 0.981
## 13 266 1 0.953 0.01266 0.929 0.978
## 14 265 3 0.942 0.01397 0.915 0.970
## 15 262 2 0.935 0.01476 0.907 0.965
## 17 260 3 0.924 0.01585 0.894 0.956
## 18 257 3 0.914 0.01684 0.881 0.947
## 19 254 2 0.906 0.01746 0.873 0.941
## 20 252 3 0.896 0.01833 0.860 0.932
## 22 249 1 0.892 0.01861 0.856 0.929
## 24 248 2 0.885 0.01914 0.848 0.923
## 25 246 2 0.878 0.01965 0.840 0.917
## 26 244 1 0.874 0.01990 0.836 0.914
## 27 243 1 0.871 0.02014 0.832 0.911
## 28 242 2 0.863 0.02060 0.824 0.905
## 30 240 2 0.856 0.02105 0.816 0.898
## 31 238 1 0.853 0.02127 0.812 0.895
## 32 237 2 0.845 0.02169 0.804 0.889
## 33 235 2 0.838 0.02209 0.796 0.883
## 34 233 1 0.835 0.02229 0.792 0.879
## 35 232 4 0.820 0.02303 0.776 0.867
## 36 228 3 0.809 0.02356 0.764 0.857
## 37 225 2 0.802 0.02389 0.757 0.850
## 38 223 1 0.799 0.02405 0.753 0.847
## 39 222 1 0.795 0.02421 0.749 0.844
## 40 221 4 0.781 0.02482 0.733 0.831
## 42 217 1 0.777 0.02497 0.730 0.827
## 43 216 4 0.763 0.02552 0.714 0.814
## 44 212 2 0.755 0.02578 0.707 0.808
## 45 210 2 0.748 0.02603 0.699 0.801
## 46 208 3 0.737 0.02639 0.687 0.791
## 47 205 1 0.734 0.02651 0.684 0.788
## 48 204 2 0.727 0.02673 0.676 0.781
## 49 202 4 0.712 0.02715 0.661 0.767
## 50 198 3 0.701 0.02745 0.650 0.757
## 52 195 2 0.694 0.02763 0.642 0.751
##
## age=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 2 154 1 0.994 0.00647 0.981 1.000
## 3 153 1 0.987 0.00912 0.969 1.000
## 8 152 2 0.974 0.01282 0.949 0.999
## 9 150 2 0.961 0.01559 0.931 0.992
## 12 148 1 0.955 0.01679 0.922 0.988
## 16 147 2 0.942 0.01890 0.905 0.979
## 20 145 2 0.929 0.02075 0.889 0.970
## 21 143 2 0.916 0.02240 0.873 0.961
## 23 141 1 0.909 0.02317 0.865 0.956
## 24 140 2 0.896 0.02459 0.849 0.946
## 25 138 1 0.890 0.02525 0.841 0.941
## 26 137 2 0.877 0.02650 0.826 0.930
## 27 135 1 0.870 0.02709 0.819 0.925
## 34 134 1 0.864 0.02765 0.811 0.920
## 37 133 2 0.851 0.02872 0.796 0.909
## 39 131 1 0.844 0.02923 0.789 0.903
## 42 130 1 0.838 0.02972 0.781 0.898
## 46 129 1 0.831 0.03019 0.774 0.892
## 49 128 1 0.825 0.03064 0.767 0.887
## 52 127 2 0.812 0.03150 0.752 0.876
#ggsurvplot(age_fit, data = survival_data, pval = TRUE)
ggsurvplot(age_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Age",surv.median.line ="v",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("young", "Old"), data =survival_data, pval = TRUE)
## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, :
## Median survival not reached.

- With Log-rank test p-value < 0.05, there is a statistically
significant difference between the young and old prisoners in terms of
rearrest, with the young prisoner having a higher likelihood of arrest
after releasing.
Paro survival function
paro_fit <- survfit(Surv(week, arrest) ~ paro, data = survival_data)
#summary(paro_fit)
### plot for paro
ggsurvplot(paro_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "paro",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("Not Parolee", "Parolee"), data =survival_data, pval = TRUE)

- This indicates that there is no difference between the parolee and
non parolee within the group.
Work Experience survival function
Wexp_fit <- survfit(Surv(week, arrest) ~ wexp, data = survival_data)
#summary(Wexp_fit)
### plot for Work Experience
ggsurvplot(Wexp_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "wexp",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("No Work Experience", "Work Experience"), data =survival_data, pval = TRUE)

- Log-rank test p-value < 0.05, there is a statistically
significant difference between the prisoner with no work experience and
prisoners with work experience in terms of rearrest, with prisoner with
no work experience having a higher likelihood of arrest after
releasing.
Race survival function
race_fit <- survfit(Surv(week, arrest) ~ race, data = survival_data)
#summary(race_fit)
### plot for Race
ggsurvplot(race_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Race",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("White", "Black"), data =survival_data, pval = TRUE)

- The p-value indicates that there is no statistical significance
difference within the race group.
Marriage survival function
mar_fit <- survfit(Surv(week, arrest) ~ mar, data = survival_data)
#summary(mar_fit)
### plot for Marriage
ggsurvplot(mar_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Marriage",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("Not Married", "Married"), data =survival_data, pval = TRUE)

- The Log-rank test p-value indicates a statistically significant
difference between the married and non-married groups. The result
highlights that not married increases the likelihood of arrest after
releasing.
Education survival function
educ_fit <- survfit(Surv(week, arrest) ~ educ, data = survival_data)
#summary(educ_fit)
### plot for education
ggsurvplot(educ_fit, risk.table = TRUE, conf.int=TRUE,legend.title = "Level of Education",
risk.table.y.text.col=TRUE, risk.table.y.text = FALSE, ggtheme =theme_bw(),
legend.labs=c("Level 2", "Level 3", "Level 4", "Level 5", "Level 6"), data =survival_data, pval = TRUE)

- Another intriguing finding is that the level of education plays a
vital factor in being rearrested. The Log-rank test p-value shows that
there is a statistically significant difference within this group,
indicating that prisoners with a higher level of education are better
off in this group.
Fit a cox proportional hazards model
data_fit <- Surv(time = survival_data$week, event = survival_data$arrest)
hazard_fit <- coxph(data_fit ~ mar + fin + race + educ + wexp + paro + age, data = survival_data)
hazard_fit
## Call:
## coxph(formula = data_fit ~ mar + fin + race + educ + wexp + paro +
## age, data = survival_data)
##
## coef exp(coef) se(coef) z p
## mar -0.4269 0.6525 0.3807 -1.122 0.2620
## finyes -0.3764 0.6863 0.1908 -1.973 0.0485
## race 0.2927 1.3401 0.3082 0.950 0.3422
## educ -0.2389 0.7875 0.1278 -1.870 0.0615
## wexp -0.3450 0.7082 0.2038 -1.693 0.0905
## paro -0.1162 0.8903 0.1923 -0.604 0.5456
## age -0.3685 0.6918 0.2286 -1.612 0.1070
##
## Likelihood ratio test=22.6 on 7 df, p=0.002002
## n= 432, number of events= 114
summary(hazard_fit)
## Call:
## coxph(formula = data_fit ~ mar + fin + race + educ + wexp + paro +
## age, data = survival_data)
##
## n= 432, number of events= 114
##
## coef exp(coef) se(coef) z Pr(>|z|)
## mar -0.4269 0.6525 0.3807 -1.122 0.2620
## finyes -0.3764 0.6863 0.1908 -1.973 0.0485 *
## race 0.2927 1.3401 0.3082 0.950 0.3422
## educ -0.2389 0.7875 0.1278 -1.870 0.0615 .
## wexp -0.3450 0.7082 0.2038 -1.693 0.0905 .
## paro -0.1162 0.8903 0.1923 -0.604 0.5456
## age -0.3685 0.6918 0.2286 -1.612 0.1070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## mar 0.6525 1.5326 0.3094 1.3759
## finyes 0.6863 1.4571 0.4722 0.9976
## race 1.3401 0.7462 0.7324 2.4519
## educ 0.7875 1.2699 0.6130 1.0116
## wexp 0.7082 1.4121 0.4749 1.0560
## paro 0.8903 1.1232 0.6107 1.2978
## age 0.6918 1.4455 0.4420 1.0828
##
## Concordance= 0.631 (se = 0.026 )
## Likelihood ratio test= 22.6 on 7 df, p=0.002
## Wald test = 20.92 on 7 df, p=0.004
## Score (logrank) test = 21.6 on 7 df, p=0.003
Hazard Risk
ggforest(hazard_fit, data = survival_data)

- Combining all the covariates, using the Cox proportions hazard and
hazard ratio analysis, the prisoners with financial aid are better
off.The hazard ratio of 0.69 for the financial aid group indicates that
prisoners who get financial aid have a lower risk of rearrest by 31%.The
forest plot shows that the 95% confidence interval is 0.47 -1.0, and
it is statistically significant. Using this model, only financial aid
influences the likelihood of re-arrest in this study. This is quite
different from what we previously uncovered with the Kaplan-Meier
estimator and the Log-rank test.