arr = read.csv("C:\\Thach\\VLU workshop (Jun2023)\\Datasets\\Arrest data.csv", header = TRUE)
dim(arr)
## [1] 432 11
head(arr)
## id week arrest finance age race work married parole prior educ
## 1 1 20 1 no 27 black no not married yes 3 3
## 2 2 17 1 no 18 black no not married yes 8 4
## 3 3 25 1 no 19 other yes not married yes 13 3
## 4 4 52 0 yes 23 black yes married yes 1 5
## 5 5 52 0 no 19 other yes not married yes 3 3
## 6 6 52 0 no 24 black yes not married no 2 4
library(compareGroups)
arr$educ = as.factor(arr$educ)
createTable(compareGroups(arrest ~ age + race + educ + prior + work + married + parole + finance, data = arr))
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
##
## --------Summary descriptives table by 'arrest'---------
##
## _________________________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 25.3 (6.31) 22.8 (5.12) <0.001
## race: 0.621
## black 277 (87.1%) 102 (89.5%)
## other 41 (12.9%) 12 (10.5%)
## educ: 0.023
## 2 20 (6.29%) 4 (3.51%)
## 3 162 (50.9%) 77 (67.5%)
## 4 92 (28.9%) 27 (23.7%)
## 5 34 (10.7%) 5 (4.39%)
## 6 10 (3.14%) 1 (0.88%)
## prior 2.70 (2.55) 3.77 (3.59) 0.004
## work: 0.005
## no 123 (38.7%) 62 (54.4%)
## yes 195 (61.3%) 52 (45.6%)
## married: 0.068
## married 45 (14.2%) 8 (7.02%)
## not married 273 (85.8%) 106 (93.0%)
## parole: 0.660
## no 119 (37.4%) 46 (40.4%)
## yes 199 (62.6%) 68 (59.6%)
## finance: 0.063
## no 150 (47.2%) 66 (57.9%)
## yes 168 (52.8%) 48 (42.1%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
library(ggplot2)
ggplot(arr, aes(group = arrest, x = arrest, y = age, fill = arrest, color = arrest)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = arrest)) + theme(legend.position = "none") + labs(x = "Arrest", y = "Age (years)")
m1 = glm(arrest ~ age, family = binomial, data = arr)
summary(m1)
##
## Call:
## glm(formula = arrest ~ age, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9909 -0.8362 -0.7246 1.4131 2.3509
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.95611 0.54357 1.759 0.078586 .
## age -0.08305 0.02296 -3.617 0.000298 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 482.69 on 430 degrees of freedom
## AIC: 486.69
##
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(m1)
##
## Logistic regression predicting arrest
##
## OR(95%CI) P(Wald's test) P(LR-test)
## age (cont. var.) 0.92 (0.88,0.96) < 0.001 < 0.001
##
## Log-likelihood = -241.3436
## No. of observations = 432
## AIC value = 486.6872
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
m1.v = lrm(arrest ~ age, data = arr, x = TRUE, y = TRUE)
validate = validate(m1.v, B = 100)
validate
## index.orig training test optimism index.corrected n
## Dxy 0.2807 0.2906 0.2807 0.0099 0.2708 100
## R2 0.0528 0.0593 0.0528 0.0065 0.0463 100
## Intercept 0.0000 0.0000 0.0254 -0.0254 0.0254 100
## Slope 1.0000 1.0000 1.0117 -0.0117 1.0117 100
## Emax 0.0000 0.0000 0.0074 0.0074 0.0074 100
## D 0.0345 0.0392 0.0345 0.0047 0.0298 100
## U -0.0046 -0.0046 0.0006 -0.0052 0.0006 100
## Q 0.0392 0.0439 0.0339 0.0099 0.0292 100
## B 0.1869 0.1847 0.1877 -0.0029 0.1899 100
## g 0.5278 0.5626 0.5278 0.0348 0.4930 100
## gp 0.0890 0.0923 0.0890 0.0034 0.0856 100
Dxy = 2(AUC - 0.5)
Emax = maximum difference between raw predicted probabilities and recalibrated probabilities
D (Discrimination index) = the difference in quality between the best constant predictor (one that on average predicts the overall prevalence of an event) and the best calibrated predictor.
U (Unreliable index) = unitless index of how far the logit calibration curve intercept and slope are from (0, 1)
Q (Overall quality = D - U) = The difference between the best constant predictor and the quality of predictions as they stand (no calibration).
B (Brier index) = The mean square error between predictions and observations.
g (Gini’s mean difference calculated on the log-OR scale) = model’s predictive discrimination performance based only on the fitted values.
gp (Gini’s mean difference calculated on the probability scale)
m2 = glm(arrest ~ finance, family = binomial, data = arr)
summary(m2)
##
## Call:
## glm(formula = arrest ~ finance, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.854 -0.854 -0.709 1.540 1.734
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8210 0.1477 -5.558 2.73e-08 ***
## financeyes -0.4318 0.2205 -1.959 0.0502 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 494.73 on 430 degrees of freedom
## AIC: 498.73
##
## Number of Fisher Scoring iterations: 4
ggplot(arr, aes(group = finance, x = finance, y = age, fill = finance, color = finance)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = arrest)) + theme(legend.position = "none") + labs(x = "Finance", y = "Age (years)")
Người được hỗ trợ tài chánh có khuynh hướng lớn tuổi hơn so với người
không nhận được hỗ trợ tài chánh.
m3 = glm(arrest ~ finance + age, family = binomial, data = arr)
summary(m3)
##
## Call:
## glm(formula = arrest ~ finance + age, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0633 -0.8193 -0.7107 1.3320 2.2949
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.11913 0.55462 2.018 0.043608 *
## financeyes -0.39905 0.22408 -1.781 0.074935 .
## age -0.08197 0.02309 -3.550 0.000385 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 479.49 on 429 degrees of freedom
## AIC: 485.49
##
## Number of Fisher Scoring iterations: 4
library(rms)
m3.v = lrm(arrest ~ finance + age, data = arr, x = TRUE, y = TRUE)
validate = validate(m3.v, B = 100)
validate
## index.orig training test optimism index.corrected n
## Dxy 0.2753 0.2863 0.2677 0.0185 0.2568 100
## R2 0.0632 0.0707 0.0599 0.0108 0.0524 100
## Intercept 0.0000 0.0000 -0.0028 0.0028 -0.0028 100
## Slope 1.0000 1.0000 0.9998 0.0002 0.9998 100
## Emax 0.0000 0.0000 0.0007 0.0007 0.0007 100
## D 0.0419 0.0476 0.0396 0.0080 0.0340 100
## U -0.0046 -0.0046 0.0006 -0.0052 0.0006 100
## Q 0.0466 0.0522 0.0390 0.0132 0.0334 100
## B 0.1860 0.1850 0.1873 -0.0022 0.1882 100
## g 0.5899 0.6196 0.5693 0.0503 0.5396 100
## gp 0.1007 0.1045 0.0982 0.0062 0.0944 100
m4 = glm(arrest ~ age + finance + race + work + married + parole + educ, family = binomial, data = arr)
summary(m4)
##
## Call:
## glm(formula = arrest ~ age + finance + race + work + married +
## parole + educ, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1947 -0.8215 -0.6322 1.1923 2.4775
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03062 0.99560 0.031 0.9755
## age -0.06042 0.02483 -2.433 0.0150 *
## financeyes -0.46681 0.23074 -2.023 0.0431 *
## raceother -0.39425 0.36908 -1.068 0.2854
## workyes -0.21147 0.24787 -0.853 0.3936
## marriednot married 0.44496 0.42376 1.050 0.2937
## paroleyes -0.15446 0.23449 -0.659 0.5101
## educ3 0.71294 0.59084 1.207 0.2276
## educ4 0.23309 0.61778 0.377 0.7060
## educ5 -0.24754 0.74909 -0.330 0.7411
## educ6 -0.49929 1.20272 -0.415 0.6780
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 466.61 on 421 degrees of freedom
## AIC: 488.61
##
## Number of Fisher Scoring iterations: 4
library(rms)
m4.v = lrm(arrest ~ age + finance + race + work + married + parole + educ, data = arr, x = TRUE, y = TRUE)
validate = validate(m4.v, B = 100)
validate
## index.orig training test optimism index.corrected n
## Dxy 0.3721 0.4011 0.3281 0.0730 0.2991 100
## R2 0.1043 0.1358 0.0766 0.0593 0.0450 100
## Intercept 0.0000 0.0000 -0.2725 0.2725 -0.2725 100
## Slope 1.0000 1.0000 0.7034 0.2966 0.7034 100
## Emax 0.0000 0.0000 0.1314 0.1314 0.1314 100
## D 0.0718 0.0955 0.0516 0.0439 0.0279 100
## U -0.0046 -0.0046 0.0118 -0.0165 0.0118 100
## Q 0.0764 0.1001 0.0397 0.0604 0.0160 100
## B 0.1795 0.1751 0.1835 -0.0084 0.1879 100
## g 0.7887 1.0213 0.6759 0.3455 0.4432 100
## gp 0.1341 0.1496 0.1107 0.0388 0.0953 100
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
sample = createDataPartition(arr$arrest, p = 0.6, list = FALSE)
train = arr[sample,]
test = arr[-sample,]
m5 = train(factor(arrest) ~ age + finance + race + work + married + parole + educ, data = train, method = "glm", family = "binomial")
summary(m5)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3047 -0.7942 -0.5433 -0.1653 2.5060
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.38353 1.54624 -0.248 0.80410
## age -0.09677 0.03538 -2.735 0.00623 **
## financeyes -0.37128 0.31091 -1.194 0.23241
## raceother -0.47629 0.48545 -0.981 0.32653
## workyes -0.37817 0.33718 -1.122 0.26204
## `marriednot married` 0.75207 0.65967 1.140 0.25426
## paroleyes -0.32942 0.31538 -1.045 0.29625
## educ3 1.76447 1.08937 1.620 0.10529
## educ4 1.32948 1.11647 1.191 0.23374
## educ5 1.00789 1.33236 0.756 0.44937
## educ6 0.88281 1.53809 0.574 0.56599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 290.20 on 259 degrees of freedom
## Residual deviance: 259.65 on 249 degrees of freedom
## AIC: 281.65
##
## Number of Fisher Scoring iterations: 5
arrest.pred = predict(m5, newdata = test, type = "raw")
confusionMatrix(data = arrest.pred, reference = factor(test$arrest), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 118 46
## 1 4 4
##
## Accuracy : 0.7093
## 95% CI : (0.6353, 0.7759)
## No Information Rate : 0.7093
## P-Value [Acc > NIR] : 0.5381
##
## Kappa : 0.0628
##
## Mcnemar's Test P-Value : 6.7e-09
##
## Sensitivity : 0.08000
## Specificity : 0.96721
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.71951
## Prevalence : 0.29070
## Detection Rate : 0.02326
## Detection Prevalence : 0.04651
## Balanced Accuracy : 0.52361
##
## 'Positive' Class : 1
##
Test/Disease A B / C D Sen = A / (A + C) Spe = D / (B + D) PPV = A / (A + B) NPV = D / (C + D) Detection rate = A / N Detection Prevalence = (A + B)/ N Balanced Accuracy = (Sen + Spe) / 2
prob = predict(m5, newdata = test, type = "prob")
pred = data.frame(prob, test$arrest)
head(pred)
## X0 X1 test.arrest
## 3 0.7087968 0.29120316 1
## 4 0.9358528 0.06414719 0
## 7 0.8985259 0.10147412 1
## 8 0.7267284 0.27327155 0
## 9 0.4989971 0.50100288 0
## 10 0.7185329 0.28146712 0
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(pred$test.arrest, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred$test.arrest, predictor = pred$X1)
##
## Data: pred$X1 in 122 controls (pred$test.arrest 0) < 50 cases (pred$test.arrest 1).
## Area under the curve: 0.6143
plot(smooth(roc(pred$test.arrest, pred$X1)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases