arr = read.csv("C:\\VN trips\\VN trip 4 (Dec 2022)\\VLU\\Regression analysis\\Datasets\\Arrest data.csv", header = TRUE)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ age + race + educ + prior + work + married + parole + finance | factor(arrest), data = arr)
0 (N=318) |
1 (N=114) |
Overall (N=432) |
|
---|---|---|---|
age | |||
Mean (SD) | 25.3 (6.31) | 22.8 (5.12) | 24.6 (6.11) |
Median [Min, Max] | 23.0 [17.0, 44.0] | 21.0 [17.0, 44.0] | 23.0 [17.0, 44.0] |
race | |||
black | 277 (87.1%) | 102 (89.5%) | 379 (87.7%) |
other | 41 (12.9%) | 12 (10.5%) | 53 (12.3%) |
educ | |||
Mean (SD) | 3.53 (0.883) | 3.32 (0.656) | 3.48 (0.834) |
Median [Min, Max] | 3.00 [2.00, 6.00] | 3.00 [2.00, 6.00] | 3.00 [2.00, 6.00] |
prior | |||
Mean (SD) | 2.70 (2.55) | 3.77 (3.59) | 2.98 (2.90) |
Median [Min, Max] | 2.00 [0, 15.0] | 3.00 [0, 18.0] | 2.00 [0, 18.0] |
work | |||
no | 123 (38.7%) | 62 (54.4%) | 185 (42.8%) |
yes | 195 (61.3%) | 52 (45.6%) | 247 (57.2%) |
married | |||
married | 45 (14.2%) | 8 (7.0%) | 53 (12.3%) |
not married | 273 (85.8%) | 106 (93.0%) | 379 (87.7%) |
parole | |||
no | 119 (37.4%) | 46 (40.4%) | 165 (38.2%) |
yes | 199 (62.6%) | 68 (59.6%) | 267 (61.8%) |
finance | |||
no | 150 (47.2%) | 66 (57.9%) | 216 (50.0%) |
yes | 168 (52.8%) | 48 (42.1%) | 216 (50.0%) |
library(compareGroups)
createTable(compareGroups(arrest ~ age + race + educ + prior + work + married + parole + finance, data = arr))
##
## --------Summary descriptives table by 'arrest'---------
##
## _________________________________________________
## 0 1 p.overall
## N=318 N=114
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 25.3 (6.31) 22.8 (5.12) <0.001
## race: 0.621
## black 277 (87.1%) 102 (89.5%)
## other 41 (12.9%) 12 (10.5%)
## educ 3.53 (0.88) 3.32 (0.66) 0.006
## prior 2.70 (2.55) 3.77 (3.59) 0.004
## work: 0.005
## no 123 (38.7%) 62 (54.4%)
## yes 195 (61.3%) 52 (45.6%)
## married: 0.068
## married 45 (14.2%) 8 (7.02%)
## not married 273 (85.8%) 106 (93.0%)
## parole: 0.660
## no 119 (37.4%) 46 (40.4%)
## yes 199 (62.6%) 68 (59.6%)
## finance: 0.063
## no 150 (47.2%) 66 (57.9%)
## yes 168 (52.8%) 48 (42.1%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
library(ggplot2)
ggplot(arr, aes(group = arrest, x = arrest, y = age, fill = arrest, color = arrest)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = arrest))+ theme(legend.position = "none")+ labs(x = "Arrest", y = "Age (years)")
### (4.3.2) Dùng Student’s t-test đánh giá mối liên quan
t.test(age ~ arrest, data = arr)
##
## Welch Two Sample t-test
##
## data: age by arrest
## t = 4.1789, df = 243.6, p-value = 4.086e-05
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 1.317143 3.665975
## sample estimates:
## mean in group 0 mean in group 1
## 25.25472 22.76316
m1 = glm(arrest ~ age, family = binomial, data = arr)
summary(m1)
##
## Call:
## glm(formula = arrest ~ age, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9909 -0.8362 -0.7246 1.4131 2.3509
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.95611 0.54357 1.759 0.078586 .
## age -0.08305 0.02296 -3.617 0.000298 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 482.69 on 430 degrees of freedom
## AIC: 486.69
##
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(m1)
##
## Logistic regression predicting arrest
##
## OR(95%CI) P(Wald's test) P(LR-test)
## age (cont. var.) 0.92 (0.88,0.96) < 0.001 < 0.001
##
## Log-likelihood = -241.3436
## No. of observations = 432
## AIC value = 486.6872
library(ggplot2)
ggplot(data = arr, aes(x = age, y = arrest)) + geom_point(alpha = 0.15) + geom_smooth(method = "glm", method.args = list(family = "binomial"))
## `geom_smooth()` using formula 'y ~ x'
m2 = glm(arrest ~ finance, family = binomial, data = arr)
summary(m2)
##
## Call:
## glm(formula = arrest ~ finance, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.854 -0.854 -0.709 1.540 1.734
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8210 0.1477 -5.558 2.73e-08 ***
## financeyes -0.4318 0.2205 -1.959 0.0502 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 494.73 on 430 degrees of freedom
## AIC: 498.73
##
## Number of Fisher Scoring iterations: 4
logistic.display(m2)
##
## Logistic regression predicting arrest
##
## OR(95%CI) P(Wald's test) P(LR-test)
## finance (cont. var.) 0.65 (0.42,1) 0.05 0.049
##
## Log-likelihood = -247.3642
## No. of observations = 432
## AIC value = 498.7283
# Biểu đồ hộp
ggplot(arr, aes(group = finance, x = finance, y = age, fill = finance, color = finance)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = arrest))+theme(legend.position = "none")+ labs(x = "Finance", y = "Age (years)")
t.test(age ~ finance, data = arr)
##
## Welch Two Sample t-test
##
## data: age by finance
## t = -1.2759, df = 423.8, p-value = 0.2027
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## -1.9054289 0.4054289
## sample estimates:
## mean in group no mean in group yes
## 24.22222 24.97222
m3 = glm(arrest ~ finance + age, family = binomial, data = arr)
summary(m3)
##
## Call:
## glm(formula = arrest ~ finance + age, family = binomial, data = arr)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0633 -0.8193 -0.7107 1.3320 2.2949
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.11913 0.55462 2.018 0.043608 *
## financeyes -0.39905 0.22408 -1.781 0.074935 .
## age -0.08197 0.02309 -3.550 0.000385 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 498.60 on 431 degrees of freedom
## Residual deviance: 479.49 on 429 degrees of freedom
## AIC: 485.49
##
## Number of Fisher Scoring iterations: 4
library(Publish)
## Loading required package: prodlim
publish(m3)
## Variable Units OddsRatio CI.95 p-value
## finance no Ref
## yes 0.67 [0.43;1.04] 0.0749350
## age 0.92 [0.88;0.96] 0.0003852
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.6-2)
yvar = arr[, "arrest"]
xvars = arr[, c("finance", "age", "race", "work", "married", "parole", "prior", "educ")]
bma = bic.glm(xvars, yvar, glm.family = "binomial")
summary(bma)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial")
##
##
## 12 models were selected
## Best 5 models (cumulative posterior probability = 0.7985 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 0.817350 0.81303 5.022e-01 9.561e-01 1.466e+00
## finance 14.6 -0.058569 0.16599 . . .
## age 100.0 -0.078012 0.02328 -7.796e-02 -8.305e-02 -7.692e-02
## race 3.1 -0.010614 0.08693 . . .
## work 4.9 -0.014130 0.08348 . . .
## married 5.0 0.027326 0.15084 . . .
## parole 2.1 -0.002227 0.03709 . . .
## prior 75.0 0.077046 0.05457 1.046e-01 . 9.433e-02
## educ 20.8 -0.062332 0.14083 . . -2.802e-01
##
## nVar 2 1 3
## BIC -2.129e+03 -2.127e+03 -2.126e+03
## post prob 0.395 0.122 0.107
## model 4 model 5
## Intercept 6.565e-01 2.054e+00
## finance -4.057e-01 .
## age -7.648e-02 -8.108e-02
## race . .
## work . .
## married . .
## parole . .
## prior 1.056e-01 .
## educ . -3.351e-01
##
## nVar 3 2
## BIC -2.126e+03 -2.126e+03
## post prob 0.095 0.079
imageplot.bma(bma)
## (4.6) Đánh giá mô hình
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
sample = createDataPartition(arr$arrest, p = 0.6, list = FALSE)
train = arr[sample,]
test = arr[-sample,]
m4 = train(factor(arrest) ~ age + prior, data = train, method = "glm", family = "binomial")
summary(m4)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2836 -0.8265 -0.6817 1.2243 2.4613
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.85081 0.75747 1.123 0.26135
## age -0.09227 0.03146 -2.933 0.00336 **
## prior 0.11486 0.05335 2.153 0.03131 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 302.90 on 259 degrees of freedom
## Residual deviance: 286.31 on 257 degrees of freedom
## AIC: 292.31
##
## Number of Fisher Scoring iterations: 4
arrest.pred = predict(m4, newdata = test, type = "raw")
confusionMatrix(data = arrest.pred, reference = factor(test$arrest))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 123 37
## 1 5 7
##
## Accuracy : 0.7558
## 95% CI : (0.6846, 0.818)
## No Information Rate : 0.7442
## P-Value [Acc > NIR] : 0.4018
##
## Kappa : 0.1576
##
## Mcnemar's Test P-Value : 1.724e-06
##
## Sensitivity : 0.9609
## Specificity : 0.1591
## Pos Pred Value : 0.7687
## Neg Pred Value : 0.5833
## Prevalence : 0.7442
## Detection Rate : 0.7151
## Detection Prevalence : 0.9302
## Balanced Accuracy : 0.5600
##
## 'Positive' Class : 0
##
library(rms)
## Loading required package: Hmisc
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
m5 = lrm(arrest ~ age + prior, data = arr, x = TRUE, y = TRUE)
validate = validate(m5, B = 100)
validate
## index.orig training test optimism index.corrected n
## Dxy 0.3132 0.3140 0.3076 0.0065 0.3068 100
## R2 0.0800 0.0867 0.0761 0.0106 0.0694 100
## Intercept 0.0000 0.0000 -0.0555 0.0555 -0.0555 100
## Slope 1.0000 1.0000 0.9598 0.0402 0.9598 100
## Emax 0.0000 0.0000 0.0192 0.0192 0.0192 100
## D 0.0540 0.0593 0.0512 0.0081 0.0460 100
## U -0.0046 -0.0046 -0.0002 -0.0044 -0.0002 100
## Q 0.0586 0.0639 0.0514 0.0125 0.0461 100
## B 0.1825 0.1832 0.1837 -0.0006 0.1831 100
## g 0.6416 0.6634 0.6196 0.0438 0.5978 100
## gp 0.1135 0.1163 0.1096 0.0067 0.1067 100
prob = predict(m4, newdata = test, type = "prob")
pred = data.frame(prob, test$arrest)
head(pred)
## X0 X1 test.arrest
## 1 0.7851532 0.21484677 1
## 9 0.6201066 0.37989342 0
## 10 0.7300032 0.26999682 0
## 11 0.7691790 0.23082095 0
## 14 0.9116171 0.08838293 0
## 19 0.5193990 0.48060100 0
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(pred$test.arrest, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred$test.arrest, predictor = pred$X1)
##
## Data: pred$X1 in 128 controls (pred$test.arrest 0) < 44 cases (pred$test.arrest 1).
## Area under the curve: 0.626
plot(smooth(roc(pred$test.arrest, pred$X1)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ddist = datadist(arr)
options(datadist = "ddist")
m6 = lrm(arrest ~ age + prior, data = arr)
p = nomogram(m6, fun = function(x)1/(1+exp(-x)), fun.at = c(0.001, 0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99, 0.999), funlabel = "Probability of arrest")
plot(p, cex.axis = 0.6, lmgp = 0.2)