t="C:\\Users\\pc\\Downloads\\CAC NGHIEN CUU\\BAI GIANG CAC MON\\GS TUAN\\BG 12.6.19\\Hip fracture data.csv"
#bao cho R biet co bien missing na
hip=read.csv(t,na.strings = "")
dim(hip)
## [1] 2788 20
table(hip$hipfx)
##
## 0 1
## 2599 189
table(hip$hipfx, hip$gender)
##
## Female Male
## 0 1512 1087
## 1 142 47
# kiem tra xem co su khac nhau ve ti le gay co xuong dui giua nam va nu.
# mo ta so bo
library(DescTools)
Desc(hip$hipfx~hip$gender)
## -------------------------------------------------------------------------
## hip$hipfx ~ hip$gender
##
##
## Summary:
## n: 2'788, rows: 2, columns: 2
##
## Pearson's Chi-squared test (cont. adj):
## X-squared = 20.296, df = 1, p-value = 6.635e-06
## Fisher's exact test p-value = 3.505e-06
## McNemar's chi-squared = 725.09, df = 1, p-value < 2.2e-16
##
## estimate lwr.ci upr.ci'
##
## odds ratio 0.460 0.328 0.646
## rel. risk (col1) 0.774 0.709 0.846
## rel. risk (col2) 1.682 1.307 2.164
##
##
## Phi-Coefficient 0.087
## Contingency Coeff. 0.086
## Cramer's V 0.087
##
##
## hip$gender Female Male Sum
## hip$hipfx
##
## 0 freq 1'512 1'087 2'599
## perc 54.2% 39.0% 93.2%
## p.row 58.2% 41.8% .
## p.col 91.4% 95.9% .
##
## 1 freq 142 47 189
## perc 5.1% 1.7% 6.8%
## p.row 75.1% 24.9% .
## p.col 8.6% 4.1% .
##
## Sum freq 1'654 1'134 2'788
## perc 59.3% 40.7% 100.0%
## p.row . . .
## p.col . . .
##
##
## ----------
## ' 95% conf. level
m1= glm(hipfx~gender, family=binomial, data=hip)
summary(m1)
##
## Call:
## glm(formula = hipfx ~ gender, family = binomial, data = hip)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4237 -0.4237 -0.4237 -0.2910 2.5232
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.36536 0.08777 -26.949 < 2e-16 ***
## genderMale -0.77567 0.17289 -4.486 7.24e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1382.2 on 2787 degrees of freedom
## Residual deviance: 1360.0 on 2786 degrees of freedom
## AIC: 1364
##
## Number of Fisher Scoring iterations: 5
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
logistic.display(m1)
##
## Logistic regression predicting hipfx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## gender: Male vs Female 0.46 (0.33,0.65) < 0.001 < 0.001
##
## Log-likelihood = -679.98
## No. of observations = 2788
## AIC value = 1363.96
#ve bieu do tuong quan bmi va gay co xuong dui
boxplot(hip$hipfx~hip$bmi)
m2= glm(hipfx~bmi, family=binomial, data=hip)
logistic.display(m2)
##
## Logistic regression predicting hipfx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## bmi (cont. var.) 0.84 (0.81,0.88) < 0.001 < 0.001
##
## Log-likelihood = -615.9691
## No. of observations = 2754
## AIC value = 1235.9381
#tao bien moi, dim la liet ke so hang ngang
hh=subset(hip,v2<2)
dim(hh)
## [1] 2722 20
dim(hip)
## [1] 2788 20
boxplot(hh$v2~hh$hipfx, col=c("blue", "red"))
#tao ra bien moi
hh$v2.n=hh$v2/0.1
m3=glm(hipfx~v2.n, family=binomial, data=hh)
logistic.display(m3)
##
## Logistic regression predicting hipfx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## v2.n (cont. var.) 0.41 (0.36,0.47) < 0.001 < 0.001
##
## Log-likelihood = -525.8766
## No. of observations = 2722
## AIC value = 1055.7531
library(pROC)
## Warning: package 'pROC' was built under R version 3.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#tao bien moi
hh$pred=predict(m3, type = "response")
head(hh)
## id dov gender age dob visit v1 v2 v3 v4 wt bmi ht v5
## 1 3 15/6/89 Male 73 8/6/16 1 0.98 0.88 1.079 1.458 98 32 175 NA
## 2 8 17/4/89 Female 67 11/12/21 1 0.85 0.85 0.966 1.325 72 26 166 18
## 3 9 12/6/90 Male 68 8/1/22 1 0.87 0.84 1.013 1.494 87 26 184 36
## 4 10 4/6/90 Female 62 15/5/28 1 0.62 0.71 0.839 1.214 72 24 173 NA
## 5 23 8/8/89 Male 61 22/9/28 1 0.87 0.60 0.811 1.144 72 24 173 44
## 6 24 3/5/89 Female 76 1/8/13 1 0.76 0.58 0.743 0.980 67 28 156 15
## v6 v7 v8 v9 hipfx timehip v2.n pred
## 1 39.9 1 0 0 0 0.55 8.8 0.005907462
## 2 31.0 0 0 0 0 19.68 8.5 0.007714900
## 3 28.6 0 0 0 0 5.05 8.4 0.008431852
## 4 28.2 1 0 0 0 18.55 7.1 0.026528278
## 5 28.9 1 0 0 0 19.37 6.0 0.068038669
## 6 33.3 0 0 0 0 12.30 5.8 0.080317293
mm=roc(hh$hipfx, hh$pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
summary(mm)
## Length Class Mode
## percent 1 -none- logical
## sensitivities 166 -none- numeric
## specificities 166 -none- numeric
## thresholds 166 -none- numeric
## direction 1 -none- character
## cases 174 -none- numeric
## controls 2548 -none- numeric
## fun.sesp 1 -none- function
## auc 1 auc numeric
## call 3 -none- call
## original.predictor 2722 -none- numeric
## original.response 2722 -none- numeric
## predictor 2722 -none- numeric
## response 2722 -none- numeric
## levels 2 -none- character
auc(mm)
## Area under the curve: 0.8249
plot(mm)
ci(mm)
## 95% CI: 0.7944-0.8554 (DeLong)
summary(hh$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.00 24.00 26.00 26.64 29.00 57.00 7
#co 7 du lieu missing nen tao ra dataset la bb b missing data
bb=subset(hh, bmi>0)
dim(bb)
## [1] 2715 22
m4=glm(hipfx~bmi, family=binomial, data=bb)
bb$pred.bmi=predict(m4, type="response")
head(bb,3)
## id dov gender age dob visit v1 v2 v3 v4 wt bmi ht v5
## 1 3 15/6/89 Male 73 8/6/16 1 0.98 0.88 1.079 1.458 98 32 175 NA
## 2 8 17/4/89 Female 67 11/12/21 1 0.85 0.85 0.966 1.325 72 26 166 18
## 3 9 12/6/90 Male 68 8/1/22 1 0.87 0.84 1.013 1.494 87 26 184 36
## v6 v7 v8 v9 hipfx timehip v2.n pred pred.bmi
## 1 39.9 1 0 0 0 0.55 8.8 0.005907462 0.02058579
## 2 31.0 0 0 0 0 19.68 8.5 0.007714900 0.05601439
## 3 28.6 0 0 0 0 5.05 8.4 0.008431852 0.05601439
# dung ham roc để đưa ra giá trị auc hay c-index
mm1=roc(bb$hipfx,bb$pred.bmi)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(mm1)
## Area under the curve: 0.6809
plot(mm1)
m3=glm(hipfx~v2.n, family=binomial, data=hh)
logistic.display(m2)
##
## Logistic regression predicting hipfx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## bmi (cont. var.) 0.84 (0.81,0.88) < 0.001 < 0.001
##
## Log-likelihood = -615.9691
## No. of observations = 2754
## AIC value = 1235.9381
t.test(hh$v2~hh$gender)
##
## Welch Two Sample t-test
##
## data: hh$v2 by hh$gender
## t = -11.844, df = 2360.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08835424 -0.06325396
## sample estimates:
## mean in group Female mean in group Male
## 0.6639450 0.7397491
# phan tich them gender+v2.n
m5=glm(hipfx~v2.n+gender, family=binomial, data=hh)
logistic.display(m5)
##
## Logistic regression predicting hipfx
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## v2.n (cont. var.) 0.41 (0.36,0.47) 0.41 (0.36,0.47) < 0.001
##
## gender: Male vs Female 0.49 (0.35,0.69) 0.85 (0.58,1.23) 0.376
##
## P(LR-test)
## v2.n (cont. var.) < 0.001
##
## gender: Male vs Female 0.373
##
## Log-likelihood = -525.4792
## No. of observations = 2722
## AIC value = 1056.9584
m6=glm(hipfx~bmi+gender, family=binomial, data=hh)
logistic.display
## function (logistic.model, alpha = 0.05, crude = TRUE, crude.p.value = FALSE,
## decimal = 2, simplified = FALSE)
## {
## model <- logistic.model
## if (length(grep("[$]", attr(model$term, "term.labels"))) >
## 0 | length(grep(")", attr(model$term, "term.labels"))) >
## 0 | length(model$call) < 3) {
## simplified <- TRUE
## crude <- TRUE
## }
## else {
## factor.with.colon <- NULL
## for (i in 1:(length(attr(model$term, "term.labels")))) {
## factor.with.colon <- c(factor.with.colon, any(grep(":",
## model$xlevels[i])))
## }
## factor.with.colon <- any(factor.with.colon)
## if ((length(grep(":", attr(model$terms, "term.labels"))) >
## 1) | factor.with.colon) {
## simplified <- TRUE
## crude <- TRUE
## }
## }
## if (simplified) {
## coeff <- summary(model)$coefficients[-1, ]
## table1 <- cbind(exp(coeff[, 1]), exp(coeff[, 1] - qnorm(1 -
## alpha/2) * coeff[, 2]), exp(coeff[, 1] + qnorm(1 -
## alpha/2) * coeff[, 2]), coeff[, 4])
## colnames(table1) <- c("OR", paste("lower", 100 - 100 *
## alpha, "ci", sep = ""), paste("upper", 100 - 100 *
## alpha, "ci", sep = ""), "Pr(>|Z|)")
## }
## else {
## if (length(class(model)) == 1) {
## stop("Model not from logistic regression")
## }
## if (class(model)[1] != "glm" | class(model)[2] != "lm" |
## model$family$family != "binomial") {
## stop("Model not from logistic regression")
## }
## var.names <- attr(model$terms, "term.labels")
## if (length(var.names) == 1) {
## crude <- FALSE
## }
## if (crude) {
## orci0 <- NULL
## for (i in 1:length(var.names)) {
## formula0 <- as.formula(paste(names(model$model)[1],
## "~", var.names[i]))
## if (names(model$coefficient)[1] != "(Intercept)") {
## formula0 <- as.formula(paste(names(model$model)[1],
## "~", var.names[i], "- 1"))
## }
## if (length(grep("cbind", names(model$model)[1])) >
## 0) {
## model0 <- glm(formula0, family = binomial,
## data = get(as.character(model$call)[4]))
## }
## else {
## model0 <- glm(formula0, weights = model$prior.weights,
## family = binomial, data = model$model)
## }
## coeff.matrix <- (summary(model0))$coefficients[-1,
## , drop = FALSE]
## if (names(model$coefficient)[1] != "(Intercept)") {
## coeff.matrix <- (summary(model0))$coefficients[,
## , drop = FALSE]
## }
## if (length(grep(":", var.names[i])) > 0) {
## var.name.interact <- unlist(strsplit(var.names[i],
## ":"))
## if (any(names(model$xlevels) == var.name.interact[1])) {
## level1 <- length(unlist(model$xlevels[var.name.interact[1]])) -
## 1
## }
## else {
## level1 <- 1
## }
## if (any(names(model$xlevels) == var.name.interact[2])) {
## level2 <- length(unlist(model$xlevels[var.name.interact[2]])) -
## 1
## }
## else {
## level2 <- 1
## }
## dim.coeff <- dim((summary(model0))$coefficients[-1,
## , drop = FALSE])
## coeff.matrix <- matrix(rep(NA, dim.coeff[1] *
## dim.coeff[2]), dim.coeff[1], dim.coeff[2])
## coeff.matrix <- coeff.matrix[1:(level1 * level2),
## , drop = FALSE]
## }
## orci0 <- rbind(orci0, coeff.matrix)
## }
## if (names(model$coefficient)[1] == "(Intercept)") {
## orci0 <- rbind(matrix(rep(0, 4), 1, 4), orci0)
## }
## colnames(orci0) <- c("crudeOR", paste("lower0", 100 -
## 100 * alpha, "ci", sep = ""), paste("upper0",
## 100 - 100 * alpha, "ci", sep = ""), "crude P value")
## orci0[, 3] <- exp(orci0[, 1] + qnorm(1 - alpha/2) *
## orci0[, 2])
## orci0[, 2] <- exp(orci0[, 1] - qnorm(1 - alpha/2) *
## orci0[, 2])
## orci0[, 1] <- exp(orci0[, 1])
## }
## s1 <- summary(model)
## orci <- s1$coefficients
## colnames(orci) <- c("OR", paste("lower", 100 - 100 *
## alpha, "ci", sep = ""), paste("upper", 100 - 100 *
## alpha, "ci", sep = ""), "P(Wald's test)")
## orci[, 3] <- exp(orci[, 1] + qnorm(1 - alpha/2) * orci[,
## 2])
## orci[, 2] <- exp(orci[, 1] - qnorm(1 - alpha/2) * orci[,
## 2])
## orci[, 1] <- exp(orci[, 1])
## decimal1 <- ifelse(abs(orci[, 1] - 1) < 0.01, 4, decimal)
## a <- cbind(paste(round(orci[, 1], decimal1), " (", round(orci[,
## 2], decimal1), ",", round(orci[, 3], decimal1), ") ",
## sep = ""), ifelse(orci[, 4] < 0.001, "< 0.001", round(orci[,
## 4], decimal + 1)))
## colnames(a) <- c(paste("adj. OR(", 100 - 100 * alpha,
## "%CI)", sep = ""), "P(Wald's test)")
## if (length(var.names) == 1) {
## colnames(a) <- c(paste("OR(", 100 - 100 * alpha,
## "%CI)", sep = ""), "P(Wald's test)")
## }
## rownames.a <- rownames(a)
## if (crude) {
## decimal0 <- ifelse(abs(orci0[, 1] - 1) < 0.01, 4,
## decimal)
## if (crude.p.value) {
## a0 <- cbind(paste(round(orci0[, 1], decimal0),
## " (", round(orci0[, 2], decimal0), ",", round(orci0[,
## 3], decimal0), ") ", sep = ""), ifelse(orci0[,
## 4, drop = FALSE] < 0.001, "< 0.001", round(orci0[,
## 4, drop = FALSE], decimal + 1)))
## a <- cbind(a0, a)
## rownames(a) <- rownames.a
## colnames(a) <- c(paste("crude OR(", 100 - 100 *
## alpha, "%CI)", sep = ""), "crude P value",
## paste("adj. OR(", 100 - 100 * alpha, "%CI)",
## sep = ""), "P(Wald's test)")
## a[grep(":", rownames(a)), 1:2] <- "-"
## }
## else {
## a <- cbind(paste(round(orci0[, 1], decimal1),
## " (", round(orci0[, 2], decimal1), ",", round(orci0[,
## 3], decimal1), ") ", sep = ""), a)
## colnames(a) <- c(paste("crude OR(", 100 - 100 *
## alpha, "%CI)", sep = ""), paste("adj. OR(",
## 100 - 100 * alpha, "%CI)", sep = ""), "P(Wald's test)")
## a[grep(":", rownames(a)), 1] <- "-"
## }
## }
## modified.coeff.array <- a
## table1 <- tableGlm(model, modified.coeff.array, decimal)
## }
## if (simplified) {
## first.line <- NULL
## last.lines <- NULL
## }
## else {
## outcome.name <- names(model$model)[1]
## if (!is.null(attr(model$data, "var.labels"))) {
## outcome.name <- attr(model$data, "var.labels")[attr(model$data,
## "names") == names(model$model)[1]]
## }
## else {
## outcome.name <- names(model$model)[1]
## }
## outcome.name <- ifelse(outcome.name == "", names(model$model)[1],
## outcome.name)
## if (crude) {
## if (attr(model0$term, "dataClasses")[1] == "logical") {
## outcome.lab <- paste(outcome.name, "\n")
## }
## else {
## if ((attr(model$term, "dataClasses")[1] == "numeric") |
## (attr(model$term, "dataClasses")[1] == "integer")) {
## outcome.levels <- levels(factor(model$model[,
## 1]))
## outcome.lab <- paste(outcome.name, outcome.levels[2],
## "vs", outcome.levels[1], "\n")
## }
## }
## }
## if (attr(model$term, "dataClasses")[1] == "factor") {
## outcome.lab <- paste(names(model$model)[1], ":",
## levels(model$model[, 1])[2], "vs", levels(model$model[,
## 1])[1], "\n")
## }
## else {
## outcome.lab <- paste(outcome.name, "\n")
## }
## first.line <- paste("\n", "Logistic regression predicting ",
## outcome.lab, sep = "")
## last.lines <- paste("Log-likelihood = ", round(logLik(model),
## decimal + 2), "\n", "No. of observations = ", length(model$y),
## "\n", "AIC value = ", round(s1$aic, decimal + 2),
## "\n", "\n", sep = "")
## }
## results <- list(first.line = first.line, table = table1,
## last.lines = last.lines)
## class(results) <- c("display", "list")
## results
## }
## <bytecode: 0x000000000f42c2d8>
## <environment: namespace:epiDisplay>
# analysis xem bien nao co y nghia bang BMA
hh=na.omit(hh)
dim(hh)
## [1] 2315 22
xvars=hh[,c("gender", "age", "v1", "v2", "v3","v4","v5","v6","v7","v8","v9","bmi")]
yvar=hh[, c("hipfx")]
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-4)
##
## Attaching package: 'rrcov'
## The following object is masked from 'package:DescTools':
##
## Cov
m7=bic.glm(xvars, yvar, strict=F, OR=20, glm.family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m7)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = F, OR = 20)
##
##
## 7 models were selected
## Best 5 models (cumulative posterior probability = 0.9371 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -1.1497190 1.368386 -1.021e+00 -1.200e+00
## gender.x 3.0
## .Male 0.0153228 0.096747 . .
## age.x 100.0 0.0771649 0.013818 7.658e-02 7.963e-02
## v1.x 24.1 -0.6892902 1.357276 . -2.528e+00
## v2.x 13.0 -0.5054618 1.467008 . .
## v3.x 95.4 -7.9421840 2.286455 -9.064e+00 -6.775e+00
## v4.x 0.0 0.0000000 0.000000 . .
## v5.x 5.6 -0.0009949 0.004752 . .
## v6.x 3.3 0.0011420 0.007505 . .
## v7.x 0.0 0.0000000 0.000000 . .
## v8.x 0.0 0.0000000 0.000000 . .
## v9.x 0.0 0.0000000 0.000000 . .
## bmi.x 0.0 0.0000000 0.000000 . .
##
## nVar 2 3
## BIC -1.707e+04 -1.707e+04
## post prob 0.586 0.166
## model 3 model 4 model 5
## Intercept -1.223e+00 -5.573e-01 -2.551e+00
## gender.x
## .Male . . .
## age.x 7.476e-02 7.129e-02 8.253e-02
## v1.x . . -3.621e+00
## v2.x -3.033e+00 . -5.500e+00
## v3.x -6.269e+00 -8.635e+00 .
## v4.x . . .
## v5.x . -1.773e-02 .
## v6.x . . .
## v7.x . . .
## v8.x . . .
## v9.x . . .
## bmi.x . . .
##
## nVar 3 3 3
## BIC -1.707e+04 -1.707e+04 -1.707e+04
## post prob 0.084 0.056 0.046
#mo hinh sau cung
#dung 2 biên age va v3
fm=glm(hipfx~age+v3, family = binomial, data=hh)
#tinh giá trị tiên lượng
hh$pred1=predict(fm,type="response")
mm4=roc(hh$hipfx,hh$pred1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(mm4)
## Area under the curve: 0.8645
ci(mm4)
## 95% CI: 0.8359-0.893 (DeLong)
plot(smooth(mm4))