#reading data
t <- file.choose()
t
## [1] "C:\\Users\\Q-Anh\\Desktop\\Prediction Model - NVT Prof\\R markdown\\Exercise_19_06_16.Rmd"
hip <- read.csv("C:\\Users\\Q-Anh\\Desktop\\Prediction Model - NVT Prof\\Dataset\\Hip fracture data.csv",
header = TRUE)
head(hip)
## id dov gender age dob visit v1 v2 v3 v4 wt bmi ht v5
## 1 3 15/6/89 Male 73 8/6/16 1 0.98 0.88 1.079 1.458 98 32 175 NA
## 2 8 17/4/89 Female 67 11/12/21 1 0.85 0.85 0.966 1.325 72 26 166 18
## 3 9 12/6/90 Male 68 8/1/22 1 0.87 0.84 1.013 1.494 87 26 184 36
## 4 10 4/6/90 Female 62 15/5/28 1 0.62 0.71 0.839 1.214 72 24 173 NA
## 5 23 8/8/89 Male 61 22/9/28 1 0.87 0.60 0.811 1.144 72 24 173 44
## 6 24 3/5/89 Female 76 1/8/13 1 0.76 0.58 0.743 0.980 67 28 156 15
## v6 v7 v8 v9 hipfx timehip
## 1 39.9 1 0 0 0 0.55
## 2 31.0 0 0 0 0 19.68
## 3 28.6 0 0 0 0 5.05
## 4 28.2 1 0 0 0 18.55
## 5 28.9 1 0 0 0 19.37
## 6 33.3 0 0 0 0 12.30
#Display the exact number
options(scipen = 999)
#the difference between male and female
table(hip$gender, hip$hipfx)
##
## 0 1
## Female 1512 142
## Male 1087 47
require(DescTools)
## Loading required package: DescTools
Desc(hip$gender, hip$hipfx)
## -------------------------------------------------------------------------
##
##
## length n NAs unique
## 2'788 2'788 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## Female 1'654 59.3% 57.5% 61.1%
## Male 1'134 40.7% 38.9% 42.5%
##
## ' 95%-CI Wilson
#fit logistic model
m <- glm(hipfx ~ age, family = binomial, data = hip)
summary(m)
##
## Call:
## glm(formula = hipfx ~ age, family = binomial, data = hip)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1780 -0.3790 -0.2843 -0.2254 2.8400
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.102684 0.745925 -14.88 <0.0000000000000002 ***
## age 0.118129 0.009981 11.84 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1382.2 on 2787 degrees of freedom
## Residual deviance: 1242.5 on 2786 degrees of freedom
## AIC: 1246.5
##
## Number of Fisher Scoring iterations: 6
require(epiDisplay)
## Loading required package: epiDisplay
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
m1 <- glm(hipfx ~ bmi, family = binomial, data = hip)
summary(m1)
##
## Call:
## glm(formula = hipfx ~ bmi, family = binomial, data = hip)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8205 -0.4038 -0.3144 -0.2441 3.0143
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.66748 0.51819 3.218 0.00129 **
## bmi -0.17221 0.02125 -8.105 0.000000000000000528 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1308.6 on 2753 degrees of freedom
## Residual deviance: 1231.9 on 2752 degrees of freedom
## (34 observations deleted due to missingness)
## AIC: 1235.9
##
## Number of Fisher Scoring iterations: 6
#Plotting boxplot
require(ggplot2)
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
##
## alpha
p <- ggplot(data = hip, aes(x=factor(hipfx), y= bmi, col = factor(hipfx))) +
geom_jitter(alpha=0.5) +
xlab("Hip Fracture") +
ylab("Body Mass Index") +
theme(legend.position = "top")+
theme(legend.title=element_blank())
scale_fill_discrete(labels=c("no fracture", "fracture"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
## aesthetics: fill
## axis_order: function
## break_info: function
## break_positions: function
## breaks: waiver
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: no fracture fracture
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: grey50
## name: waiver
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range, gg>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range, gg>
## reset: function
## scale_name: hue
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale, gg>
p
## Warning: Removed 34 rows containing missing values (geom_point).
#plotting ROC curve
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
hh <- subset(hip, hip$bmi > 0)
hh$pre.bmi <- predict(m1, type="response")
roccurve <- roc(hh$hipfx ~ hh$pre.bmi)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roccurve)
## Area under the curve: 0.6803
ggroc(roccurve, col="red", alpha = 0.5)
library(epiDisplay)
m3 <- glm(hipfx ~ v2 + bmi, family=binomial, data = hh)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m3)
##
## Call:
## glm(formula = hipfx ~ v2 + bmi, family = binomial, data = hh)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5435 -0.3624 -0.2280 -0.1341 3.5651
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.8723 0.5565 6.959 0.00000000000344 ***
## v2 -8.2338 0.7086 -11.620 < 0.0000000000000002 ***
## bmi -0.0616 0.0225 -2.737 0.0062 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1282.1 on 2715 degrees of freedom
## Residual deviance: 1034.1 on 2713 degrees of freedom
## (38 observations deleted due to missingness)
## AIC: 1040.1
##
## Number of Fisher Scoring iterations: 7
logistic.display(m3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Logistic regression predicting hipfx
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## v2 (cont. var.) 0 (0,0) 0 (0,0) < 0.001
##
## bmi (cont. var.) 0.84 (0.81,0.88) 0.94 (0.9,0.98) 0.006
##
## P(LR-test)
## v2 (cont. var.) < 0.001
##
## bmi (cont. var.) 0.005
##
## Log-likelihood = -517.0407
## No. of observations = 2716
## AIC value = 1040.0815
#BMA
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
##
## Attaching package: 'rrcov'
## The following object is masked from 'package:DescTools':
##
## Cov
xvars <- hh[, c("age", "gender", "v1", "v2", "v3", "v4",
"v5", "v6", "v7", "v8", "v9", "bmi")]
yvar <- hh[, c("hipfx")]
m <- bic.glm(xvars, yvar, strict=FALSE, OR=20, glm.family="binomial")
## Warning in bic.glm.data.frame(xvars, yvar, strict = FALSE, OR = 20,
## glm.family = "binomial"): There were 438 records deleted due to NA's
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = FALSE, OR = 20)
##
##
## 6 models were selected
## Best 5 models (cumulative posterior probability = 0.9658 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -1.160848 1.370080 -1.02179 -1.20027
## age.x 100.0 0.077098 0.013821 0.07660 0.07966
## gender.x 0.0
## .Male 0.000000 0.000000 . .
## v1.x 21.2 -0.582100 1.244628 . -2.48784
## v2.x 13.6 -0.530365 1.498308 . .
## v3.x 95.2 -7.995824 2.303874 -9.06475 -6.81512
## v4.x 0.0 0.000000 0.000000 . .
## v5.x 5.8 -0.001032 0.004837 . .
## v6.x 3.4 0.001185 0.007641 . .
## v7.x 0.0 0.000000 0.000000 . .
## v8.x 0.0 0.000000 0.000000 . .
## v9.x 0.0 0.000000 0.000000 . .
## bmi.x 0.0 0.000000 0.000000 . .
##
## nVar 2 3
## BIC -17081.29230 -17078.67937
## post prob 0.607 0.164
## model 3 model 4 model 5
## Intercept -1.22289 -0.55783 -2.55134
## age.x 0.07476 0.07130 0.08253
## gender.x
## .Male . . .
## v1.x . . -3.62078
## v2.x -3.03265 . -5.49960
## v3.x -6.26859 -8.63587 .
## v4.x . . .
## v5.x . -0.01774 .
## v6.x . . .
## v7.x . . .
## v8.x . . .
## v9.x . . .
## bmi.x . . .
##
## nVar 3 3 3
## BIC -17077.43400 -17076.60201 -17076.20840
## post prob 0.088 0.058 0.048
imageplot.bma(m)
#Calculate final model (age, v3)
hx <- subset(hip, hip$v3 > 0)
mf <- glm(data=hx, hipfx ~ age + v3)
hx$pred <- predict(mf, type="response")
rocfinal <- roc(hx$hipfx ~ hx$pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(rocfinal)
## Area under the curve: 0.8452
plot(smooth(rocfinal))
ggroc(rocfinal, col="red", alpha=0.5, size= 1.2, linetype=4,
legacy.axes=TRUE)