#reading data

t <- file.choose()
t
## [1] "C:\\Users\\Q-Anh\\Desktop\\Prediction Model - NVT Prof\\R markdown\\Exercise_19_06_16.Rmd"
hip <- read.csv("C:\\Users\\Q-Anh\\Desktop\\Prediction Model - NVT Prof\\Dataset\\Hip fracture data.csv",
header = TRUE)
head(hip)
##   id     dov gender age      dob visit   v1   v2    v3    v4 wt bmi  ht v5
## 1  3 15/6/89   Male  73   8/6/16     1 0.98 0.88 1.079 1.458 98  32 175 NA
## 2  8 17/4/89 Female  67 11/12/21     1 0.85 0.85 0.966 1.325 72  26 166 18
## 3  9 12/6/90   Male  68   8/1/22     1 0.87 0.84 1.013 1.494 87  26 184 36
## 4 10  4/6/90 Female  62  15/5/28     1 0.62 0.71 0.839 1.214 72  24 173 NA
## 5 23  8/8/89   Male  61  22/9/28     1 0.87 0.60 0.811 1.144 72  24 173 44
## 6 24  3/5/89 Female  76   1/8/13     1 0.76 0.58 0.743 0.980 67  28 156 15
##     v6 v7 v8 v9 hipfx timehip
## 1 39.9  1  0  0     0    0.55
## 2 31.0  0  0  0     0   19.68
## 3 28.6  0  0  0     0    5.05
## 4 28.2  1  0  0     0   18.55
## 5 28.9  1  0  0     0   19.37
## 6 33.3  0  0  0     0   12.30

#Display the exact number

options(scipen = 999)

#the difference between male and female

table(hip$gender, hip$hipfx)
##         
##             0    1
##   Female 1512  142
##   Male   1087   47
require(DescTools)
## Loading required package: DescTools
Desc(hip$gender, hip$hipfx)
## ------------------------------------------------------------------------- 
## 
## 
##   length      n    NAs unique
##    2'788  2'788      0      2
##          100.0%   0.0%       
## 
##          freq   perc  lci.95  uci.95'
## Female  1'654  59.3%   57.5%   61.1%
## Male    1'134  40.7%   38.9%   42.5%
## 
## ' 95%-CI Wilson

#fit logistic model

m <- glm(hipfx ~ age, family = binomial, data = hip)
summary(m)
## 
## Call:
## glm(formula = hipfx ~ age, family = binomial, data = hip)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1780  -0.3790  -0.2843  -0.2254   2.8400  
## 
## Coefficients:
##               Estimate Std. Error z value            Pr(>|z|)    
## (Intercept) -11.102684   0.745925  -14.88 <0.0000000000000002 ***
## age           0.118129   0.009981   11.84 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1382.2  on 2787  degrees of freedom
## Residual deviance: 1242.5  on 2786  degrees of freedom
## AIC: 1246.5
## 
## Number of Fisher Scoring iterations: 6
require(epiDisplay)
## Loading required package: epiDisplay
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
m1 <- glm(hipfx ~ bmi, family = binomial, data = hip)
summary(m1)
## 
## Call:
## glm(formula = hipfx ~ bmi, family = binomial, data = hip)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8205  -0.4038  -0.3144  -0.2441   3.0143  
## 
## Coefficients:
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  1.66748    0.51819   3.218              0.00129 ** 
## bmi         -0.17221    0.02125  -8.105 0.000000000000000528 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1308.6  on 2753  degrees of freedom
## Residual deviance: 1231.9  on 2752  degrees of freedom
##   (34 observations deleted due to missingness)
## AIC: 1235.9
## 
## Number of Fisher Scoring iterations: 6

#Plotting boxplot

require(ggplot2)
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
## 
##     alpha
p <- ggplot(data = hip, aes(x=factor(hipfx), y= bmi, col = factor(hipfx))) +
geom_jitter(alpha=0.5) +
xlab("Hip Fracture") +
ylab("Body Mass Index") +
theme(legend.position = "top")+
theme(legend.title=element_blank())
scale_fill_discrete(labels=c("no fracture", "fracture"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: fill
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: no fracture fracture
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range, gg>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range, gg>
##     reset: function
##     scale_name: hue
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>
p
## Warning: Removed 34 rows containing missing values (geom_point).

#plotting ROC curve

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)
hh <- subset(hip, hip$bmi > 0)
hh$pre.bmi <- predict(m1, type="response")
roccurve <- roc(hh$hipfx ~ hh$pre.bmi)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roccurve)
## Area under the curve: 0.6803
ggroc(roccurve, col="red", alpha = 0.5)

library(epiDisplay)
m3 <- glm(hipfx ~ v2 + bmi, family=binomial, data = hh)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m3)
## 
## Call:
## glm(formula = hipfx ~ v2 + bmi, family = binomial, data = hh)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5435  -0.3624  -0.2280  -0.1341   3.5651  
## 
## Coefficients:
##             Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)   3.8723     0.5565   6.959     0.00000000000344 ***
## v2           -8.2338     0.7086 -11.620 < 0.0000000000000002 ***
## bmi          -0.0616     0.0225  -2.737               0.0062 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1282.1  on 2715  degrees of freedom
## Residual deviance: 1034.1  on 2713  degrees of freedom
##   (38 observations deleted due to missingness)
## AIC: 1040.1
## 
## Number of Fisher Scoring iterations: 7
logistic.display(m3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Logistic regression predicting hipfx 
##  
##                  crude OR(95%CI)   adj. OR(95%CI)   P(Wald's test)
## v2 (cont. var.)  0 (0,0)           0 (0,0)          < 0.001       
##                                                                   
## bmi (cont. var.) 0.84 (0.81,0.88)  0.94 (0.9,0.98)  0.006         
##                                                                   
##                  P(LR-test)
## v2 (cont. var.)  < 0.001   
##                            
## bmi (cont. var.) 0.005     
##                            
## Log-likelihood = -517.0407
## No. of observations = 2716
## AIC value = 1040.0815

#BMA

library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
## 
## Attaching package: 'rrcov'
## The following object is masked from 'package:DescTools':
## 
##     Cov
xvars <- hh[, c("age", "gender", "v1", "v2", "v3", "v4", 
"v5", "v6", "v7", "v8", "v9", "bmi")]
yvar <- hh[, c("hipfx")]
m <- bic.glm(xvars, yvar, strict=FALSE, OR=20, glm.family="binomial")
## Warning in bic.glm.data.frame(xvars, yvar, strict = FALSE, OR = 20,
## glm.family = "binomial"): There were 438 records deleted due to NA's
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = FALSE, OR = 20)
## 
## 
##   6  models were selected
##  Best  5  models (cumulative posterior probability =  0.9658 ): 
## 
##                p!=0    EV        SD        model 1       model 2     
## Intercept      100    -1.160848  1.370080      -1.02179      -1.20027
## age.x          100.0   0.077098  0.013821       0.07660       0.07966
## gender.x         0.0                                                 
##         .Male          0.000000  0.000000        .             .     
## v1.x            21.2  -0.582100  1.244628        .           -2.48784
## v2.x            13.6  -0.530365  1.498308        .             .     
## v3.x            95.2  -7.995824  2.303874      -9.06475      -6.81512
## v4.x             0.0   0.000000  0.000000        .             .     
## v5.x             5.8  -0.001032  0.004837        .             .     
## v6.x             3.4   0.001185  0.007641        .             .     
## v7.x             0.0   0.000000  0.000000        .             .     
## v8.x             0.0   0.000000  0.000000        .             .     
## v9.x             0.0   0.000000  0.000000        .             .     
## bmi.x            0.0   0.000000  0.000000        .             .     
##                                                                      
## nVar                                              2             3    
## BIC                                        -17081.29230  -17078.67937
## post prob                                       0.607         0.164  
##                model 3       model 4       model 5     
## Intercept          -1.22289      -0.55783      -2.55134
## age.x               0.07476       0.07130       0.08253
## gender.x                                               
##         .Male        .             .             .     
## v1.x                 .             .           -3.62078
## v2.x               -3.03265        .           -5.49960
## v3.x               -6.26859      -8.63587        .     
## v4.x                 .             .             .     
## v5.x                 .           -0.01774        .     
## v6.x                 .             .             .     
## v7.x                 .             .             .     
## v8.x                 .             .             .     
## v9.x                 .             .             .     
## bmi.x                .             .             .     
##                                                        
## nVar                  3             3             3    
## BIC            -17077.43400  -17076.60201  -17076.20840
## post prob           0.088         0.058         0.048
imageplot.bma(m)

#Calculate final model (age, v3)

hx <- subset(hip, hip$v3 > 0)
mf <- glm(data=hx, hipfx ~ age + v3)
hx$pred <- predict(mf, type="response")
rocfinal <- roc(hx$hipfx ~ hx$pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(rocfinal)
## Area under the curve: 0.8452
plot(smooth(rocfinal))

ggroc(rocfinal, col="red", alpha=0.5, size= 1.2, linetype=4,
legacy.axes=TRUE)