os = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Bone data.csv")
head(os)
## id sex age weight height prior.fx fnbmd smoking fx
## 1 1 Male 73 98 175 0 1.08 1 0
## 2 2 Female 68 72 166 0 0.97 0 0
## 3 3 Male 68 87 184 0 1.01 0 0
## 4 4 Female 62 72 173 0 0.84 1 0
## 5 5 Male 61 72 173 0 0.81 1 0
## 6 6 Female 76 57 156 0 0.74 0 0
dim(os)
## [1] 2162 9
head(os)
## id sex age weight height prior.fx fnbmd smoking fx
## 1 1 Male 73 98 175 0 1.08 1 0
## 2 2 Female 68 72 166 0 0.97 0 0
## 3 3 Male 68 87 184 0 1.01 0 0
## 4 4 Female 62 72 173 0 0.84 1 0
## 5 5 Male 61 72 173 0 0.81 1 0
## 6 6 Female 76 57 156 0 0.74 0 0
m.1 = glm(fx ~ smoking, family = binomial, data = os)
summary(m.1)
##
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = os)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.03006 0.06442 -15.990 <2e-16 ***
## smoking -0.13796 0.10081 -1.368 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2439.5 on 2160 degrees of freedom
## AIC: 2443.5
##
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Warning: package 'epiDisplay' was built under R version 4.3.2
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
logistic.display(m.1)
##
## Logistic regression predicting fx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## smoking: 1 vs 0 0.87 (0.71,1.06) 0.171 0.17
##
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938
Giới tính là yếu tố gây nhiễu
library(compareGroups)
createTable(compareGroups(sex ~ smoking + fx, data = os))
##
## --------Summary descriptives table by 'sex'---------
##
## _________________________________________
## Female Male p.overall
## N=1317 N=845
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## smoking 0.30 (0.46) 0.62 (0.49) <0.001
## fx 0.30 (0.46) 0.17 (0.38) <0.001
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
Hiệu chỉnh nhiễu
os$gender = ifelse(os$sex == "Male", 1, 0)
m.2 = glm(fx ~ smoking + gender, family = binomial, data = os)
logistic.display(m.2)
##
## Logistic regression predicting fx
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test) P(LR-test)
## smoking: 1 vs 0 0.87 (0.71,1.06) 1.1 (0.89,1.36) 0.357 0.358
##
## gender: 1 vs 0 0.47 (0.38,0.58) 0.45 (0.36,0.57) < 0.001 < 0.001
##
## Log-likelihood = -1194.7997
## No. of observations = 2162
## AIC value = 2395.5994
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
##
## alpha
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
m.rel = glm(fx ~ gender + age + fnbmd + weight, family = binomial, data = os)
varImp(m.rel, scale = FALSE)
## Overall
## gender 2.3022178
## age 2.5112611
## fnbmd 8.7193666
## weight 0.8239425
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
os.2 = na.omit(os)
xvars = os.2[, c("sex", "age", "weight", "height", "fnbmd", "smoking")]
yvar = os.2[, c("fx")]
m.bma = bic.glm(xvars, yvar, strict = FALSE, OR = 20, glm.family = "binomial")
summary(m.bma)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = FALSE, OR = 20)
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 2.406528 0.791302 2.780e+00 1.271e+00 2.633e+00
## sex 13.8 -0.033579 0.095516 . . -2.344e-01
## age 23.0 0.004096 0.008381 . 1.753e-02 .
## weight 0.0 0.000000 0.000000 . . .
## height 0.0 0.000000 0.000000 . . .
## fnbmd 100.0 -4.700740 0.436561 -4.818e+00 -4.489e+00 -4.536e+00
## smoking 0.0 0.000000 0.000000 . . .
##
## nVar 1 2 2
## BIC -1.402e+04 -1.401e+04 -1.401e+04
## post prob 0.675 0.187 0.095
## model 4
## Intercept 9.605e-01
## sex -2.640e-01
## age 1.919e-02
## weight .
## height .
## fnbmd -4.137e+00
## smoking .
##
## nVar 3
## BIC -1.401e+04
## post prob 0.042
imageplot.bma(m.bma)
os.2$fnbmd.sd = os.2$fnbmd/0.15
m.3 = glm(fx ~ fnbmd.sd, family = binomial, data = os.2)
logistic.display(m.3)
##
## Logistic regression predicting fx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## fnbmd.sd (cont. var.) 0.49 (0.43,0.54) < 0.001 < 0.001
##
## Log-likelihood = -1106.8757
## No. of observations = 2121
## AIC value = 2217.7514
library(readxl)
df = read_excel("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Pima Indian Diabetes Dta.xlsx")
dim(df)
## [1] 768 10
head(df)
## # A tibble: 6 × 10
## id Pregnancies Glucose BloodPressure SkinThickness Insulin bmi
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 148 72 35 0 33.6
## 2 2 1 85 66 29 0 26.6
## 3 3 8 183 64 0 0 23.3
## 4 4 1 89 66 23 94 28.1
## 5 5 0 137 40 35 168 43.1
## 6 6 5 116 74 0 0 25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
df$y = ifelse(df$Outcome == 1, "Yes", "No")
library(caret)
set.seed(1234)
index = createDataPartition(df$y, p =.7, list = FALSE)
training = df[index, ]
dim(training)
## [1] 538 11
testing = df[-index, ]
dim(testing)
## [1] 230 11
fit = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = training, method = "glm", family = "binomial")
summary(fit)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.084008 0.824117 -9.809 < 2e-16 ***
## Age 0.015382 0.010763 1.429 0.1530
## Pregnancies 0.094875 0.037380 2.538 0.0111 *
## Glucose 0.032732 0.004010 8.162 3.30e-16 ***
## BloodPressure -0.007063 0.006114 -1.155 0.2480
## bmi 0.090121 0.017343 5.196 2.03e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 696.28 on 537 degrees of freedom
## Residual deviance: 528.11 on 532 degrees of freedom
## AIC: 540.11
##
## Number of Fisher Scoring iterations: 5
testing$y.2 = factor(testing$y, levels = c("Yes", "No"))
pred = predict(fit, newdata = testing, type = "raw")
pred.2 = factor(pred, levels = c("Yes", "No"))
confusionMatrix(testing$y.2, pred.2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 49 31
## No 18 132
##
## Accuracy : 0.787
## 95% CI : (0.7283, 0.838)
## No Information Rate : 0.7087
## P-Value [Acc > NIR] : 0.004594
##
## Kappa : 0.5119
##
## Mcnemar's Test P-Value : 0.086476
##
## Sensitivity : 0.7313
## Specificity : 0.8098
## Pos Pred Value : 0.6125
## Neg Pred Value : 0.8800
## Prevalence : 0.2913
## Detection Rate : 0.2130
## Detection Prevalence : 0.3478
## Balanced Accuracy : 0.7706
##
## 'Positive' Class : Yes
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
prob = predict(fit, newdata = testing, type = "prob")
pred = data.frame(prob, testing$Outcome)
head(pred)
## No Yes testing.Outcome
## 1 0.9426214 0.05737863 0
## 2 0.8273269 0.17267313 0
## 3 0.1647211 0.83527888 1
## 4 0.9560975 0.04390247 1
## 5 0.7117197 0.28828032 0
## 6 0.3953255 0.60467451 1
auc = roc(pred$testing.Outcome, pred$Yes)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc
##
## Call:
## roc.default(response = pred$testing.Outcome, predictor = pred$Yes)
##
## Data: pred$Yes in 150 controls (pred$testing.Outcome 0) < 80 cases (pred$testing.Outcome 1).
## Area under the curve: 0.8517
plot(auc)
fit.cv = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = df, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary, classProbs = TRUE))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit.cv
## Generalized Linear Model
##
## 768 samples
## 5 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 691, 692, 691, 691, 691, 692, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8215698 0.874 0.574359
fit.b = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = df, method = "glm", family = "binomial", trControl = trainControl(method = "boot", number = 100, summaryFunction = twoClassSummary, classProbs = TRUE))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit.b
## Generalized Linear Model
##
## 768 samples
## 5 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (100 reps)
## Summary of sample sizes: 768, 768, 768, 768, 768, 768, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8254318 0.8732031 0.5683258