ob <- read.csv("D:/1. Dr Hung 108/NCKH R/obesity data.csv")
dim(ob)
## [1] 1217 13
head(ob,10)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat hypertension
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3 0
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8 1
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0 1
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8 1
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8 0
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2 1
## 7 7 F 155 58 24.1 66 1546 0.96 20233 35599 35.3 1
## 8 8 M 167 65 23.3 50 2276 1.11 17749 43301 28.0 1
## 9 9 M 165 54 19.8 61 1778 0.96 10795 38613 21.1 0
## 10 10 F 158 60 24.0 58 1404 0.86 21365 35534 36.6 1
## diabetes
## 1 1
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 1
## 8 1
## 9 0
## 10 0
tail(ob,10)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat
## 1208 1218 F 147 48 22.2 50 1457 0.93 16820 29212 35.4
## 1209 1219 M 155 54 22.5 70 2046 1.13 14005 37044 26.4
## 1210 1220 F 144 53 25.6 72 1408 0.93 20069 31510 37.9
## 1211 1221 F 155 54 22.5 75 1667 0.97 18321 33715 34.1
## 1212 1222 F 153 50 21.4 59 1309 0.87 18328 29147 37.6
## 1213 1223 F 150 44 19.6 44 1474 0.95 12906 28534 30.1
## 1214 1224 F 148 51 23.3 58 1522 0.97 14938 33931 29.6
## 1215 1225 F 149 50 22.5 57 1409 0.93 16777 30598 34.4
## 1216 1226 F 144 49 23.6 67 1266 0.90 20094 27272 41.3
## 1217 1227 F 141 45 22.6 58 1228 0.91 14567 28111 33.2
## hypertension diabetes
## 1208 1 0
## 1209 0 0
## 1210 1 0
## 1211 1 0
## 1212 1 0
## 1213 0 1
## 1214 0 0
## 1215 1 0
## 1216 1 0
## 1217 0 0
summary(ob)
## id gender height weight
## Min. : 1.0 Length:1217 Min. :136.0 Min. :34.00
## 1st Qu.: 309.0 Class :character 1st Qu.:151.0 1st Qu.:49.00
## Median : 615.0 Mode :character Median :155.0 Median :54.00
## Mean : 614.5 Mean :156.7 Mean :55.14
## 3rd Qu.: 921.0 3rd Qu.:162.0 3rd Qu.:61.00
## Max. :1227.0 Max. :185.0 Max. :95.00
## bmi age WBBMC wbbmd fat
## Min. :14.5 Min. :13.00 Min. : 695 Min. :0.650 Min. : 4277
## 1st Qu.:20.2 1st Qu.:35.00 1st Qu.:1481 1st Qu.:0.930 1st Qu.:13768
## Median :22.2 Median :48.00 Median :1707 Median :1.010 Median :16955
## Mean :22.4 Mean :47.15 Mean :1725 Mean :1.009 Mean :17288
## 3rd Qu.:24.3 3rd Qu.:58.00 3rd Qu.:1945 3rd Qu.:1.090 3rd Qu.:20325
## Max. :37.1 Max. :88.00 Max. :3040 Max. :1.350 Max. :40825
## lean pcfat hypertension diabetes
## Min. :19136 Min. : 9.2 Min. :0.000 Min. :0.0000
## 1st Qu.:30325 1st Qu.:27.0 1st Qu.:0.000 1st Qu.:0.0000
## Median :33577 Median :32.4 Median :1.000 Median :0.0000
## Mean :35463 Mean :31.6 Mean :0.507 Mean :0.1109
## 3rd Qu.:39761 3rd Qu.:36.8 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :63059 Max. :48.4 Max. :1.000 Max. :1.0000
ob$sex[ob$gender == "F"] = 1
ob$sex[ob$gender == "M"] = 0
ob$sex.b = ifelse(ob$gender== "F",1,0)
table(ob$sex, ob$sex.b)
##
## 0 1
## 0 355 0
## 1 0 862
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
ob = ob %>%
mutate(sex.2 = case_when(gender == "F" ~ 1,
gender == "M" ~ 0))
table(ob$sex, ob$sex.2)
##
## 0 1
## 0 355 0
## 1 0 862
# Cách đơn giản:
ob$obese[ob$bmi< 18.5] = "Underweight"
ob$obese[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$obese[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$obese[ob$bmi>= 30] = "Obese"
# Sử dụng tidyverse:
ob = ob %>%
mutate(obese.2 = case_when(bmi< 18.5 ~ "Underweight",
bmi>= 18.5 & bmi< 25 ~ "Normal",
bmi>= 25 & bmi< 30 ~ "Overweight",
bmi>= 30 ~ "Obese"))
table(ob$obese, ob$obese.2)
##
## Normal Obese Overweight Underweight
## Normal 865 0 0 0
## Obese 0 15 0 0
## Overweight 0 0 230 0
## Underweight 0 0 0 107
# Cách đơn giản:
ob$lean.kg = ob$lean*1000
ob$fat.kg = ob$fat*1000
# Sử dụng tidyverse:
ob = ob %>%
mutate(lean.kg.2 = lean/1000,
fat.kg.2 = fat/1000)
# Cách đơn giản:
men.overweight = subset(ob, gender == "M" & bmi>= 25)
dim(men.overweight)
## [1] 85 22
table(men.overweight$obese)
##
## Obese Overweight
## 4 81
# Sử dụng tidyverse:
men.overweight.2 = ob %>% filter(gender == "M", bmi>= 25)
dim(men.overweight.2)
## [1] 85 22
table(men.overweight.2$obese)
##
## Obese Overweight
## 4 81
# Cách đơn giản:
Subset = subset(ob, select = c(id, age, gender, bmi, lean, fat))
head(Subset)
## id age gender bmi lean fat
## 1 1 53 F 21.8 28600 17802
## 2 2 65 M 19.1 40229 8381
## 3 3 64 F 23.1 36057 19221
## 4 4 56 F 21.8 33094 17472
## 5 5 54 M 19.9 40621 7336
## 6 6 52 F 20.1 30068 14904
Subset.b = ob[, c("id", "age", "gender", "bmi", "lean", "fat")]
head(Subset.b)
## id age gender bmi lean fat
## 1 1 53 F 21.8 28600 17802
## 2 2 65 M 19.1 40229 8381
## 3 3 64 F 23.1 36057 19221
## 4 4 56 F 21.8 33094 17472
## 5 5 54 M 19.9 40621 7336
## 6 6 52 F 20.1 30068 14904
# Sử dụng tidyverse:
Subset.2 = ob %>% select(id, age, gender, bmi, lean, fat)
head(Subset.2)
## id age gender bmi lean fat
## 1 1 53 F 21.8 28600 17802
## 2 2 65 M 19.1 40229 8381
## 3 3 64 F 23.1 36057 19221
## 4 4 56 F 21.8 33094 17472
## 5 5 54 M 19.9 40621 7336
## 6 6 52 F 20.1 30068 14904
ob <- read.csv("D:/1. Dr Hung 108/NCKH R/obesity data.csv")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ age + gender + bmi + WBBMC + wbbmd + fat + lean + hypertension + diabetes, data = ob)
| Overall (N=1217) |
|
|---|---|
| age | |
| Mean (SD) | 47.2 (17.3) |
| Median [Min, Max] | 48.0 [13.0, 88.0] |
| gender | |
| F | 862 (70.8%) |
| M | 355 (29.2%) |
| bmi | |
| Mean (SD) | 22.4 (3.06) |
| Median [Min, Max] | 22.2 [14.5, 37.1] |
| WBBMC | |
| Mean (SD) | 1720 (363) |
| Median [Min, Max] | 1710 [695, 3040] |
| wbbmd | |
| Mean (SD) | 1.01 (0.113) |
| Median [Min, Max] | 1.01 [0.650, 1.35] |
| fat | |
| Mean (SD) | 17300 (5210) |
| Median [Min, Max] | 17000 [4280, 40800] |
| lean | |
| Mean (SD) | 35500 (7030) |
| Median [Min, Max] | 33600 [19100, 63100] |
| hypertension | |
| Mean (SD) | 0.507 (0.500) |
| Median [Min, Max] | 1.00 [0, 1.00] |
| diabetes | |
| Mean (SD) | 0.111 (0.314) |
| Median [Min, Max] | 0 [0, 1.00] |
table1(~ age + bmi + WBBMC + wbbmd + fat + lean + as.factor(hypertension) + as.factor(diabetes) | gender, data = ob)
| F (N=862) |
M (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 48.6 (16.4) | 43.7 (18.8) | 47.2 (17.3) |
| Median [Min, Max] | 49.0 [14.0, 85.0] | 44.0 [13.0, 88.0] | 48.0 [13.0, 88.0] |
| bmi | |||
| Mean (SD) | 22.3 (3.05) | 22.7 (3.04) | 22.4 (3.06) |
| Median [Min, Max] | 22.1 [15.2, 37.1] | 22.5 [14.5, 34.7] | 22.2 [14.5, 37.1] |
| WBBMC | |||
| Mean (SD) | 1600 (293) | 2030 (336) | 1720 (363) |
| Median [Min, Max] | 1610 [695, 2660] | 2030 [1190, 3040] | 1710 [695, 3040] |
| wbbmd | |||
| Mean (SD) | 0.988 (0.111) | 1.06 (0.101) | 1.01 (0.113) |
| Median [Min, Max] | 0.990 [0.650, 1.35] | 1.06 [0.780, 1.34] | 1.01 [0.650, 1.35] |
| fat | |||
| Mean (SD) | 18200 (4950) | 15000 (5110) | 17300 (5210) |
| Median [Min, Max] | 17700 [6220, 40800] | 15100 [4280, 29900] | 17000 [4280, 40800] |
| lean | |||
| Mean (SD) | 32000 (3970) | 43800 (5820) | 35500 (7030) |
| Median [Min, Max] | 31500 [19100, 53400] | 43400 [28600, 63100] | 33600 [19100, 63100] |
| as.factor(hypertension) | |||
| 0 | 430 (49.9%) | 170 (47.9%) | 600 (49.3%) |
| 1 | 432 (50.1%) | 185 (52.1%) | 617 (50.7%) |
| as.factor(diabetes) | |||
| 0 | 760 (88.2%) | 322 (90.7%) | 1082 (88.9%) |
| 1 | 102 (11.8%) | 33 (9.3%) | 135 (11.1%) |
table1(~ age + bmi + WBBMC + wbbmd + fat + lean + as.factor(hypertension) + as.factor(diabetes) | gender, data = ob, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))
| F (N=862) |
M (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 48.6 (16.4) | 43.7 (18.8) | 47.2 (17.3) |
| Median [Q1, Q3] | 49.0 [39.0, 59.0] | 44.0 [24.0, 56.0] | 48.0 [35.0, 58.0] |
| bmi | |||
| Mean (SD) | 22.3 (3.05) | 22.7 (3.04) | 22.4 (3.06) |
| Median [Q1, Q3] | 22.1 [20.1, 24.1] | 22.5 [20.8, 24.9] | 22.2 [20.2, 24.3] |
| WBBMC | |||
| Mean (SD) | 1600 (293) | 2030 (336) | 1720 (363) |
| Median [Q1, Q3] | 1610 [1410, 1800] | 2030 [1810, 2250] | 1710 [1480, 1950] |
| wbbmd | |||
| Mean (SD) | 0.988 (0.111) | 1.06 (0.101) | 1.01 (0.113) |
| Median [Q1, Q3] | 0.990 [0.910, 1.07] | 1.06 [0.990, 1.13] | 1.01 [0.930, 1.09] |
| fat | |||
| Mean (SD) | 18200 (4950) | 15000 (5110) | 17300 (5210) |
| Median [Q1, Q3] | 17700 [14800, 21100] | 15100 [11400, 18200] | 17000 [13800, 20300] |
| lean | |||
| Mean (SD) | 32000 (3970) | 43800 (5820) | 35500 (7030) |
| Median [Q1, Q3] | 31500 [29300, 34500] | 43400 [40300, 47600] | 33600 [30300, 39800] |
| as.factor(hypertension) | |||
| 0 | 430 (49.9%) | 170 (47.9%) | 600 (49.3%) |
| 1 | 432 (50.1%) | 185 (52.1%) | 617 (50.7%) |
| as.factor(diabetes) | |||
| 0 | 760 (88.2%) | 322 (90.7%) | 1082 (88.9%) |
| 1 | 102 (11.8%) | 33 (9.3%) | 135 (11.1%) |
library(compareGroups)
createTable(compareGroups(gender ~ age + bmi + WBBMC + wbbmd + fat + lean + hypertension + diabetes, data = ob))
##
## --------Summary descriptives table by 'gender'---------
##
## ________________________________________________
## F M p.overall
## N=862 N=355
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 48.6 (16.4) 43.7 (18.8) <0.001
## bmi 22.3 (3.05) 22.7 (3.04) 0.013
## WBBMC 1599 (293) 2030 (336) <0.001
## wbbmd 0.99 (0.11) 1.06 (0.10) <0.001
## fat 18240 (4954) 14978 (5113) <0.001
## lean 32045 (3966) 43762 (5819) <0.001
## hypertension 0.50 (0.50) 0.52 (0.50) 0.527
## diabetes 0.12 (0.32) 0.09 (0.29) 0.181
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
ob$hypert = as.factor(ob$hypertension)
ob$dm = as.factor(ob$diabetes)
createTable(compareGroups(gender ~ age + bmi + WBBMC + wbbmd + fat + lean + hypert + dm, data = ob))
##
## --------Summary descriptives table by 'gender'---------
##
## ___________________________________________
## F M p.overall
## N=862 N=355
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## age 48.6 (16.4) 43.7 (18.8) <0.001
## bmi 22.3 (3.05) 22.7 (3.04) 0.013
## WBBMC 1599 (293) 2030 (336) <0.001
## wbbmd 0.99 (0.11) 1.06 (0.10) <0.001
## fat 18240 (4954) 14978 (5113) <0.001
## lean 32045 (3966) 43762 (5819) <0.001
## hypert: 0.569
## 0 430 (49.9%) 170 (47.9%)
## 1 432 (50.1%) 185 (52.1%)
## dm: 0.238
## 0 760 (88.2%) 322 (90.7%)
## 1 102 (11.8%) 33 (9.30%)
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p = ggplot(data = ob, aes(x = pcfat))
p1 = p + geom_histogram()
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tỉ trọng mỡ (%)", y = "Số người", title = "Phân bố tỉ trọng mỡ")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = ob, aes(x = pcfat, fill = gender))
p1 = p + geom_histogram(col="white") + labs(x = "Tỉ trọng mỡ", y = "Số người", title = "Phân bố tỉ trọng mỡ")
p2 = p + geom_density(alpha = 0.5) + labs(x = "Tỉ trọng mỡ", y = "Tỷ lệ (%)", title = "Phân bố tỉ trọng mỡ")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ob$OB[ob$bmi< 18.5] = "Underweight"
ob$OB[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$OB[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$OB[ob$bmi>= 30] = "Obese"
p = ggplot(data = ob, aes(x = OB, fill = OB, col = OB))
p + geom_bar(position = "dodge")
p = ggplot(data = ob, aes(x = OB, y = pcfat, fill = gender, group = gender))
p + geom_bar(stat = "identity", position = "dodge")
women = subset(ob, gender == "F")
dim(women)
## [1] 862 16
p = ggplot(data = women, aes(x = OB, y = pcfat, fill = OB, col = OB))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Tình trạng béo phì", y = "Tỉ trọng mỡ (%)") + ggtitle("Tỉ trọng mỡ theo tình trạng béo phì ở nữ")
p1
women$OB.n = factor(women$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))
p = ggplot(data = women, aes(x = OB.n, y = pcfat, fill = OB.n, col = OB.n))
p2 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Tình trạng béo phì", y = "Tỉ trọng mỡ (%)") + ggtitle("Tỉ trọng mỡ theo tình trạng béo phì ở nữ")
p2
ob$OB.n = factor(ob$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))
p = ggplot(data = ob, aes(x = OB.n, y = pcfat, fill = gender, col = gender))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Tình trạng béo phì", y = "Tỉ trọng mỡ (%)") + ggtitle("Tỉ trọng mỡ theo tình trạng béo phì và giới tính")
p1
p = ggplot(data = ob, aes(x = bmi, y = pcfat))
p1 = p + geom_point()
p2 = p + geom_point() + geom_smooth()
grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
p = ggplot(data = ob, aes(x = bmi, y = pcfat, fill = gender, col = gender))
p1 = p + geom_point() + geom_smooth() + labs(x = "Chỉ số khối cơ thể (kg/m2)", y = "Tỉ trọng mỡ (%)") + ggtitle("Liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính")
p1
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
p = ggplot(data = ob, aes(x = bmi, y = pcfat, fill = gender, col = gender))
p2 = p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2)) + labs(x = "Chỉ số khối cơ thể (kg/m2)", y = "Tỉ trọng mỡ (%)") + ggtitle("Liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính")
p2
ob <- read.csv("D:/1. Dr Hung 108/NCKH R/obesity data.csv")
library(ggplot2)
p = ggplot(data = ob, aes(x = bmi))
p + geom_histogram(fill = "blue", col = "white") + labs(x = "Chỉ số khối cơ thể (kg/m2)", y = "Số người", title = "Phân bố chỉ số khối cơ thể")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Giả thuyết vô hiệu: Không có khác biệt về chỉ số khối cơ thể giữa nam và nữ Giả tuyết thay thế: Có sự khác biệt về chỉ số khối cơ thể giữa nam và nữ
library(table1)
table1(~bmi | gender, data = ob)
| F (N=862) |
M (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| bmi | |||
| Mean (SD) | 22.3 (3.05) | 22.7 (3.04) | 22.4 (3.06) |
| Median [Min, Max] | 22.1 [15.2, 37.1] | 22.5 [14.5, 34.7] | 22.2 [14.5, 37.1] |
t.test(bmi~ gender, data = ob)
##
## Welch Two Sample t-test
##
## data: bmi by gender
## t = -2.4841, df = 662.09, p-value = 0.01324
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -0.85400278 -0.09994709
## sample estimates:
## mean in group F mean in group M
## 22.25626 22.73324
p = ggplot(data = ob, aes(x = wbbmd))
p + geom_histogram(fill = "blue", col = "white") + labs(x = "Mật độ xương toàn cơ thể (g/cm2)", y = "Số người", title = "Phân bố mật độ xương toàn cơ thể")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table1(~wbbmd | gender, data = ob)
| F (N=862) |
M (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| wbbmd | |||
| Mean (SD) | 0.988 (0.111) | 1.06 (0.101) | 1.01 (0.113) |
| Median [Min, Max] | 0.990 [0.650, 1.35] | 1.06 [0.780, 1.34] | 1.01 [0.650, 1.35] |
obese = subset(ob, bmi>= 30)
table(obese$gender)
##
## F M
## 11 4
p = ggplot(data = ob, aes(x = age))
p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tuổi (năm)", y = "Số người", title = "Phân bố tuổi")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table1(~ age | gender, data = obese)
| F (N=11) |
M (N=4) |
Overall (N=15) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 54.4 (13.0) | 37.0 (13.3) | 49.7 (14.9) |
| Median [Min, Max] | 56.0 [21.0, 70.0] | 40.5 [18.0, 49.0] | 54.0 [18.0, 70.0] |
table1(~ age | gender, data = obese, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))
| F (N=11) |
M (N=4) |
Overall (N=15) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 54.4 (13.0) | 37.0 (13.3) | 49.7 (14.9) |
| Median [Q1, Q3] | 56.0 [53.0, 59.0] | 40.5 [34.5, 43.0] | 54.0 [44.0, 57.0] |
library(simpleboot)
## Simple Bootstrap Routines (1.1-8)
men = subset(ob, gender == "M" & bmi>= 30)
women = subset(ob, gender == "F" & bmi >= 30)
# men = ob %>% filter(gender == "M")
# women = ob %>% filter(gender == "F")
set.seed(1234)
b.means = two.boot(men$age, women$age, mean, R = 1000)
hist (b.means$t, breaks = 20)
quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## -31.662500 -17.261364 -5.153409
ob$obese[ob$bmi< 18.5] = "Underweight"
ob$obese[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$obese[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$obese[ob$bmi>= 30] = "Obese"
table(ob$obese, ob$gender)
##
## F M
## Normal 626 239
## Obese 11 4
## Overweight 149 81
## Underweight 76 31
ob$obese.n = factor(ob$obese, levels = c("Underweight", "Normal", "Overweight", "Obese"))
table1(~obese.n | gender, data = ob)
| F (N=862) |
M (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| obese.n | |||
| Underweight | 76 (8.8%) | 31 (8.7%) | 107 (8.8%) |
| Normal | 626 (72.6%) | 239 (67.3%) | 865 (71.1%) |
| Overweight | 149 (17.3%) | 81 (22.8%) | 230 (18.9%) |
| Obese | 11 (1.3%) | 4 (1.1%) | 15 (1.2%) |
chisq.test(ob$obese.n, ob$gender, correct = FALSE)
## Warning in chisq.test(ob$obese.n, ob$gender, correct = FALSE): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: ob$obese.n and ob$gender
## X-squared = 5.1114, df = 3, p-value = 0.1638
ob$hypert = as.factor(ob$hypertension)
table1(~ hypert | gender, data = ob)
| F (N=862) |
M (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| hypert | |||
| 0 | 430 (49.9%) | 170 (47.9%) | 600 (49.3%) |
| 1 | 432 (50.1%) | 185 (52.1%) | 617 (50.7%) |
chisq.test(ob$hypert, ob$gender, correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: ob$hypert and ob$gender
## X-squared = 0.40105, df = 1, p-value = 0.5265
p = ggplot(data = ob, aes(x = wbbmd))
p + geom_histogram(fill = "blue", col = "white") + labs(x = "Mật độ xương toàn thân (g/cm2)", y = "Số người", title = "Phân bố mật độ xương toàn thân")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = ob, aes(x = obese.n, y = wbbmd, fill = obese.n, col = obese.n))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Tình trạng béo phì", y = "Mật độ xương toàn thân (g/cm2)") + ggtitle("Mật độ xương toàn thân theo tình trạng béo phì")
p1
table1(~ wbbmd | obese.n, data = ob)
| Underweight (N=107) |
Normal (N=865) |
Overweight (N=230) |
Obese (N=15) |
Overall (N=1217) |
|
|---|---|---|---|---|---|
| wbbmd | |||||
| Mean (SD) | 0.971 (0.0998) | 1.01 (0.112) | 1.03 (0.120) | 1.03 (0.114) | 1.01 (0.113) |
| Median [Min, Max] | 0.980 [0.740, 1.17] | 1.02 [0.650, 1.35] | 1.03 [0.700, 1.29] | 1.06 [0.790, 1.19] | 1.01 [0.650, 1.35] |
anova = aov(wbbmd ~ obese.n, data = ob)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## obese.n 3 0.24 0.07991 6.326 0.000295 ***
## Residuals 1213 15.32 0.01263
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tukey.anova= TukeyHSD(anova)
tukey.anova
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = wbbmd ~ obese.n, data = ob)
##
## $obese.n
## diff lwr upr p adj
## Normal-Underweight 0.037682027 0.008052906 0.06731115 0.0060418
## Overweight-Underweight 0.056395774 0.022562445 0.09022910 0.0001143
## Obese-Underweight 0.060105919 -0.019606866 0.13981870 0.2119646
## Overweight-Normal 0.018713747 -0.002735926 0.04016342 0.1120163
## Obese-Normal 0.022423892 -0.052872339 0.09772012 0.8696949
## Obese-Overweight 0.003710145 -0.073337448 0.08075774 0.9993207
ob <- read.csv("D:/1. Dr Hung 108/NCKH R/obesity data.csv")
library(ggplot2)
library(gridExtra)
p = ggplot(data = ob, aes(x = age))
p1 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tuổi (năm)", y = "Số người", title = "Phân bố tuổi")
p = ggplot(data = ob, aes(x = wbbmd))
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Mật độ xương toàn thân (g/cm2)", y = "Số người", title = "Phân bố MĐX toàn thân")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = ob, aes(x = age, y = wbbmd))
p + geom_point() + geom_smooth(method = "lm", formula = y~ x)
cor.test(ob$age, ob$wbbmd, method= "pearson")
##
## Pearson's product-moment correlation
##
## data: ob$age and ob$wbbmd
## t = -17.154, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4856972 -0.3951677
## sample estimates:
## cor
## -0.4415556
m.1 = lm(wbbmd ~ age, data = ob)
summary(m.1)
##
## Call:
## lm(formula = wbbmd ~ age, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32749 -0.07268 -0.00533 0.06793 0.33178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1450766 0.0084638 135.29 <2e-16 ***
## age -0.0028914 0.0001686 -17.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1015 on 1215 degrees of freedom
## Multiple R-squared: 0.195, Adjusted R-squared: 0.1943
## F-statistic: 294.3 on 1 and 1215 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.1)
library(ggfortify)
autoplot(m.1)
p = ggplot(data = ob, aes(x = age, y = wbbmd, fill = gender, col = gender))
p1 = p + geom_point() + geom_smooth() + labs(x = "Tuổi (năm)", y = "Mật độ xương toàn thân (g/cm2)") + ggtitle("Liên quan giữa tuổi và MĐX theo giới tính")
p1
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
m.2 = lm(wbbmd ~ age + gender, data = ob)
summary(m.2)
##
## Call:
## lm(formula = wbbmd ~ age + gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36272 -0.06658 -0.00411 0.06549 0.34473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.118288 0.008636 129.485 <2e-16 ***
## age -0.002691 0.000164 -16.408 <2e-16 ***
## genderM 0.059417 0.006230 9.537 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09798 on 1214 degrees of freedom
## Multiple R-squared: 0.2511, Adjusted R-squared: 0.2498
## F-statistic: 203.5 on 2 and 1214 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.2)
summary(m.2)
##
## Call:
## lm(formula = wbbmd ~ age + gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36272 -0.06658 -0.00411 0.06549 0.34473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.118288 0.008636 129.485 <2e-16 ***
## age -0.002691 0.000164 -16.408 <2e-16 ***
## genderM 0.059417 0.006230 9.537 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09798 on 1214 degrees of freedom
## Multiple R-squared: 0.2511, Adjusted R-squared: 0.2498
## F-statistic: 203.5 on 2 and 1214 DF, p-value: < 2.2e-16
library(readxl)
fx = as.data.frame(read_excel(("D:/1. Dr Hung 108/NCKH R/Osteo-data.xlsx")))
dim(fx)
## [1] 2216 17
summary(fx)
## id sex age weight
## Min. : 1.0 Length:2216 Min. :57.00 Min. : 34.00
## 1st Qu.: 554.8 Class :character 1st Qu.:65.00 1st Qu.: 60.00
## Median :1108.5 Mode :character Median :70.00 Median : 69.00
## Mean :1108.5 Mean :70.89 Mean : 70.14
## 3rd Qu.:1662.2 3rd Qu.:76.00 3rd Qu.: 79.00
## Max. :2216.0 Max. :96.00 Max. :133.00
## NA's :53
## height prior_fx fnbmd smoking
## Min. :136.0 Min. :0.0000 Min. :0.2800 Min. :0.0000
## 1st Qu.:158.0 1st Qu.:0.0000 1st Qu.:0.7300 1st Qu.:0.0000
## Median :164.0 Median :0.0000 Median :0.8200 Median :0.0000
## Mean :164.9 Mean :0.1611 Mean :0.8287 Mean :0.4176
## 3rd Qu.:171.0 3rd Qu.:0.0000 3rd Qu.:0.9300 3rd Qu.:1.0000
## Max. :196.0 Max. :1.0000 Max. :1.5100 Max. :1.0000
## NA's :54 NA's :89 NA's :1
## parkinson rheum hypertension diabetes
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.00000 Median :0.00000 Median :1.0000 Median :0.000
## Mean :0.06498 Mean :0.03881 Mean :0.5063 Mean :0.111
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.000
##
## copd cancer cvd falls_n
## Min. :0.000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.111 Mean :0.08529 Mean :0.3872 Mean :0.2843
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.0000 Max. :2.0000
##
## fx
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2595
## 3rd Qu.:1.0000
## Max. :1.0000
##
library(table1)
table1(~ age + weight + height + fnbmd + as.factor(prior_fx) + as.factor(falls_n) + as.factor(smoking) + as.factor(parkinson) + as.factor(rheum) + as.factor(hypertension) + as.factor(diabetes) + as.factor(copd) + as.factor(cancer) + as.factor(cvd) + as.factor(fx) | sex, data = fx)
| Female (N=1358) |
Male (N=858) |
Overall (N=2216) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 71.2 (7.59) | 70.4 (6.44) | 70.9 (7.17) |
| Median [Min, Max] | 70.0 [57.0, 96.0] | 69.0 [59.0, 92.0] | 70.0 [57.0, 96.0] |
| weight | |||
| Mean (SD) | 64.9 (12.5) | 78.2 (12.7) | 70.1 (14.2) |
| Median [Min, Max] | 64.0 [34.0, 115] | 78.0 [45.0, 133] | 69.0 [34.0, 133] |
| Missing | 42 (3.1%) | 11 (1.3%) | 53 (2.4%) |
| height | |||
| Mean (SD) | 160 (6.37) | 173 (6.86) | 165 (9.35) |
| Median [Min, Max] | 160 [136, 181] | 173 [151, 196] | 164 [136, 196] |
| Missing | 41 (3.0%) | 13 (1.5%) | 54 (2.4%) |
| fnbmd | |||
| Mean (SD) | 0.777 (0.132) | 0.909 (0.153) | 0.829 (0.155) |
| Median [Min, Max] | 0.770 [0.280, 1.31] | 0.900 [0.340, 1.51] | 0.820 [0.280, 1.51] |
| Missing | 57 (4.2%) | 32 (3.7%) | 89 (4.0%) |
| as.factor(prior_fx) | |||
| 0 | 1125 (82.8%) | 734 (85.5%) | 1859 (83.9%) |
| 1 | 233 (17.2%) | 124 (14.5%) | 357 (16.1%) |
| as.factor(falls_n) | |||
| 0 | 1063 (78.3%) | 671 (78.2%) | 1734 (78.2%) |
| 1 | 206 (15.2%) | 128 (14.9%) | 334 (15.1%) |
| 2 | 89 (6.6%) | 59 (6.9%) | 148 (6.7%) |
| as.factor(smoking) | |||
| 0 | 962 (70.8%) | 328 (38.2%) | 1290 (58.2%) |
| 1 | 395 (29.1%) | 530 (61.8%) | 925 (41.7%) |
| Missing | 1 (0.1%) | 0 (0%) | 1 (0.0%) |
| as.factor(parkinson) | |||
| 0 | 1268 (93.4%) | 804 (93.7%) | 2072 (93.5%) |
| 1 | 90 (6.6%) | 54 (6.3%) | 144 (6.5%) |
| as.factor(rheum) | |||
| 0 | 1306 (96.2%) | 824 (96.0%) | 2130 (96.1%) |
| 1 | 52 (3.8%) | 34 (4.0%) | 86 (3.9%) |
| as.factor(hypertension) | |||
| 0 | 695 (51.2%) | 399 (46.5%) | 1094 (49.4%) |
| 1 | 663 (48.8%) | 459 (53.5%) | 1122 (50.6%) |
| as.factor(diabetes) | |||
| 0 | 1213 (89.3%) | 757 (88.2%) | 1970 (88.9%) |
| 1 | 145 (10.7%) | 101 (11.8%) | 246 (11.1%) |
| as.factor(copd) | |||
| 0 | 1211 (89.2%) | 759 (88.5%) | 1970 (88.9%) |
| 1 | 147 (10.8%) | 99 (11.5%) | 246 (11.1%) |
| as.factor(cancer) | |||
| 0 | 1235 (90.9%) | 792 (92.3%) | 2027 (91.5%) |
| 1 | 123 (9.1%) | 66 (7.7%) | 189 (8.5%) |
| as.factor(cvd) | |||
| 0 | 843 (62.1%) | 515 (60.0%) | 1358 (61.3%) |
| 1 | 515 (37.9%) | 343 (40.0%) | 858 (38.7%) |
| as.factor(fx) | |||
| 0 | 932 (68.6%) | 709 (82.6%) | 1641 (74.1%) |
| 1 | 426 (31.4%) | 149 (17.4%) | 575 (25.9%) |
fx.2 = na.omit(fx)
m.step = lm(fnbmd ~ age + sex + weight + height + fnbmd + prior_fx + falls_n + smoking + parkinson + rheum + hypertension + diabetes + copd + cancer + cvd, data = fx.2)
## Warning in model.matrix.default(mt, mf, contrasts): the response appeared on
## the right-hand side and was dropped
## Warning in model.matrix.default(mt, mf, contrasts): problem with term 5 in
## model.matrix: no columns are assigned
step = step(m.step)
## Start: AIC=-9100.52
## fnbmd ~ age + sex + weight + height + fnbmd + prior_fx + falls_n +
## smoking + parkinson + rheum + hypertension + diabetes + copd +
## cancer + cvd
## Warning in model.matrix.default(object, data = structure(list(fnbmd = c(1.08, :
## the response appeared on the right-hand side and was dropped
## Warning in model.matrix.default(object, data = structure(list(fnbmd = c(1.08, :
## problem with term 5 in model.matrix: no columns are assigned
##
## Step: AIC=-9100.52
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## parkinson + rheum + hypertension + diabetes + copd + cancer +
## cvd
##
## Df Sum of Sq RSS AIC
## - copd 1 0.0000 28.641 -9102.5
## - cvd 1 0.0029 28.643 -9102.3
## - hypertension 1 0.0037 28.644 -9102.2
## - cancer 1 0.0056 28.646 -9102.1
## - parkinson 1 0.0074 28.648 -9102.0
## - diabetes 1 0.0086 28.649 -9101.9
## - falls_n 1 0.0134 28.654 -9101.5
## <none> 28.641 -9100.5
## - rheum 1 0.0433 28.684 -9099.3
## - height 1 0.1607 28.801 -9090.7
## - prior_fx 1 0.3350 28.976 -9077.9
## - smoking 1 0.3807 29.021 -9074.5
## - sex 1 0.8234 29.464 -9042.4
## - age 1 2.1037 30.744 -8952.2
## - weight 1 4.9616 33.602 -8763.7
##
## Step: AIC=-9102.52
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## parkinson + rheum + hypertension + diabetes + cancer + cvd
##
## Df Sum of Sq RSS AIC
## - cvd 1 0.0029 28.643 -9104.3
## - hypertension 1 0.0037 28.644 -9104.2
## - cancer 1 0.0056 28.646 -9104.1
## - parkinson 1 0.0074 28.648 -9104.0
## - diabetes 1 0.0086 28.649 -9103.9
## - falls_n 1 0.0134 28.654 -9103.5
## <none> 28.641 -9102.5
## - rheum 1 0.0433 28.684 -9101.3
## - height 1 0.1607 28.801 -9092.7
## - prior_fx 1 0.3351 28.976 -9079.9
## - smoking 1 0.3810 29.022 -9076.5
## - sex 1 0.8236 29.464 -9044.4
## - age 1 2.1053 30.746 -8954.1
## - weight 1 4.9628 33.603 -8765.6
##
## Step: AIC=-9104.3
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## parkinson + rheum + hypertension + diabetes + cancer
##
## Df Sum of Sq RSS AIC
## - hypertension 1 0.0045 28.648 -9106.0
## - cancer 1 0.0057 28.649 -9105.9
## - parkinson 1 0.0074 28.651 -9105.8
## - diabetes 1 0.0076 28.651 -9105.7
## - falls_n 1 0.0134 28.657 -9105.3
## <none> 28.643 -9104.3
## - rheum 1 0.0426 28.686 -9103.2
## - height 1 0.1604 28.804 -9094.5
## - prior_fx 1 0.3346 28.978 -9081.7
## - smoking 1 0.3826 29.026 -9078.2
## - sex 1 0.8236 29.467 -9046.2
## - age 1 2.1048 30.748 -8955.9
## - weight 1 4.9620 33.605 -8767.4
##
## Step: AIC=-9105.97
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## parkinson + rheum + diabetes + cancer
##
## Df Sum of Sq RSS AIC
## - cancer 1 0.0057 28.654 -9107.5
## - diabetes 1 0.0066 28.655 -9107.5
## - parkinson 1 0.0077 28.656 -9107.4
## - falls_n 1 0.0130 28.661 -9107.0
## <none> 28.648 -9106.0
## - rheum 1 0.0428 28.691 -9104.8
## - height 1 0.1613 28.809 -9096.1
## - prior_fx 1 0.3334 28.981 -9083.4
## - smoking 1 0.3807 29.029 -9080.0
## - sex 1 0.8201 29.468 -9048.1
## - age 1 2.1035 30.751 -8957.7
## - weight 1 4.9596 33.608 -8769.3
##
## Step: AIC=-9107.55
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## parkinson + rheum + diabetes
##
## Df Sum of Sq RSS AIC
## - diabetes 1 0.0069 28.661 -9109.0
## - parkinson 1 0.0076 28.661 -9109.0
## - falls_n 1 0.0135 28.667 -9108.5
## <none> 28.654 -9107.5
## - rheum 1 0.0442 28.698 -9106.3
## - height 1 0.1628 28.817 -9097.5
## - prior_fx 1 0.3351 28.989 -9084.9
## - smoking 1 0.3854 29.039 -9081.2
## - sex 1 0.8219 29.476 -9049.6
## - age 1 2.0984 30.752 -8959.6
## - weight 1 4.9596 33.613 -8771.0
##
## Step: AIC=-9109.04
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## parkinson + rheum
##
## Df Sum of Sq RSS AIC
## - parkinson 1 0.0080 28.669 -9110.4
## - falls_n 1 0.0135 28.674 -9110.0
## <none> 28.661 -9109.0
## - rheum 1 0.0450 28.706 -9107.7
## - height 1 0.1614 28.822 -9099.1
## - prior_fx 1 0.3366 28.997 -9086.3
## - smoking 1 0.3844 29.045 -9082.8
## - sex 1 0.8262 29.487 -9050.8
## - age 1 2.1055 30.766 -8960.7
## - weight 1 4.9622 33.623 -8772.4
##
## Step: AIC=-9110.44
## fnbmd ~ age + sex + weight + height + prior_fx + falls_n + smoking +
## rheum
##
## Df Sum of Sq RSS AIC
## - falls_n 1 0.0132 28.682 -9111.5
## <none> 28.669 -9110.4
## - rheum 1 0.0454 28.714 -9109.1
## - height 1 0.1614 28.830 -9100.5
## - prior_fx 1 0.3345 29.003 -9087.8
## - smoking 1 0.3848 29.053 -9084.2
## - sex 1 0.8261 29.495 -9052.2
## - age 1 2.1123 30.781 -8961.7
## - weight 1 4.9672 33.636 -8773.5
##
## Step: AIC=-9111.47
## fnbmd ~ age + sex + weight + height + prior_fx + smoking + rheum
##
## Df Sum of Sq RSS AIC
## <none> 28.682 -9111.5
## - rheum 1 0.0467 28.729 -9110.0
## - height 1 0.1639 28.846 -9101.4
## - prior_fx 1 0.3383 29.020 -9088.6
## - smoking 1 0.3900 29.072 -9084.8
## - sex 1 0.8244 29.506 -9053.4
## - age 1 2.1100 30.792 -8962.9
## - weight 1 4.9608 33.643 -8775.1
summary(step)
##
## Call:
## lm(formula = fnbmd ~ age + sex + weight + height + prior_fx +
## smoking + rheum, data = fx.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37479 -0.07541 -0.00655 0.06870 0.56423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6097744 0.0757999 8.045 1.43e-15 ***
## age -0.0047778 0.0003832 -12.468 < 2e-16 ***
## sexMale 0.0603920 0.0077495 7.793 1.02e-14 ***
## weight 0.0043022 0.0002250 19.117 < 2e-16 ***
## height 0.0015035 0.0004327 3.475 0.000521 ***
## prior_fx -0.0353546 0.0070820 -4.992 6.46e-07 ***
## smoking -0.0289828 0.0054069 -5.360 9.21e-08 ***
## rheum 0.0242102 0.0130528 1.855 0.063765 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1165 on 2113 degrees of freedom
## Multiple R-squared: 0.435, Adjusted R-squared: 0.4332
## F-statistic: 232.4 on 7 and 2113 DF, p-value: < 2.2e-16
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-6)
xvars = fx.2[, c("age", "sex", "weight", "height", "prior_fx", "falls_n", "smoking", "parkinson", "rheum", "hypertension", "diabetes", "copd", "cancer", "cvd")]
m.bma = bicreg(xvars, fx.2$fnbmd, strict = FALSE, OR = 20)
summary(m.bma)
##
## Call:
## bicreg(x = xvars, y = fx.2$fnbmd, strict = FALSE, OR = 20)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 0.626343 0.0977859 6.071e-01 6.098e-01 8.466e-01
## age 100.0 -0.004784 0.0003880 -4.765e-03 -4.778e-03 -4.995e-03
## sexMale 100.0 0.061554 0.0088666 6.021e-02 6.039e-02 7.690e-02
## weight 100.0 0.004329 0.0002379 4.306e-03 4.302e-03 4.603e-03
## height 92.1 0.001397 0.0005836 1.519e-03 1.503e-03 .
## prior_fx 100.0 -0.035389 0.0070901 -3.532e-02 -3.535e-02 -3.611e-02
## falls_n 0.0 0.000000 0.0000000 . . .
## smoking 100.0 -0.029091 0.0054107 -2.910e-02 -2.898e-02 -2.918e-02
## parkinson 0.0 0.000000 0.0000000 . . .
## rheum 10.0 0.002423 0.0083564 . 2.421e-02 .
## hypertension 0.0 0.000000 0.0000000 . . .
## diabetes 0.0 0.000000 0.0000000 . . .
## copd 0.0 0.000000 0.0000000 . . .
## cancer 0.0 0.000000 0.0000000 . . .
## cvd 0.0 0.000000 0.0000000 . . .
##
## nVar 6 7 5
## r2 0.434 0.435 0.431
## BIC -1.162e+03 -1.157e+03 -1.157e+03
## post prob 0.821 0.100 0.079
imageplot.bma(m.bma)
m.bmd = lm(fnbmd ~ age + sex + weight + height + prior_fx + smoking, data = fx.2)
summary(m.bmd)
##
## Call:
## lm(formula = fnbmd ~ age + sex + weight + height + prior_fx +
## smoking, data = fx.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37425 -0.07568 -0.00663 0.07184 0.58750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6070710 0.0758296 8.006 1.94e-15 ***
## age -0.0047646 0.0003834 -12.428 < 2e-16 ***
## sexMale 0.0602131 0.0077533 7.766 1.25e-14 ***
## weight 0.0043063 0.0002252 19.126 < 2e-16 ***
## height 0.0015189 0.0004328 3.509 0.000459 ***
## prior_fx -0.0353237 0.0070861 -4.985 6.70e-07 ***
## smoking -0.0290965 0.0054097 -5.379 8.34e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1166 on 2114 degrees of freedom
## Multiple R-squared: 0.4341, Adjusted R-squared: 0.4325
## F-statistic: 270.3 on 6 and 2114 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.bmd)
summary(m.bmd)
##
## Call:
## lm(formula = fnbmd ~ age + sex + weight + height + prior_fx +
## smoking, data = fx.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37425 -0.07568 -0.00663 0.07184 0.58750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6070710 0.0758296 8.006 1.94e-15 ***
## age -0.0047646 0.0003834 -12.428 < 2e-16 ***
## sexMale 0.0602131 0.0077533 7.766 1.25e-14 ***
## weight 0.0043063 0.0002252 19.126 < 2e-16 ***
## height 0.0015189 0.0004328 3.509 0.000459 ***
## prior_fx -0.0353237 0.0070861 -4.985 6.70e-07 ***
## smoking -0.0290965 0.0054097 -5.379 8.34e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1166 on 2114 degrees of freedom
## Multiple R-squared: 0.4341, Adjusted R-squared: 0.4325
## F-statistic: 270.3 on 6 and 2114 DF, p-value: < 2.2e-16
library(readxl)
fx = as.data.frame(read_excel(("D:/1. Dr Hung 108/NCKH R/Osteo-data.xlsx")))
dim(fx)
## [1] 2216 17
summary(fx)
## id sex age weight
## Min. : 1.0 Length:2216 Min. :57.00 Min. : 34.00
## 1st Qu.: 554.8 Class :character 1st Qu.:65.00 1st Qu.: 60.00
## Median :1108.5 Mode :character Median :70.00 Median : 69.00
## Mean :1108.5 Mean :70.89 Mean : 70.14
## 3rd Qu.:1662.2 3rd Qu.:76.00 3rd Qu.: 79.00
## Max. :2216.0 Max. :96.00 Max. :133.00
## NA's :53
## height prior_fx fnbmd smoking
## Min. :136.0 Min. :0.0000 Min. :0.2800 Min. :0.0000
## 1st Qu.:158.0 1st Qu.:0.0000 1st Qu.:0.7300 1st Qu.:0.0000
## Median :164.0 Median :0.0000 Median :0.8200 Median :0.0000
## Mean :164.9 Mean :0.1611 Mean :0.8287 Mean :0.4176
## 3rd Qu.:171.0 3rd Qu.:0.0000 3rd Qu.:0.9300 3rd Qu.:1.0000
## Max. :196.0 Max. :1.0000 Max. :1.5100 Max. :1.0000
## NA's :54 NA's :89 NA's :1
## parkinson rheum hypertension diabetes
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.00000 Median :0.00000 Median :1.0000 Median :0.000
## Mean :0.06498 Mean :0.03881 Mean :0.5063 Mean :0.111
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.000
##
## copd cancer cvd falls_n
## Min. :0.000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.111 Mean :0.08529 Mean :0.3872 Mean :0.2843
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.0000 Max. :2.0000
##
## fx
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2595
## 3rd Qu.:1.0000
## Max. :1.0000
##
library(tidyverse)
library(tidyverse)
fx = fx %>% mutate(
prior_fx = as.factor(prior_fx),
falls_n = as.factor(falls_n),
smoking = as.factor(smoking),
parkinson = as.factor(parkinson),
rheum = as.factor(rheum),
hypertension = as.factor(hypertension),
diabetes = as.factor(diabetes),
copd = as.factor(copd),
cancer = as.factor(cancer),
cvd = as.factor(cvd),
fx = as.factor(fx)
)
library(table1)
table1(~ sex + age + weight + height + fnbmd + prior_fx + falls_n + smoking + parkinson + rheum + hypertension + diabetes + copd + cancer + cvd | fx, data = fx)
| 0 (N=1641) |
1 (N=575) |
Overall (N=2216) |
|
|---|---|---|---|
| sex | |||
| Female | 932 (56.8%) | 426 (74.1%) | 1358 (61.3%) |
| Male | 709 (43.2%) | 149 (25.9%) | 858 (38.7%) |
| age | |||
| Mean (SD) | 70.2 (6.89) | 72.9 (7.58) | 70.9 (7.17) |
| Median [Min, Max] | 69.0 [57.0, 94.0] | 72.0 [60.0, 96.0] | 70.0 [57.0, 96.0] |
| weight | |||
| Mean (SD) | 71.5 (14.2) | 66.0 (13.3) | 70.1 (14.2) |
| Median [Min, Max] | 70.0 [36.0, 133] | 65.0 [34.0, 102] | 69.0 [34.0, 133] |
| Missing | 23 (1.4%) | 30 (5.2%) | 53 (2.4%) |
| height | |||
| Mean (SD) | 166 (9.40) | 162 (8.74) | 165 (9.35) |
| Median [Min, Max] | 165 [139, 192] | 162 [136, 196] | 164 [136, 196] |
| Missing | 24 (1.5%) | 30 (5.2%) | 54 (2.4%) |
| fnbmd | |||
| Mean (SD) | 0.854 (0.151) | 0.754 (0.140) | 0.829 (0.155) |
| Median [Min, Max] | 0.850 [0.380, 1.51] | 0.750 [0.280, 1.25] | 0.820 [0.280, 1.51] |
| Missing | 51 (3.1%) | 38 (6.6%) | 89 (4.0%) |
| prior_fx | |||
| 0 | 1419 (86.5%) | 440 (76.5%) | 1859 (83.9%) |
| 1 | 222 (13.5%) | 135 (23.5%) | 357 (16.1%) |
| falls_n | |||
| 0 | 1275 (77.7%) | 459 (79.8%) | 1734 (78.2%) |
| 1 | 252 (15.4%) | 82 (14.3%) | 334 (15.1%) |
| 2 | 114 (6.9%) | 34 (5.9%) | 148 (6.7%) |
| smoking | |||
| 0 | 935 (57.0%) | 355 (61.7%) | 1290 (58.2%) |
| 1 | 705 (43.0%) | 220 (38.3%) | 925 (41.7%) |
| Missing | 1 (0.1%) | 0 (0%) | 1 (0.0%) |
| parkinson | |||
| 0 | 1537 (93.7%) | 535 (93.0%) | 2072 (93.5%) |
| 1 | 104 (6.3%) | 40 (7.0%) | 144 (6.5%) |
| rheum | |||
| 0 | 1576 (96.0%) | 554 (96.3%) | 2130 (96.1%) |
| 1 | 65 (4.0%) | 21 (3.7%) | 86 (3.9%) |
| hypertension | |||
| 0 | 805 (49.1%) | 289 (50.3%) | 1094 (49.4%) |
| 1 | 836 (50.9%) | 286 (49.7%) | 1122 (50.6%) |
| diabetes | |||
| 0 | 1456 (88.7%) | 514 (89.4%) | 1970 (88.9%) |
| 1 | 185 (11.3%) | 61 (10.6%) | 246 (11.1%) |
| copd | |||
| 0 | 1452 (88.5%) | 518 (90.1%) | 1970 (88.9%) |
| 1 | 189 (11.5%) | 57 (9.9%) | 246 (11.1%) |
| cancer | |||
| 0 | 1503 (91.6%) | 524 (91.1%) | 2027 (91.5%) |
| 1 | 138 (8.4%) | 51 (8.9%) | 189 (8.5%) |
| cvd | |||
| 0 | 993 (60.5%) | 365 (63.5%) | 1358 (61.3%) |
| 1 | 648 (39.5%) | 210 (36.5%) | 858 (38.7%) |
p = ggplot(data = fx, aes(x = fx, y = fnbmd, fill = fx, col = fx))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Gãy xương", y = "MĐX cổ xương đùi (g/cm2)") + ggtitle("Liên quan giữa MĐX và khả năng bị gãy xương")
p1
## Warning: Removed 89 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 89 rows containing missing values or values outside the scale range
## (`geom_point()`).
### 3.2 Mô hình hồi qui logistic đạnh giá mối liên quan giữa MĐX và nguy
cơ gãy xương
m.bmd = glm(fx ~ fnbmd, family = "binomial", data = fx)
summary(m.bmd)
##
## Call:
## glm(formula = fx ~ fnbmd, family = "binomial", data = fx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7609 0.3055 9.036 <2e-16 ***
## fnbmd -4.7917 0.3856 -12.427 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2403.6 on 2126 degrees of freedom
## Residual deviance: 2222.8 on 2125 degrees of freedom
## (89 observations deleted due to missingness)
## AIC: 2226.8
##
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Loading required package: foreign
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(m.bmd)
##
## Logistic regression predicting fx : 1 vs 0
##
## OR(95%CI) P(Wald's test) P(LR-test)
## fnbmd (cont. var.) 0.01 (0,0.02) < 0.001 < 0.001
##
## Log-likelihood = -1111.4092
## No. of observations = 2127
## AIC value = 2226.8184
library(Publish)
## Loading required package: prodlim
publish(m.bmd)
## Variable Units OddsRatio CI.95 p-value
## fnbmd 0.01 [0.00;0.02] < 1e-04
fx$fnbmd.sd = fx$fnbmd/(-0.155)
m.bmd.sd = glm(fx ~ fnbmd.sd, family = "binomial", data = fx)
summary(m.bmd.sd)
##
## Call:
## glm(formula = fx ~ fnbmd.sd, family = "binomial", data = fx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.76088 0.30554 9.036 <2e-16 ***
## fnbmd.sd 0.74271 0.05976 12.427 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2403.6 on 2126 degrees of freedom
## Residual deviance: 2222.8 on 2125 degrees of freedom
## (89 observations deleted due to missingness)
## AIC: 2226.8
##
## Number of Fisher Scoring iterations: 4
publish(m.bmd.sd)
## Variable Units OddsRatio CI.95 p-value
## fnbmd.sd 2.10 [1.87;2.36] < 1e-04
m.sex = glm(fx ~ sex, family = "binomial", data = fx)
summary(m.sex)
##
## Call:
## glm(formula = fx ~ sex, family = "binomial", data = fx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.78289 0.05848 -13.386 < 2e-16 ***
## sexMale -0.77702 0.10743 -7.232 4.74e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2537.4 on 2215 degrees of freedom
## Residual deviance: 2481.6 on 2214 degrees of freedom
## AIC: 2485.6
##
## Number of Fisher Scoring iterations: 4
publish(m.sex)
## Variable Units OddsRatio CI.95 p-value
## sex Female Ref
## Male 0.46 [0.37;0.57] <1e-04
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
vars = fx[, c("fnbmd", "sex", "fx")]
ggpairs(data = vars, mapping = aes(color = fx))
## Warning: Removed 89 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 89 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Removed 89 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 89 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 89 rows containing non-finite outside the scale range
## (`stat_bin()`).
m.multi = glm(fx ~ fnbmd.sd + sex, family = "binomial", data = fx)
summary(m.multi)
##
## Call:
## glm(formula = fx ~ fnbmd.sd + sex, family = "binomial", data = fx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.62064 0.31432 8.337 <2e-16 ***
## fnbmd.sd 0.70112 0.06367 11.012 <2e-16 ***
## sexMale -0.22252 0.12112 -1.837 0.0662 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2403.6 on 2126 degrees of freedom
## Residual deviance: 2219.4 on 2124 degrees of freedom
## (89 observations deleted due to missingness)
## AIC: 2225.4
##
## Number of Fisher Scoring iterations: 4
publish(m.multi)
## Variable Units OddsRatio CI.95 p-value
## fnbmd.sd 2.02 [1.78;2.28] < 1e-04
## sex Female Ref
## Male 0.80 [0.63;1.01] 0.06618
library(BMA)
fx.2 = na.omit(fx)
bma.fx = bic.glm(fx ~ sex + age + weight + height + fnbmd + prior_fx + falls_n + smoking + parkinson + rheum + hypertension + diabetes + copd + cancer + cvd, data = fx.2, strict = F, OR = 20, glm.family = "binomial")
summary(bma.fx)
##
## Call:
## bic.glm.formula(f = fx ~ sex + age + weight + height + fnbmd + prior_fx + falls_n + smoking + parkinson + rheum + hypertension + diabetes + copd + cancer + cvd, data = fx.2, glm.family = "binomial", strict = F, OR = 20)
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 2.249962 0.674258 2.509e+00 2.340e+00 1.140e+00
## sexMale 18.2 -0.048048 0.114551 . -2.578e-01 .
## age 16.2 0.002663 0.006838 . . 1.602e-02
## weight 0.0 0.000000 0.000000 . . .
## height 0.0 0.000000 0.000000 . . .
## fnbmd 100.0 -4.487636 0.436674 -4.595e+00 -4.279e+00 -4.302e+00
## prior_fx 100.0
## .1 0.530729 0.134289 5.306e-01 5.445e-01 5.156e-01
## falls_n 0.0
## .1 0.000000 0.000000 . . .
## .2 0.000000 0.000000 . . .
## smoking 0.0
## .1 0.000000 0.000000 . . .
## parkinson 0.0
## .1 0.000000 0.000000 . . .
## rheum 0.0
## .1 0.000000 0.000000 . . .
## hypertension 0.0
## .1 0.000000 0.000000 . . .
## diabetes 0.0
## .1 0.000000 0.000000 . . .
## copd 0.0
## .1 0.000000 0.000000 . . .
## cancer 0.0
## .1 0.000000 0.000000 . . .
## cvd 0.0
## .1 0.000000 0.000000 . . .
##
## nVar 2 3 3
## BIC -1.402e+04 -1.402e+04 -1.402e+04
## post prob 0.696 0.142 0.122
## model 4
## Intercept 7.978e-01
## sexMale -2.851e-01
## age 1.780e-02
## weight .
## height .
## fnbmd -3.918e+00
## prior_fx
## .1 5.296e-01
## falls_n
## .1 .
## .2 .
## smoking
## .1 .
## parkinson
## .1 .
## rheum
## .1 .
## hypertension
## .1 .
## diabetes
## .1 .
## copd
## .1 .
## cancer
## .1 .
## cvd
## .1 .
##
## nVar 4
## BIC -1.402e+04
## post prob 0.040
##
## 1 observations deleted due to missingness.
imageplot.bma(bma.fx)
m.final = glm(fx ~ fnbmd.sd + prior_fx, family = "binomial", data = fx)
summary(m.final)
##
## Call:
## glm(formula = fx ~ fnbmd.sd + prior_fx, family = "binomial",
## data = fx)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.49096 0.31301 7.958 1.75e-15 ***
## fnbmd.sd 0.70829 0.06033 11.740 < 2e-16 ***
## prior_fx1 0.52983 0.13386 3.958 7.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2403.6 on 2126 degrees of freedom
## Residual deviance: 2207.6 on 2124 degrees of freedom
## (89 observations deleted due to missingness)
## AIC: 2213.6
##
## Number of Fisher Scoring iterations: 4
publish(m.final)
## Variable Units OddsRatio CI.95 p-value
## fnbmd.sd 2.03 [1.80;2.29] <1e-04
## prior_fx 0 Ref
## 1 1.70 [1.31;2.21] <1e-04