#install.packages(c("readxl", "tidyverse", "table1", "compareGroups", "ggplot2", "gridExtra", "tidyverse", "GGally", "ggfortify", "DescTools", "relaimpo", "carData", "rms", "caret", "BMA", "pROC", "epiDisplay"), dependencies = T)
# Đọc trực tiếp:
# Tìm đường dẫn với file.choose()
# t = file.choose()
# t
bw = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\birthwt.csv")
# Đọc từ "Import Dataset"
dim(bw)
## [1] 189 11
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 1 85 0 19 182 2 0 0 0 1 0 2523
## 2 86 0 33 155 3 0 0 0 0 3 2551
## 3 87 0 20 105 1 1 0 0 0 1 2557
## 4 88 0 21 108 1 1 0 0 1 2 2594
## 5 89 0 18 107 1 1 0 0 1 0 2600
## 6 91 0 21 124 3 0 0 0 0 0 2622
tail(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 184 78 1 14 101 3 1 1 0 0 0 2466
## 185 79 1 28 95 1 1 0 0 0 2 2466
## 186 81 1 14 100 3 0 0 0 0 2 2495
## 187 82 1 23 94 3 1 0 0 0 0 2495
## 188 83 1 17 142 2 0 0 1 0 0 2495
## 189 84 1 21 130 1 1 0 1 0 3 2495
bw$mwt = bw$lwt*0.453592
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541
# Dùng gói tidyverse:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
bw = bw %>% mutate(mwt.2 = lwt*0.453592)
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374 82.55374
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676 70.30676
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716 47.62716
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794 48.98794
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434 48.53434
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541 56.24541
bw$ethnicity[bw$race == 1] = "White"
bw$ethnicity[bw$race == 2] = "Black"
bw$ethnicity[bw$race == 3] = "Other"
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374 82.55374 Black
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676 70.30676 Other
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716 47.62716 White
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794 48.98794 White
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434 48.53434 White
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541 56.24541 Other
# Dùng gói tidyverse:
bw = bw %>%
mutate(ethnicity.2 = case_when(race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Other"))
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374 82.55374 Black
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676 70.30676 Other
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716 47.62716 White
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794 48.98794 White
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434 48.53434 White
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541 56.24541 Other
## ethnicity.2
## 1 Black
## 2 Other
## 3 White
## 4 White
## 5 White
## 6 Other
table(bw$ethnicity, bw$ethnicity.2)
##
## Black Other White
## Black 26 0 0
## Other 0 67 0
## White 0 0 96
bw$smokig = ifelse(bw$smoke == "Yes", 1, 0)
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 1 85 0 19 182 2 0 0 0 1 0 2523 82.55374 82.55374 Black
## 2 86 0 33 155 3 0 0 0 0 3 2551 70.30676 70.30676 Other
## 3 87 0 20 105 1 1 0 0 0 1 2557 47.62716 47.62716 White
## 4 88 0 21 108 1 1 0 0 1 2 2594 48.98794 48.98794 White
## 5 89 0 18 107 1 1 0 0 1 0 2600 48.53434 48.53434 White
## 6 91 0 21 124 3 0 0 0 0 0 2622 56.24541 56.24541 Other
## ethnicity.2 smokig
## 1 Black 0
## 2 Other 0
## 3 White 0
## 4 White 0
## 5 White 0
## 6 Other 0
bw1 = bw[, c("id", "low", "bwt")]
dim(bw1)
## [1] 189 3
head(bw1)
## id low bwt
## 1 85 0 2523
## 2 86 0 2551
## 3 87 0 2557
## 4 88 0 2594
## 5 89 0 2600
## 6 91 0 2622
# Dùng gói tidyverse:
bw1.2 = bw %>% select(id, low, bwt)
dim(bw1.2)
## [1] 189 3
head(bw1.2)
## id low bwt
## 1 85 0 2523
## 2 86 0 2551
## 3 87 0 2557
## 4 88 0 2594
## 5 89 0 2600
## 6 91 0 2622
bw2 = subset(bw, low == 1)
dim(bw2)
## [1] 59 16
head(bw2)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 131 4 1 28 120 3 1 1 0 1 0 709 54.43104 54.43104 Other
## 132 10 1 29 130 1 0 0 0 1 2 1021 58.96696 58.96696 White
## 133 11 1 34 187 2 1 0 1 0 0 1135 84.82170 84.82170 Black
## 134 13 1 25 105 3 0 1 1 0 0 1330 47.62716 47.62716 Other
## 135 15 1 25 85 3 0 0 0 1 0 1474 38.55532 38.55532 Other
## 136 16 1 27 150 3 0 0 0 0 0 1588 68.03880 68.03880 Other
## ethnicity.2 smokig
## 131 Other 0
## 132 White 0
## 133 Black 0
## 134 Other 0
## 135 Other 0
## 136 Other 0
# Dùng gói tidyverse:
bw2.2 = bw %>% filter(low == 1)
dim(bw2.2)
## [1] 59 16
head(bw2.2)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 1 4 1 28 120 3 1 1 0 1 0 709 54.43104 54.43104 Other
## 2 10 1 29 130 1 0 0 0 1 2 1021 58.96696 58.96696 White
## 3 11 1 34 187 2 1 0 1 0 0 1135 84.82170 84.82170 Black
## 4 13 1 25 105 3 0 1 1 0 0 1330 47.62716 47.62716 Other
## 5 15 1 25 85 3 0 0 0 1 0 1474 38.55532 38.55532 Other
## 6 16 1 27 150 3 0 0 0 0 0 1588 68.03880 68.03880 Other
## ethnicity.2 smokig
## 1 Other 0
## 2 White 0
## 3 Black 0
## 4 Other 0
## 5 Other 0
## 6 Other 0
bw3 = subset(bw, low == 1 & smoke == 1)
dim(bw3)
## [1] 30 16
head(bw3)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 131 4 1 28 120 3 1 1 0 1 0 709 54.43104 54.43104 Other
## 133 11 1 34 187 2 1 0 1 0 0 1135 84.82170 84.82170 Black
## 140 20 1 21 165 1 1 0 1 0 1 1790 74.84268 74.84268 White
## 141 22 1 32 105 1 1 0 0 0 0 1818 47.62716 47.62716 White
## 142 23 1 19 91 1 1 2 0 1 0 1885 41.27687 41.27687 White
## 145 26 1 25 92 1 1 0 0 0 0 1928 41.73046 41.73046 White
## ethnicity.2 smokig
## 131 Other 0
## 133 Black 0
## 140 White 0
## 141 White 0
## 142 White 0
## 145 White 0
# Dùng gói tidyverse:
bw3.2 = bw %>% filter(low == 1, smoke == 1)
dim(bw3.2)
## [1] 30 16
head(bw3.2)
## id low age lwt race smoke ptl ht ui ftv bwt mwt mwt.2 ethnicity
## 1 4 1 28 120 3 1 1 0 1 0 709 54.43104 54.43104 Other
## 2 11 1 34 187 2 1 0 1 0 0 1135 84.82170 84.82170 Black
## 3 20 1 21 165 1 1 0 1 0 1 1790 74.84268 74.84268 White
## 4 22 1 32 105 1 1 0 0 0 0 1818 47.62716 47.62716 White
## 5 23 1 19 91 1 1 2 0 1 0 1885 41.27687 41.27687 White
## 6 26 1 25 92 1 1 0 0 0 0 1928 41.73046 41.73046 White
## ethnicity.2 smokig
## 1 Other 0
## 2 Black 0
## 3 White 0
## 4 White 0
## 5 White 0
## 6 White 0
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ age + lwt + bwt, data = bw)
Overall (N=189) |
|
---|---|
age | |
Mean (SD) | 23.2 (5.30) |
Median [Min, Max] | 23.0 [14.0, 45.0] |
lwt | |
Mean (SD) | 130 (30.6) |
Median [Min, Max] | 121 [80.0, 250] |
bwt | |
Mean (SD) | 2940 (729) |
Median [Min, Max] | 2980 [709, 4990] |
table1(~ age + lwt + bwt, data = bw, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))
Overall (N=189) |
|
---|---|
age | |
Mean (SD) | 23.2 (5.30) |
Median [Q1, Q3] | 23.0 [19.0, 26.0] |
lwt | |
Mean (SD) | 130 (30.6) |
Median [Q1, Q3] | 121 [110, 140] |
bwt | |
Mean (SD) | 2940 (729) |
Median [Q1, Q3] | 2980 [2410, 3490] |
table1(~ age + lwt + bwt | low, data = bw)
## Warning in table1.formula(~age + lwt + bwt | low, data = bw): Terms to the
## right of '|' in formula 'x' define table columns and are expected to be factors
## with meaningful labels.
0 (N=130) |
1 (N=59) |
Overall (N=189) |
|
---|---|---|---|
age | |||
Mean (SD) | 23.7 (5.58) | 22.3 (4.51) | 23.2 (5.30) |
Median [Min, Max] | 23.0 [14.0, 45.0] | 22.0 [14.0, 34.0] | 23.0 [14.0, 45.0] |
lwt | |||
Mean (SD) | 133 (31.7) | 122 (26.6) | 130 (30.6) |
Median [Min, Max] | 124 [85.0, 250] | 120 [80.0, 200] | 121 [80.0, 250] |
bwt | |||
Mean (SD) | 3330 (478) | 2100 (391) | 2940 (729) |
Median [Min, Max] | 3270 [2520, 4990] | 2210 [709, 2500] | 2980 [709, 4990] |
ob = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Obesity data.csv")
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p = ggplot(data = ob, aes(x = pcfat))
p1 = p + geom_histogram()
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tỉ trọng mỡ (%)", y = "Số người", title = "Phân bố tỉ trọng mỡ")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = ob, aes(x = pcfat, fill = gender))
p1 = p + geom_histogram(col="white") + labs(x = "Tỉ trọng mỡ", y = "Số người", title = "Phân bố tỉ trọng mỡ")
p2 = p + geom_density(alpha = 0.5) + labs(x = "Tỉ trọng mỡ", y = "Phân bố (%)", title = "Phân bố tỉ trọng mỡ")
grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ob$OB[ob$bmi< 18.5] = "Underweight"
ob$OB[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$OB[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$OB[ob$bmi>= 30] = "Obese"
p = ggplot(data = ob, aes(x = OB, fill = OB, col = OB))
p1 = p + geom_bar(position = "dodge")
ob$OB.2 = factor(ob$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))
p = ggplot(data = ob, aes(x = OB.2, fill = OB.2, col = OB.2))
p2 = p + geom_bar(position = "dodge")
grid.arrange(p1, p2, ncol = 2)
Tính tỉ lệ %:
library(tidyverse)
temp = ob %>% group_by(gender) %>% count(OB) %>% mutate(pct = n/sum(n))
temp$pct = round(temp$pct*100, 1)
Thêm % vào biểu đồ:
p = ggplot(data = temp, aes(x = OB, y = pct, fill = gender, group = gender))
p + geom_bar(stat = "identity", position = "dodge") + geom_text(aes(x = OB, y = pct, label = pct, group = gender), position = position_dodge(width = 1), vjust = -0.5, col = "blue") + labs(x = "Obesity status", y = "Percent") + theme(legend.position = "none")
p = ggplot(data = ob, aes(x = gender, y = pcfat, col = gender))
p1 = p + geom_boxplot()
p1
p2 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Giới tính", y = "Tỉ trọng mỡ (%)") + ggtitle("Tỉ trọng mỡ theo giới tính")
p2
grid.arrange(p1, p2, ncol = 2)
p = ggplot(data = ob, aes(x = bmi, y = pcfat, fill = gender, col = gender))
p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2)) + labs(x = "Chỉ số khối cơ thể (kg/m2)", y = "Tỉ trọng mỡ (%)") + ggtitle("Liên quan giữa chỉ số khối cơ thể và tỉ trọng mỡ theo giới tính")