#install.packages(c("readxl", "tidyverse", "dplyr", "ggplot2", "gridExtra", "GGally", "DescTools", "table1", "compareGroups", "simpleboot", "epiDisplay", "Publish"), dependencies = T)
ob = read.csv("C:\\Thach\\VN trips\\2024_2Aug\\Data Analysis workshop (Hospital 108)\\Datasets\\obesity data.csv")
Tìm đường dẫn với file.choose()
#t = file.choose()
dim(ob)
## [1] 1217 13
head(ob)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat hypertension
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3 0
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8 1
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0 1
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8 1
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8 0
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2 1
## diabetes
## 1 1
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
tail(ob)
## id gender height weight bmi age WBBMC wbbmd fat lean pcfat
## 1212 1222 F 153 50 21.4 59 1309 0.87 18328 29147 37.6
## 1213 1223 F 150 44 19.6 44 1474 0.95 12906 28534 30.1
## 1214 1224 F 148 51 23.3 58 1522 0.97 14938 33931 29.6
## 1215 1225 F 149 50 22.5 57 1409 0.93 16777 30598 34.4
## 1216 1226 F 144 49 23.6 67 1266 0.90 20094 27272 41.3
## 1217 1227 F 141 45 22.6 58 1228 0.91 14567 28111 33.2
## hypertension diabetes
## 1212 1 0
## 1213 0 1
## 1214 0 0
## 1215 1 0
## 1216 1 0
## 1217 0 0
summary(ob)
## id gender height weight
## Min. : 1.0 Length:1217 Min. :136.0 Min. :34.00
## 1st Qu.: 309.0 Class :character 1st Qu.:151.0 1st Qu.:49.00
## Median : 615.0 Mode :character Median :155.0 Median :54.00
## Mean : 614.5 Mean :156.7 Mean :55.14
## 3rd Qu.: 921.0 3rd Qu.:162.0 3rd Qu.:61.00
## Max. :1227.0 Max. :185.0 Max. :95.00
## bmi age WBBMC wbbmd fat
## Min. :14.5 Min. :13.00 Min. : 695 Min. :0.650 Min. : 4277
## 1st Qu.:20.2 1st Qu.:35.00 1st Qu.:1481 1st Qu.:0.930 1st Qu.:13768
## Median :22.2 Median :48.00 Median :1707 Median :1.010 Median :16955
## Mean :22.4 Mean :47.15 Mean :1725 Mean :1.009 Mean :17288
## 3rd Qu.:24.3 3rd Qu.:58.00 3rd Qu.:1945 3rd Qu.:1.090 3rd Qu.:20325
## Max. :37.1 Max. :88.00 Max. :3040 Max. :1.350 Max. :40825
## lean pcfat hypertension diabetes
## Min. :19136 Min. : 9.2 Min. :0.000 Min. :0.0000
## 1st Qu.:30325 1st Qu.:27.0 1st Qu.:0.000 1st Qu.:0.0000
## Median :33577 Median :32.4 Median :1.000 Median :0.0000
## Mean :35463 Mean :31.6 Mean :0.507 Mean :0.1109
## 3rd Qu.:39761 3rd Qu.:36.8 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :63059 Max. :48.4 Max. :1.000 Max. :1.0000
# Cách đơn giản:
ob$sex[ob$gender == "F"] = 1
ob$sex[ob$gender == "M"] = 0
ob$sex.b = ifelse(ob$gender== "F", 1, 0)
table(ob$sex, ob$sex.b)
##
## 0 1
## 0 355 0
## 1 0 862
# Sử dụng tidyverse:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
ob = ob %>%
mutate(sex.2 = case_when(gender == "F" ~ 1,
gender == "M" ~ 0))
table(ob$sex, ob$sex.2)
##
## 0 1
## 0 355 0
## 1 0 862
# Cách đơn giản:
ob$obese[ob$bmi< 18.5] = "Underweight"
ob$obese[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$obese[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$obese[ob$bmi>= 30] = "Obese"
# Sử dụng tidyverse:
ob = ob %>%
mutate(obese.2 = case_when(bmi< 18.5 ~ "Underweight",
bmi>= 18.5 & bmi< 25 ~ "Normal",
bmi>= 25 & bmi< 30 ~ "Overweight",
bmi>= 30 ~ "Obese"))
table(ob$obese, ob$obese.2)
##
## Normal Obese Overweight Underweight
## Normal 865 0 0 0
## Obese 0 15 0 0
## Overweight 0 0 230 0
## Underweight 0 0 0 107
# Cách đơn giản:
ob$lean.kg = ob$lean*1000
ob$fat.kg = ob$fat*1000
# Sử dụng tidyverse:
ob = ob %>%
mutate(lean.kg.2 = lean*1000,
fat.kg.2 = fat*1000)
# Cách đơn giản:
men.overweight = subset(ob, gender == "M" & bmi>= 25)
dim(men.overweight)
## [1] 85 22
table(men.overweight$obese)
##
## Obese Overweight
## 4 81
# Sử dụng tidyverse:
men.overweight.2 = ob %>% filter(gender == "M", bmi>= 25)
dim(men.overweight.2)
## [1] 85 22
table(men.overweight.2$obese)
##
## Obese Overweight
## 4 81
# Cách đơn giản:
Subset = subset(ob, select = c(id, age, gender, bmi, lean, fat))
head(Subset)
## id age gender bmi lean fat
## 1 1 53 F 21.8 28600 17802
## 2 2 65 M 19.1 40229 8381
## 3 3 64 F 23.1 36057 19221
## 4 4 56 F 21.8 33094 17472
## 5 5 54 M 19.9 40621 7336
## 6 6 52 F 20.1 30068 14904
Subset.b = ob[, c("id", "age", "gender", "bmi", "lean", "fat")]
head(Subset.b)
## id age gender bmi lean fat
## 1 1 53 F 21.8 28600 17802
## 2 2 65 M 19.1 40229 8381
## 3 3 64 F 23.1 36057 19221
## 4 4 56 F 21.8 33094 17472
## 5 5 54 M 19.9 40621 7336
## 6 6 52 F 20.1 30068 14904
# Sử dụng tidyverse:
Subset.2 = ob %>% select(id, age, gender, bmi, lean, fat)
head(Subset.2)
## id age gender bmi lean fat
## 1 1 53 F 21.8 28600 17802
## 2 2 65 M 19.1 40229 8381
## 3 3 64 F 23.1 36057 19221
## 4 4 56 F 21.8 33094 17472
## 5 5 54 M 19.9 40621 7336
## 6 6 52 F 20.1 30068 14904