Open data

Summary data

dim(ob)
## [1] 1217   13
head(ob,10)
##    id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat hypertension
## 1   1      F    150     49 21.8  53  1312  0.88 17802 28600  37.3            0
## 2   2      M    165     52 19.1  65  1309  0.84  8381 40229  16.8            1
## 3   3      F    157     57 23.1  64  1230  0.84 19221 36057  34.0            1
## 4   4      F    156     53 21.8  56  1171  0.80 17472 33094  33.8            1
## 5   5      M    160     51 19.9  54  1681  0.98  7336 40621  14.8            0
## 6   6      F    153     47 20.1  52  1358  0.91 14904 30068  32.2            1
## 7   7      F    155     58 24.1  66  1546  0.96 20233 35599  35.3            1
## 8   8      M    167     65 23.3  50  2276  1.11 17749 43301  28.0            1
## 9   9      M    165     54 19.8  61  1778  0.96 10795 38613  21.1            0
## 10 10      F    158     60 24.0  58  1404  0.86 21365 35534  36.6            1
##    diabetes
## 1         1
## 2         0
## 3         0
## 4         0
## 5         0
## 6         0
## 7         1
## 8         1
## 9         0
## 10        0
tail(ob)
##        id gender height weight  bmi age WBBMC wbbmd   fat  lean pcfat
## 1212 1222      F    153     50 21.4  59  1309  0.87 18328 29147  37.6
## 1213 1223      F    150     44 19.6  44  1474  0.95 12906 28534  30.1
## 1214 1224      F    148     51 23.3  58  1522  0.97 14938 33931  29.6
## 1215 1225      F    149     50 22.5  57  1409  0.93 16777 30598  34.4
## 1216 1226      F    144     49 23.6  67  1266  0.90 20094 27272  41.3
## 1217 1227      F    141     45 22.6  58  1228  0.91 14567 28111  33.2
##      hypertension diabetes
## 1212            1        0
## 1213            0        1
## 1214            0        0
## 1215            1        0
## 1216            1        0
## 1217            0        0
summary(ob)
##        id            gender              height          weight     
##  Min.   :   1.0   Length:1217        Min.   :136.0   Min.   :34.00  
##  1st Qu.: 309.0   Class :character   1st Qu.:151.0   1st Qu.:49.00  
##  Median : 615.0   Mode  :character   Median :155.0   Median :54.00  
##  Mean   : 614.5                      Mean   :156.7   Mean   :55.14  
##  3rd Qu.: 921.0                      3rd Qu.:162.0   3rd Qu.:61.00  
##  Max.   :1227.0                      Max.   :185.0   Max.   :95.00  
##       bmi            age            WBBMC          wbbmd            fat       
##  Min.   :14.5   Min.   :13.00   Min.   : 695   Min.   :0.650   Min.   : 4277  
##  1st Qu.:20.2   1st Qu.:35.00   1st Qu.:1481   1st Qu.:0.930   1st Qu.:13768  
##  Median :22.2   Median :48.00   Median :1707   Median :1.010   Median :16955  
##  Mean   :22.4   Mean   :47.15   Mean   :1725   Mean   :1.009   Mean   :17288  
##  3rd Qu.:24.3   3rd Qu.:58.00   3rd Qu.:1945   3rd Qu.:1.090   3rd Qu.:20325  
##  Max.   :37.1   Max.   :88.00   Max.   :3040   Max.   :1.350   Max.   :40825  
##       lean           pcfat       hypertension      diabetes     
##  Min.   :19136   Min.   : 9.2   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:30325   1st Qu.:27.0   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :33577   Median :32.4   Median :1.000   Median :0.0000  
##  Mean   :35463   Mean   :31.6   Mean   :0.507   Mean   :0.1109  
##  3rd Qu.:39761   3rd Qu.:36.8   3rd Qu.:1.000   3rd Qu.:0.0000  
##  Max.   :63059   Max.   :48.4   Max.   :1.000   Max.   :1.0000

Coding varible

#Simply way
ob$sex[ob$gender=="M"] <- "0"
ob$sex[ob$gender=="F"] <- "1"
#simply way 2
ob$sex.b=ifelse(ob$gender=="F",1,0)
table(ob$sex, ob$sex.b)
##    
##       0   1
##   0 355   0
##   1   0 862
# Using tidyverse
#for many variable
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Trinh bay so lieu

##Xem Mean, Median (Min-Max) của các biến
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~height+ weight+ bmi+age+WBBMC+wbbmd+fat+lean+pcfat+hypertension+diabetes, data = ob)
Overall
(N=1217)
height
Mean (SD) 157 (7.98)
Median [Min, Max] 155 [136, 185]
weight
Mean (SD) 55.1 (9.40)
Median [Min, Max] 54.0 [34.0, 95.0]
bmi
Mean (SD) 22.4 (3.06)
Median [Min, Max] 22.2 [14.5, 37.1]
age
Mean (SD) 47.2 (17.3)
Median [Min, Max] 48.0 [13.0, 88.0]
WBBMC
Mean (SD) 1720 (363)
Median [Min, Max] 1710 [695, 3040]
wbbmd
Mean (SD) 1.01 (0.113)
Median [Min, Max] 1.01 [0.650, 1.35]
fat
Mean (SD) 17300 (5210)
Median [Min, Max] 17000 [4280, 40800]
lean
Mean (SD) 35500 (7030)
Median [Min, Max] 33600 [19100, 63100]
pcfat
Mean (SD) 31.6 (7.18)
Median [Min, Max] 32.4 [9.20, 48.4]
hypertension
Mean (SD) 0.507 (0.500)
Median [Min, Max] 1.00 [0, 1.00]
diabetes
Mean (SD) 0.111 (0.314)
Median [Min, Max] 0 [0, 1.00]
##Xem Mean, Median (Min-Max) của các biến, phân loại theo 01 biến
table1(~height+ weight+ bmi+age+WBBMC+wbbmd+fat+lean+pcfat+hypertension+as.factor(diabetes), data = ob)
Overall
(N=1217)
height
Mean (SD) 157 (7.98)
Median [Min, Max] 155 [136, 185]
weight
Mean (SD) 55.1 (9.40)
Median [Min, Max] 54.0 [34.0, 95.0]
bmi
Mean (SD) 22.4 (3.06)
Median [Min, Max] 22.2 [14.5, 37.1]
age
Mean (SD) 47.2 (17.3)
Median [Min, Max] 48.0 [13.0, 88.0]
WBBMC
Mean (SD) 1720 (363)
Median [Min, Max] 1710 [695, 3040]
wbbmd
Mean (SD) 1.01 (0.113)
Median [Min, Max] 1.01 [0.650, 1.35]
fat
Mean (SD) 17300 (5210)
Median [Min, Max] 17000 [4280, 40800]
lean
Mean (SD) 35500 (7030)
Median [Min, Max] 33600 [19100, 63100]
pcfat
Mean (SD) 31.6 (7.18)
Median [Min, Max] 32.4 [9.20, 48.4]
hypertension
Mean (SD) 0.507 (0.500)
Median [Min, Max] 1.00 [0, 1.00]
as.factor(diabetes)
0 1082 (88.9%)
1 135 (11.1%)
##Hien thi IQR
table1(~height+ weight+ bmi+age+WBBMC+wbbmd+fat+lean+pcfat+as.factor(hypertension)+as.factor(diabetes), data = ob, render.continuous=c(.="Mean(SD)", .="Median[Q1, Q3]"))
Overall
(N=1217)
height
Mean(SD) 157(7.98)
Median[Q1, Q3] 155[151, 162]
weight
Mean(SD) 55.1(9.40)
Median[Q1, Q3] 54.0[49.0, 61.0]
bmi
Mean(SD) 22.4(3.06)
Median[Q1, Q3] 22.2[20.2, 24.3]
age
Mean(SD) 47.2(17.3)
Median[Q1, Q3] 48.0[35.0, 58.0]
WBBMC
Mean(SD) 1720(363)
Median[Q1, Q3] 1710[1480, 1950]
wbbmd
Mean(SD) 1.01(0.113)
Median[Q1, Q3] 1.01[0.930, 1.09]
fat
Mean(SD) 17300(5210)
Median[Q1, Q3] 17000[13800, 20300]
lean
Mean(SD) 35500(7030)
Median[Q1, Q3] 33600[30300, 39800]
pcfat
Mean(SD) 31.6(7.18)
Median[Q1, Q3] 32.4[27.0, 36.8]
as.factor(hypertension)
0 600 (49.3%)
1 617 (50.7%)
as.factor(diabetes)
0 1082 (88.9%)
1 135 (11.1%)
##Phan theo gioi tinh
table1(~height+ weight+ bmi+age+WBBMC+wbbmd+fat+lean+pcfat+as.factor(hypertension)+as.factor(diabetes)|gender, data = ob, render.continuous=c(.="Mean(SD)", .="Median[Q1, Q3]"))
F
(N=862)
M
(N=355)
Overall
(N=1217)
height
Mean(SD) 153(5.55) 165(6.73) 157(7.98)
Median[Q1, Q3] 153[150, 157] 165[160, 169] 155[151, 162]
weight
Mean(SD) 52.3(7.72) 62.0(9.59) 55.1(9.40)
Median[Q1, Q3] 51.0[47.0, 57.0] 62.0[55.0, 68.0] 54.0[49.0, 61.0]
bmi
Mean(SD) 22.3(3.05) 22.7(3.04) 22.4(3.06)
Median[Q1, Q3] 22.1[20.1, 24.1] 22.5[20.8, 24.9] 22.2[20.2, 24.3]
age
Mean(SD) 48.6(16.4) 43.7(18.8) 47.2(17.3)
Median[Q1, Q3] 49.0[39.0, 59.0] 44.0[24.0, 56.0] 48.0[35.0, 58.0]
WBBMC
Mean(SD) 1600(293) 2030(336) 1720(363)
Median[Q1, Q3] 1610[1410, 1800] 2030[1810, 2250] 1710[1480, 1950]
wbbmd
Mean(SD) 0.988(0.111) 1.06(0.101) 1.01(0.113)
Median[Q1, Q3] 0.990[0.910, 1.07] 1.06[0.990, 1.13] 1.01[0.930, 1.09]
fat
Mean(SD) 18200(4950) 15000(5110) 17300(5210)
Median[Q1, Q3] 17700[14800, 21100] 15100[11400, 18200] 17000[13800, 20300]
lean
Mean(SD) 32000(3970) 43800(5820) 35500(7030)
Median[Q1, Q3] 31500[29300, 34500] 43400[40300, 47600] 33600[30300, 39800]
pcfat
Mean(SD) 34.7(5.19) 24.2(5.76) 31.6(7.18)
Median[Q1, Q3] 34.7[31.5, 38.3] 24.6[20.4, 28.0] 32.4[27.0, 36.8]
as.factor(hypertension)
0 430 (49.9%) 170 (47.9%) 600 (49.3%)
1 432 (50.1%) 185 (52.1%) 617 (50.7%)
as.factor(diabetes)
0 760 (88.2%) 322 (90.7%) 1082 (88.9%)
1 102 (11.8%) 33 (9.3%) 135 (11.1%)
##So sanh hai nhom
library(compareGroups)
createTable(compareGroups(gender~age+bmi+age+WBBMC+wbbmd+fat+lean+pcfat+diabetes,data=ob))
## 
## --------Summary descriptives table by 'gender'---------
## 
## ____________________________________________ 
##               F            M       p.overall 
##             N=862        N=355               
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age      48.6 (16.4)  43.7 (18.8)   <0.001   
## bmi      22.3 (3.05)  22.7 (3.04)    0.013   
## age      48.6 (16.4)  43.7 (18.8)   <0.001   
## WBBMC     1599 (293)   2030 (336)   <0.001   
## wbbmd    0.99 (0.11)  1.06 (0.10)   <0.001   
## fat      18240 (4954) 14978 (5113)  <0.001   
## lean     32045 (3966) 43762 (5819)  <0.001   
## pcfat    34.7 (5.19)  24.2 (5.76)   <0.001   
## diabetes 0.12 (0.32)  0.09 (0.29)    0.181   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
##Voi bien dinh luong
### Lap bien moi 
createTable(compareGroups(gender~age+bmi+age+WBBMC+wbbmd+fat+lean+pcfat,data=ob))
## 
## --------Summary descriptives table by 'gender'---------
## 
## _________________________________________ 
##            F            M       p.overall 
##          N=862        N=355               
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age   48.6 (16.4)  43.7 (18.8)   <0.001   
## bmi   22.3 (3.05)  22.7 (3.04)    0.013   
## age   48.6 (16.4)  43.7 (18.8)   <0.001   
## WBBMC  1599 (293)   2030 (336)   <0.001   
## wbbmd 0.99 (0.11)  1.06 (0.10)   <0.001   
## fat   18240 (4954) 14978 (5113)  <0.001   
## lean  32045 (3966) 43762 (5819)  <0.001   
## pcfat 34.7 (5.19)  24.2 (5.76)   <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

#Ve bieu do

##histogram
library(ggplot2)
p <- ggplot(data=ob, aes(x=pcfat))
## Thêm màu, label và title
p + geom_histogram(fill="blue",col="white") + labs(x="Perecent body fat", y="Number of people",title="Phân bố tỉ trọng mỡ")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Phân theo giới tính
p = ggplot(data=ob, aes(x=pcfat,fill=gender))
p1=p + geom_histogram(col="white") +labs(x="Perecent body fat",y="Number of people", title="Phân bố tỉ trọng mỡ")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Trục tung theo density, 
p + geom_density() + labs(x="Perecent body fat",y="Number of people", title="Phân bố tỉ trọng mỡ")

## Làm mờ fill để xem phần overlap
p + geom_density(alpha=0.5) +labs(x="Perecent body fat",y="Number of people", title="Phân bố tỉ trọng mỡ")

p = ggplot(data=ob, aes(x=bmi,
y=pcfat))
p1 = p + geom_point()
p2 = p + geom_point() +
geom_smooth()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1, p2, ncol=2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#Theo gioi tinh

p = ggplot(data=ob, aes(x = bmi, y=pcfat, col=gender, fill=gender))
p1 = p + geom_point() + geom_smooth()
p2 = p + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)+I(x^3))
p2