ob = read.csv("//Users//nguyenthicuc//Downloads//obesity data.csv")
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + gender + bmi + WBBMC + wbbmd + fat + lean + hypertension + diabetes, data = ob)
Overall
(N=1217)
age
Mean (SD) 47.2 (17.3)
Median [Min, Max] 48.0 [13.0, 88.0]
gender
F 862 (70.8%)
M 355 (29.2%)
bmi
Mean (SD) 22.4 (3.06)
Median [Min, Max] 22.2 [14.5, 37.1]
WBBMC
Mean (SD) 1720 (363)
Median [Min, Max] 1710 [695, 3040]
wbbmd
Mean (SD) 1.01 (0.113)
Median [Min, Max] 1.01 [0.650, 1.35]
fat
Mean (SD) 17300 (5210)
Median [Min, Max] 17000 [4280, 40800]
lean
Mean (SD) 35500 (7030)
Median [Min, Max] 33600 [19100, 63100]
hypertension
Mean (SD) 0.507 (0.500)
Median [Min, Max] 1.00 [0, 1.00]
diabetes
Mean (SD) 0.111 (0.314)
Median [Min, Max] 0 [0, 1.00]
table1(~ age + bmi + WBBMC + wbbmd + fat + lean + as.factor(hypertension) + as.factor(diabetes) | gender, data = ob)
F
(N=862)
M
(N=355)
Overall
(N=1217)
age
Mean (SD) 48.6 (16.4) 43.7 (18.8) 47.2 (17.3)
Median [Min, Max] 49.0 [14.0, 85.0] 44.0 [13.0, 88.0] 48.0 [13.0, 88.0]
bmi
Mean (SD) 22.3 (3.05) 22.7 (3.04) 22.4 (3.06)
Median [Min, Max] 22.1 [15.2, 37.1] 22.5 [14.5, 34.7] 22.2 [14.5, 37.1]
WBBMC
Mean (SD) 1600 (293) 2030 (336) 1720 (363)
Median [Min, Max] 1610 [695, 2660] 2030 [1190, 3040] 1710 [695, 3040]
wbbmd
Mean (SD) 0.988 (0.111) 1.06 (0.101) 1.01 (0.113)
Median [Min, Max] 0.990 [0.650, 1.35] 1.06 [0.780, 1.34] 1.01 [0.650, 1.35]
fat
Mean (SD) 18200 (4950) 15000 (5110) 17300 (5210)
Median [Min, Max] 17700 [6220, 40800] 15100 [4280, 29900] 17000 [4280, 40800]
lean
Mean (SD) 32000 (3970) 43800 (5820) 35500 (7030)
Median [Min, Max] 31500 [19100, 53400] 43400 [28600, 63100] 33600 [19100, 63100]
as.factor(hypertension)
0 430 (49.9%) 170 (47.9%) 600 (49.3%)
1 432 (50.1%) 185 (52.1%) 617 (50.7%)
as.factor(diabetes)
0 760 (88.2%) 322 (90.7%) 1082 (88.9%)
1 102 (11.8%) 33 (9.3%) 135 (11.1%)
library(compareGroups)
createTable(compareGroups(gender ~ age + bmi + WBBMC + wbbmd + fat + lean + hypertension + diabetes, data = ob))
## 
## --------Summary descriptives table by 'gender'---------
## 
## ________________________________________________ 
##                   F            M       p.overall 
##                 N=862        N=355               
## ÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻ 
## age          48.6 (16.4)  43.7 (18.8)   <0.001   
## bmi          22.3 (3.05)  22.7 (3.04)    0.013   
## WBBMC         1599 (293)   2030 (336)   <0.001   
## wbbmd        0.99 (0.11)  1.06 (0.10)   <0.001   
## fat          18240 (4954) 14978 (5113)  <0.001   
## lean         32045 (3966) 43762 (5819)  <0.001   
## hypertension 0.50 (0.50)  0.52 (0.50)    0.527   
## diabetes     0.12 (0.32)  0.09 (0.29)    0.181   
## ÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻ
ob$hypert = as.factor(ob$hypertension)
ob$dm = as.factor(ob$diabetes)
createTable(compareGroups(gender ~ age + bmi + WBBMC + wbbmd + fat + lean + hypert + dm, data = ob))
## 
## --------Summary descriptives table by 'gender'---------
## 
## ___________________________________________ 
##              F            M       p.overall 
##            N=862        N=355               
## ÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻ 
## age     48.6 (16.4)  43.7 (18.8)   <0.001   
## bmi     22.3 (3.05)  22.7 (3.04)    0.013   
## WBBMC    1599 (293)   2030 (336)   <0.001   
## wbbmd   0.99 (0.11)  1.06 (0.10)   <0.001   
## fat     18240 (4954) 14978 (5113)  <0.001   
## lean    32045 (3966) 43762 (5819)  <0.001   
## hypert:                             0.569   
##     0   430 (49.9%)  170 (47.9%)            
##     1   432 (50.1%)  185 (52.1%)            
## dm:                                 0.238   
##     0   760 (88.2%)  322 (90.7%)            
##     1   102 (11.8%)   33 (9.30%)            
## ÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻÂŻ
library(ggplot2)
library(gridExtra) 

p = ggplot(data = ob, aes(x = pcfat))
p1 = p + geom_histogram()
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tỉ trọng mụ (%)", y = "Số người", title = "PhĂąn bố tỉ trọng mụ")

grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p = ggplot(data = ob, aes(x = pcfat, fill = gender))
p1 = p + geom_histogram(col="white") + labs(x = "Tỉ trọng mụ", y = "Số người", title = "PhĂąn bố tỉ trọng mụ")
p2 = p + geom_density(alpha = 0.5) + labs(x = "Tỉ trọng mụ", y = "Số người", title = "PhĂąn bố tỉ trọng mụ")

grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ob$OB[ob$bmi< 18.5] = "Underweight"
ob$OB[ob$bmi>= 18.5 & ob$bmi< 25] = "Normal"
ob$OB[ob$bmi>= 25 & ob$bmi< 30] = "Overweight"
ob$OB[ob$bmi>= 30] = "Obese"

p = ggplot(data = ob, aes(x = OB, fill = OB, col = OB))
p + geom_bar(position = "dodge")

p = ggplot(data = ob, aes(x = OB, y = pcfat, fill = gender, group = gender))
p + geom_bar(stat = "identity", position = "dodge")

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## â„č Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
temp = ob %>% group_by(gender) %>% count(OB) %>% mutate(pct = n/sum(n))
temp$pct = round(temp$pct*100, 1)

p = ggplot(data = temp, aes(x = OB, y = pct, fill = gender, group = gender))
p1 = p + geom_bar(stat = "identity", position = "dodge") + geom_text(aes(x = OB, y = pct, label = pct, group = gender), position = position_dodge(width = 1), vjust = -0.5, col = "blue")
p1

p = ggplot(data = temp, aes(x = OB, y = pct, fill = gender, label = pct))
p2 = p + geom_bar(stat = "identity") + geom_text(size = 3, position = position_stack(vjust = 0.5))
p2

grid.arrange(p1, p2, ncol = 2)

women = subset(ob, gender == "F")
dim(women)
## [1] 862  16
p = ggplot(data = women, aes(x = OB, y = pcfat, fill = OB, col = OB))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "TĂŹnh tráșĄng bĂ©o phĂŹ", y = "Tỉ trọng mụ (%)") + ggtitle("Tỉ trọng mụ theo tĂŹnh tráșĄng bĂ©o phĂŹ ở nữ")
p1

women$OB.n = factor(women$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))
p = ggplot(data = women, aes(x = OB.n, y = pcfat, fill = OB.n, col = OB.n))
p2 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "TĂŹnh tráșĄng bĂ©o phĂŹ", y = "Tỉ trọng mụ (%)") + ggtitle("Tỉ trọng mụ theo tĂŹnh tráșĄng bĂ©o phĂŹ ở nữ")
p2

grid.arrange(p1, p2, ncol = 2)

men = subset(ob, gender == "M")
dim(men)
## [1] 355  16
p = ggplot(data = men, aes(x = OB, y = pcfat, fill = OB, col = OB))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "TĂŹnh tráșĄng bĂ©o phĂŹ", y = "Tỉ trọng mụ (%)") + ggtitle("Tỉ trọng mụ theo tĂŹnh tráșĄng bĂ©o phĂŹ ở nam")
p1

men$OB.n = factor(men$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))
p = ggplot(data = men, aes(x = OB.n, y = pcfat, fill = OB.n, col = OB.n))
p2 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "TĂŹnh tráșĄng bĂ©o phĂŹ", y = "Tỉ trọng mụ (%)") + ggtitle("Tỉ trọng mụ theo tĂŹnh tráșĄng bĂ©o phĂŹ ở nam")
p2

grid.arrange(p1, p2, ncol = 2)

ob$OB.n = factor(ob$OB, levels = c("Underweight", "Normal", "Overweight", "Obese"))
p = ggplot(data = ob, aes(x = OB.n, y = pcfat, fill = gender, col = gender))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "TĂŹnh tráșĄng bĂ©o phĂŹ", y = "Tỉ trọng mụ (%)") + ggtitle("Tỉ trọng mụ theo tĂŹnh tráșĄng bĂ©o phĂŹ vĂ  giới tĂ­nh")
p1

p = ggplot(data = ob, aes(x = bmi, y = pcfat))
p1 = p + geom_point()
p2 = p + geom_point() + geom_smooth()

grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

p = ggplot(data = ob, aes(x = bmi, y = pcfat, fill = gender, col = gender))
p1 = p + geom_point() + geom_smooth() + labs(x = "Chỉ số khối cÆĄ thể (kg/m2)", y = "Tỉ trọng mụ (%)") + ggtitle("LiĂȘn quan giữa chỉ số khối cÆĄ thể vĂ  tỉ trọng mụ theo giới tĂ­nh")
p1
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

p = ggplot(data = ob, aes(x = bmi, y = pcfat, fill = gender, col = gender))
p2 = p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2)) + labs(x = "Chỉ số khối cÆĄ thể (kg/m2)", y = "Tỉ trọng mụ (%)") + ggtitle("LiĂȘn quan giữa chỉ số khối cÆĄ thể vĂ  tỉ trọng mụ theo giới tĂ­nh")
p2

grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

library(ggplot2)
library(gridExtra) 

p = ggplot(data = ob, aes(x = age))
p1 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Tuổi (năm)", y = "Số người", title = "PhĂąn bố tuổi")

p = ggplot(data = ob, aes(x = wbbmd))
p2 = p + geom_histogram(fill = "blue", col = "white") + labs(x = "Máș­t độ xÆ°ÆĄng toĂ n thĂąn (g/cm2)", y = "Số người", title = "PhĂąn bố MĐX toĂ n thĂąn")

p = ggplot(data = ob, aes(x = age, y = wbbmd))
p + geom_point() + geom_smooth(method = "lm", formula = y~ x)

grid.arrange(p1, p2, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cor.test(ob$age, ob$wbbmd, method= "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  ob$age and ob$wbbmd
## t = -17.154, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4856972 -0.3951677
## sample estimates:
##        cor 
## -0.4415556
m.1 = lm(wbbmd ~ age, data = ob)
summary(m.1)
## 
## Call:
## lm(formula = wbbmd ~ age, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32749 -0.07268 -0.00533  0.06793  0.33178 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.1450766  0.0084638  135.29   <2e-16 ***
## age         -0.0028914  0.0001686  -17.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1015 on 1215 degrees of freedom
## Multiple R-squared:  0.195,  Adjusted R-squared:  0.1943 
## F-statistic: 294.3 on 1 and 1215 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.1)

library(ggfortify)
autoplot(m.1)

p = ggplot(data = ob, aes(x = age, y = wbbmd, fill = gender, col = gender))
p1 = p + geom_point() + geom_smooth() + labs(x = "Tuổi (năm)", y = "Máș­t độ xÆ°ÆĄng toĂ n thĂąn (g/cm2)") + ggtitle("LiĂȘn quan giữa tuổi vĂ  MĐX theo giới tĂ­nh")

m.2=lm(wbbmd ~ age + gender, data = ob)
summary(m.2)
## 
## Call:
## lm(formula = wbbmd ~ age + gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36272 -0.06658 -0.00411  0.06549  0.34473 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.118288   0.008636 129.485   <2e-16 ***
## age         -0.002691   0.000164 -16.408   <2e-16 ***
## genderM      0.059417   0.006230   9.537   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09798 on 1214 degrees of freedom
## Multiple R-squared:  0.2511, Adjusted R-squared:  0.2498 
## F-statistic: 203.5 on 2 and 1214 DF,  p-value: < 2.2e-16
par(mfrow = c(2,2))

plot(m.2)

autoplot(m.2)

summary(m.2)
## 
## Call:
## lm(formula = wbbmd ~ age + gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36272 -0.06658 -0.00411  0.06549  0.34473 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.118288   0.008636 129.485   <2e-16 ***
## age         -0.002691   0.000164 -16.408   <2e-16 ***
## genderM      0.059417   0.006230   9.537   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09798 on 1214 degrees of freedom
## Multiple R-squared:  0.2511, Adjusted R-squared:  0.2498 
## F-statistic: 203.5 on 2 and 1214 DF,  p-value: < 2.2e-16

R Markdown