t="D:\\Datasets for practice\\Diabetes data.csv"
db=read.csv(t)
head(db,3)
##   id age gender height weight waist hip sysbp diabp active hypertension
## 1  1  76 Female    163     53    90  93   160    90      0            1
## 2  1  40 Female    149     51    74  94   100    60      0            0
## 3  1  51 Female    151     55    91 100   120    80      0            0
##     bmi  whr diabetes
## 1 19.95 0.97      IFG
## 2 22.97 0.79   Normal
## 3 24.12 0.91   Normal
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~age+height+weight+waist+hip+whr+active+hypertension+diabetes|gender, data=db)

Female
(n=2161)
Male
(n=1004)
Overall
(n=3165)
age
Mean (SD) 52.0 (11.7) 53.3 (12.5) 52.4 (12.0)
Median [Min, Max] 51.0 [30.0, 93.0] 52.0 [30.0, 89.0] 51.0 [30.0, 93.0]
height
Mean (SD) 153 (5.52) 163 (6.11) 157 (7.33)
Median [Min, Max] 153 [132, 173] 163 [140, 180] 156 [132, 180]
Missing 6 (0.3%) 1 (0.1%) 7 (0.2%)
weight
Mean (SD) 56.5 (9.34) 63.8 (10.5) 58.8 (10.3)
Median [Min, Max] 55.0 [29.0, 98.0] 63.0 [38.0, 115] 58.0 [29.0, 115]
Missing 2 (0.1%) 0 (0%) 2 (0.1%)
waist
Mean (SD) 80.1 (10.2) 86.0 (9.70) 82.0 (10.4)
Median [Min, Max] 79.0 [54.0, 120] 87.0 [59.0, 120] 82.0 [54.0, 120]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
hip
Mean (SD) 93.9 (7.66) 95.4 (7.64) 94.4 (7.68)
Median [Min, Max] 93.0 [64.0, 124] 95.0 [56.0, 130] 94.0 [56.0, 130]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
whr
Mean (SD) 0.852 (0.0761) 0.901 (0.0660) 0.868 (0.0765)
Median [Min, Max] 0.850 [0.610, 1.21] 0.900 [0.680, 1.36] 0.870 [0.610, 1.36]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
active
Mean (SD) 0.569 (0.495) 0.450 (0.498) 0.531 (0.499)
Median [Min, Max] 1.00 [0.00, 1.00] 0.00 [0.00, 1.00] 1.00 [0.00, 1.00]
hypertension
Mean (SD) 0.533 (0.499) 0.682 (0.466) 0.580 (0.494)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
diabetes
IFG 157 (7.3%) 86 (8.6%) 243 (7.7%)
Normal 1857 (85.9%) 823 (82.0%) 2680 (84.7%)
Yes 147 (6.8%) 95 (9.5%) 242 (7.6%)
##

table1(~age+height+weight+waist+hip+whr+active+hypertension+diabetes|gender, data=db)
Female
(n=2161)
Male
(n=1004)
Overall
(n=3165)
age
Mean (SD) 52.0 (11.7) 53.3 (12.5) 52.4 (12.0)
Median [Min, Max] 51.0 [30.0, 93.0] 52.0 [30.0, 89.0] 51.0 [30.0, 93.0]
height
Mean (SD) 153 (5.52) 163 (6.11) 157 (7.33)
Median [Min, Max] 153 [132, 173] 163 [140, 180] 156 [132, 180]
Missing 6 (0.3%) 1 (0.1%) 7 (0.2%)
weight
Mean (SD) 56.5 (9.34) 63.8 (10.5) 58.8 (10.3)
Median [Min, Max] 55.0 [29.0, 98.0] 63.0 [38.0, 115] 58.0 [29.0, 115]
Missing 2 (0.1%) 0 (0%) 2 (0.1%)
waist
Mean (SD) 80.1 (10.2) 86.0 (9.70) 82.0 (10.4)
Median [Min, Max] 79.0 [54.0, 120] 87.0 [59.0, 120] 82.0 [54.0, 120]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
hip
Mean (SD) 93.9 (7.66) 95.4 (7.64) 94.4 (7.68)
Median [Min, Max] 93.0 [64.0, 124] 95.0 [56.0, 130] 94.0 [56.0, 130]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
whr
Mean (SD) 0.852 (0.0761) 0.901 (0.0660) 0.868 (0.0765)
Median [Min, Max] 0.850 [0.610, 1.21] 0.900 [0.680, 1.36] 0.870 [0.610, 1.36]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
active
Mean (SD) 0.569 (0.495) 0.450 (0.498) 0.531 (0.499)
Median [Min, Max] 1.00 [0.00, 1.00] 0.00 [0.00, 1.00] 1.00 [0.00, 1.00]
hypertension
Mean (SD) 0.533 (0.499) 0.682 (0.466) 0.580 (0.494)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
diabetes
IFG 157 (7.3%) 86 (8.6%) 243 (7.7%)
Normal 1857 (85.9%) 823 (82.0%) 2680 (84.7%)
Yes 147 (6.8%) 95 (9.5%) 242 (7.6%)
table1(~gender+age+height+weight++bmi+waist+hip+whr+active+hypertension|diabetes, data=db)
IFG
(n=243)
Normal
(n=2680)
Yes
(n=242)
Overall
(n=3165)
gender
Female 157 (64.6%) 1857 (69.3%) 147 (60.7%) 2161 (68.3%)
Male 86 (35.4%) 823 (30.7%) 95 (39.3%) 1004 (31.7%)
age
Mean (SD) 55.8 (11.1) 52.0 (12.1) 52.7 (11.6) 52.4 (12.0)
Median [Min, Max] 55.0 [30.0, 86.0] 51.0 [30.0, 93.0] 51.0 [30.0, 82.0] 51.0 [30.0, 93.0]
height
Mean (SD) 157 (6.98) 156 (7.33) 158 (7.52) 157 (7.33)
Median [Min, Max] 157 [132, 176] 156 [133, 180] 157 [139, 178] 156 [132, 180]
Missing 0 (0%) 6 (0.2%) 1 (0.4%) 7 (0.2%)
weight
Mean (SD) 61.1 (10.4) 58.5 (10.2) 59.6 (10.5) 58.8 (10.3)
Median [Min, Max] 60.0 [39.0, 96.0] 58.0 [29.0, 113] 58.8 [36.0, 115] 58.0 [29.0, 115]
Missing 1 (0.4%) 1 (0.0%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.7 (3.48) 23.9 (3.62) 23.9 (3.27) 24.0 (3.59)
Median [Min, Max] 24.6 [17.6, 37.5] 23.6 [13.3, 43.8] 23.7 [15.8, 39.8] 23.7 [13.3, 43.8]
Missing 1 (0.4%) 7 (0.3%) 1 (0.4%) 9 (0.3%)
waist
Mean (SD) 84.9 (10.6) 81.6 (10.4) 83.7 (9.51) 82.0 (10.4)
Median [Min, Max] 85.0 [57.0, 119] 81.0 [54.0, 120] 83.0 [63.0, 120] 82.0 [54.0, 120]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
hip
Mean (SD) 95.2 (7.98) 94.4 (7.66) 93.9 (7.58) 94.4 (7.68)
Median [Min, Max] 95.0 [77.0, 119] 94.0 [56.0, 130] 94.0 [69.0, 115] 94.0 [56.0, 130]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
whr
Mean (SD) 0.891 (0.0679) 0.863 (0.0772) 0.892 (0.0687) 0.868 (0.0765)
Median [Min, Max] 0.890 [0.730, 1.12] 0.860 [0.610, 1.36] 0.880 [0.680, 1.17] 0.870 [0.610, 1.36]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
active
Mean (SD) 0.523 (0.501) 0.533 (0.499) 0.517 (0.501) 0.531 (0.499)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
hypertension
Mean (SD) 0.737 (0.441) 0.563 (0.496) 0.612 (0.488) 0.580 (0.494)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
table1(~factor(gender)+age+height+weight++bmi+waist+hip+whr+factor(active)+factor(hypertension)|diabetes, data=db)

IFG
(n=243)
Normal
(n=2680)
Yes
(n=242)
Overall
(n=3165)
factor(gender)
Female 157 (64.6%) 1857 (69.3%) 147 (60.7%) 2161 (68.3%)
Male 86 (35.4%) 823 (30.7%) 95 (39.3%) 1004 (31.7%)
age
Mean (SD) 55.8 (11.1) 52.0 (12.1) 52.7 (11.6) 52.4 (12.0)
Median [Min, Max] 55.0 [30.0, 86.0] 51.0 [30.0, 93.0] 51.0 [30.0, 82.0] 51.0 [30.0, 93.0]
height
Mean (SD) 157 (6.98) 156 (7.33) 158 (7.52) 157 (7.33)
Median [Min, Max] 157 [132, 176] 156 [133, 180] 157 [139, 178] 156 [132, 180]
Missing 0 (0%) 6 (0.2%) 1 (0.4%) 7 (0.2%)
weight
Mean (SD) 61.1 (10.4) 58.5 (10.2) 59.6 (10.5) 58.8 (10.3)
Median [Min, Max] 60.0 [39.0, 96.0] 58.0 [29.0, 113] 58.8 [36.0, 115] 58.0 [29.0, 115]
Missing 1 (0.4%) 1 (0.0%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.7 (3.48) 23.9 (3.62) 23.9 (3.27) 24.0 (3.59)
Median [Min, Max] 24.6 [17.6, 37.5] 23.6 [13.3, 43.8] 23.7 [15.8, 39.8] 23.7 [13.3, 43.8]
Missing 1 (0.4%) 7 (0.3%) 1 (0.4%) 9 (0.3%)
waist
Mean (SD) 84.9 (10.6) 81.6 (10.4) 83.7 (9.51) 82.0 (10.4)
Median [Min, Max] 85.0 [57.0, 119] 81.0 [54.0, 120] 83.0 [63.0, 120] 82.0 [54.0, 120]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
hip
Mean (SD) 95.2 (7.98) 94.4 (7.66) 93.9 (7.58) 94.4 (7.68)
Median [Min, Max] 95.0 [77.0, 119] 94.0 [56.0, 130] 94.0 [69.0, 115] 94.0 [56.0, 130]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
whr
Mean (SD) 0.891 (0.0679) 0.863 (0.0772) 0.892 (0.0687) 0.868 (0.0765)
Median [Min, Max] 0.890 [0.730, 1.12] 0.860 [0.610, 1.36] 0.880 [0.680, 1.17] 0.870 [0.610, 1.36]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
factor(active)
0 116 (47.7%) 1251 (46.7%) 117 (48.3%) 1484 (46.9%)
1 127 (52.3%) 1429 (53.3%) 125 (51.7%) 1681 (53.1%)
factor(hypertension)
0 64 (26.3%) 1171 (43.7%) 94 (38.8%) 1329 (42.0%)
1 179 (73.7%) 1509 (56.3%) 148 (61.2%) 1836 (58.0%)
### có sự khác biệt giữa nam và nữ: dùng t.test

t.test(db$bmi~db$gender)
## 
##  Welch Two Sample t-test
## 
## data:  db$bmi by db$gender
## t = 0.66677, df = 2034.2, p-value = 0.505
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1749957  0.3552877
## sample estimates:
## mean in group Female   mean in group Male 
##             23.98515             23.89500

###. có sự khác biệt về tỷ lệ mắc bệnh tiểu đường giữa nam và nữ? # tạo biến mới có 2 giá trị yes,no

db$diab=ifelse(db$diabetes=="Yes","Yes","No")
tab=table(db$gender,db$diab)
tab
##         
##            No  Yes
##   Female 2014  147
##   Male    909   95

#Kiểm định z về sự khác biệt giữa hai tỉ lệ tiểu đường giữa nam và nữ

prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004880546 0.048314356
## sample estimates:
##    prop 1    prop 2 
## 0.9319759 0.9053785
# tính theo Chisq.test
chisq.test(tab)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081

#Kiểm định z về sự khác biệt giữa hai tỉ lệ tiểu đường giữa nam và nữ

prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004880546 0.048314356
## sample estimates:
##    prop 1    prop 2 
## 0.9319759 0.9053785

tính theo Chisq.test

chisq.test(tab)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081

có sự khác biệt về BMI giữa 3 nhóm ko?

# phương pháp: phân tích phương sai(aov)
m=aov(bmi~diabetes, data=db)
summary(m)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## diabetes       2    151   75.27   5.851 0.00291 **
## Residuals   3153  40567   12.87                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9 observations deleted due to missingness
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = db)
## 
## $diabetes
##                  diff        lwr          upr     p adj
## Normal-IFG -0.8236127 -1.3882150 -0.259010503 0.0018313
## Yes-IFG    -0.7643882 -1.5297871  0.001010667 0.0503925
## Yes-Normal  0.0592245 -0.5064508  0.624899835 0.9673231
#phân tích cho nhóm nữ
# tạo ra một data cho nhóm nữ
women=subset(db,gender=="Female")
dim(women)
## [1] 2161   15
m=aov(bmi~diabetes,data = women)
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = women)
## 
## $diabetes
##                  diff       lwr       upr     p adj
## Normal-IFG -0.5165746 -1.228157 0.1950074 0.2044133
## Yes-IFG    -0.8256674 -1.808504 0.1571697 0.1198536
## Yes-Normal -0.3090928 -1.042806 0.4246202 0.5845178
# phân tích cho nam
men=subset(db,gender=="Male")
dim(men)
## [1] 1004   15
m=aov(bmi~diabetes,data = men)
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = men)
## 
## $diabetes
##                  diff        lwr        upr     p adj
## Normal-IFG -1.4155325 -2.3370305 -0.4940344 0.0009525
## Yes-IFG    -0.7331665 -1.9433880  0.4770551 0.3299207
## Yes-Normal  0.6823660 -0.1987307  1.5634627 0.1642512

Vẽ biểu đồ tương quan đa chiều giữa các biến liên tục trong dataset

Tạo ra một dataset mới gồm các biến liên tục

dat=db[,c("age","weight","height","bmi","waist","hip","whr")]
library(psych)
pairs.panels(dat)

## Ý nghĩa: hệ số tương quan dao động từ 0-1 (0: ko có liên quan, 1: liên quan chặt chẽ).nếu hệ số tương quan âm=> thể hiện mối liên quan nghịch. ví dụ age & weight là tương quan nghịch(tuổi càng cao, chiều cao càng giảm)