t="/Users/locnguyen/Documents/R Console/Datasets for practice/Diabetes data.csv"
db=read.csv(t)
head(db)
##   id age gender height weight waist hip sysbp diabp active hypertension
## 1  1  76 Female    163     53    90  93   160    90      0            1
## 2  1  40 Female    149     51    74  94   100    60      0            0
## 3  1  51 Female    151     55    91 100   120    80      0            0
## 4  1  43 Female    158     62    78  96   120    80      1            0
## 5  2  72 Female    148     47    91  95   130    60      1            0
## 6  2  44   Male    155     48    69  86   120    80      0            0
##     bmi  whr diabetes
## 1 19.95 0.97      IFG
## 2 22.97 0.79   Normal
## 3 24.12 0.91   Normal
## 4 24.84 0.81   Normal
## 5 21.46 0.96      IFG
## 6 19.98 0.80   Normal

dung table1 tom tat du lieu theo gender

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~age + height + weight + bmi+ waist+ hip+ whr+ active+ hypertension+diabetes|gender, data = db)

Female
(n=2161)
Male
(n=1004)
Overall
(n=3165)
age
Mean (SD) 52.0 (11.7) 53.3 (12.5) 52.4 (12.0)
Median [Min, Max] 51.0 [30.0, 93.0] 52.0 [30.0, 89.0] 51.0 [30.0, 93.0]
height
Mean (SD) 153 (5.52) 163 (6.11) 157 (7.33)
Median [Min, Max] 153 [132, 173] 163 [140, 180] 156 [132, 180]
Missing 6 (0.3%) 1 (0.1%) 7 (0.2%)
weight
Mean (SD) 56.5 (9.34) 63.8 (10.5) 58.8 (10.3)
Median [Min, Max] 55.0 [29.0, 98.0] 63.0 [38.0, 115] 58.0 [29.0, 115]
Missing 2 (0.1%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.0 (3.64) 23.9 (3.49) 24.0 (3.59)
Median [Min, Max] 23.6 [13.3, 43.8] 23.7 [14.0, 39.8] 23.7 [13.3, 43.8]
Missing 8 (0.4%) 1 (0.1%) 9 (0.3%)
waist
Mean (SD) 80.1 (10.2) 86.0 (9.70) 82.0 (10.4)
Median [Min, Max] 79.0 [54.0, 120] 87.0 [59.0, 120] 82.0 [54.0, 120]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
hip
Mean (SD) 93.9 (7.66) 95.4 (7.64) 94.4 (7.68)
Median [Min, Max] 93.0 [64.0, 124] 95.0 [56.0, 130] 94.0 [56.0, 130]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
whr
Mean (SD) 0.852 (0.0761) 0.901 (0.0660) 0.868 (0.0765)
Median [Min, Max] 0.850 [0.610, 1.21] 0.900 [0.680, 1.36] 0.870 [0.610, 1.36]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
active
Mean (SD) 0.569 (0.495) 0.450 (0.498) 0.531 (0.499)
Median [Min, Max] 1.00 [0.00, 1.00] 0.00 [0.00, 1.00] 1.00 [0.00, 1.00]
hypertension
Mean (SD) 0.533 (0.499) 0.682 (0.466) 0.580 (0.494)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
diabetes
IFG 157 (7.3%) 86 (8.6%) 243 (7.7%)
Normal 1857 (85.9%) 823 (82.0%) 2680 (84.7%)
Yes 147 (6.8%) 95 (9.5%) 242 (7.6%)
# dung table1 tom tat du lieu theo diabetes

table1(~gender+ age + height + weight + bmi+ waist+ hip+ whr+ active+ hypertension|diabetes, data = db)
IFG
(n=243)
Normal
(n=2680)
Yes
(n=242)
Overall
(n=3165)
gender
Female 157 (64.6%) 1857 (69.3%) 147 (60.7%) 2161 (68.3%)
Male 86 (35.4%) 823 (30.7%) 95 (39.3%) 1004 (31.7%)
age
Mean (SD) 55.8 (11.1) 52.0 (12.1) 52.7 (11.6) 52.4 (12.0)
Median [Min, Max] 55.0 [30.0, 86.0] 51.0 [30.0, 93.0] 51.0 [30.0, 82.0] 51.0 [30.0, 93.0]
height
Mean (SD) 157 (6.98) 156 (7.33) 158 (7.52) 157 (7.33)
Median [Min, Max] 157 [132, 176] 156 [133, 180] 157 [139, 178] 156 [132, 180]
Missing 0 (0%) 6 (0.2%) 1 (0.4%) 7 (0.2%)
weight
Mean (SD) 61.1 (10.4) 58.5 (10.2) 59.6 (10.5) 58.8 (10.3)
Median [Min, Max] 60.0 [39.0, 96.0] 58.0 [29.0, 113] 58.8 [36.0, 115] 58.0 [29.0, 115]
Missing 1 (0.4%) 1 (0.0%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.7 (3.48) 23.9 (3.62) 23.9 (3.27) 24.0 (3.59)
Median [Min, Max] 24.6 [17.6, 37.5] 23.6 [13.3, 43.8] 23.7 [15.8, 39.8] 23.7 [13.3, 43.8]
Missing 1 (0.4%) 7 (0.3%) 1 (0.4%) 9 (0.3%)
waist
Mean (SD) 84.9 (10.6) 81.6 (10.4) 83.7 (9.51) 82.0 (10.4)
Median [Min, Max] 85.0 [57.0, 119] 81.0 [54.0, 120] 83.0 [63.0, 120] 82.0 [54.0, 120]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
hip
Mean (SD) 95.2 (7.98) 94.4 (7.66) 93.9 (7.58) 94.4 (7.68)
Median [Min, Max] 95.0 [77.0, 119] 94.0 [56.0, 130] 94.0 [69.0, 115] 94.0 [56.0, 130]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
whr
Mean (SD) 0.891 (0.0679) 0.863 (0.0772) 0.892 (0.0687) 0.868 (0.0765)
Median [Min, Max] 0.890 [0.730, 1.12] 0.860 [0.610, 1.36] 0.880 [0.680, 1.17] 0.870 [0.610, 1.36]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
active
Mean (SD) 0.523 (0.501) 0.533 (0.499) 0.517 (0.501) 0.531 (0.499)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
hypertension
Mean (SD) 0.737 (0.441) 0.563 (0.496) 0.612 (0.488) 0.580 (0.494)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
table1(~gender+ age + height + weight + bmi+ waist+ hip+ whr+ factor(active)+ factor(hypertension)|diabetes, data = db)

IFG
(n=243)
Normal
(n=2680)
Yes
(n=242)
Overall
(n=3165)
gender
Female 157 (64.6%) 1857 (69.3%) 147 (60.7%) 2161 (68.3%)
Male 86 (35.4%) 823 (30.7%) 95 (39.3%) 1004 (31.7%)
age
Mean (SD) 55.8 (11.1) 52.0 (12.1) 52.7 (11.6) 52.4 (12.0)
Median [Min, Max] 55.0 [30.0, 86.0] 51.0 [30.0, 93.0] 51.0 [30.0, 82.0] 51.0 [30.0, 93.0]
height
Mean (SD) 157 (6.98) 156 (7.33) 158 (7.52) 157 (7.33)
Median [Min, Max] 157 [132, 176] 156 [133, 180] 157 [139, 178] 156 [132, 180]
Missing 0 (0%) 6 (0.2%) 1 (0.4%) 7 (0.2%)
weight
Mean (SD) 61.1 (10.4) 58.5 (10.2) 59.6 (10.5) 58.8 (10.3)
Median [Min, Max] 60.0 [39.0, 96.0] 58.0 [29.0, 113] 58.8 [36.0, 115] 58.0 [29.0, 115]
Missing 1 (0.4%) 1 (0.0%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.7 (3.48) 23.9 (3.62) 23.9 (3.27) 24.0 (3.59)
Median [Min, Max] 24.6 [17.6, 37.5] 23.6 [13.3, 43.8] 23.7 [15.8, 39.8] 23.7 [13.3, 43.8]
Missing 1 (0.4%) 7 (0.3%) 1 (0.4%) 9 (0.3%)
waist
Mean (SD) 84.9 (10.6) 81.6 (10.4) 83.7 (9.51) 82.0 (10.4)
Median [Min, Max] 85.0 [57.0, 119] 81.0 [54.0, 120] 83.0 [63.0, 120] 82.0 [54.0, 120]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
hip
Mean (SD) 95.2 (7.98) 94.4 (7.66) 93.9 (7.58) 94.4 (7.68)
Median [Min, Max] 95.0 [77.0, 119] 94.0 [56.0, 130] 94.0 [69.0, 115] 94.0 [56.0, 130]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
whr
Mean (SD) 0.891 (0.0679) 0.863 (0.0772) 0.892 (0.0687) 0.868 (0.0765)
Median [Min, Max] 0.890 [0.730, 1.12] 0.860 [0.610, 1.36] 0.880 [0.680, 1.17] 0.870 [0.610, 1.36]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
factor(active)
0 116 (47.7%) 1251 (46.7%) 117 (48.3%) 1484 (46.9%)
1 127 (52.3%) 1429 (53.3%) 125 (51.7%) 1681 (53.1%)
factor(hypertension)
0 64 (26.3%) 1171 (43.7%) 94 (38.8%) 1329 (42.0%)
1 179 (73.7%) 1509 (56.3%) 148 (61.2%) 1836 (58.0%)
# ty le mac benh tieu duong giua nam va nu #1. tao ra 1 bien moi chi co 2 gia tri Yes/No

db$diab = ifelse(db$diabetes=="Yes", "Yes", "No")
head(db)
##   id age gender height weight waist hip sysbp diabp active hypertension
## 1  1  76 Female    163     53    90  93   160    90      0            1
## 2  1  40 Female    149     51    74  94   100    60      0            0
## 3  1  51 Female    151     55    91 100   120    80      0            0
## 4  1  43 Female    158     62    78  96   120    80      1            0
## 5  2  72 Female    148     47    91  95   130    60      1            0
## 6  2  44   Male    155     48    69  86   120    80      0            0
##     bmi  whr diabetes diab
## 1 19.95 0.97      IFG   No
## 2 22.97 0.79   Normal   No
## 3 24.12 0.91   Normal   No
## 4 24.84 0.81   Normal   No
## 5 21.46 0.96      IFG   No
## 6 19.98 0.80   Normal   No
tab= table(db$gender, db$diab)
tab
##         
##            No  Yes
##   Female 2014  147
##   Male    909   95
prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004880546 0.048314356
## sample estimates:
##    prop 1    prop 2 
## 0.9319759 0.9053785
# kiem dinh Z ve khac biet giua 2 ti le tieu duong giua nam va nu (co ti le 0.01081...)-> ty le nam va nu khac nhau. (prop1 and prop2 la 2 gia tri khong bi tieu duong of gender)

cau 6 BMI giua nam va nu (tu ket qua nu thap hon nam hay nam hon nu)

phuong phap: phan tich phuong sai (aov)

m = aov(bmi~ diabetes, data=db)
summary(m)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## diabetes       2    151   75.27   5.851 0.00291 **
## Residuals   3153  40567   12.87                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9 observations deleted due to missingness
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = db)
## 
## $diabetes
##                  diff        lwr          upr     p adj
## Normal-IFG -0.8236127 -1.3882150 -0.259010503 0.0018313
## Yes-IFG    -0.7643882 -1.5297871  0.001010667 0.0503925
## Yes-Normal  0.0592245 -0.5064508  0.624899835 0.9673231

phan tich nhom nu

tao ra mot data cho nu

women = subset(db, gender=="Female")
dim(women)
## [1] 2161   15
m=aov(bmi~diabetes, data=women)
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = women)
## 
## $diabetes
##                  diff       lwr       upr     p adj
## Normal-IFG -0.5165746 -1.228157 0.1950074 0.2044133
## Yes-IFG    -0.8256674 -1.808504 0.1571697 0.1198536
## Yes-Normal -0.3090928 -1.042806 0.4246202 0.5845178

#tao ra mot data cho nu

man= subset(db, gender=="Male")
dim(man)
## [1] 1004   15
ma=aov(bmi~diabetes, data=man)
TukeyHSD(ma)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = man)
## 
## $diabetes
##                  diff        lwr        upr     p adj
## Normal-IFG -1.4155325 -2.3370305 -0.4940344 0.0009525
## Yes-IFG    -0.7331665 -1.9433880  0.4770551 0.3299207
## Yes-Normal  0.6823660 -0.1987307  1.5634627 0.1642512

#ve bieu do tuong quan da chieu cac bien lien tuc trong datasets

head(db, 3)
##   id age gender height weight waist hip sysbp diabp active hypertension
## 1  1  76 Female    163     53    90  93   160    90      0            1
## 2  1  40 Female    149     51    74  94   100    60      0            0
## 3  1  51 Female    151     55    91 100   120    80      0            0
##     bmi  whr diabetes diab
## 1 19.95 0.97      IFG   No
## 2 22.97 0.79   Normal   No
## 3 24.12 0.91   Normal   No
# tao ra mot dataframe moi gom cac bien lien tuc (lien quan chat che va khong chat che -1< 0 >1)
dat = db[, c("age", "height", "weight", "bmi", "waist", "hip", "whr")]
library(psych)
pairs.panels(dat)