t="C:\\Users\\pc\\Downloads\\CAC NGHIEN CUU\\BAI GIANG CAC MON\\GS TUAN\\BG 12.6.19\\Diabetes data.csv"
db=read.csv(t)
head(db)
##   id age gender height weight waist hip sysbp diabp active hypertension
## 1  1  76 Female    163     53    90  93   160    90      0            1
## 2  1  40 Female    149     51    74  94   100    60      0            0
## 3  1  51 Female    151     55    91 100   120    80      0            0
## 4  1  43 Female    158     62    78  96   120    80      1            0
## 5  2  72 Female    148     47    91  95   130    60      1            0
## 6  2  44   Male    155     48    69  86   120    80      0            0
##     bmi  whr diabetes
## 1 19.95 0.97      IFG
## 2 22.97 0.79   Normal
## 3 24.12 0.91   Normal
## 4 24.84 0.81   Normal
## 5 21.46 0.96      IFG
## 6 19.98 0.80   Normal

#analysis of data: mo ta cac bien theo gender

library(table1)
## Warning: package 'table1' was built under R version 3.5.3
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~age+height+weight+bmi+waist+hip+whr+active+hypertension+diabetes|gender, data=db)
Female
(n=2161)
Male
(n=1004)
Overall
(n=3165)
age
Mean (SD) 52.0 (11.7) 53.3 (12.5) 52.4 (12.0)
Median [Min, Max] 51.0 [30.0, 93.0] 52.0 [30.0, 89.0] 51.0 [30.0, 93.0]
height
Mean (SD) 153 (5.52) 163 (6.11) 157 (7.33)
Median [Min, Max] 153 [132, 173] 163 [140, 180] 156 [132, 180]
Missing 6 (0.3%) 1 (0.1%) 7 (0.2%)
weight
Mean (SD) 56.5 (9.34) 63.8 (10.5) 58.8 (10.3)
Median [Min, Max] 55.0 [29.0, 98.0] 63.0 [38.0, 115] 58.0 [29.0, 115]
Missing 2 (0.1%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.0 (3.64) 23.9 (3.49) 24.0 (3.59)
Median [Min, Max] 23.6 [13.3, 43.8] 23.7 [14.0, 39.8] 23.7 [13.3, 43.8]
Missing 8 (0.4%) 1 (0.1%) 9 (0.3%)
waist
Mean (SD) 80.1 (10.2) 86.0 (9.70) 82.0 (10.4)
Median [Min, Max] 79.0 [54.0, 120] 87.0 [59.0, 120] 82.0 [54.0, 120]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
hip
Mean (SD) 93.9 (7.66) 95.4 (7.64) 94.4 (7.68)
Median [Min, Max] 93.0 [64.0, 124] 95.0 [56.0, 130] 94.0 [56.0, 130]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
whr
Mean (SD) 0.852 (0.0761) 0.901 (0.0660) 0.868 (0.0765)
Median [Min, Max] 0.850 [0.610, 1.21] 0.900 [0.680, 1.36] 0.870 [0.610, 1.36]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
active
Mean (SD) 0.569 (0.495) 0.450 (0.498) 0.531 (0.499)
Median [Min, Max] 1.00 [0.00, 1.00] 0.00 [0.00, 1.00] 1.00 [0.00, 1.00]
hypertension
Mean (SD) 0.533 (0.499) 0.682 (0.466) 0.580 (0.494)
Median [Min, Max] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00] 1.00 [0.00, 1.00]
diabetes
IFG 157 (7.3%) 86 (8.6%) 243 (7.7%)
Normal 1857 (85.9%) 823 (82.0%) 2680 (84.7%)
Yes 147 (6.8%) 95 (9.5%) 242 (7.6%)

#do bien acitive va hypertension la bien phan loai nen dung factor truoc de biet la bien nhi phan

table1(~age+height+weight+bmi+waist+hip+whr+factor(active)+factor(hypertension) |diabetes, data=db)

IFG
(n=243)
Normal
(n=2680)
Yes
(n=242)
Overall
(n=3165)
age
Mean (SD) 55.8 (11.1) 52.0 (12.1) 52.7 (11.6) 52.4 (12.0)
Median [Min, Max] 55.0 [30.0, 86.0] 51.0 [30.0, 93.0] 51.0 [30.0, 82.0] 51.0 [30.0, 93.0]
height
Mean (SD) 157 (6.98) 156 (7.33) 158 (7.52) 157 (7.33)
Median [Min, Max] 157 [132, 176] 156 [133, 180] 157 [139, 178] 156 [132, 180]
Missing 0 (0%) 6 (0.2%) 1 (0.4%) 7 (0.2%)
weight
Mean (SD) 61.1 (10.4) 58.5 (10.2) 59.6 (10.5) 58.8 (10.3)
Median [Min, Max] 60.0 [39.0, 96.0] 58.0 [29.0, 113] 58.8 [36.0, 115] 58.0 [29.0, 115]
Missing 1 (0.4%) 1 (0.0%) 0 (0%) 2 (0.1%)
bmi
Mean (SD) 24.7 (3.48) 23.9 (3.62) 23.9 (3.27) 24.0 (3.59)
Median [Min, Max] 24.6 [17.6, 37.5] 23.6 [13.3, 43.8] 23.7 [15.8, 39.8] 23.7 [13.3, 43.8]
Missing 1 (0.4%) 7 (0.3%) 1 (0.4%) 9 (0.3%)
waist
Mean (SD) 84.9 (10.6) 81.6 (10.4) 83.7 (9.51) 82.0 (10.4)
Median [Min, Max] 85.0 [57.0, 119] 81.0 [54.0, 120] 83.0 [63.0, 120] 82.0 [54.0, 120]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
hip
Mean (SD) 95.2 (7.98) 94.4 (7.66) 93.9 (7.58) 94.4 (7.68)
Median [Min, Max] 95.0 [77.0, 119] 94.0 [56.0, 130] 94.0 [69.0, 115] 94.0 [56.0, 130]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
whr
Mean (SD) 0.891 (0.0679) 0.863 (0.0772) 0.892 (0.0687) 0.868 (0.0765)
Median [Min, Max] 0.890 [0.730, 1.12] 0.860 [0.610, 1.36] 0.880 [0.680, 1.17] 0.870 [0.610, 1.36]
Missing 1 (0.4%) 9 (0.3%) 1 (0.4%) 11 (0.3%)
factor(active)
0 116 (47.7%) 1251 (46.7%) 117 (48.3%) 1484 (46.9%)
1 127 (52.3%) 1429 (53.3%) 125 (51.7%) 1681 (53.1%)
factor(hypertension)
0 64 (26.3%) 1171 (43.7%) 94 (38.8%) 1329 (42.0%)
1 179 (73.7%) 1509 (56.3%) 148 (61.2%) 1836 (58.0%)
#su khac biet giua bmi nam va nu

t.test(db$bmi~db$gender)
## 
##  Welch Two Sample t-test
## 
## data:  db$bmi by db$gender
## t = 0.66677, df = 2034.2, p-value = 0.505
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1749957  0.3552877
## sample estimates:
## mean in group Female   mean in group Male 
##             23.98515             23.89500
# ket luan: khong co su khac biet giua hai gioi
# su khac biet ve tieu duong giua nam va nu khong, do tieu duong co 03 gia tri nen dung t-test se khong nen. vi vay tao bien moi co hai gia tri la yes va no
db$diab=ifelse(db$diabetes=="Yes", "Yes", "No")
#dem so tieu duong cho moi nhom nam va nu
tab=table(db$diab, db$gender)
tab
##      
##       Female Male
##   No    2014  909
##   Yes    147   95
# kiem dinh z ve su khac biet giua 2 ti le tieu duong giua nam va nu bang kiem dinh chi binh phuong= prop.test(), phai dua ra bang voi table truoc roi moi kiem dinh prop.test vao
prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.01557132 0.14758891
## sample estimates:
##    prop 1    prop 2 
## 0.6890181 0.6074380
# Kl: co su khac biet ve tieu duong giua nam va nu
tab=table(db$gender, db$diab)
tab
##         
##            No  Yes
##   Female 2014  147
##   Male    909   95
prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004880546 0.048314356
## sample estimates:
##    prop 1    prop 2 
## 0.9319759 0.9053785

#su khac biet bmi giua 3 nhom diabetes: do diabetes co 3 nhom nen phan tich phuong sai: ANOVA

#aov: so sanh giua 3 nhom ve bmi
m=aov(bmi~diabetes, data=db)
summary(m)
##               Df Sum Sq Mean Sq F value  Pr(>F)   
## diabetes       2    151   75.27   5.851 0.00291 **
## Residuals   3153  40567   12.87                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 9 observations deleted due to missingness
#TukeyHSD : cho biet su khac biet giua tung nhom voi nhau
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = db)
## 
## $diabetes
##                  diff        lwr          upr     p adj
## Normal-IFG -0.8236127 -1.3882150 -0.259010503 0.0018313
## Yes-IFG    -0.7643882 -1.5297871  0.001010667 0.0503925
## Yes-Normal  0.0592245 -0.5064508  0.624899835 0.9673231
# tao ra mot data cho nu
women=subset(db, gender=="Female")
dim(women)
## [1] 2161   15
m=aov(bmi~diabetes, data=women)
summary(m)
##               Df Sum Sq Mean Sq F value Pr(>F)
## diabetes       2     55   27.50   2.076  0.126
## Residuals   2150  28475   13.24               
## 8 observations deleted due to missingness
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = women)
## 
## $diabetes
##                  diff       lwr       upr     p adj
## Normal-IFG -0.5165746 -1.228157 0.1950074 0.2044133
## Yes-IFG    -0.8256674 -1.808504 0.1571697 0.1198536
## Yes-Normal -0.3090928 -1.042806 0.4246202 0.5845178
# ve bieu do tuong quan da chieu cac bien lien tuc trong dataset
#tao ra mot dataframe moi gom cac bien lien tuc
dat=db[,c("age", "height", "weight","bmi", "waist", "hip", "whr")]
library(psych); 
#ham pair.panels la hinh ve tuong quan giua nhieu bien
pairs.panels(dat)