#bang dac diem chung cua du lieu (co factor bien phan loai
db=read.csv("D:\\data analysis\\dataset\\Diabetes data.csv")
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~age+height+hip+weight+waist+hip+factor(hypertension)+bmi+whr+diabetes|gender, data=db)
Female
(n=2161)
Male
(n=1004)
Overall
(n=3165)
age
Mean (SD) 52.0 (11.7) 53.3 (12.5) 52.4 (12.0)
Median [Min, Max] 51.0 [30.0, 93.0] 52.0 [30.0, 89.0] 51.0 [30.0, 93.0]
height
Mean (SD) 153 (5.52) 163 (6.11) 157 (7.33)
Median [Min, Max] 153 [132, 173] 163 [140, 180] 156 [132, 180]
Missing 6 (0.3%) 1 (0.1%) 7 (0.2%)
hip
Mean (SD) 93.9 (7.66) 95.4 (7.64) 94.4 (7.68)
Median [Min, Max] 93.0 [64.0, 124] 95.0 [56.0, 130] 94.0 [56.0, 130]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
weight
Mean (SD) 56.5 (9.34) 63.8 (10.5) 58.8 (10.3)
Median [Min, Max] 55.0 [29.0, 98.0] 63.0 [38.0, 115] 58.0 [29.0, 115]
Missing 2 (0.1%) 0 (0%) 2 (0.1%)
waist
Mean (SD) 80.1 (10.2) 86.0 (9.70) 82.0 (10.4)
Median [Min, Max] 79.0 [54.0, 120] 87.0 [59.0, 120] 82.0 [54.0, 120]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
factor(hypertension)
0 1010 (46.7%) 319 (31.8%) 1329 (42.0%)
1 1151 (53.3%) 685 (68.2%) 1836 (58.0%)
bmi
Mean (SD) 24.0 (3.64) 23.9 (3.49) 24.0 (3.59)
Median [Min, Max] 23.6 [13.3, 43.8] 23.7 [14.0, 39.8] 23.7 [13.3, 43.8]
Missing 8 (0.4%) 1 (0.1%) 9 (0.3%)
whr
Mean (SD) 0.852 (0.0761) 0.901 (0.0660) 0.868 (0.0765)
Median [Min, Max] 0.850 [0.610, 1.21] 0.900 [0.680, 1.36] 0.870 [0.610, 1.36]
Missing 9 (0.4%) 2 (0.2%) 11 (0.3%)
diabetes
IFG 157 (7.3%) 86 (8.6%) 243 (7.7%)
Normal 1857 (85.9%) 823 (82.0%) 2680 (84.7%)
Yes 147 (6.8%) 95 (9.5%) 242 (7.6%)
# lam khi- bp bang prop.test
db$diab=ifelse(db$diabetes=="Yes", "Yes", "No")
table(db$diab,db$gender)
##      
##       Female Male
##   No    2014  909
##   Yes    147   95
tab=table(db$gender, db$diab)
prop.test(tab)
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  tab
## X-squared = 6.496, df = 1, p-value = 0.01081
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004880546 0.048314356
## sample estimates:
##    prop 1    prop 2 
## 0.9319759 0.9053785
#dung Desc de lam khi bp
library(DescTools)
Desc(db$diab~db$gender)
## ------------------------------------------------------------------------- 
## db$diab ~ db$gender
## 
## 
## Summary: 
## n: 3e+03, rows: 2e+00, columns: 2e+00
## 
## Pearson's Chi-squared test (cont. adj):
##   X-squared = 6.496, df = 1, p-value = 0.01081
## Fisher's exact test p-value = 0.009682
## McNemar's chi-squared = 548.41, df = 1, p-value < 2.2e-16
## 
##                     estimate lwr.ci upr.ci'
##                                           
## odds ratio             1.432  1.093  1.875
## rel. risk (col1)       1.134  1.022  1.259
## rel. risk (col2)       0.792  0.671  0.935
## 
## 
## Phi-Coefficient        0.047
## Contingency Coeff.     0.047
## Cramer's V             0.047
## 
##                                           
##           db$gender   Female   Male    Sum
## db$diab                                   
##                                           
## No        freq         2e+03  9e+02  3e+03
##           perc         63.6%  28.7%  92.4%
##           p.row        68.9%  31.1%      .
##           p.col        93.2%  90.5%      .
##                                           
## Yes       freq         1e+02  1e+02  2e+02
##           perc          4.6%   3.0%   7.6%
##           p.row        60.7%  39.3%      .
##           p.col         6.8%   9.5%      .
##                                           
## Sum       freq         2e+03  1e+03  3e+03
##           perc         68.3%  31.7% 100.0%
##           p.row            .      .      .
##           p.col            .      .      .
##                                           
## 
## ----------
## ' 95% conf. level

m=aov(db$bmi~db$diabetes)
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = db$bmi ~ db$diabetes)
## 
## $`db$diabetes`
##                  diff        lwr          upr     p adj
## Normal-IFG -0.8236127 -1.3882150 -0.259010503 0.0018313
## Yes-IFG    -0.7643882 -1.5297871  0.001010667 0.0503925
## Yes-Normal  0.0592245 -0.5064508  0.624899835 0.9673231
#tao bien moi
women=subset(db,gender=="Female")
dim(women)
## [1] 2161   15
m=aov(bmi~diabetes, data=women)
TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = bmi ~ diabetes, data = women)
## 
## $diabetes
##                  diff       lwr       upr     p adj
## Normal-IFG -0.5165746 -1.228157 0.1950074 0.2044133
## Yes-IFG    -0.8256674 -1.808504 0.1571697 0.1198536
## Yes-Normal -0.3090928 -1.042806 0.4246202 0.5845178
#Ve bieu do tuong quan da chieu giua cac bien lien tuc trong dataset
dat=db[, c("age","height","weight","bmi","hip","whr")]
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:DescTools':
## 
##     AUC, ICC, SD
pairs.panels(dat)