t=file.choose()
p <- read.csv(t)
names(p)
## [1] "id" "gender" "height" "weight" "bmi" "age" "bmc" "bmd"
## [9] "fat" "lean" "pcfat"
str(p)
## 'data.frame': 1217 obs. of 11 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender: chr "F" "M" "F" "F" ...
## $ height: int 150 165 157 156 160 153 155 167 165 158 ...
## $ weight: int 49 52 57 53 51 47 58 65 54 60 ...
## $ bmi : num 21.8 19.1 23.1 21.8 19.9 20.1 24.1 23.3 19.8 24 ...
## $ age : int 53 65 64 56 54 52 66 50 61 58 ...
## $ bmc : int 1312 1309 1230 1171 1681 1358 1546 2276 1778 1404 ...
## $ bmd : num 0.88 0.84 0.84 0.8 0.98 0.91 0.96 1.11 0.96 0.86 ...
## $ fat : int 17802 8381 19221 17472 7336 14904 20233 17749 10795 21365 ...
## $ lean : int 28600 40229 36057 33094 40621 30068 35599 43301 38613 35534 ...
## $ pcfat : num 37.3 16.8 34 33.8 14.8 32.2 35.3 28 21.1 36.6 ...
summary(p)
## id gender height weight
## Min. : 1.0 Length:1217 Min. :136.0 Min. :34.00
## 1st Qu.: 309.0 Class :character 1st Qu.:151.0 1st Qu.:49.00
## Median : 615.0 Mode :character Median :155.0 Median :54.00
## Mean : 614.5 Mean :156.7 Mean :55.14
## 3rd Qu.: 921.0 3rd Qu.:162.0 3rd Qu.:61.00
## Max. :1227.0 Max. :185.0 Max. :95.00
## bmi age bmc bmd fat
## Min. :14.5 Min. :13.00 Min. : 695 Min. :0.650 Min. : 4277
## 1st Qu.:20.2 1st Qu.:35.00 1st Qu.:1481 1st Qu.:0.930 1st Qu.:13768
## Median :22.2 Median :48.00 Median :1707 Median :1.010 Median :16955
## Mean :22.4 Mean :47.15 Mean :1725 Mean :1.009 Mean :17288
## 3rd Qu.:24.3 3rd Qu.:58.00 3rd Qu.:1945 3rd Qu.:1.090 3rd Qu.:20325
## Max. :37.1 Max. :88.00 Max. :3040 Max. :1.350 Max. :40825
## lean pcfat
## Min. :19136 Min. : 9.2
## 1st Qu.:30325 1st Qu.:27.0
## Median :33577 Median :32.4
## Mean :35463 Mean :31.6
## 3rd Qu.:39761 3rd Qu.:36.8
## Max. :63059 Max. :48.4
head(p)
## id gender height weight bmi age bmc bmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
head(p,3)
## id gender height weight bmi age bmc bmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
p$gender[p$gender == "F"] = "female"
p$gender[p$gender == "M"] = "male"
p$bmigrp[p$bmi < 18.5 ] = "underweight" #(Thiếu cân)
p$bmigrp[p$bmi >= 18.5 & p$bmi < 24.9 ] = "Normal" #(Bình thường)
p$bmigrp[p$bmi >= 24.9 & p$bmi < 29.9 ] = "Overweight" #(Thừa cân)
p$bmigrp[p$bmi >= 29.9 & p$bmi <= 34.9 ] = "Obese" #(Béo phì)
p$bmigrp[p$bmi >= 35 ] = "Extramily Obese" #(Béo phì nặng)
p$bmigrp = factor(p$bmigrp, levels = c("underweight", "Normal", "Overweight", "Obese", "Extramily Obese"))
p %>% count(age)
## age n
## 1 13 1
## 2 14 4
## 3 16 3
## 4 17 3
## 5 18 21
## 6 19 47
## 7 20 24
## 8 21 23
## 9 22 23
## 10 23 28
## 11 24 21
## 12 25 11
## 13 26 10
## 14 27 7
## 15 28 9
## 16 29 6
## 17 30 14
## 18 31 8
## 19 32 7
## 20 33 12
## 21 34 16
## 22 35 15
## 23 36 17
## 24 37 19
## 25 38 13
## 26 39 20
## 27 40 20
## 28 41 25
## 29 42 24
## 30 43 28
## 31 44 25
## 32 45 24
## 33 46 33
## 34 47 24
## 35 48 27
## 36 49 33
## 37 50 33
## 38 51 38
## 39 52 35
## 40 53 25
## 41 54 33
## 42 55 37
## 43 56 20
## 44 57 25
## 45 58 24
## 46 59 16
## 47 60 16
## 48 61 14
## 49 62 13
## 50 63 16
## 51 64 14
## 52 65 15
## 53 66 14
## 54 67 13
## 55 68 11
## 56 69 15
## 57 70 19
## 58 71 14
## 59 72 14
## 60 73 13
## 61 74 7
## 62 75 10
## 63 76 13
## 64 77 11
## 65 78 9
## 66 79 6
## 67 80 10
## 68 81 2
## 69 82 5
## 70 83 2
## 71 84 5
## 72 85 3
## 73 87 1
## 74 88 1
table(p$age)
##
## 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
## 1 4 3 3 21 47 24 23 23 28 21 11 10 7 9 6 14 8 7 12 16 15 17 19 13 20
## 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
## 20 25 24 28 25 24 33 24 27 33 33 38 35 25 33 37 20 25 24 16 16 14 13 16 14 15
## 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 87 88
## 14 13 11 15 19 14 14 13 7 10 13 11 9 6 10 2 5 2 5 3 1 1
p$age_grp = cut(p$age, breaks = seq(13,88,5))
c = p %>% count(age_grp)
table(p$age_grp, p$gender)
##
## female male
## (13,18] 16 15
## (18,23] 74 71
## (23,28] 41 17
## (28,33] 32 15
## (33,38] 51 29
## (38,43] 93 24
## (43,48] 107 26
## (48,53] 114 50
## (53,58] 109 30
## (58,63] 56 19
## (63,68] 48 19
## (68,73] 63 12
## (73,78] 36 14
## (78,83] 18 7
## (83,88] 4 6
c
## age_grp n
## 1 (13,18] 31
## 2 (18,23] 145
## 3 (23,28] 58
## 4 (28,33] 47
## 5 (33,38] 80
## 6 (38,43] 117
## 7 (43,48] 133
## 8 (48,53] 164
## 9 (53,58] 139
## 10 (58,63] 75
## 11 (63,68] 67
## 12 (68,73] 75
## 13 (73,78] 50
## 14 (78,83] 25
## 15 (83,88] 10
## 16 <NA> 1
p %>% count(bmigrp)
## bmigrp n
## 1 underweight 107
## 2 Normal 857
## 3 Overweight 238
## 4 Obese 12
## 5 Extramily Obese 3
table(p$bmigrp, p$gender)
##
## female male
## underweight 76 31
## Normal 622 235
## Overweight 153 85
## Obese 8 4
## Extramily Obese 3 0
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
c1= p %>% count(age_grp,gender)
c1
## age_grp gender n
## 1 (13,18] female 16
## 2 (13,18] male 15
## 3 (18,23] female 74
## 4 (18,23] male 71
## 5 (23,28] female 41
## 6 (23,28] male 17
## 7 (28,33] female 32
## 8 (28,33] male 15
## 9 (33,38] female 51
## 10 (33,38] male 29
## 11 (38,43] female 93
## 12 (38,43] male 24
## 13 (43,48] female 107
## 14 (43,48] male 26
## 15 (48,53] female 114
## 16 (48,53] male 50
## 17 (53,58] female 109
## 18 (53,58] male 30
## 19 (58,63] female 56
## 20 (58,63] male 19
## 21 (63,68] female 48
## 22 (63,68] male 19
## 23 (68,73] female 63
## 24 (68,73] male 12
## 25 (73,78] female 36
## 26 (73,78] male 14
## 27 (78,83] female 18
## 28 (78,83] male 7
## 29 (83,88] female 4
## 30 (83,88] male 6
## 31 <NA> male 1
c1 %>%
ggplot(aes(x=age_grp, y= n)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.5) +
xlab("Age group") + ylab("No.of people") + ggtitle("No.in age group") +
theme_economist()
shapiro.test(p$age)
##
## Shapiro-Wilk normality test
##
## data: p$age
## W = 0.97366, p-value = 4.281e-14
shapiro.test(p$height)
##
## Shapiro-Wilk normality test
##
## data: p$height
## W = 0.9827, p-value = 7.179e-11
shapiro.test(p$pcfat)
##
## Shapiro-Wilk normality test
##
## data: p$pcfat
## W = 0.98284, p-value = 8.193e-11
shapiro.test(p$bmi)
##
## Shapiro-Wilk normality test
##
## data: p$bmi
## W = 0.98352, p-value = 1.581e-10
# p nhỏ hơn 0,05 rất nhiều nên ko có phân phối chuẩn
ggplot(data = p, mapping = aes(x=age))+
geom_histogram(color = "red", fill = "#7A9B57")+
ggtitle("ABC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p %>%
ggplot(aes(age)) +
geom_histogram(color = "red", fill = "#7A9B57")+
ggtitle("ABC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggthemes)
c %>%
ggplot(aes(x=age_grp, y= n)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.5) +
xlab("Age group") + ylab("No.of people") + ggtitle("No.in age group") +
theme_economist()
ggplot(data = p, mapping = aes(x=pcfat))+
geom_histogram(color = "red", fill = "#7A9B57")+
ggtitle("Pcfat")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p$pcfat_grp = cut(p$pcfat, breaks = seq(9.2,48.4,5))
pc = p %>% count(pcfat_grp)
pc
## pcfat_grp n
## 1 (9.2,14.2] 18
## 2 (14.2,19.2] 52
## 3 (19.2,24.2] 110
## 4 (24.2,29.2] 223
## 5 (29.2,34.2] 334
## 6 (34.2,39.2] 318
## 7 (39.2,44.2] 141
## 8 <NA> 21
library(ggthemes)
pc %>%
ggplot(aes(x=pcfat_grp, y= n)) +
geom_bar(stat="identity", fill="#f68060", alpha=.5, width=.6) +
xlab("pcfat group") + ylab("No.people") + ggtitle("No.in pcfat group") +
theme_economist()
pcg = table(p$pcfat_grp,p$gender)
pcg
##
## female male
## (9.2,14.2] 0 18
## (14.2,19.2] 4 48
## (19.2,24.2] 17 93
## (24.2,29.2] 94 129
## (29.2,34.2] 280 54
## (34.2,39.2] 306 12
## (39.2,44.2] 141 0
library(ggplot2); library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.5
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
m = ggplot(data=p, aes(x=pcfat))
m1 = m +
geom_histogram(color="white",
fill="blue")
m1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
m = m +
geom_histogram(aes(y=..density..),color="white", fill="blue")
m2 = m + geom_density(col="red")
grid.arrange(m1, m2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h = ggplot(data=p, aes(x=pcfat,
fill = gender))
h1 = h +
geom_histogram(position="dodge")
h1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h2 = ggplot(data=p, aes(x=pcfat,
fill=gender, color=gender)) +
geom_density(alpha = 0.1)
h2
grid.arrange(h1, h2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
v = ggplot(data=p, aes(x=bmi,
fill = gender))
v1 = v +
geom_histogram(position="dodge")
v2 = ggplot(data=p, aes(x=bmi,
fill=gender, color=gender)) +
geom_density(alpha = 0.1)
grid.arrange(v1, v2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggthemes)
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.0.5
## Registered S3 method overwritten by 'parameters':
## method from
## format.parameters_distribution datawizard
library(sjmisc)
## Warning: package 'sjmisc' was built under R version 4.0.5
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:purrr':
##
## is_empty
## The following object is masked from 'package:tidyr':
##
## replace_na
## The following object is masked from 'package:tibble':
##
## add_case
t= table(p$bmigrp)
t
##
## underweight Normal Overweight Obese Extramily Obese
## 107 857 238 12 3
barplot(t,col="light blue")
sjt.xtab(p$bmigrp, p$gender, show.row.prc = T, show.col.prc = T)
| bmigrp | gender | Total | |
|---|---|---|---|
| female | male | ||
| underweight |
76 71 % 8.8 % |
31 29 % 8.7 % |
107 100 % 8.8 % |
| Normal |
622 72.6 % 72.2 % |
235 27.4 % 66.2 % |
857 100 % 70.4 % |
| Overweight |
153 64.3 % 17.7 % |
85 35.7 % 23.9 % |
238 100 % 19.6 % |
| Obese |
8 66.7 % 0.9 % |
4 33.3 % 1.1 % |
12 100 % 1 % |
| Extramily Obese |
3 100 % 0.3 % |
0 0 % 0 % |
3 100 % 0.2 % |
| Total |
862 70.8 % 100 % |
355 29.2 % 100 % |
1217 100 % 100 % |
χ2=7.540 · df=4 · Cramer’s V=0.079 · Fisher’s p=0.134 |
p %>% count(bmigrp) %>%
ggplot(aes(x=bmigrp, y=n))+
geom_bar(stat="identity", fill="#f68060", alpha=.5, width=.6) +
xlab("bmi group") + ylab("No.people") + ggtitle("No.in bmi group") +
theme_economist()
r = ggplot(data = p, aes(x=bmigrp, fill = gender))
r = r + geom_bar(aes(y = (..count..)))
r = r + xlab("Tình trạng") + ylab("số người")
r
r = ggplot(data = p, aes(x=bmigrp, fill = gender))
r = r + geom_bar(aes(y = (..count..)/sum(..count..)))
r = r + xlab("Tình trạng") + ylab("tỉ lệ")
r
d = ggplot(data=p, aes(x=gender,
y=pcfat, fill=gender))
d + geom_boxplot()
boxplot(p$pcfat ~ p$bmigrp, col="light green", border="red")
boxplot(p$age ~ p$bmigrp, col="light green", border="red")
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.0.5
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:sjmisc':
##
## %nin%
## The following object is masked from 'package:rrcov':
##
## Cov
Desc(p$bmi)
## ------------------------------------------------------------------------------
## p$bmi (numeric)
##
## length n NAs unique 0s mean meanCI'
## 1'217 1'217 0 149 0 22.40 22.22
## 100.0% 0.0% 0.0% 22.57
##
## .05 .10 .25 median .75 .90 .95
## 17.70 18.66 20.20 22.20 24.30 26.30 27.50
##
## range sd vcoef mad IQR skew kurt
## 22.60 3.06 0.14 2.97 4.10 0.53 0.95
##
## lowest : 14.5, 15.2, 15.4 (2), 15.6, 15.8
## highest: 32.5, 34.7 (2), 35.6, 36.3, 37.1
##
## ' 95%-CI (classic)
Desc(p$bmigrp)
## ------------------------------------------------------------------------------
## p$bmigrp (factor)
##
## length n NAs unique levels dupes
## 1'217 1'217 0 5 5 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 Normal 857 70.4% 857 70.4%
## 2 Overweight 238 19.6% 1'095 90.0%
## 3 underweight 107 8.8% 1'202 98.8%
## 4 Obese 12 1.0% 1'214 99.8%
## 5 Extramily Obese 3 0.2% 1'217 100.0%
Desc(p$age ~ p$bmigrp)
## ------------------------------------------------------------------------------
## p$age ~ p$bmigrp
##
## Summary:
## n pairs: 1'217, valid: 1'217 (100.0%), missings: 0 (0.0%), groups: 5
##
##
## underweight Normal Overweight Obese
## mean 40.888 46.256 53.034 47.833
## median 35.000 48.000 52.000 50.500
## sd 20.432 16.963 15.374 16.140
## IQR 36.000 23.000 22.750 15.500
## n 107 857 238 12
## np 8.792% 70.419% 19.556% 0.986%
## NAs 0 0 0 0
## 0s 0 0 0 0
##
## Extramily Obese
## mean 57.333
## median 57.000
## sd 3.512
## IQR 3.500
## n 3
## np 0.247%
## NAs 0
## 0s 0
##
## Kruskal-Wallis rank sum test:
## Kruskal-Wallis chi-squared = 42.913, df = 4, p-value = 1.078e-08
p$gender=ifelse(p$gender == "male", 1, 0)
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
##
## Attaching package: 'psych'
## The following objects are masked from 'package:DescTools':
##
## AUC, ICC, SD
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
p1 = p[,c("gender", "height", "weight", "bmi", "age", "bmc", "bmd", "fat", "lean", "pcfat")]
pairs.panels(p1)
cor(p1)
## gender height weight bmi age bmc
## gender 1.00000000 0.67077713 0.46955838 0.07096352 -0.12823407 0.539203649
## height 0.67077713 1.00000000 0.59766675 -0.01640275 -0.37311646 0.691945350
## weight 0.46955838 0.59766675 1.00000000 0.78691712 -0.04819844 0.610006100
## bmi 0.07096352 -0.01640275 0.78691712 1.00000000 0.22862793 0.225461272
## age -0.12823407 -0.37311646 -0.04819844 0.22862793 1.00000000 -0.462907393
## bmc 0.53920365 0.69194535 0.61000610 0.22546127 -0.46290739 1.000000000
## bmd 0.29154032 0.38160279 0.34056024 0.12863581 -0.44155558 0.883668572
## fat -0.28441678 -0.06349351 0.57584040 0.76921338 0.21754740 0.007271522
## lean 0.75818630 0.76816704 0.84047786 0.45385450 -0.19438829 0.739428246
## pcfat -0.66576817 -0.47982065 0.05655573 0.44091833 0.30710026 -0.403526172
## bmd fat lean pcfat
## gender 0.29154032 -0.284416777 0.7581863 -0.66576817
## height 0.38160279 -0.063493511 0.7681670 -0.47982065
## weight 0.34056024 0.575840396 0.8404779 0.05655573
## bmi 0.12863581 0.769213384 0.4538545 0.44091833
## age -0.44155558 0.217547400 -0.1943883 0.30710026
## bmc 0.88366857 0.007271522 0.7394282 -0.40352617
## bmd 1.00000000 -0.060155111 0.4348068 -0.30440674
## fat -0.06015511 1.000000000 0.1150510 0.82416190
## lean 0.43480680 0.115051048 1.0000000 -0.44557451
## pcfat -0.30440674 0.824161905 -0.4455745 1.00000000
corrplot(cor(p1), type = "upper", method = "number")
corrplot(cor(p1), method = "circle")
p$gender=ifelse(p$gender == 1, "male", "female")
z = ggplot(data=p, aes(x=age,
y=pcfat))
z1 = z + geom_point()
z2 = z1 +
geom_smooth()
grid.arrange(z1, z2, ncol=2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
k = ggplot(data=p, aes(x=age, y=pcfat, col=gender, fill=gender))
k1 = k + geom_point() + geom_smooth()
k2 = k + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)+I(x^3))
grid.arrange(k1, k2, ncol=2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
library(table1)
## Warning: package 'table1' was built under R version 4.0.5
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~age + height + weight + bmi + pcfat | gender,
data=p)
| female (N=862) |
male (N=355) |
Overall (N=1217) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 48.6 (16.4) | 43.7 (18.8) | 47.2 (17.3) |
| Median [Min, Max] | 49.0 [14.0, 85.0] | 44.0 [13.0, 88.0] | 48.0 [13.0, 88.0] |
| height | |||
| Mean (SD) | 153 (5.55) | 165 (6.73) | 157 (7.98) |
| Median [Min, Max] | 153 [136, 170] | 165 [146, 185] | 155 [136, 185] |
| weight | |||
| Mean (SD) | 52.3 (7.72) | 62.0 (9.59) | 55.1 (9.40) |
| Median [Min, Max] | 51.0 [34.0, 95.0] | 62.0 [38.0, 95.0] | 54.0 [34.0, 95.0] |
| bmi | |||
| Mean (SD) | 22.3 (3.05) | 22.7 (3.04) | 22.4 (3.06) |
| Median [Min, Max] | 22.1 [15.2, 37.1] | 22.5 [14.5, 34.7] | 22.2 [14.5, 37.1] |
| pcfat | |||
| Mean (SD) | 34.7 (5.19) | 24.2 (5.76) | 31.6 (7.18) |
| Median [Min, Max] | 34.7 [14.6, 48.4] | 24.6 [9.20, 39.0] | 32.4 [9.20, 48.4] |
#Ảnh hưởng của tuổi đến tỉ trọng mỡ ?
l1 = lm(pcfat ~ age, data=p)
summary(l1)
##
## Call:
## lm(formula = pcfat ~ age, data = p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.7114 -4.0069 0.8378 4.9514 18.2192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.58408 0.57003 44.88 <2e-16 ***
## age 0.12769 0.01135 11.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.839 on 1215 degrees of freedom
## Multiple R-squared: 0.09431, Adjusted R-squared: 0.09357
## F-statistic: 126.5 on 1 and 1215 DF, p-value: < 2.2e-16
#Phương trình: pcfat = 25.6 + 0.13*age
#Diễn giải: Mỗi năm tăng độ tuổi, tỉ trọng mỡ tăng 0.13% (SE 0.011), và mối liên quan này có ý nghĩa thống kê (P < 0.0001)
l1 = lm(pcfat ~ age, data=p)
plot(p$pcfat ~ p$age, pch=16, col="blue")
abline(l1, col="red")
#Tỉ trọng mỡ khác nhau giữa nam và nữ
l2 = lm(pcfat ~ gender, data=p)
summary(l2)
##
## Call:
## lm(formula = pcfat ~ gender, data = p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <2e-16 ***
## gendermale -10.5163 0.3381 -31.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 2.2e-16
#Phương trình pcfat = 34.7 – 10.5*gender(M)
#Diễn giải: Nam có tỉ trọng mỡ thấp hơn nữ 10.5% (SE 0.34%), và sự khác biệt này có ý nghĩa thống kê (P < 0.0001).
#Khác biệt giữa nam và nữ giải thích 44% những khác biệt về phương sai của tỉ trọng mỡ.
yvar = p[, ("pcfat")]
xvars = p[, c("gender", "height", "weight", "bmi", "age")]
bma = bicreg(xvars, yvar, strict=FALSE, OR=20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 5.26146 4.582901 7.958e+00 -7.928e-01 8.137e+00
## gendermale 100.0 -11.25139 0.429659 -1.144e+01 -1.143e+01 -1.081e+01
## height 31.4 0.01759 0.028494 . 5.598e-02 .
## weight 39.2 0.03102 0.042611 7.921e-02 . .
## bmi 100.0 1.01265 0.111625 8.942e-01 1.089e+00 1.089e+00
## age 100.0 0.05259 0.008048 5.497e-02 5.473e-02 4.715e-02
##
## nVar 4 4 3
## r2 0.697 0.696 0.695
## BIC -1.423e+03 -1.423e+03 -1.422e+03
## post prob 0.392 0.314 0.294
yvar = p[, ("bmi")]
xvars = p[, c("gender", "height", "weight", "pcfat", "age")]
bma = bicreg(xvars, yvar, strict=FALSE, OR=20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 45.1299146 0.4137278 4.498e+01 4.565e+01 4.477e+01
## gendermale 0.0 0.0000000 0.0000000 . . .
## height 100.0 -0.2870415 0.0026074 -2.862e-01 -2.901e-01 -2.849e-01
## weight 100.0 0.4011038 0.0017466 4.006e-01 4.028e-01 4.001e-01
## pcfat 72.1 0.0039043 0.0028516 5.446e-03 . 5.278e-03
## age 17.6 0.0002116 0.0005273 . . 1.179e-03
##
## nVar 3 2 4
## r2 0.988 0.988 0.988
## BIC -5.348e+03 -5.346e+03 -5.345e+03
## post prob 0.594 0.231 0.127
## model 4
## Intercept 4.539e+01
## gendermale .
## height -2.886e-01
## weight 4.022e-01
## pcfat .
## age 1.272e-03
##
## nVar 3
## r2 0.988
## BIC -5.343e+03
## post prob 0.049
#Với biên outcome là biến bmi, dùng 5 biến cung cấp (gender, height, weight, pcfat và age) BMA tìm được 4 mô hình 'tối ưu'.
#"p!=0" có nghĩa là xác suất biến số có 'ảnh hưởng'. Vd: xác suất mà pcfat có ảnh hưởng đến bmi là 72.1%, age là 17.6%, và height, weight là 100%
bm = lm(bmi ~ height + weight + pcfat, data = p)
summary(bm)
##
## Call:
## lm(formula = bmi ~ height + weight + pcfat, data = p)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.44844 -0.11445 0.01667 0.13200 2.05001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.984673 0.294981 152.500 < 2e-16 ***
## height -0.286178 0.001970 -145.261 < 2e-16 ***
## weight 0.400581 0.001469 272.776 < 2e-16 ***
## pcfat 0.005446 0.001757 3.099 0.00198 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.337 on 1213 degrees of freedom
## Multiple R-squared: 0.9879, Adjusted R-squared: 0.9878
## F-statistic: 3.293e+04 on 3 and 1213 DF, p-value: < 2.2e-16
#phương trình hồi quy: bmi = -0,29*height + 0,4*weight + 0,005*pcfat + 44,98
#Các biến height, weight, pcfat giải thích 98,8% phương sai của bmi
w = select(p, c("id", "gender", "height", "weight", "age", "bmi", "bmigrp"))
head(w)
## id gender height weight age bmi bmigrp
## 1 1 female 150 49 53 21.8 Normal
## 2 2 male 165 52 65 19.1 Normal
## 3 3 female 157 57 64 23.1 Normal
## 4 4 female 156 53 56 21.8 Normal
## 5 5 male 160 51 54 19.9 Normal
## 6 6 female 153 47 52 20.1 Normal