setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t=file.choose()
ob=read.csv(t,header=T)
names(ob)
## [1] "id" "gender" "height" "weight" "bmi" "age" "bmc"
## [8] "bmd" "fat" "lean" "pcfat"
head(ob)
## id gender height weight bmi age bmc bmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
dat=ob[,c("gender","age","height","weight","bmi","pcfat")]
names(dat)
## [1] "gender" "age" "height" "weight" "bmi" "pcfat"
library(GGally);library(ggplot2);library(visreg);library(table1);library(BMA)
## Warning: package 'GGally' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'visreg' was built under R version 3.5.3
## Warning: package 'table1' was built under R version 3.5.3
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
## Warning: package 'BMA' was built under R version 3.5.3
## Loading required package: survival
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 3.5.3
## Loading required package: robustbase
## Warning: package 'robustbase' was built under R version 3.5.3
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Warning: package 'inline' was built under R version 3.5.3
## Loading required package: rrcov
## Warning: package 'rrcov' was built under R version 3.5.3
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
table1(~age+height+weight+bmi+pcfat|gender,data=ob)
| F (n=862) |
M (n=355) |
Overall (n=1217) |
|
|---|---|---|---|
| age | |||
| Mean (SD) | 48.6 (16.4) | 43.7 (18.8) | 47.2 (17.3) |
| Median [Min, Max] | 49.0 [14.0, 85.0] | 44.0 [13.0, 88.0] | 48.0 [13.0, 88.0] |
| height | |||
| Mean (SD) | 153 (5.55) | 165 (6.73) | 157 (7.98) |
| Median [Min, Max] | 153 [136, 170] | 165 [146, 185] | 155 [136, 185] |
| weight | |||
| Mean (SD) | 52.3 (7.72) | 62.0 (9.59) | 55.1 (9.40) |
| Median [Min, Max] | 51.0 [34.0, 95.0] | 62.0 [38.0, 95.0] | 54.0 [34.0, 95.0] |
| bmi | |||
| Mean (SD) | 22.3 (3.05) | 22.7 (3.04) | 22.4 (3.06) |
| Median [Min, Max] | 22.1 [15.2, 37.1] | 22.5 [14.5, 34.7] | 22.2 [14.5, 37.1] |
| pcfat | |||
| Mean (SD) | 34.7 (5.19) | 24.2 (5.76) | 31.6 (7.18) |
| Median [Min, Max] | 34.7 [14.6, 48.4] | 24.6 [9.20, 39.0] | 32.4 [9.20, 48.4] |
library(ggplot2)
ggpairs(dat,mapping=aes(color=gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Nhận xét: Các biến có tương quan trung bình gồm: weight và height (r=0.59); bmi và weight (r=0.79);pcfat và bmi (r=0.44). Phân bố pcfat của nam và nữ có khác biệt nhiều nhất, nữ có pcfat cao hơn nam. height của nam cao hơn nữ; weight của nam cao hơn nữ. # Task 2: Mô hình hồi quy tuyến tính pcfat và gender Khác biệt giữa nam và nữ
t=file.choose()
ob=read.csv(t,header=T)
m=lm(pcfat~gender,data=ob)
summary(m)
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <2e-16 ***
## genderM -10.5163 0.3381 -31.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 2.2e-16
Phân tích hồi quy tuyến tính cho thấy nam có tỉ trọng mỡ thấp hơn nữ 10.5% (sai số chuẩn 0.34). Khác biệt này có ý nghĩa thống kê (p<0.0001). Yếu tố giới tính giải thích khoảng 44% những khác biệt về tỉ trọng mỡ giữa các cá nhân.
setwd("C:/Users\\HP\\Documents\\R\\Thuc hanh\\Training CR")
t=file.choose()
ob=read.csv(t,header=T)
m1=lm(pcfat~bmi,data=ob)
m2=lm(pcfat~bmi+I(bmi^2),data=ob)
m3=lm(pcfat~bmi+I(bmi^2)+I(bmi^3),data=ob)
anova(m1,m2,m3)
## Analysis of Variance Table
##
## Model 1: pcfat ~ bmi
## Model 2: pcfat ~ bmi + I(bmi^2)
## Model 3: pcfat ~ bmi + I(bmi^2) + I(bmi^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1215 50541
## 2 1214 49921 1 620.14 15.1175 0.0001065 ***
## 3 1213 49758 1 162.30 3.9565 0.0469163 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m2)
##
## Call:
## lm(formula = pcfat ~ bmi + I(bmi^2), data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.503 -4.290 1.512 4.633 18.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16.31919 6.50871 -2.507 0.012296 *
## bmi 3.20632 0.56205 5.705 1.46e-08 ***
## I(bmi^2) -0.04675 0.01204 -3.883 0.000109 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.413 on 1214 degrees of freedom
## Multiple R-squared: 0.2043, Adjusted R-squared: 0.203
## F-statistic: 155.8 on 2 and 1214 DF, p-value: < 2.2e-16
summary(m3)
##
## Call:
## lm(formula = pcfat ~ bmi + I(bmi^2) + I(bmi^3), data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.686 -4.122 1.386 4.561 17.751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -68.046558 26.805839 -2.538 0.01126 *
## bmi 9.799399 3.361833 2.915 0.00362 **
## I(bmi^2) -0.320765 0.138284 -2.320 0.02053 *
## I(bmi^3) 0.003711 0.001866 1.989 0.04692 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.405 on 1213 degrees of freedom
## Multiple R-squared: 0.2069, Adjusted R-squared: 0.2049
## F-statistic: 105.5 on 3 and 1213 DF, p-value: < 2.2e-16
par(mfrow=c(1,3))
p1=visreg(m1)
p2=visreg(m2)
p3=visreg(m3)
mh1=lm(pcfat~bmi+gender,data=ob)
visreg(mh1,"gender","bmi")
t=file.choose()
ob=read.csv(t,header=T)
mh3=lm(pcfat~age+gender+bmi+I(bmi^2),data=ob)
visreg2d(mh3,"gender","bmi")
library(BMA)
t=file.choose()
ob=read.csv(t,header=T)
y=ob$pcfat
x=ob[,c("gender","height","weight","bmi","age")]
bma=bicreg(x,y,strict=F,OR=20)
summary(bma)
##
## Call:
## bicreg(x = x, y = y, strict = F, OR = 20)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 5.26146 4.582901 7.958e+00 -7.928e-01 8.137e+00
## genderM 100.0 -11.25139 0.429659 -1.144e+01 -1.143e+01 -1.081e+01
## height 31.4 0.01759 0.028494 . 5.598e-02 .
## weight 39.2 0.03102 0.042611 7.921e-02 . .
## bmi 100.0 1.01265 0.111625 8.942e-01 1.089e+00 1.089e+00
## age 100.0 0.05259 0.008048 5.497e-02 5.473e-02 4.715e-02
##
## nVar 4 4 3
## r2 0.697 0.696 0.695
## BIC -1.423e+03 -1.423e+03 -1.422e+03
## post prob 0.392 0.314 0.294
imageplot.bma(bma)
Nhận xét: Chọn mô hình 1 gồm 4 biến độc lập: gender, weight, age và bmi.69.7 % Sự khác biệt về pcfat giữa các cá nhân được giải thích 4 biến gender và bmi. Xác suất hậu định của mô hình 1 là 39.2%, nghĩa là xác suất lặp lại của mô hình là 39.2%.
Task 5.1: Chia dữ liệu thành 2 nhóm nhỏ
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
t=file.choose()
ob=read.csv(t,header=T)
sa=createDataPartition(ob$pcfat,p=0.6,list=F)
dev=ob[sa, ]
val=ob[-sa, ]
t=file.choose()
ob=read.csv(t,header=T)
control=trainControl(method="cv",number = 10)
training=train(pcfat~gender+age+bmi+I(bmi^2),data=dev,method="lm",trControl=control,metric="Rsquared")
training=train(pcfat~gender+age+bmi+I(bmi^2),data=dev,method="lm",trControl=control,metric="Rsquared")
summary(training)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.943 -2.546 0.091 2.505 13.989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.419297 4.752476 -3.665 0.000265 ***
## genderM -10.658553 0.317557 -33.564 < 2e-16 ***
## age 0.041042 0.008753 4.689 3.28e-06 ***
## bmi 3.362531 0.410314 8.195 1.13e-15 ***
## `I(bmi^2)` -0.048864 0.008738 -5.592 3.17e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.908 on 726 degrees of freedom
## Multiple R-squared: 0.7091, Adjusted R-squared: 0.7075
## F-statistic: 442.3 on 4 and 726 DF, p-value: < 2.2e-16
t=file.choose()
ob=read.csv(t,header=T)
require(relaimpo)
## Loading required package: relaimpo
## Warning: package 'relaimpo' was built under R version 3.5.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.5.3
## Loading required package: boot
## Warning: package 'boot' was built under R version 3.5.3
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## Warning: package 'mitools' was built under R version 3.5.3
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
mh3=lm(pcfat~age+gender+bmi+I(bmi^2),data=ob)
calc.relimp(mh3,type="lmg",rela=T,rank=T)
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 4 Regressors:
## age gender bmi I(bmi^2)
## Proportion of variance explained by model: 70.65%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## age 0.06186434
## gender 0.64634359
## bmi 0.15442688
## I(bmi^2) 0.13736518
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## age 0.12768705 0.092389959 0.06189251 0.04405818
## gender -10.51634414 -10.717838213 -10.88911021 -10.86319562
## bmi 1.03619023 1.759526891 2.52740016 3.47197075
## I(bmi^2) 0.02152954 -0.001260437 -0.02421181 -0.05122595
pred=predict(training,newdata=val)
plot(pred~val$pcfat,pch=16)
validation=data.frame(obs=val$pcfat,pred)
defaultSummary(validation)
## RMSE Rsquared MAE
## 3.8944611 0.7046689 3.0446712