t="C:\\Users\\pc\\Downloads\\CAC NGHIEN CUU\\BAI GIANG CAC MON\\GS TUAN\\BG 12.6.19\\obesity data.csv"
ob=read.csv(t)
head(ob)
## id gender height weight bmi age bmc bmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
#analysis
#chia thanh 02 nhom voi ti le 7:3; upper la diem chia bo du lieu.
rows=nrow(ob)
prop=0.7
upper=floor(prop*rows)
permutation=ob[sample(rows),]
dev=permutation[1:upper,]
val=permutation[(upper+1):rows,]
# dim la ham cho biet so hang hay so id (bn)
dim(dev)
## [1] 851 11
dim(val)
## [1] 366 11
#xay dung mo hinh dung dev: theo mo hinh BMA da chon ra mo hinh 4 tham so voi r2 =0.69 va xac suat hau dinh pros prop la cao nhat 0.39.
m=lm(pcfat~gender+age+bmi+weight, data=dev)
summary(m)
##
## Call:
## lm(formula = pcfat ~ gender + age + bmi + weight, data = dev)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.3541 -2.3794 0.0871 2.6034 15.5951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.652717 0.977583 7.828 1.48e-14 ***
## genderM -11.928480 0.404907 -29.460 < 2e-16 ***
## age 0.056284 0.008736 6.443 1.97e-10 ***
## bmi 0.793492 0.092505 8.578 < 2e-16 ***
## weight 0.126718 0.033854 3.743 0.000194 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.909 on 846 degrees of freedom
## Multiple R-squared: 0.7071, Adjusted R-squared: 0.7057
## F-statistic: 510.6 on 4 and 846 DF, p-value: < 2.2e-16
# tinh gia tri tien luong cho val (dung tham so cua m)
val$pred=predict(m,newdata=val)
head(val,3)
## id gender height weight bmi age bmc bmd fat lean pcfat
## 1037 1047 F 149 53 23.9 59 1589 1.03 18296 32902 34.7
## 133 134 F 150 48 21.3 80 695 0.65 13823 26787 33.5
## 391 395 F 151 50 21.9 41 1936 1.13 14589 32963 29.5
## pred
## 1037 36.65399
## 133 35.13929
## 391 33.67374
r2=cor(val$pcfat, val$pred)^2
r2
## [1] 0.6697813
# ve do thi moi tuong quan giua pct va bien moi pred
plot(val$pcfat~val$pred, pch=16)
#danh gia tam quan trong cua bien tien luong, dung du lieu cho 1217 doi tuong.
library(relaimpo)
## Warning: package 'relaimpo' was built under R version 3.5.3
## Loading required package: MASS
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
m=lm(pcfat~gender+age+weight+height+bmi, data=ob)
calc.relimp(m, type="lmg", rela=T, rank=T)
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 5 Regressors:
## gender age weight height bmi
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## gender 0.50802351
## age 0.05465515
## weight 0.09339118
## height 0.15217127
## bmi 0.19175889
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## gender -10.51634414 -11.25778000 -11.37601905 -11.34809670 -11.44105012
## age 0.12768705 0.09380448 0.04849443 0.04545609 0.05493292
## weight 0.04319324 0.06043494 0.08062526 0.10196648 0.09394895
## height -0.43201351 -0.39537155 -0.28180720 -0.13427073 -0.01099062
## bmi 1.03619023 1.38422758 1.37018157 1.10104878 0.85803534
# chu thich: them rela=T, rank=T : la lay 100% du lieu, neu khong co chi lay 25% du lieu , tuy nhien thu tu anh huong cua cac bien van theo dung thu tu
calc.relimp(m, type = "lmg")
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 5 Regressors:
## gender age weight height bmi
## Proportion of variance explained by model: 69.66%
## Metrics are not normalized (rela=FALSE).
##
## Relative importance metrics:
##
## lmg
## gender 0.35388610
## age 0.03807245
## weight 0.06505573
## height 0.10600159
## bmi 0.13357808
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## gender -10.51634414 -11.25778000 -11.37601905 -11.34809670 -11.44105012
## age 0.12768705 0.09380448 0.04849443 0.04545609 0.05493292
## weight 0.04319324 0.06043494 0.08062526 0.10196648 0.09394895
## height -0.43201351 -0.39537155 -0.28180720 -0.13427073 -0.01099062
## bmi 1.03619023 1.38422758 1.37018157 1.10104878 0.85803534
# dung BMA de kiem tra bien x nao anh huong den y
#nhap du lieu
t="C:\\Users\\pc\\Downloads\\CAC NGHIEN CUU\\BAI GIANG CAC MON\\GS TUAN\\BG 12.6.19\\birthwt.csv"
bw=read.csv(t)
head(bw)
## id low age lwt race smoke ptl ht ui ftv bwt
## 1 85 0 19 182 2 0 0 0 1 0 2523
## 2 86 0 33 155 3 0 0 0 0 3 2551
## 3 87 0 20 105 1 1 0 0 0 1 2557
## 4 88 0 21 108 1 1 0 0 1 2 2594
## 5 89 0 18 107 1 1 0 0 1 0 2600
## 6 91 0 21 124 3 0 0 0 0 0 2622
# bien y la bwt(gram); bien x =age, lwt, race, smoke, ptl,ht, ui, ftv,)
# bien y la bwt(gram); bien x =age, lwt, race, smoke, ptl,ht, ui, ftv)
# dung BMA tim bien lien quan den bwt
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## The following object is masked from 'package:boot':
##
## salinity
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-4)
# dinh nghia bien x va y
yvar=bw[, c("bwt")]
xvars=bw[, c("age", "lwt", "race", "smoke", "ptl", "ht", "ui", "ftv")]
bma=bicreg(xvars, yvar, strict = FALSE, OR =20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 10 models were selected
## Best 5 models (cumulative posterior probability = 0.8884 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 3439.59745 293.382 3600.166 3104.438 3575.199
## age 2.7 0.03797 1.540 . . .
## lwt 32.3 1.06174 1.813 . 3.434 .
## race 100.0 -204.84945 57.309 -210.768 -187.849 -215.686
## smoke 100.0 -383.45469 105.883 -389.385 -366.135 -397.582
## ptl 7.4 -4.88165 32.782 . . .
## ht 73.5 -393.12802 294.446 -497.117 -595.820 .
## ui 100.0 -534.07331 138.284 -555.943 -523.419 -517.511
## ftv 4.5 -0.35508 10.015 . . .
##
## nVar 4 5 3
## r2 0.203 0.221 0.175
## BIC -21.891 -21.080 -20.743
## post prob 0.365 0.243 0.205
## model 4 model 5
## Intercept 3241.527 3601.038
## age . .
## lwt 2.288 .
## race -201.062 -208.353
## smoke -383.172 -375.269
## ptl . -71.592
## ht . -496.568
## ui -490.751 -534.874
## ftv . .
##
## nVar 4 5
## r2 0.184 0.205
## BIC -17.502 -17.157
## post prob 0.041 0.034
# xem bien nao anh huong dua vao r2 cua tung bien: phai goi ham hoi quy tuyen tinh ra (ham lm) roi moi kiem tra duoc bang ham calc.relimp.
m1=lm(bwt~race+smoke+ht+ui, data=bw)
calc.relimp(m1,type="lmg")
## Response variable: bwt
## Total response variance: 531753.5
## Analysis based on 189 observations
##
## 4 Regressors:
## race smoke ht ui
## Proportion of variance explained by model: 20.29%
## Metrics are not normalized (rela=FALSE).
##
## Relative importance metrics:
##
## lmg
## race 0.05116788
## smoke 0.04937125
## ht 0.02479817
## ui 0.07754788
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## race -154.6132 -175.9747 -194.8922 -210.7681
## smoke -283.7767 -323.7193 -359.3030 -389.3848
## ht -435.3983 -461.8318 -483.3517 -497.1166
## ui -581.2733 -580.5885 -572.3122 -555.9427