data

t="C:\\Users\\pc\\Downloads\\CAC NGHIEN CUU\\BAI GIANG CAC MON\\GS TUAN\\BG 12.6.19\\obesity data.csv"
ob=read.csv(t)
head(ob)
##   id gender height weight  bmi age  bmc  bmd   fat  lean pcfat
## 1  1      F    150     49 21.8  53 1312 0.88 17802 28600  37.3
## 2  2      M    165     52 19.1  65 1309 0.84  8381 40229  16.8
## 3  3      F    157     57 23.1  64 1230 0.84 19221 36057  34.0
## 4  4      F    156     53 21.8  56 1171 0.80 17472 33094  33.8
## 5  5      M    160     51 19.9  54 1681 0.98  7336 40621  14.8
## 6  6      F    153     47 20.1  52 1358 0.91 14904 30068  32.2

#analysis

#chia thanh 02 nhom voi ti le 7:3; upper la diem chia bo du lieu. 
rows=nrow(ob)
prop=0.7
upper=floor(prop*rows)
permutation=ob[sample(rows),]
dev=permutation[1:upper,]
val=permutation[(upper+1):rows,]
# dim la ham cho biet so hang hay so id (bn)
dim(dev)
## [1] 851  11
dim(val)
## [1] 366  11
#xay dung mo hinh dung dev: theo mo hinh BMA da chon ra mo hinh 4 tham so voi r2 =0.69 va xac suat hau dinh pros prop la cao nhat 0.39. 
m=lm(pcfat~gender+age+bmi+weight, data=dev)
summary(m)
## 
## Call:
## lm(formula = pcfat ~ gender + age + bmi + weight, data = dev)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3541  -2.3794   0.0871   2.6034  15.5951 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.652717   0.977583   7.828 1.48e-14 ***
## genderM     -11.928480   0.404907 -29.460  < 2e-16 ***
## age           0.056284   0.008736   6.443 1.97e-10 ***
## bmi           0.793492   0.092505   8.578  < 2e-16 ***
## weight        0.126718   0.033854   3.743 0.000194 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.909 on 846 degrees of freedom
## Multiple R-squared:  0.7071, Adjusted R-squared:  0.7057 
## F-statistic: 510.6 on 4 and 846 DF,  p-value: < 2.2e-16

kiem dinh cho quan the val

# tinh gia tri tien luong cho val (dung tham so cua m)
val$pred=predict(m,newdata=val)
head(val,3)
##        id gender height weight  bmi age  bmc  bmd   fat  lean pcfat
## 1037 1047      F    149     53 23.9  59 1589 1.03 18296 32902  34.7
## 133   134      F    150     48 21.3  80  695 0.65 13823 26787  33.5
## 391   395      F    151     50 21.9  41 1936 1.13 14589 32963  29.5
##          pred
## 1037 36.65399
## 133  35.13929
## 391  33.67374
r2=cor(val$pcfat, val$pred)^2
r2
## [1] 0.6697813
# ve do thi moi tuong quan giua pct va bien moi pred
plot(val$pcfat~val$pred, pch=16)
#danh gia tam quan trong cua bien tien luong, dung du lieu cho 1217 doi tuong.
library(relaimpo)
## Warning: package 'relaimpo' was built under R version 3.5.3
## Loading required package: MASS
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.

m=lm(pcfat~gender+age+weight+height+bmi, data=ob)
calc.relimp(m, type="lmg", rela=T, rank=T)
## Response variable: pcfat 
## Total response variance: 51.5935 
## Analysis based on 1217 observations 
## 
## 5 Regressors: 
## gender age weight height bmi 
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##               lmg
## gender 0.50802351
## age    0.05465515
## weight 0.09339118
## height 0.15217127
## bmi    0.19175889
## 
## Average coefficients for different model sizes: 
## 
##                  1X          2Xs          3Xs          4Xs          5Xs
## gender -10.51634414 -11.25778000 -11.37601905 -11.34809670 -11.44105012
## age      0.12768705   0.09380448   0.04849443   0.04545609   0.05493292
## weight   0.04319324   0.06043494   0.08062526   0.10196648   0.09394895
## height  -0.43201351  -0.39537155  -0.28180720  -0.13427073  -0.01099062
## bmi      1.03619023   1.38422758   1.37018157   1.10104878   0.85803534
# chu thich: them rela=T, rank=T : la lay 100% du lieu, neu khong co chi lay 25% du lieu , tuy nhien thu tu anh huong cua cac bien van theo dung thu tu
calc.relimp(m, type = "lmg")
## Response variable: pcfat 
## Total response variance: 51.5935 
## Analysis based on 1217 observations 
## 
## 5 Regressors: 
## gender age weight height bmi 
## Proportion of variance explained by model: 69.66%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##               lmg
## gender 0.35388610
## age    0.03807245
## weight 0.06505573
## height 0.10600159
## bmi    0.13357808
## 
## Average coefficients for different model sizes: 
## 
##                  1X          2Xs          3Xs          4Xs          5Xs
## gender -10.51634414 -11.25778000 -11.37601905 -11.34809670 -11.44105012
## age      0.12768705   0.09380448   0.04849443   0.04545609   0.05493292
## weight   0.04319324   0.06043494   0.08062526   0.10196648   0.09394895
## height  -0.43201351  -0.39537155  -0.28180720  -0.13427073  -0.01099062
## bmi      1.03619023   1.38422758   1.37018157   1.10104878   0.85803534

# dung BMA de kiem tra bien x nao anh huong den y

 #nhap du lieu
t="C:\\Users\\pc\\Downloads\\CAC NGHIEN CUU\\BAI GIANG CAC MON\\GS TUAN\\BG 12.6.19\\birthwt.csv"
bw=read.csv(t)
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622

analysis of data

# bien y la bwt(gram); bien x =age, lwt, race, smoke, ptl,ht, ui, ftv,)
# bien y la bwt(gram); bien x =age, lwt, race, smoke, ptl,ht, ui, ftv)
# dung BMA tim bien lien quan den bwt
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## The following object is masked from 'package:boot':
## 
##     salinity
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-4)
# dinh nghia bien x va y
yvar=bw[, c("bwt")]
xvars=bw[, c("age", "lwt", "race", "smoke", "ptl", "ht", "ui", "ftv")]
bma=bicreg(xvars, yvar, strict = FALSE, OR =20)
summary(bma)
## 
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   10  models were selected
##  Best  5  models (cumulative posterior probability =  0.8884 ): 
## 
##            p!=0    EV         SD       model 1   model 2   model 3 
## Intercept  100.0  3439.59745  293.382  3600.166  3104.438  3575.199
## age          2.7     0.03797    1.540      .         .         .   
## lwt         32.3     1.06174    1.813      .        3.434      .   
## race       100.0  -204.84945   57.309  -210.768  -187.849  -215.686
## smoke      100.0  -383.45469  105.883  -389.385  -366.135  -397.582
## ptl          7.4    -4.88165   32.782      .         .         .   
## ht          73.5  -393.12802  294.446  -497.117  -595.820      .   
## ui         100.0  -534.07331  138.284  -555.943  -523.419  -517.511
## ftv          4.5    -0.35508   10.015      .         .         .   
##                                                                    
## nVar                                        4         5         3  
## r2                                        0.203     0.221     0.175
## BIC                                     -21.891   -21.080   -20.743
## post prob                                 0.365     0.243     0.205
##            model 4   model 5 
## Intercept  3241.527  3601.038
## age            .         .   
## lwt           2.288      .   
## race       -201.062  -208.353
## smoke      -383.172  -375.269
## ptl            .      -71.592
## ht             .     -496.568
## ui         -490.751  -534.874
## ftv            .         .   
##                              
## nVar            4         5  
## r2            0.184     0.205
## BIC         -17.502   -17.157
## post prob     0.041     0.034
# xem bien nao anh huong dua vao r2 cua tung bien: phai goi ham hoi quy tuyen tinh ra (ham lm) roi moi kiem tra duoc bang ham calc.relimp. 
m1=lm(bwt~race+smoke+ht+ui, data=bw)
calc.relimp(m1,type="lmg")
## Response variable: bwt 
## Total response variance: 531753.5 
## Analysis based on 189 observations 
## 
## 4 Regressors: 
## race smoke ht ui 
## Proportion of variance explained by model: 20.29%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##              lmg
## race  0.05116788
## smoke 0.04937125
## ht    0.02479817
## ui    0.07754788
## 
## Average coefficients for different model sizes: 
## 
##              1X       2Xs       3Xs       4Xs
## race  -154.6132 -175.9747 -194.8922 -210.7681
## smoke -283.7767 -323.7193 -359.3030 -389.3848
## ht    -435.3983 -461.8318 -483.3517 -497.1166
## ui    -581.2733 -580.5885 -572.3122 -555.9427