t="D:\\Datasets for practice\\obesity data.csv"
ob=read.csv(t)
head(ob)
##   id gender height weight  bmi age  bmc  bmd   fat  lean pcfat
## 1  1      F    150     49 21.8  53 1312 0.88 17802 28600  37.3
## 2  2      M    165     52 19.1  65 1309 0.84  8381 40229  16.8
## 3  3      F    157     57 23.1  64 1230 0.84 19221 36057  34.0
## 4  4      F    156     53 21.8  56 1171 0.80 17472 33094  33.8
## 5  5      M    160     51 19.9  54 1681 0.98  7336 40621  14.8
## 6  6      F    153     47 20.1  52 1358 0.91 14904 30068  32.2
rows=nrow(ob)
prop=0.7
upper=floor(prop*rows)
permutation=ob[sample(rows),]
dev=permutation[1:upper,]
val=permutation[(upper+1):rows,]
dim(dev)
## [1] 851  11
dim(val)
## [1] 366  11

xay dung mo hinh dung dev

m=lm(pcfat~gender+age+bmi+weight,data=dev)
summary(m)
## 
## Call:
## lm(formula = pcfat ~ gender + age + bmi + weight, data = dev)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.1087  -2.5076  -0.0144   2.6404  15.4466 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.858198   1.004864   8.815  < 2e-16 ***
## genderM     -11.430503   0.410916 -27.817  < 2e-16 ***
## age           0.051942   0.008752   5.935 4.28e-09 ***
## bmi           0.881255   0.094960   9.280  < 2e-16 ***
## weight        0.067372   0.033397   2.017    0.044 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.922 on 846 degrees of freedom
## Multiple R-squared:  0.6918, Adjusted R-squared:  0.6903 
## F-statistic: 474.7 on 4 and 846 DF,  p-value: < 2.2e-16

Tinh gia tri tien luong cho val(dung tham so cua m)

val$pred=predict(m,newdata = val)
head(val,3)
##      id gender height weight  bmi age  bmc  bmd   fat  lean pcfat     pred
## 715 721      M    175     88 28.7  36 2525 1.15 25704 57455  30.0 30.51840
## 86   87      F    151     51 22.4  78 1655 1.05 18166 30180  36.3 36.08580
## 56   56      F    156     54 22.2  64 1287 0.84 21264 30774  39.9 35.38447
r2=cor(val$pcfat,val$pred)^2
r2
## [1] 0.7103249
plot(val$pcfat~val$pred, pch=16, col="blue")

# Đánh giá tầm quan trọng của biến số tiên lượng # dữ liệu cho 1217 đối tượng

library(relaimpo)# relative importance
## Loading required package: MASS
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
m=lm(pcfat~gender+age+weight+height+bmi,data=ob)
calc.relimp(m, type="lmg")
## Response variable: pcfat 
## Total response variance: 51.5935 
## Analysis based on 1217 observations 
## 
## 5 Regressors: 
## gender age weight height bmi 
## Proportion of variance explained by model: 69.66%
## Metrics are not normalized (rela=FALSE). 
## 
## Relative importance metrics: 
## 
##               lmg
## gender 0.35388610
## age    0.03807245
## weight 0.06505573
## height 0.10600159
## bmi    0.13357808
## 
## Average coefficients for different model sizes: 
## 
##                  1X          2Xs          3Xs          4Xs          5Xs
## gender -10.51634414 -11.25778000 -11.37601905 -11.34809670 -11.44105012
## age      0.12768705   0.09380448   0.04849443   0.04545609   0.05493292
## weight   0.04319324   0.06043494   0.08062526   0.10196648   0.09394895
## height  -0.43201351  -0.39537155  -0.28180720  -0.13427073  -0.01099062
## bmi      1.03619023   1.38422758   1.37018157   1.10104878   0.85803534

Mô tả: từ bảng trên thì gender có R square cao nhất, có ý nghĩa tiên lượng tốt nhất.