Doc du lieu

t = "C:\\Users\\SONY\\Google Drive\\HOC TAP\\CH 22\\Phan tich du lieu\\Workshop GS Tuan\\Phan tich du lieu va ung dung\\Datasets for practice\\obesity data.csv"
ob = read.csv(t)

Bước 1: Phân tích mô tả / khai thác

## Loading required package: ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bước 2: Dùng BMA tìm biến liên quan

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
yvar = ob[, ("pcfat")]
xvars = ob[, c("gender", "height", "weight", "bmi", "age")]
bma = bicreg(xvars, yvar, strict=FALSE, OR=20)
summary(bma)
## 
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   3  models were selected
##  Best  3  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100.0    5.26146  4.582901   7.958e+00  -7.928e-01   8.137e+00
## genderM    100.0  -11.25139  0.429659  -1.144e+01  -1.143e+01  -1.081e+01
## height      31.4    0.01759  0.028494       .       5.598e-02       .    
## weight      39.2    0.03102  0.042611   7.921e-02       .           .    
## bmi        100.0    1.01265  0.111625   8.942e-01   1.089e+00   1.089e+00
## age        100.0    0.05259  0.008048   5.497e-02   5.473e-02   4.715e-02
##                                                                          
## nVar                                      4           4           3      
## r2                                      0.697       0.696       0.695    
## BIC                                    -1.423e+03  -1.423e+03  -1.422e+03
## post prob                               0.392       0.314       0.294

Bước 3: Chia dữ liệu thành 2 nhóm: development (60%) và validation (40%)

rows = nrow(ob)
prop = 0.6
upper = floor(prop*rows)
permutation = ob[sample(rows), ]
dev = permutation[1:upper, ]
val = permutation[(upper+1):rows, ]

Bước 4: Xây dựng mô hình dùng dữ liệu của dev

m = lm(pcfat ~ gender + age + bmi + weight, data=dev)
summary(m)
## 
## Call:
## lm(formula = pcfat ~ gender + age + bmi + weight, data = dev)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4126  -2.2632   0.0179   2.4536  15.5918 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.29732    1.06561   7.786 2.38e-14 ***
## genderM     -11.50467    0.43735 -26.305  < 2e-16 ***
## age           0.05962    0.00935   6.377 3.21e-10 ***
## bmi           0.83290    0.10185   8.177 1.30e-15 ***
## weight        0.09556    0.03613   2.645  0.00834 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.848 on 725 degrees of freedom
## Multiple R-squared:  0.7064, Adjusted R-squared:  0.7048 
## F-statistic: 436.2 on 4 and 725 DF,  p-value: < 2.2e-16

Bước 5: Kiểm tra mô hình (validation) # Kiểm tra mô hình dùng dữ liệu của val # Dùng hàm predict

m = lm(pcfat ~ gender + age + bmi + weight, data = dev)
val$pred = predict(m, newdata = val)
val$resid = val$pred-val$pcfat

Vẽ residuals vs giá trị tiên lượng

plot(val$resid ~ val$pred)

# Tính RMSE – residual mean square error

sum(val$resid^2) / (nrow(val)-5)
## [1] 17.25179

Tính R-square

cor(val$pred, val$pcfat)^2
## [1] 0.6822016

Training và testing mô hình qua “caret”

Chia mẫu thành development và validation

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
sample = createDataPartition(ob$pcfat, p=0.6, list=F)
dev = ob[sample, ]
val = ob[-sample, ]

Huấn luyện mô hình: dùng hàm “train”

control = trainControl(method="cv", number=10)
training = train(pcfat ~ gender + age + bmi + weight, data=dev,
method="lm", trControl=control, metric="Rsquared")
summary(training)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3448  -2.6180   0.2357   2.5550  15.6455 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.353905   1.063678   7.854 1.45e-14 ***
## genderM     -11.816194   0.446131 -26.486  < 2e-16 ***
## age           0.057895   0.009489   6.101 1.71e-09 ***
## bmi           0.802709   0.101419   7.915 9.27e-15 ***
## weight        0.107050   0.036585   2.926  0.00354 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.873 on 726 degrees of freedom
## Multiple R-squared:  0.7172, Adjusted R-squared:  0.7157 
## F-statistic: 460.4 on 4 and 726 DF,  p-value: < 2.2e-16

Đánh giá tầm quan trọng của các biến

m = lm(pcfat ~ gender + age + bmi + weight, data=ob)
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:survival':
## 
##     aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(m, type="lmg", rela=T, rank=T)
## Response variable: pcfat 
## Total response variance: 51.5935 
## Analysis based on 1217 observations 
## 
## 4 Regressors: 
## gender age bmi weight 
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##               lmg
## gender 0.59317775
## age    0.06893066
## bmi    0.24613695
## weight 0.09175463
## 
## Average coefficients for different model sizes: 
## 
##                  1X          2Xs          3Xs          4Xs
## gender -10.51634414 -11.71834412 -11.80453842 -11.44430262
## age      0.12768705   0.10445197   0.05168496   0.05496623
## bmi      1.03619023   1.50631405   1.54278433   0.89419395
## weight   0.04319324  -0.05539405  -0.06907993   0.07920690