Doc du lieu
t = "C:\\Users\\SONY\\Google Drive\\HOC TAP\\CH 22\\Phan tich du lieu\\Workshop GS Tuan\\Phan tich du lieu va ung dung\\Datasets for practice\\obesity data.csv"
ob = read.csv(t)
Bước 1: Phân tích mô tả / khai thác
## Loading required package: ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Bước 2: Dùng BMA tìm biến liên quan
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
yvar = ob[, ("pcfat")]
xvars = ob[, c("gender", "height", "weight", "bmi", "age")]
bma = bicreg(xvars, yvar, strict=FALSE, OR=20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 5.26146 4.582901 7.958e+00 -7.928e-01 8.137e+00
## genderM 100.0 -11.25139 0.429659 -1.144e+01 -1.143e+01 -1.081e+01
## height 31.4 0.01759 0.028494 . 5.598e-02 .
## weight 39.2 0.03102 0.042611 7.921e-02 . .
## bmi 100.0 1.01265 0.111625 8.942e-01 1.089e+00 1.089e+00
## age 100.0 0.05259 0.008048 5.497e-02 5.473e-02 4.715e-02
##
## nVar 4 4 3
## r2 0.697 0.696 0.695
## BIC -1.423e+03 -1.423e+03 -1.422e+03
## post prob 0.392 0.314 0.294
Bước 3: Chia dữ liệu thành 2 nhóm: development (60%) và validation (40%)
rows = nrow(ob)
prop = 0.6
upper = floor(prop*rows)
permutation = ob[sample(rows), ]
dev = permutation[1:upper, ]
val = permutation[(upper+1):rows, ]
Bước 4: Xây dựng mô hình dùng dữ liệu của dev
m = lm(pcfat ~ gender + age + bmi + weight, data=dev)
summary(m)
##
## Call:
## lm(formula = pcfat ~ gender + age + bmi + weight, data = dev)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.4126 -2.2632 0.0179 2.4536 15.5918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.29732 1.06561 7.786 2.38e-14 ***
## genderM -11.50467 0.43735 -26.305 < 2e-16 ***
## age 0.05962 0.00935 6.377 3.21e-10 ***
## bmi 0.83290 0.10185 8.177 1.30e-15 ***
## weight 0.09556 0.03613 2.645 0.00834 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.848 on 725 degrees of freedom
## Multiple R-squared: 0.7064, Adjusted R-squared: 0.7048
## F-statistic: 436.2 on 4 and 725 DF, p-value: < 2.2e-16
Bước 5: Kiểm tra mô hình (validation) # Kiểm tra mô hình dùng dữ liệu của val # Dùng hàm predict
m = lm(pcfat ~ gender + age + bmi + weight, data = dev)
val$pred = predict(m, newdata = val)
val$resid = val$pred-val$pcfat
plot(val$resid ~ val$pred)
# Tính RMSE – residual mean square error
sum(val$resid^2) / (nrow(val)-5)
## [1] 17.25179
cor(val$pred, val$pcfat)^2
## [1] 0.6822016
Chia mẫu thành development và validation
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
sample = createDataPartition(ob$pcfat, p=0.6, list=F)
dev = ob[sample, ]
val = ob[-sample, ]
control = trainControl(method="cv", number=10)
training = train(pcfat ~ gender + age + bmi + weight, data=dev,
method="lm", trControl=control, metric="Rsquared")
summary(training)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.3448 -2.6180 0.2357 2.5550 15.6455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.353905 1.063678 7.854 1.45e-14 ***
## genderM -11.816194 0.446131 -26.486 < 2e-16 ***
## age 0.057895 0.009489 6.101 1.71e-09 ***
## bmi 0.802709 0.101419 7.915 9.27e-15 ***
## weight 0.107050 0.036585 2.926 0.00354 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.873 on 726 degrees of freedom
## Multiple R-squared: 0.7172, Adjusted R-squared: 0.7157
## F-statistic: 460.4 on 4 and 726 DF, p-value: < 2.2e-16
Đánh giá tầm quan trọng của các biến
m = lm(pcfat ~ gender + age + bmi + weight, data=ob)
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(m, type="lmg", rela=T, rank=T)
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 4 Regressors:
## gender age bmi weight
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## gender 0.59317775
## age 0.06893066
## bmi 0.24613695
## weight 0.09175463
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## gender -10.51634414 -11.71834412 -11.80453842 -11.44430262
## age 0.12768705 0.10445197 0.05168496 0.05496623
## bmi 1.03619023 1.50631405 1.54278433 0.89419395
## weight 0.04319324 -0.05539405 -0.06907993 0.07920690