Muc tieu: Tim mo hinh sao cho it bien so nhat, nhung giai thich phuong sai nhieu nhat

#Trong mot mo hinh da bien, cau hoi quan trong la bien X nao cos lien quan den Y mot cach doc lap? Lam sao de phat hien nhung bien do? #Phan tich mo hinh da bien, thay ca bien khi phan tich don bien co y nghia lai tro nen ko co YNTK?? (Ans: Do la weight va height co lien quan den bmi- la bien co lien quan den pcfat–> YNTK gia) #Giai phap Akaike: AIC (akaike Information Criterion) #Giai phap BIC( Bayesian Information Criterion) # AIC, BIC cang thap, mo hinh cang TOT # –> “BMA” package trong R.

ob=read.csv("C:\\Users\\Thu Bo\\Desktop\\obesity data.csv")
head(ob)
m=lm (pcfat~gender+ age+ height+ weight+ bmi, data=ob)
summary (m)

#Bien 1 mau, pi=100: xuat hien trong ca 3 mo hinh--> quan trong
# pi: ty le xuat hien trong cac mo hinh cua bien
#EV: expected value, co nghia la gia tri trung binh cua tham so- he so lien quan cua bien trong tat ca cac model. VD: Tinh trung binhm nam co ti trong mo thap hon nu 11.2%. Moi cm tang chieu cao co lien quan den 0.02% tang ti trong mo.
#model: nVar(so bien so), r2: r square, postprob: xac suat xuat hien =xac suat hau dinh (muc do nhat quan) la bn %, BIC
#lua chon model "toi uu" (ko co mo hinh "tot nhat"): r2 cao nhat, post prop (posterior properity) cao nhat, BIC thap nhat, it bien nhat---> lua chon 1-3-2
# Neu bien so dau vao qua nhieu, post prop (<10%) thuong thap du R2 cao--> Nen lua chon bien so dau vao (can cu y van, co che)
library(BMA)
yvar = ob[, ("pcfat")]
xvars = ob[, c("gender", "height", "weight", "bmi", "age")]
bma = bicreg(xvars, yvar, strict=FALSE, OR=20)
summary(bma)

#Thanh lap mo hinh tien luong cho quan the

m = lm(pcfat ~ gender + age + bmi + weight, data=dev)
m

KET QUA: pcfat= -12.13Male+ 0.052age + 0.85bmi+ 0.11weight

MO HINH TIEN LUONG (5 BUOC) #Buoc1: Phan tich mo ta/ khai thac

dat= ob[,c("age", "gender", "bmi", "height", "weight", "pcfat")]
library(GGally)
ggpairs(dat)

#Buoc 2: Dung BMA tim bien lien quan –> chon mo hinh tien luong toi uu nhat ( nhu tren)

library(BMA)
yvar = ob[, ("pcfat")]
xvars = ob[, c("gender", "height", "weight", "bmi", "age")]
bma = bicreg(xvars, yvar, strict=FALSE, OR=20)
summary(bma)

#Buoc 3: Chia du lieu thanh 2 nhom 1 cach ngau nhien (validation va development)

#Phuong phap thu cong
rows = nrow(ob)
prop = 0.6
upper = floor(prop*rows)
permutation = ob[sample(rows), ]
dev = permutation[1:upper, ]
val = permutation[(upper+1):rows, ]
dim(val)
dim(dev)

#Buoc 4: Xay dung mo hinh training

m = lm(pcfat ~ gender + age + bmi + weight, data=dev)
summary(m)

#Buoc 5: Kiem tra mo hinh

# Kiểm tra mô hình dùng dữ liệu của val
# Dùng hàm predict
val$pred = predict(m, newdata = val)
val$resid = val$pred-val$pcfat
head(val,3)
# Vẽ residuals vs giá trị tiên lượng
plot(val$resid ~ val$pred)
# Tính RMSE – residual mean square error
RMSE = sum(val$resid^2) / (nrow(val)-5)
# Tính R-square
Rsqr = cor(val$pred, val$pcfat)^2 
plot(val$pcfat ~ val$pred, pch=16)
abline(lm(val$pcfat~val$pred), col="blue")

Training và testing mô hình qua “caret”

library(caret)
# Chia mẫu thành development và validation
sample = createDataPartition(ob$pcfat, p=0.6, list=F)
dev = ob[sample, ]
val = ob[-sample, ]
# Huấn luyện mô hình: dùng hàm "train"
control = trainControl(method="cv", number=10)
training = train(pcfat ~ gender + age + bmi + weight, data=dev,
method="lm", trControl=control, metric="Rsquared")
summary(training)
# Kiểm tra mô hình (val), tính giá trị tiên lượng
pred = predict(training, newdata=val)
model.values = data.frame(obs=val$pcfat, pred)
plot(pred ~ val$pcfat, pch=16)
abline(lm(pred ~ val$pcfat), col= "blue")
defaultSummary(model.values)

#Danh gia tam quan trong cua cac bien

library(relaimpo)

m=lm(pcfat~age+bmi+gender+ weight, data=ob)
summary(m)
library(relaimpo)
calc.relimp(m, type= "lmg", rela= T, rank= T)

Exercise 15.6.19 Chon mo hinh hoi quy tuyen tinh

Thu Nguyen

6/15/2019

Muc tieu: Tim mo hinh sao cho it bien so nhat, nhung giai thich phuong sai nhieu nhat