Lý thuyết mô hình hồi quy tuyến tính đa biến

ob=read.csv("D:/Learning/CME/R Statistic 2025/DỮ LIỆU THỰC HÀNH (TS Thạch gửi)/Obesity data.csv")
library(lessR)
## 
## lessR 4.4.3                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
## 
##     sort_by
fit=lm(pcfat~bmi,data=ob); summary(fit)
## 
## Call:
## lm(formula = pcfat ~ bmi, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.612  -4.181   1.392   4.690  18.241 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  8.39889    1.36777   6.141        0.00000000111 ***
## bmi          1.03619    0.06051  17.123 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.45 on 1215 degrees of freedom
## Multiple R-squared:  0.1944, Adjusted R-squared:  0.1937 
## F-statistic: 293.2 on 1 and 1215 DF,  p-value: < 0.00000000000000022
fit=lm(pcfat~weight,data=ob); summary(fit)
## 
## Call:
## lm(formula = pcfat ~ weight, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.3122  -4.5234   0.8902   5.2695  16.9742 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 29.22295    1.22370  23.881 <0.0000000000000002 ***
## weight       0.04319    0.02188   1.975              0.0485 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared:  0.003199,   Adjusted R-squared:  0.002378 
## F-statistic: 3.899 on 1 and 1215 DF,  p-value: 0.04855
fit=lm(pcfat~gender,data=ob); summary(fit)
## 
## Call:
## lm(formula = pcfat ~ gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.0724  -3.2724   0.1484   3.6276  14.8439 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  34.6724     0.1826   189.9 <0.0000000000000002 ***
## genderM     -10.5163     0.3381   -31.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4428 
## F-statistic: 967.3 on 1 and 1215 DF,  p-value: < 0.00000000000000022

Thử với 2 biến

fit=lm(pcfat~bmi+gender,data=ob); summary(fit)
## 
## Call:
## lm(formula = pcfat ~ bmi + gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.4709  -2.4780   0.1773   2.6903  15.1761 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   9.01035    0.85880   10.49 <0.0000000000000002 ***
## bmi           1.15303    0.03809   30.27 <0.0000000000000002 ***
## genderM     -11.06631    0.25599  -43.23 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.049 on 1214 degrees of freedom
## Multiple R-squared:  0.6828, Adjusted R-squared:  0.6822 
## F-statistic:  1306 on 2 and 1214 DF,  p-value: < 0.00000000000000022

Thử với nhiều biến

fit=lm(pcfat~bmi+gender+age+weight+height,data=ob); summary(fit)
## 
## Call:
## lm(formula = pcfat ~ bmi + gender + age + weight + height, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.221  -2.552   0.033   2.619  15.649 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   9.677701  15.611478   0.620               0.5354    
## bmi           0.858035   0.337409   2.543               0.0111 *  
## genderM     -11.441050   0.343970 -33.262 < 0.0000000000000002 ***
## age           0.054933   0.007404   7.420     0.00000000000022 ***
## weight        0.093949   0.136641   0.688               0.4919    
## height       -0.010991   0.099609  -0.110               0.9122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.965 on 1211 degrees of freedom
## Multiple R-squared:  0.6966, Adjusted R-squared:  0.6953 
## F-statistic: 556.1 on 5 and 1211 DF,  p-value: < 0.00000000000000022

Biểu diễn scatterplot xem bmi và weight có tương quan thế nào

ScatterPlot(bmi,weight,data=ob)

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(bmi, weight, enhance=TRUE)  # many options
## Plot(bmi, weight, fill="skyblue")  # interior fill color of points
## Plot(bmi, weight, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(bmi, weight, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 1217 
## Sample Correlation of bmi and weight: r = 0.787 
##   
## Hypothesis Test of 0 Correlation:  t = 44.452,  df = 1215,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.765 to 0.807 
## 

#Thực hành ## Phân tích tương quan ### Việc 1. Đọc dữ liệu “Bone data.csv” vào R và gọi dữ liệu là “df”

df=read.csv("D:/Learning/CME/R Statistic 2025/DỮ LIỆU THỰC HÀNH (TS Thạch gửi)/Bone data.csv")

Việc 2. Đánh giá mối liên quan giữa cân nặng (weight) và mật độ xương tại cổ xương đùi (fnbmd)

library(lessR)
ScatterPlot(weight,fnbmd,data=df,fit="loess")

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(weight, fnbmd, enhance=TRUE)  # many options
## Plot(weight, fnbmd, fill="skyblue")  # interior fill color of points
## Plot(weight, fnbmd, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
##    Loess Model MSE = 0.0156
## 

Hồi qui tuyến tính

Việc 3. Đọc dữ liệu “Obesity data.csv” vào R và gọi dữ liệu là “ob”

ob=read.csv("D:/Learning/CME/R Statistic 2025/DỮ LIỆU THỰC HÀNH (TS Thạch gửi)/Obesity data.csv")

Việc 4. So sánh tỉ trọng mỡ (pcfat) giữa nam và nữ (gender)

fit4=lm(pcfat~gender,data=ob)
summary(fit4)
## 
## Call:
## lm(formula = pcfat ~ gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.0724  -3.2724   0.1484   3.6276  14.8439 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  34.6724     0.1826   189.9 <0.0000000000000002 ***
## genderM     -10.5163     0.3381   -31.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4428 
## F-statistic: 967.3 on 1 and 1215 DF,  p-value: < 0.00000000000000022
#genderM: chữ M là biến so sánh (Female sẽ là biến tham chiếu), intercept 34.67 chính là pcfat của biến tham chiếu - nữ

####Kiểm tra giả định

#cách 1:
par(mfrow=c(2,2)) #để chia màn hình thành 4 ô cho dễ thấy tất cả 4 hình biểu đồ
plot(fit4)

#hình 1: giá trị tuyệt đối: residual dao động từ -20 -> 10
#hình 3: giá trị dao động bao nhiêu độ lệch chuẩn
#hình Q-Q: các giá trị nằm trên đường gạch thì mô hình tốt
#hình Residual vs Leverage
#hình như ví dụ là mô hình tốt
#Kết luận: thực tế chỉ dùng biểu đồ để kiểm định mô hình đạt chuẩn hay ko

#Cách 2
library(lessR)
fit4.2=reg(pcfat~gender,data=ob); summary(fit4.2)
## 
## >>>  gender is not numeric. Converted to indicator variables.

##                 Length Class      Mode     
## out_suggest        1   out        character
## call               3   -none-     call     
## formula            3   formula    call     
## vars               2   -none-     character
## out_title_bck      1   out        character
## out_background     7   out        character
## out_title_basic    1   out        character
## out_estimates      3   out        character
## out_fit            7   out        character
## out_anova          6   out        character
## out_title_mod      1   out        character
## out_mod            1   out        character
## out_mdls           1   out        character
## out_title_kfold    1   out        character
## out_kfold          1   out        character
## out_title_rel      1   out        character
## out_cor            3   out        character
## out_collinear      1   out        character
## out_subsets        1   out        character
## out_title_res      1   out        character
## out_residuals     25   out        character
## out_title_pred     1   out        character
## out_predict       11   out        character
## out_ref            1   out        character
## out_Rmd            1   out        character
## out_Word           1   out        character
## out_pdf            1   out        character
## out_odt            1   out        character
## out_rtf            1   out        character
## out_plots          4   out        character
## n.vars             1   -none-     numeric  
## n.obs              1   -none-     numeric  
## n.keep             1   -none-     numeric  
## coefficients       2   -none-     numeric  
## sterrs             2   -none-     numeric  
## tvalues            2   -none-     numeric  
## pvalues            2   -none-     numeric  
## cilb               2   -none-     numeric  
## ciub               2   -none-     numeric  
## anova_model        5   -none-     numeric  
## anova_residual     3   -none-     numeric  
## anova_total        3   -none-     numeric  
## se                 1   -none-     numeric  
## resid_range        1   -none-     numeric  
## Rsq                1   -none-     numeric  
## Rsqadj             1   -none-     numeric  
## PRESS              1   -none-     numeric  
## RsqPRESS           1   -none-     numeric  
## m_se               1   -none-     logical  
## m_MSE              1   -none-     logical  
## m_Rsq              1   -none-     logical  
## cor                4   -none-     numeric  
## tolerances         1   -none-     logical  
## vif                1   -none-     logical  
## resid.max         20   -none-     numeric  
## pred_min_max       2   -none-     numeric  
## residuals       1217   -none-     numeric  
## fitted          1217   -none-     numeric  
## cooks.distance  1217   -none-     numeric  
## model              2   data.frame list     
## terms              3   terms      call
#R2: tính trực tiếp từ các lỗi(khác biệt giữa giá trị tính từ mô hình so với thực tế), nếu chỉ có 1 biến số thì bằng adjusted R2
#Adjusted R2: tính từ trung bình các lỗi, nên mô hình có nhiều biến số thì nên dùng Adjusted R2 
fit4.2
## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## reg(pcfat ~ gender, data=ob, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  ob 
##  
## Response Variable: pcfat 
## Predictor Variable: genderM 
##  
## Number of cases (rows) of data:  1217 
## Number of cases retained for analysis:  1217 
## 
## 
##   BASIC ANALYSIS 
## 
##               Estimate    Std Err  t-value  p-value    Lower 95%    Upper 95% 
## (Intercept)  34.672413   0.182622  189.859    0.000    34.314123    35.030703 
##     genderM -10.516344   0.338131  -31.101    0.000   -11.179729    -9.852959 
## 
## Standard deviation of pcfat: 7.182861 
##  
## Standard deviation of residuals:  5.361759 for df=1215 
## 95% range of residuals:  21.038669 = 2 * (1.962 * 5.361759) 
##  
## R-squared: 0.443    Adjusted R-squared: 0.443    PRESS R-squared: 0.441 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 967.297     df: 1 and 1215     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##                df        Sum Sq       Mean Sq     F-value   p-value 
## Model           1  27808.311497  27808.311497  967.297285     0.000 
## Residuals    1215  34929.384159     28.748464 
## pcfat        1216  62737.695656     51.593500 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##           pcfat genderM 
##     pcfat  1.00   -0.67 
##   genderM -0.67    1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"] 
## --------------------------------------------------------------------------- 
##         genderM     pcfat    fitted      resid    rstdnt    dffits    cooks 
##    210        1  9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020 
##    509        1 39.000000 24.156069  14.843931  2.780055  0.147758 0.010860 
##    179        1 38.700000 24.156069  14.543931  2.723523  0.144754 0.010420 
##    518        1  9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300 
##    200        1  9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150 
##    563        1 38.300000 24.156069  14.143931  2.648179  0.140749 0.009860 
##    318        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    972        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    388        1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920 
##    203        1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530 
##   1137        0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150 
##    893        0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070 
##    688        1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020 
##    403        1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640 
##    858        1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400 
##    158        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##   1106        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##    827        1 36.000000 24.156069  11.843931  2.215637  0.117760 0.006910 
##    756        1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810 
##    196        1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##         genderM     pcfat      pred   s_pred    pi.lwr    pi.upr     width 
##      2        1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      5        1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
## ... 
##   1209        1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      1        0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##      3        0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869 
## ... 
##   1215        0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1216        0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1217        0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------
#fitted: giá trị tính toán theo mô hình dự đoán
#residual:sự khác biệt của giá trị dự đoán với thực tế
#dffits và cooks: đánh giá mức độ ảnh hưởng của biến số lên mô hình

Việc 5. Đánh giá mối liên quan giữa cân nặng (weight) và tỉ trọng mỡ (pcfat)

ScatterPlot(weight,pcfat,data=ob,fit="loess")

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(weight, pcfat, enhance=TRUE)  # many options
## Plot(weight, pcfat, color="red")  # exterior edge color of points
## Plot(weight, pcfat, out_cut=.10)  # label top 10% from center as outliers 
## 
##    Loess Model MSE = 49.325299
## 
fit5=reg(pcfat~weight,data=ob)

fit5
## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## reg(pcfat ~ weight, data=ob, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  ob 
##  
## Response Variable: pcfat 
## Predictor Variable: weight 
##  
## Number of cases (rows) of data:  1217 
## Number of cases retained for analysis:  1217 
## 
## 
##   BASIC ANALYSIS 
## 
##              Estimate    Std Err  t-value  p-value   Lower 95%   Upper 95% 
## (Intercept) 29.222947   1.223696   23.881    0.000   26.822156   31.623738 
##      weight  0.043193   0.021875    1.975    0.049    0.000276    0.086111 
## 
## Standard deviation of pcfat: 7.182861 
##  
## Standard deviation of residuals:  7.174315 for df=1215 
## 95% range of residuals:  28.150843 = 2 * (1.962 * 7.174315) 
##  
## R-squared: 0.003    Adjusted R-squared: 0.002    PRESS R-squared: 0.000 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 3.899     df: 1 and 1215     p-value:  0.049 
## 
## -- Analysis of Variance 
##  
##                df        Sum Sq     Mean Sq   F-value   p-value 
## Model           1    200.669670  200.669670  3.898709     0.049 
## Residuals    1215  62537.025985   51.470803 
## pcfat        1216  62737.695656   51.593500 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##          pcfat weight 
##    pcfat  1.00   0.06 
##   weight  0.06   1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"] 
## ---------------------------------------------------------------------------- 
##           weight     pcfat    fitted      resid    rstdnt    dffits    cooks 
##    373        85 47.400000 32.894372  14.505628  2.033775  0.194997 0.018960 
##    923        95 43.500000 33.326305  10.173695  1.429871  0.179944 0.016180 
##    972        42 10.300000 31.037063 -20.737063 -2.902805 -0.143205 0.010190 
##    716        74 17.600000 32.419246 -14.819246 -2.072679 -0.133434 0.008880 
##    858        42 11.900000 31.037063 -19.137063 -2.677456 -0.132088 0.008680 
##    177        38 15.700000 30.864290 -15.164290 -2.120502 -0.126644 0.008000 
##   1071        70 48.100000 32.246474  15.853526  2.216504  0.118990 0.007060 
##    943        73 18.700000 32.376053 -13.676053 -1.911957 -0.117867 0.006930 
##    876        34 18.800000 30.691517 -11.891517 -1.662860 -0.117617 0.006910 
##    245        35 18.400000 30.734710 -12.334710 -1.724650 -0.117167 0.006850 
##    762        80 22.600000 32.678406 -10.078406 -1.409997 -0.114628 0.006560 
##     88        68 15.300000 32.160087 -16.860087 -2.357246 -0.114610 0.006540 
##    200        47  9.800000 31.253029 -21.453029 -3.002259 -0.113942 0.006450 
##    688        46 11.400000 31.209836 -19.809836 -2.771011 -0.110895 0.006120 
##    184        39 17.200000 30.907483 -13.707483 -1.915842 -0.109309 0.005960 
##    388        47 10.700000 31.253029 -20.553029 -2.875431 -0.109129 0.005920 
##    895        65 13.600000 32.030507 -18.430507 -2.577138 -0.107125 0.005710 
##    276        40 16.900000 30.950676 -14.050676 -1.963672 -0.106882 0.005700 
##    528        39 17.600000 30.907483 -13.307483 -1.859774 -0.106110 0.005620 
##    441        72 20.000000 32.332860 -12.332860 -1.723410 -0.101599 0.005150 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##           weight     pcfat      pred   s_pred    pi.lwr    pi.upr     width 
##    876        34 18.800000 30.691517 7.192151 16.581104 44.801929 28.220825 
##     55        35 27.100000 30.734710 7.190777 16.626993 44.842427 28.215435 
##    245        35 18.400000 30.734710 7.190777 16.626993 44.842427 28.215435 
## ... 
##   1211        54 34.100000 31.555382 7.177306 17.474093 45.636670 28.162577 
##     20        55 19.300000 31.598575 7.177263 17.517370 45.679779 28.162409 
##     27        55 37.200000 31.598575 7.177263 17.517370 45.679779 28.162409 
## ... 
##    402        93 32.300000 33.239918 7.224879 19.065295 47.414541 28.349246 
##    628        95 32.900000 33.326305 7.230024 19.141587 47.511022 28.369435 
##    891        95 30.100000 33.326305 7.230024 19.141587 47.511022 28.369435 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------

####Kiểm định

fit5.1=lm(pcfat~weight,data=ob);summary(fit5.1)
## 
## Call:
## lm(formula = pcfat ~ weight, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.3122  -4.5234   0.8902   5.2695  16.9742 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 29.22295    1.22370  23.881 <0.0000000000000002 ***
## weight       0.04319    0.02188   1.975              0.0485 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared:  0.003199,   Adjusted R-squared:  0.002378 
## F-statistic: 3.899 on 1 and 1215 DF,  p-value: 0.04855
library(ggfortify)
## Loading required package: ggplot2
autoplot(fit5.1)

###Việc 6 Đánh giá mối liên quan độc lập giữa cân nặng (weight) và tỉ trọng mỡ (pcfat)

fit6=lm(pcfat~weight+age+gender+height,data=ob); summary(fit6)
## 
## Call:
## lm(formula = pcfat ~ weight + age + gender + height, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.208  -2.543   0.019   2.582  15.706 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  48.368722   3.505431  13.798 < 0.0000000000000002 ***
## weight        0.439169   0.015594  28.163 < 0.0000000000000002 ***
## age           0.056166   0.007404   7.585   0.0000000000000658 ***
## genderM     -11.483254   0.344343 -33.348 < 0.0000000000000002 ***
## height       -0.257013   0.023768 -10.813 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.974 on 1212 degrees of freedom
## Multiple R-squared:  0.695,  Adjusted R-squared:  0.694 
## F-statistic: 690.4 on 4 and 1212 DF,  p-value: < 0.00000000000000022
autoplot(fit6)

# mô hình hiệu chỉnh: tất cả các biến số đều được đưa vào để xác định các yếu tố ảnh hưởng như thế nào
# dùng mô hình hồi quy tuyến tính đa biến để đánh giá liên quan độc lập của các biến số
#F stastistic: so sánh mô hình hiện tại với mô hình cơ bản, không có biến x (tức là so sánh với giá trị trung bình)

###Việc 7 Xây dựng mô hình dự báo tỉ trọng mỡ (pcfat)

# mô hình dự báo: tất cả các biến số dự báo đầu phải có ý nghĩa thống kê
#gọi package BMA
library(BMA) 
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
#định nghĩa biến phụ thuộc yvar và biến tiên lượng xvar
yvar=ob[,c("pcfat")]
xvar=ob[,c("gender","height","weight","bmi","age")]
#dùng hàm bicreg
m.bma=bicreg(xvar,yvar,strict=FALSE,OR=20)
summary(m.bma)
## 
## Call:
## bicreg(x = xvar, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   3  models were selected
##  Best  3  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1      model 2      model 3    
## Intercept  100.0    5.26146  4.582901      7.95773     -0.79279      8.13735
## genderM    100.0  -11.25139  0.429659    -11.44430    -11.42764    -10.80625
## height      31.4    0.01759  0.028494        .          0.05598        .    
## weight      39.2    0.03102  0.042611      0.07921        .            .    
## bmi        100.0    1.01265  0.111625      0.89419      1.08852      1.08936
## age        100.0    0.05259  0.008048      0.05497      0.05473      0.04715
##                                                                             
## nVar                                         4            4            3    
## r2                                         0.697        0.696        0.695  
## BIC                                    -1423.06312  -1422.62198  -1422.49027
## post prob                                  0.392        0.314        0.294
imageplot.bma(m.bma)

#p!=0: giải thích các biến số xuất hiện trong bao nhiêu mô hình (VD: gender xuất hiện trong 100% mô hình, weight xuất hiện trong 39.2%)
#bic: chênh lệch 2 đơn vị thì tuong đương nhau
#có quyền chọn 1 trong 5 mô hình tối ưu nhất

chứng minh có hiện tượng đa cộng tuyến

#chạy mô hình tuyến tính theo model tối ưu
fit7=lm(pcfat~gender+weight+age+bmi,data=ob)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
## 
##     bc, recode, sp
vif(fit7) #vif(m.bma) 
##   gender   weight      age      bmi 
## 1.878778 5.609651 1.263468 4.663425
##vif=1: ko có, <5: có thể, 5-10: trung bình (thận trọng), >10: bắt buộc làm lại mô hình

####Kiểm định tầm quan trọng

m.bma=lm(pcfat~gender + age + weight + bmi, data = ob);summary(m.bma)
## 
## Call:
## lm(formula = pcfat ~ gender + age + weight + bmi, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.225  -2.557   0.033   2.608  15.646 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   7.957732   0.852500   9.335 < 0.0000000000000002 ***
## genderM     -11.444303   0.342565 -33.408 < 0.0000000000000002 ***
## age           0.054966   0.007395   7.433    0.000000000000199 ***
## weight        0.079207   0.028620   2.768              0.00573 ** 
## bmi           0.894194   0.080297  11.136 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.963 on 1212 degrees of freedom
## Multiple R-squared:  0.6966, Adjusted R-squared:  0.6956 
## F-statistic: 695.7 on 4 and 1212 DF,  p-value: < 0.00000000000000022
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:survival':
## 
##     aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
ob$sex = ifelse(ob$gender == "F", 1, 0)
m.bma2 = lm(pcfat ~ sex + age + weight + bmi, data = ob)
calc.relimp(m.bma2, type = "lmg", rela = TRUE, rank = TRUE)
## Response variable: pcfat 
## Total response variance: 51.5935 
## Analysis based on 1217 observations 
## 
## 4 Regressors: 
## sex age weight bmi 
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##               lmg
## sex    0.59317775
## age    0.06893066
## weight 0.09175463
## bmi    0.24613695
## 
## Average coefficients for different model sizes: 
## 
##                 1X         2Xs         3Xs         4Xs
## sex    10.51634414 11.71834412 11.80453842 11.44430262
## age     0.12768705  0.10445197  0.05168496  0.05496623
## weight  0.04319324 -0.05539405 -0.06907993  0.07920690
## bmi     1.03619023  1.50631405  1.54278433  0.89419395
#biến nhị phân: dùng random forrest

#Hồi quy logistic: 1 biến số cần tối thiểu 20 mẫu #Gói chuyên biệt giải quyết cỡ mẫu: pmsamplesize