ob=read.csv("D:/Learning/CME/R Statistic 2025/DỮ LIỆU THỰC HÀNH (TS Thạch gửi)/Obesity data.csv")
library(lessR)
##
## lessR 4.4.3 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
##
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
##
## sort_by
fit=lm(pcfat~bmi,data=ob); summary(fit)
##
## Call:
## lm(formula = pcfat ~ bmi, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.612 -4.181 1.392 4.690 18.241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.39889 1.36777 6.141 0.00000000111 ***
## bmi 1.03619 0.06051 17.123 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.45 on 1215 degrees of freedom
## Multiple R-squared: 0.1944, Adjusted R-squared: 0.1937
## F-statistic: 293.2 on 1 and 1215 DF, p-value: < 0.00000000000000022
fit=lm(pcfat~weight,data=ob); summary(fit)
##
## Call:
## lm(formula = pcfat ~ weight, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3122 -4.5234 0.8902 5.2695 16.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.22295 1.22370 23.881 <0.0000000000000002 ***
## weight 0.04319 0.02188 1.975 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared: 0.003199, Adjusted R-squared: 0.002378
## F-statistic: 3.899 on 1 and 1215 DF, p-value: 0.04855
fit=lm(pcfat~gender,data=ob); summary(fit)
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <0.0000000000000002 ***
## genderM -10.5163 0.3381 -31.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 0.00000000000000022
fit=lm(pcfat~bmi+gender,data=ob); summary(fit)
##
## Call:
## lm(formula = pcfat ~ bmi + gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.4709 -2.4780 0.1773 2.6903 15.1761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.01035 0.85880 10.49 <0.0000000000000002 ***
## bmi 1.15303 0.03809 30.27 <0.0000000000000002 ***
## genderM -11.06631 0.25599 -43.23 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.049 on 1214 degrees of freedom
## Multiple R-squared: 0.6828, Adjusted R-squared: 0.6822
## F-statistic: 1306 on 2 and 1214 DF, p-value: < 0.00000000000000022
fit=lm(pcfat~bmi+gender+age+weight+height,data=ob); summary(fit)
##
## Call:
## lm(formula = pcfat ~ bmi + gender + age + weight + height, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.221 -2.552 0.033 2.619 15.649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.677701 15.611478 0.620 0.5354
## bmi 0.858035 0.337409 2.543 0.0111 *
## genderM -11.441050 0.343970 -33.262 < 0.0000000000000002 ***
## age 0.054933 0.007404 7.420 0.00000000000022 ***
## weight 0.093949 0.136641 0.688 0.4919
## height -0.010991 0.099609 -0.110 0.9122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.965 on 1211 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6953
## F-statistic: 556.1 on 5 and 1211 DF, p-value: < 0.00000000000000022
ScatterPlot(bmi,weight,data=ob)
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(bmi, weight, enhance=TRUE) # many options
## Plot(bmi, weight, fill="skyblue") # interior fill color of points
## Plot(bmi, weight, fit="lm", fit_se=c(.90,.99)) # fit line, stnd errors
## Plot(bmi, weight, out_cut=.10) # label top 10% from center as outliers
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 1217
## Sample Correlation of bmi and weight: r = 0.787
##
## Hypothesis Test of 0 Correlation: t = 44.452, df = 1215, p-value = 0.000
## 95% Confidence Interval for Correlation: 0.765 to 0.807
##
#Thực hành ## Phân tích tương quan ### Việc 1. Đọc dữ liệu “Bone data.csv” vào R và gọi dữ liệu là “df”
df=read.csv("D:/Learning/CME/R Statistic 2025/DỮ LIỆU THỰC HÀNH (TS Thạch gửi)/Bone data.csv")
library(lessR)
ScatterPlot(weight,fnbmd,data=df,fit="loess")
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(weight, fnbmd, enhance=TRUE) # many options
## Plot(weight, fnbmd, fill="skyblue") # interior fill color of points
## Plot(weight, fnbmd, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
## Loess Model MSE = 0.0156
##
ob=read.csv("D:/Learning/CME/R Statistic 2025/DỮ LIỆU THỰC HÀNH (TS Thạch gửi)/Obesity data.csv")
fit4=lm(pcfat~gender,data=ob)
summary(fit4)
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <0.0000000000000002 ***
## genderM -10.5163 0.3381 -31.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 0.00000000000000022
#genderM: chữ M là biến so sánh (Female sẽ là biến tham chiếu), intercept 34.67 chính là pcfat của biến tham chiếu - nữ
####Kiểm tra giả định
#cách 1:
par(mfrow=c(2,2)) #để chia màn hình thành 4 ô cho dễ thấy tất cả 4 hình biểu đồ
plot(fit4)
#hình 1: giá trị tuyệt đối: residual dao động từ -20 -> 10
#hình 3: giá trị dao động bao nhiêu độ lệch chuẩn
#hình Q-Q: các giá trị nằm trên đường gạch thì mô hình tốt
#hình Residual vs Leverage
#hình như ví dụ là mô hình tốt
#Kết luận: thực tế chỉ dùng biểu đồ để kiểm định mô hình đạt chuẩn hay ko
#Cách 2
library(lessR)
fit4.2=reg(pcfat~gender,data=ob); summary(fit4.2)
##
## >>> gender is not numeric. Converted to indicator variables.
## Length Class Mode
## out_suggest 1 out character
## call 3 -none- call
## formula 3 formula call
## vars 2 -none- character
## out_title_bck 1 out character
## out_background 7 out character
## out_title_basic 1 out character
## out_estimates 3 out character
## out_fit 7 out character
## out_anova 6 out character
## out_title_mod 1 out character
## out_mod 1 out character
## out_mdls 1 out character
## out_title_kfold 1 out character
## out_kfold 1 out character
## out_title_rel 1 out character
## out_cor 3 out character
## out_collinear 1 out character
## out_subsets 1 out character
## out_title_res 1 out character
## out_residuals 25 out character
## out_title_pred 1 out character
## out_predict 11 out character
## out_ref 1 out character
## out_Rmd 1 out character
## out_Word 1 out character
## out_pdf 1 out character
## out_odt 1 out character
## out_rtf 1 out character
## out_plots 4 out character
## n.vars 1 -none- numeric
## n.obs 1 -none- numeric
## n.keep 1 -none- numeric
## coefficients 2 -none- numeric
## sterrs 2 -none- numeric
## tvalues 2 -none- numeric
## pvalues 2 -none- numeric
## cilb 2 -none- numeric
## ciub 2 -none- numeric
## anova_model 5 -none- numeric
## anova_residual 3 -none- numeric
## anova_total 3 -none- numeric
## se 1 -none- numeric
## resid_range 1 -none- numeric
## Rsq 1 -none- numeric
## Rsqadj 1 -none- numeric
## PRESS 1 -none- numeric
## RsqPRESS 1 -none- numeric
## m_se 1 -none- logical
## m_MSE 1 -none- logical
## m_Rsq 1 -none- logical
## cor 4 -none- numeric
## tolerances 1 -none- logical
## vif 1 -none- logical
## resid.max 20 -none- numeric
## pred_min_max 2 -none- numeric
## residuals 1217 -none- numeric
## fitted 1217 -none- numeric
## cooks.distance 1217 -none- numeric
## model 2 data.frame list
## terms 3 terms call
#R2: tính trực tiếp từ các lỗi(khác biệt giữa giá trị tính từ mô hình so với thực tế), nếu chỉ có 1 biến số thì bằng adjusted R2
#Adjusted R2: tính từ trung bình các lỗi, nên mô hình có nhiều biến số thì nên dùng Adjusted R2
fit4.2
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## reg(pcfat ~ gender, data=ob, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable: genderM
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 34.672413 0.182622 189.859 0.000 34.314123 35.030703
## genderM -10.516344 0.338131 -31.101 0.000 -11.179729 -9.852959
##
## Standard deviation of pcfat: 7.182861
##
## Standard deviation of residuals: 5.361759 for df=1215
## 95% range of residuals: 21.038669 = 2 * (1.962 * 5.361759)
##
## R-squared: 0.443 Adjusted R-squared: 0.443 PRESS R-squared: 0.441
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 967.297 df: 1 and 1215 p-value: 0.000
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## Model 1 27808.311497 27808.311497 967.297285 0.000
## Residuals 1215 34929.384159 28.748464
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat genderM
## pcfat 1.00 -0.67
## genderM -0.67 1.00
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ---------------------------------------------------------------------------
## genderM pcfat fitted resid rstdnt dffits cooks
## 210 1 9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020
## 509 1 39.000000 24.156069 14.843931 2.780055 0.147758 0.010860
## 179 1 38.700000 24.156069 14.543931 2.723523 0.144754 0.010420
## 518 1 9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300
## 200 1 9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150
## 563 1 38.300000 24.156069 14.143931 2.648179 0.140749 0.009860
## 318 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 972 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 388 1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920
## 203 1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530
## 1137 0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150
## 893 0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070
## 688 1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020
## 403 1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640
## 858 1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400
## 158 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 1106 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 827 1 36.000000 24.156069 11.843931 2.215637 0.117760 0.006910
## 756 1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810
## 196 1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## genderM pcfat pred s_pred pi.lwr pi.upr width
## 2 1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## 5 1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## ...
## 1209 1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280
## 1 0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 3 0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869
## ...
## 1215 0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1216 0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1217 0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
#fitted: giá trị tính toán theo mô hình dự đoán
#residual:sự khác biệt của giá trị dự đoán với thực tế
#dffits và cooks: đánh giá mức độ ảnh hưởng của biến số lên mô hình
ScatterPlot(weight,pcfat,data=ob,fit="loess")
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(weight, pcfat, enhance=TRUE) # many options
## Plot(weight, pcfat, color="red") # exterior edge color of points
## Plot(weight, pcfat, out_cut=.10) # label top 10% from center as outliers
##
## Loess Model MSE = 49.325299
##
fit5=reg(pcfat~weight,data=ob)
fit5
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## reg(pcfat ~ weight, data=ob, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable: weight
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 29.222947 1.223696 23.881 0.000 26.822156 31.623738
## weight 0.043193 0.021875 1.975 0.049 0.000276 0.086111
##
## Standard deviation of pcfat: 7.182861
##
## Standard deviation of residuals: 7.174315 for df=1215
## 95% range of residuals: 28.150843 = 2 * (1.962 * 7.174315)
##
## R-squared: 0.003 Adjusted R-squared: 0.002 PRESS R-squared: 0.000
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 3.899 df: 1 and 1215 p-value: 0.049
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## Model 1 200.669670 200.669670 3.898709 0.049
## Residuals 1215 62537.025985 51.470803
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat weight
## pcfat 1.00 0.06
## weight 0.06 1.00
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ----------------------------------------------------------------------------
## weight pcfat fitted resid rstdnt dffits cooks
## 373 85 47.400000 32.894372 14.505628 2.033775 0.194997 0.018960
## 923 95 43.500000 33.326305 10.173695 1.429871 0.179944 0.016180
## 972 42 10.300000 31.037063 -20.737063 -2.902805 -0.143205 0.010190
## 716 74 17.600000 32.419246 -14.819246 -2.072679 -0.133434 0.008880
## 858 42 11.900000 31.037063 -19.137063 -2.677456 -0.132088 0.008680
## 177 38 15.700000 30.864290 -15.164290 -2.120502 -0.126644 0.008000
## 1071 70 48.100000 32.246474 15.853526 2.216504 0.118990 0.007060
## 943 73 18.700000 32.376053 -13.676053 -1.911957 -0.117867 0.006930
## 876 34 18.800000 30.691517 -11.891517 -1.662860 -0.117617 0.006910
## 245 35 18.400000 30.734710 -12.334710 -1.724650 -0.117167 0.006850
## 762 80 22.600000 32.678406 -10.078406 -1.409997 -0.114628 0.006560
## 88 68 15.300000 32.160087 -16.860087 -2.357246 -0.114610 0.006540
## 200 47 9.800000 31.253029 -21.453029 -3.002259 -0.113942 0.006450
## 688 46 11.400000 31.209836 -19.809836 -2.771011 -0.110895 0.006120
## 184 39 17.200000 30.907483 -13.707483 -1.915842 -0.109309 0.005960
## 388 47 10.700000 31.253029 -20.553029 -2.875431 -0.109129 0.005920
## 895 65 13.600000 32.030507 -18.430507 -2.577138 -0.107125 0.005710
## 276 40 16.900000 30.950676 -14.050676 -1.963672 -0.106882 0.005700
## 528 39 17.600000 30.907483 -13.307483 -1.859774 -0.106110 0.005620
## 441 72 20.000000 32.332860 -12.332860 -1.723410 -0.101599 0.005150
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## weight pcfat pred s_pred pi.lwr pi.upr width
## 876 34 18.800000 30.691517 7.192151 16.581104 44.801929 28.220825
## 55 35 27.100000 30.734710 7.190777 16.626993 44.842427 28.215435
## 245 35 18.400000 30.734710 7.190777 16.626993 44.842427 28.215435
## ...
## 1211 54 34.100000 31.555382 7.177306 17.474093 45.636670 28.162577
## 20 55 19.300000 31.598575 7.177263 17.517370 45.679779 28.162409
## 27 55 37.200000 31.598575 7.177263 17.517370 45.679779 28.162409
## ...
## 402 93 32.300000 33.239918 7.224879 19.065295 47.414541 28.349246
## 628 95 32.900000 33.326305 7.230024 19.141587 47.511022 28.369435
## 891 95 30.100000 33.326305 7.230024 19.141587 47.511022 28.369435
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
####Kiểm định
fit5.1=lm(pcfat~weight,data=ob);summary(fit5.1)
##
## Call:
## lm(formula = pcfat ~ weight, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3122 -4.5234 0.8902 5.2695 16.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.22295 1.22370 23.881 <0.0000000000000002 ***
## weight 0.04319 0.02188 1.975 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared: 0.003199, Adjusted R-squared: 0.002378
## F-statistic: 3.899 on 1 and 1215 DF, p-value: 0.04855
library(ggfortify)
## Loading required package: ggplot2
autoplot(fit5.1)
###Việc 6 Đánh giá mối liên quan độc lập giữa cân nặng (weight) và tỉ
trọng mỡ (pcfat)
fit6=lm(pcfat~weight+age+gender+height,data=ob); summary(fit6)
##
## Call:
## lm(formula = pcfat ~ weight + age + gender + height, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.208 -2.543 0.019 2.582 15.706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.368722 3.505431 13.798 < 0.0000000000000002 ***
## weight 0.439169 0.015594 28.163 < 0.0000000000000002 ***
## age 0.056166 0.007404 7.585 0.0000000000000658 ***
## genderM -11.483254 0.344343 -33.348 < 0.0000000000000002 ***
## height -0.257013 0.023768 -10.813 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.974 on 1212 degrees of freedom
## Multiple R-squared: 0.695, Adjusted R-squared: 0.694
## F-statistic: 690.4 on 4 and 1212 DF, p-value: < 0.00000000000000022
autoplot(fit6)
# mô hình hiệu chỉnh: tất cả các biến số đều được đưa vào để xác định các yếu tố ảnh hưởng như thế nào
# dùng mô hình hồi quy tuyến tính đa biến để đánh giá liên quan độc lập của các biến số
#F stastistic: so sánh mô hình hiện tại với mô hình cơ bản, không có biến x (tức là so sánh với giá trị trung bình)
###Việc 7 Xây dựng mô hình dự báo tỉ trọng mỡ (pcfat)
# mô hình dự báo: tất cả các biến số dự báo đầu phải có ý nghĩa thống kê
#gọi package BMA
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
#định nghĩa biến phụ thuộc yvar và biến tiên lượng xvar
yvar=ob[,c("pcfat")]
xvar=ob[,c("gender","height","weight","bmi","age")]
#dùng hàm bicreg
m.bma=bicreg(xvar,yvar,strict=FALSE,OR=20)
summary(m.bma)
##
## Call:
## bicreg(x = xvar, y = yvar, strict = FALSE, OR = 20)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 5.26146 4.582901 7.95773 -0.79279 8.13735
## genderM 100.0 -11.25139 0.429659 -11.44430 -11.42764 -10.80625
## height 31.4 0.01759 0.028494 . 0.05598 .
## weight 39.2 0.03102 0.042611 0.07921 . .
## bmi 100.0 1.01265 0.111625 0.89419 1.08852 1.08936
## age 100.0 0.05259 0.008048 0.05497 0.05473 0.04715
##
## nVar 4 4 3
## r2 0.697 0.696 0.695
## BIC -1423.06312 -1422.62198 -1422.49027
## post prob 0.392 0.314 0.294
imageplot.bma(m.bma)
#p!=0: giải thích các biến số xuất hiện trong bao nhiêu mô hình (VD: gender xuất hiện trong 100% mô hình, weight xuất hiện trong 39.2%)
#bic: chênh lệch 2 đơn vị thì tuong đương nhau
#có quyền chọn 1 trong 5 mô hình tối ưu nhất
#chạy mô hình tuyến tính theo model tối ưu
fit7=lm(pcfat~gender+weight+age+bmi,data=ob)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
##
## bc, recode, sp
vif(fit7) #vif(m.bma)
## gender weight age bmi
## 1.878778 5.609651 1.263468 4.663425
##vif=1: ko có, <5: có thể, 5-10: trung bình (thận trọng), >10: bắt buộc làm lại mô hình
####Kiểm định tầm quan trọng
m.bma=lm(pcfat~gender + age + weight + bmi, data = ob);summary(m.bma)
##
## Call:
## lm(formula = pcfat ~ gender + age + weight + bmi, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.225 -2.557 0.033 2.608 15.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.957732 0.852500 9.335 < 0.0000000000000002 ***
## genderM -11.444303 0.342565 -33.408 < 0.0000000000000002 ***
## age 0.054966 0.007395 7.433 0.000000000000199 ***
## weight 0.079207 0.028620 2.768 0.00573 **
## bmi 0.894194 0.080297 11.136 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.963 on 1212 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6956
## F-statistic: 695.7 on 4 and 1212 DF, p-value: < 0.00000000000000022
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
ob$sex = ifelse(ob$gender == "F", 1, 0)
m.bma2 = lm(pcfat ~ sex + age + weight + bmi, data = ob)
calc.relimp(m.bma2, type = "lmg", rela = TRUE, rank = TRUE)
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 4 Regressors:
## sex age weight bmi
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## sex 0.59317775
## age 0.06893066
## weight 0.09175463
## bmi 0.24613695
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## sex 10.51634414 11.71834412 11.80453842 11.44430262
## age 0.12768705 0.10445197 0.05168496 0.05496623
## weight 0.04319324 -0.05539405 -0.06907993 0.07920690
## bmi 1.03619023 1.50631405 1.54278433 0.89419395
#biến nhị phân: dùng random forrest
#Hồi quy logistic: 1 biến số cần tối thiểu 20 mẫu #Gói chuyên biệt giải quyết cỡ mẫu: pmsamplesize