df = read.csv("C:\\Thach\\VN trips\\2025_3Jun\\VNUHCM\\Datasets\\Bone data.csv")
PROMPT: Dùng gói lệnh ‘lessR’ vẽ biểu đồ tương quan giữa cân nặng (weight) và mật độ xương tại cổ xương đùi (fnbmd) cùng với đường biểu diễn mối liên quan thật sự của cân nặng và mật độ xương
library(lessR)
## Warning: package 'lessR' was built under R version 4.3.3
##
## lessR 4.3.9 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
Plot(weight, fnbmd,
fit = "loess",
xlab = "Cân nặng (kg)",
ylab = "Mật độ xương cổ xương đùi (g/cm²)",
main = "Quan hệ giữa cân nặng và mật độ xương",
data = df)
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(weight, fnbmd, enhance=TRUE) # many options
## Plot(weight, fnbmd, fill="skyblue") # interior fill color of points
## Plot(weight, fnbmd, out_cut=.10) # label top 10% from center as outliers
##
## Fit: Mean Squared Error, MSE = 0.016
##
PROMPT: Dùng gói lệnh ‘lessR’ thực hiện phân tích tương quan đánh giá mối liên quan giữa cân nặng (weight) và mật độ xương tại cổ xương đùi (fnbmd).
Correlation(weight, fnbmd, data = df)
## Correlation Analysis for Variables weight and fnbmd
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 2121
## Number of cases (rows of data) deleted: 41
##
## Sample Covariance: s = 1.269
##
## Sample Correlation: r = 0.581
##
## Hypothesis Test of 0 Correlation: t = 32.882, df = 2119, p-value = 0.000
## 95% Confidence Interval for Correlation: 0.552 to 0.609
ob = read.csv("C:\\Thach\\VN trips\\2025_3Jun\\VNUHCM\\Datasets\\Obesity data.csv")
PROMPT: Dùng gói lệnh ‘lessR’ vẽ biểu đồ histogram đánh giá phân bố của tỉ trọng mỡ (pcfat), tô màu xanh và đặt tên trục x là ‘Tỉ trọng mỡ (%)’, trục y là “Số người’ và tựa là ‘Phân bố tỉ trọng mỡ’
library(lessR)
Histogram(pcfat,
data = ob,
fill = "blue",
xlab = "Tỉ trọng mỡ (%)",
ylab = "Số người",
main = "Phân bố tỉ trọng mỡ")
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## Histogram(pcfat, density=TRUE) # smoothed curve + histogram
## Plot(pcfat) # Violin/Box/Scatterplot (VBS) plot
##
## --- pcfat ---
##
## n miss mean sd min mdn max
## 1217 0 31.604786 7.182862 9.200000 32.400000 48.400000
##
##
## --- Outliers --- from the box plot: 10
##
## Small Large
## ----- -----
## 9.2
## 9.7
## 9.8
## 10.3
## 10.3
## 10.7
## 11.0
## 11.4
## 11.7
## 11.9
##
##
## Bin Width: 5
## Number of Bins: 9
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## -------------------------------------------------
## 5 > 10 7.5 3 0.00 3 0.00
## 10 > 15 12.5 26 0.02 29 0.02
## 15 > 20 17.5 61 0.05 90 0.07
## 20 > 25 22.5 128 0.11 218 0.18
## 25 > 30 27.5 244 0.20 462 0.38
## 30 > 35 32.5 338 0.28 800 0.66
## 35 > 40 37.5 294 0.24 1094 0.90
## 40 > 45 42.5 107 0.09 1201 0.99
## 45 > 50 47.5 16 0.01 1217 1.00
PROMPT: Sử dụng kiểm định t trong gói lệnh ‘lessR’ để so sánh tỉ trọng mỡ (pcfat) giữa nam và nữ (gender).
ttest(pcfat ~ gender, data = ob)
##
## Compare pcfat across gender with levels F and M
## Grouping Variable: gender
## Response Variable: pcfat
##
##
## ------ Describe ------
##
## pcfat for gender F: n.miss = 0, n = 862, mean = 34.672, sd = 5.187
## pcfat for gender M: n.miss = 0, n = 355, mean = 24.156, sd = 5.764
##
## Mean Difference of pcfat: 10.516
##
## Weighted Average Standard Deviation: 5.362
##
##
## ------ Assumptions ------
##
## Note: These hypothesis tests can perform poorly, and the
## t-test is typically robust to violations of assumptions.
## Use as heuristic guides instead of interpreting literally.
##
## Null hypothesis, for each group, is a normal distribution of pcfat.
## Group F: Sample mean assumed normal because n > 30, so no test needed.
## Group M: Sample mean assumed normal because n > 30, so no test needed.
##
## Null hypothesis is equal variances of pcfat, homogeneous.
## Variance Ratio test: F = 33.223/26.909 = 1.235, df = 354;861, p-value = 0.016
## Levene's test, Brown-Forsythe: t = -2.232, df = 1215, p-value = 0.026
##
##
## ------ Infer ------
##
## --- Assume equal population variances of pcfat for each gender
##
## t-cutoff for 95% range of variation: tcut = 1.962
## Standard Error of Mean Difference: SE = 0.338
##
## Hypothesis Test of 0 Mean Diff: t-value = 31.101, df = 1215, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.663
## 95% Confidence Interval for Mean Difference: 9.853 to 11.180
##
##
## --- Do not assume equal population variances of pcfat for each gender
##
## t-cutoff: tcut = 1.964
## Standard Error of Mean Difference: SE = 0.353
##
## Hypothesis Test of 0 Mean Diff: t = 29.768, df = 602.015, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.694
## 95% Confidence Interval for Mean Difference: 9.823 to 11.210
##
##
## ------ Effect Size ------
##
## --- Assume equal population variances of pcfat for each gender
##
## Standardized Mean Difference of pcfat, Cohen's d: 1.961
##
##
## ------ Practical Importance ------
##
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
##
##
## ------ Graphics Smoothing Parameter ------
##
## Density bandwidth for gender F: 1.475
## Density bandwidth for gender M: 1.867
PROMPT 1: Xây dựng mô hình hổi qui tuyến tính để so sánh tỉ trọng mỡ (pcfat) giữa nam và nữ (gender)
# Cách 1:
model <- lm(pcfat ~ gender, data = ob)
summary(model)
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <0.0000000000000002 ***
## genderM -10.5163 0.3381 -31.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 0.00000000000000022
# Cách 2:
library(lessR)
Regression(pcfat ~ gender, data = ob)
##
## >>> gender is not numeric. Converted to indicator variables.
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## Regression(my_formula=pcfat ~ gender, data=ob, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable: genderM
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 34.672413 0.182622 189.859 0.000 34.314123 35.030703
## genderM -10.516344 0.338131 -31.101 0.000 -11.179729 -9.852959
##
## Standard deviation of pcfat: 7.182862
##
## Standard deviation of residuals: 5.361759 for df=1215
## 95% range of residuals: 21.038669 = 2 * (1.962 * 5.361759)
##
## R-squared: 0.443 Adjusted R-squared: 0.443 PRESS R-squared: 0.441
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 967.297 df: 1 and 1215 p-value: 0.000
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## Model 1 27808.311497 27808.311497 967.297285 0.000
## Residuals 1215 34929.384159 28.748464
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat genderM
## pcfat 1.00 -0.67
## genderM -0.67 1.00
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ---------------------------------------------------------------------------
## genderM pcfat fitted resid rstdnt dffits cooks
## 210 1 9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020
## 509 1 39.000000 24.156069 14.843931 2.780055 0.147758 0.010860
## 179 1 38.700000 24.156069 14.543931 2.723523 0.144754 0.010420
## 518 1 9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300
## 200 1 9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150
## 563 1 38.300000 24.156069 14.143931 2.648179 0.140749 0.009860
## 318 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 972 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 388 1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920
## 203 1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530
## 1137 0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150
## 893 0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070
## 688 1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020
## 403 1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640
## 858 1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400
## 158 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 1106 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 827 1 36.000000 24.156069 11.843931 2.215637 0.117760 0.006910
## 756 1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810
## 196 1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## genderM pcfat pred s_pred pi.lwr pi.upr width
## 2 1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## 5 1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## ...
## 1209 1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280
## 1 0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 3 0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869
## ...
## 1215 0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1216 0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1217 0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
# Cách 1:
par(mfrow = c(2, 2))
plot(model)
# Cách 2:
Regression(pcfat ~ gender, data = ob, graphics = TRUE)
##
## >>> gender is not numeric. Converted to indicator variables.
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## Regression(my_formula=pcfat ~ gender, data=ob, graphics=TRUE, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable: genderM
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 34.672413 0.182622 189.859 0.000 34.314123 35.030703
## genderM -10.516344 0.338131 -31.101 0.000 -11.179729 -9.852959
##
## Standard deviation of pcfat: 7.182862
##
## Standard deviation of residuals: 5.361759 for df=1215
## 95% range of residuals: 21.038669 = 2 * (1.962 * 5.361759)
##
## R-squared: 0.443 Adjusted R-squared: 0.443 PRESS R-squared: 0.441
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 967.297 df: 1 and 1215 p-value: 0.000
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## Model 1 27808.311497 27808.311497 967.297285 0.000
## Residuals 1215 34929.384159 28.748464
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat genderM
## pcfat 1.00 -0.67
## genderM -0.67 1.00
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ---------------------------------------------------------------------------
## genderM pcfat fitted resid rstdnt dffits cooks
## 210 1 9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020
## 509 1 39.000000 24.156069 14.843931 2.780055 0.147758 0.010860
## 179 1 38.700000 24.156069 14.543931 2.723523 0.144754 0.010420
## 518 1 9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300
## 200 1 9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150
## 563 1 38.300000 24.156069 14.143931 2.648179 0.140749 0.009860
## 318 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 972 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 388 1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920
## 203 1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530
## 1137 0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150
## 893 0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070
## 688 1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020
## 403 1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640
## 858 1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400
## 158 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 1106 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 827 1 36.000000 24.156069 11.843931 2.215637 0.117760 0.006910
## 756 1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810
## 196 1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## genderM pcfat pred s_pred pi.lwr pi.upr width
## 2 1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## 5 1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## ...
## 1209 1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280
## 1 0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 3 0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869
## ...
## 1215 0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1216 0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1217 0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
PROMPT: Vẽ biểu đồ đánh giá mối liên quan giữa cân nặng (weight) và tỉ trọng mỡ (pcfat) có đường biểu diễn mối liên quan thật sự
# Cách 1:
library(lessR)
Plot(pcfat, weight, data = ob, fit = "lm",
main = "Mối liên hệ giữa cân nặng và tỉ trọng mỡ",
xlab = "Cân nặng (kg)",
ylab = "Tỉ trọng mỡ (%)")
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(pcfat, weight, enhance=TRUE) # many options
## Plot(pcfat, weight, color="red") # exterior edge color of points
## Plot(pcfat, weight, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 1217
## Sample Correlation of pcfat and weight: r = 0.057
##
## Hypothesis Test of 0 Correlation: t = 1.975, df = 1215, p-value = 0.049
## 95% Confidence Interval for Correlation: 0.000 to 0.112
##
##
## Line: b0 = 52.80 b1 = 0.07 Fit: MSE = 88.243 Rsq = 0.003
##
# Cách 2:
library(ggplot2)
ggplot(ob, aes(x = weight, y = pcfat)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(title = "Mối liên hệ giữa cân nặng và tỉ trọng mỡ",
x = "Cân nặng (kg)",
y = "Tỉ trọng mỡ (%)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
PROMPT: Sử dụng mô hình tuyến tính đánh giá mối liên quan giữa cân nặng (weight) và tỉ trọng mỡ (pcfat)
# Cách 1:
model <- lm(pcfat ~ weight, data = ob)
par(mfrow = c(2, 2))
plot(model)
summary(model)
##
## Call:
## lm(formula = pcfat ~ weight, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3122 -4.5234 0.8902 5.2695 16.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.22295 1.22370 23.881 <0.0000000000000002 ***
## weight 0.04319 0.02188 1.975 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared: 0.003199, Adjusted R-squared: 0.002378
## F-statistic: 3.899 on 1 and 1215 DF, p-value: 0.04855
# Cách 2:
Regression(pcfat ~ weight, data = ob)
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## Regression(my_formula=pcfat ~ weight, data=ob, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable: weight
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 29.222947 1.223696 23.881 0.000 26.822156 31.623738
## weight 0.043193 0.021875 1.975 0.049 0.000276 0.086111
##
## Standard deviation of pcfat: 7.182862
##
## Standard deviation of residuals: 7.174316 for df=1215
## 95% range of residuals: 28.150843 = 2 * (1.962 * 7.174316)
##
## R-squared: 0.003 Adjusted R-squared: 0.002 PRESS R-squared: 0.000
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 3.899 df: 1 and 1215 p-value: 0.049
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## Model 1 200.669670 200.669670 3.898709 0.049
## Residuals 1215 62537.025985 51.470803
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat weight
## pcfat 1.00 0.06
## weight 0.06 1.00
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ----------------------------------------------------------------------------
## weight pcfat fitted resid rstdnt dffits cooks
## 373 85 47.400000 32.894372 14.505628 2.033775 0.194997 0.018960
## 923 95 43.500000 33.326305 10.173695 1.429871 0.179944 0.016180
## 972 42 10.300000 31.037063 -20.737063 -2.902805 -0.143205 0.010190
## 716 74 17.600000 32.419246 -14.819246 -2.072679 -0.133434 0.008880
## 858 42 11.900000 31.037063 -19.137063 -2.677456 -0.132088 0.008680
## 177 38 15.700000 30.864290 -15.164290 -2.120502 -0.126644 0.008000
## 1071 70 48.100000 32.246474 15.853526 2.216504 0.118990 0.007060
## 943 73 18.700000 32.376053 -13.676053 -1.911957 -0.117867 0.006930
## 876 34 18.800000 30.691517 -11.891517 -1.662860 -0.117617 0.006910
## 245 35 18.400000 30.734710 -12.334710 -1.724650 -0.117167 0.006850
## 762 80 22.600000 32.678406 -10.078406 -1.409997 -0.114628 0.006560
## 88 68 15.300000 32.160087 -16.860087 -2.357246 -0.114610 0.006540
## 200 47 9.800000 31.253029 -21.453029 -3.002259 -0.113942 0.006450
## 688 46 11.400000 31.209836 -19.809836 -2.771011 -0.110895 0.006120
## 184 39 17.200000 30.907483 -13.707483 -1.915842 -0.109309 0.005960
## 388 47 10.700000 31.253029 -20.553029 -2.875431 -0.109129 0.005920
## 895 65 13.600000 32.030507 -18.430507 -2.577138 -0.107125 0.005710
## 276 40 16.900000 30.950676 -14.050676 -1.963672 -0.106882 0.005700
## 528 39 17.600000 30.907483 -13.307483 -1.859774 -0.106110 0.005620
## 441 72 20.000000 32.332860 -12.332860 -1.723410 -0.101599 0.005150
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## weight pcfat pred s_pred pi.lwr pi.upr width
## 876 34 18.800000 30.691517 7.192151 16.581104 44.801929 28.220825
## 55 35 27.100000 30.734710 7.190777 16.626993 44.842427 28.215435
## 245 35 18.400000 30.734710 7.190777 16.626993 44.842427 28.215435
## ...
## 1211 54 34.100000 31.555382 7.177306 17.474093 45.636670 28.162577
## 20 55 19.300000 31.598575 7.177263 17.517370 45.679779 28.162409
## 27 55 37.200000 31.598575 7.177263 17.517370 45.679779 28.162409
## ...
## 402 93 32.300000 33.239918 7.224879 19.065295 47.414541 28.349246
## 628 95 32.900000 33.326305 7.230024 19.141587 47.511022 28.369435
## 891 95 30.100000 33.326305 7.230024 19.141587 47.511022 28.369435
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
PROMPT: Qua y văn bạn xác định các yếu tố có thể gây nhiễu (confounder) mối liên quan giữa cân nặng và tỉ trong mỡ là giới tính, tuổi, và chiều cao. Hãy xây dựng mô hình đa biến đánh giá mối liên quan độc lập giữa cân nặng và tỉ trọng mỡ sau khi hiệu chỉnh cho các yếu tố gây nhiễu.
# Cách 1:
model_adj <- lm(pcfat ~ weight + gender + age + height, data = ob)
par(mfrow = c(2, 2))
plot(model_adj)
summary(model_adj)
##
## Call:
## lm(formula = pcfat ~ weight + gender + age + height, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.208 -2.543 0.019 2.582 15.706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.368722 3.505431 13.798 < 0.0000000000000002 ***
## weight 0.439169 0.015594 28.163 < 0.0000000000000002 ***
## genderM -11.483254 0.344343 -33.348 < 0.0000000000000002 ***
## age 0.056166 0.007404 7.585 0.0000000000000658 ***
## height -0.257013 0.023768 -10.813 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.974 on 1212 degrees of freedom
## Multiple R-squared: 0.695, Adjusted R-squared: 0.694
## F-statistic: 690.4 on 4 and 1212 DF, p-value: < 0.00000000000000022
# Cách 2:
Regression(pcfat ~ weight + gender + age + height, data = ob)
##
## >>> gender is not numeric. Converted to indicator variables.
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## Regression(my_formula=pcfat ~ weight + gender + age + height, data=ob, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable 1: weight
## Predictor Variable 2: genderM
## Predictor Variable 3: age
## Predictor Variable 4: height
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 48.368722 3.505431 13.798 0.000 41.491335 55.246110
## weight 0.439169 0.015594 28.163 0.000 0.408576 0.469763
## genderM -11.483254 0.344343 -33.348 0.000 -12.158828 -10.807679
## age 0.056166 0.007404 7.585 0.000 0.041639 0.070693
## height -0.257013 0.023768 -10.813 0.000 -0.303644 -0.210382
##
## Standard deviation of pcfat: 7.182862
##
## Standard deviation of residuals: 3.973577 for df=1212
## 95% range of residuals: 15.591705 = 2 * (1.962 * 3.973577)
##
## R-squared: 0.695 Adjusted R-squared: 0.694 PRESS R-squared: 0.692
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 690.357 df: 4 and 1212 p-value: 0.000
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## weight 1 200.669670 200.669670 12.709209 0.000
## genderM 1 38576.550161 38576.550161 2443.206529 0.000
## age 1 2977.577719 2977.577719 188.581853 0.000
## height 1 1846.251961 1846.251961 116.930488 0.000
##
## Model 4 43601.049512 10900.262378 690.357020 0.000
## Residuals 1212 19136.646144 15.789312
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat weight genderM age height
## pcfat 1.00 0.06 -0.67 0.31 -0.48
## weight 0.06 1.00 0.47 -0.05 0.60
## genderM -0.67 0.47 1.00 -0.13 0.67
## age 0.31 -0.05 -0.13 1.00 -0.37
## height -0.48 0.60 0.67 -0.37 1.00
##
## Tolerance VIF
## weight 0.604 1.656
## genderM 0.530 1.888
## age 0.794 1.260
## height 0.361 2.769
##
## weight genderM age height R2adj X's
## 1 1 1 1 0.694 4
## 1 1 0 1 0.680 3
## 1 1 1 0 0.665 3
## 1 1 0 0 0.617 2
## 0 1 1 1 0.494 3
## 0 1 1 0 0.492 2
## 0 1 0 1 0.444 2
## 0 1 0 0 0.443 1
## 1 0 1 1 0.414 3
## 1 0 0 1 0.413 2
## 0 0 1 1 0.248 2
## 0 0 0 1 0.230 1
## 1 0 1 0 0.098 2
## 0 0 1 0 0.094 1
## 1 0 0 0 0.002 1
##
## [based on Thomas Lumley's leaps function from the leaps package]
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ----------------------------------------------------------------------------------------------------------
## weight genderM age height pcfat fitted resid rstdnt dffits cooks
## 563 50 1 48 150 38.300000 22.987969 15.312031 3.892904 0.365059 0.026350
## 923 95 0 57 160 43.500000 52.169213 -8.669213 -2.212032 -0.347600 0.024090
## 1000 51 1 43 148 35.200000 23.660335 11.539665 2.929329 0.308975 0.018970
## 509 67 1 13 167 39.000000 24.118827 14.881173 3.777390 0.300716 0.017890
## 1106 49 1 67 150 36.300000 23.615951 12.684049 3.218255 0.299627 0.017820
## 562 85 0 21 167 34.800000 43.956458 -9.156458 -2.327441 -0.298503 0.017760
## 377 76 1 40 148 26.900000 34.471075 -7.571075 -1.929208 -0.291924 0.017010
## 893 53 0 18 163 14.700000 30.762588 -16.062588 -4.078127 -0.283131 0.015830
## 245 35 0 80 145 18.400000 30.966052 -12.566052 -3.186550 -0.279949 0.015560
## 49 45 1 77 156 32.100000 20.878854 11.221146 2.845738 0.278598 0.015430
## 876 34 0 76 141 18.800000 31.330271 -12.530271 -3.177361 -0.278691 0.015420
## 1008 54 0 82 165 25.500000 34.282346 -8.782346 -2.228558 -0.257750 0.013240
## 316 62 1 19 176 32.100000 19.946858 12.153142 3.079313 0.250519 0.012460
## 1137 50 0 55 158 14.600000 32.808281 -18.208281 -4.627167 -0.243679 0.011680
## 269 52 1 36 156 34.000000 21.650241 12.349759 3.128488 0.241383 0.011570
## 16 70 1 49 150 24.300000 31.827525 -7.527525 -1.909655 -0.225639 0.010160
## 179 75 1 23 168 38.700000 27.936828 10.763172 2.724895 0.222502 0.009850
## 891 95 1 41 172 30.100000 36.703152 -6.603152 -1.676741 -0.215937 0.009310
## 135 52 1 72 160 33.200000 22.644159 10.555841 2.671843 0.215079 0.009210
## 264 55 1 78 154 35.600000 25.840740 9.759260 2.470316 0.212774 0.009020
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## weight genderM age height pcfat pred s_pred pi.lwr pi.upr width
## 177 38 1 55 162 15.700000 15.026941 3.996117 7.186868 22.867015 15.680148
## 186 52 1 22 175 15.400000 15.980674 3.991308 8.150033 23.811315 15.661281
## 331 45 1 50 169 17.500000 16.021208 3.993505 8.186259 23.856158 15.669899
## ...
## 734 55 0 47 158 36.900000 34.554801 3.977017 26.752200 42.357403 15.605203
## 348 52 0 48 153 33.600000 34.578523 3.975890 26.778132 42.378915 15.600783
## 164 50 0 50 150 31.900000 34.583554 3.976421 26.782122 42.384987 15.602865
## ...
## 373 85 0 61 153 47.400000 49.801272 4.007267 41.939322 57.663223 15.723901
## 923 95 0 57 160 43.500000 52.169213 4.021169 44.279988 60.058439 15.778452
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
pcfat = 48.4 + 0.4weight + 0.06age - 11.5genderM - 0.3height
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
# Xác định các biến độc lập
X <- ob[, c("gender", "height", "weight", "bmi", "age")]
# Biến phụ thuộc là pcfat
Y <- ob$pcfat
# Chạy mô hình BMA
bma_model <- bicreg(x = X, y = Y)
# Tóm tắt kết quả
summary(bma_model)
##
## Call:
## bicreg(x = X, y = Y)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 5.26146 4.582901 7.95773 -0.79279 8.13735
## genderM 100.0 -11.25139 0.429659 -11.44430 -11.42764 -10.80625
## height 31.4 0.01759 0.028494 . 0.05598 .
## weight 39.2 0.03102 0.042611 0.07921 . .
## bmi 100.0 1.01265 0.111625 0.89419 1.08852 1.08936
## age 100.0 0.05259 0.008048 0.05497 0.05473 0.04715
##
## nVar 4 4 3
## r2 0.697 0.696 0.695
## BIC -1423.06312 -1422.62198 -1422.49027
## post prob 0.392 0.314 0.294
m.bma = lm(pcfat ~ gender + age + weight + bmi, data = ob)
par(mfrow = c(2, 2))
plot(m.bma)
summary(m.bma)
##
## Call:
## lm(formula = pcfat ~ gender + age + weight + bmi, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.225 -2.557 0.033 2.608 15.646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.957732 0.852500 9.335 < 0.0000000000000002 ***
## genderM -11.444303 0.342565 -33.408 < 0.0000000000000002 ***
## age 0.054966 0.007395 7.433 0.000000000000199 ***
## weight 0.079207 0.028620 2.768 0.00573 **
## bmi 0.894194 0.080297 11.136 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.963 on 1212 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6956
## F-statistic: 695.7 on 4 and 1212 DF, p-value: < 0.00000000000000022
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
##
## bc, recode, sp
vif(m.bma)
## gender age weight bmi
## 1.878778 1.263468 5.609651 4.663425
pcfat = 8.0 - 11.4genderM + 0.05age + 0.08weight + 0.89bmi
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
ob$sex = ifelse(ob$gender == "F", 1, 0)
model <- lm(pcfat ~ sex + age + weight + bmi, data = ob)
relimp_result <- calc.relimp(model, type = "lmg", rela = TRUE)
print(relimp_result)
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 4 Regressors:
## sex age weight bmi
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## sex 0.59317775
## age 0.06893066
## weight 0.09175463
## bmi 0.24613695
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## sex 10.51634414 11.71834412 11.80453842 11.44430262
## age 0.12768705 0.10445197 0.05168496 0.05496623
## weight 0.04319324 -0.05539405 -0.06907993 0.07920690
## bmi 1.03619023 1.50631405 1.54278433 0.89419395
# Biểu đồ thanh thể hiện tầm quan trọng tương đối
boot <- boot.relimp(model, type = "lmg", b = 1000, rela = TRUE)
plot(booteval.relimp(boot, level = 0.95))