Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV SIS Cần Thơ (2-6/1/2025)

Ngày 3: Hồi qui tuyến tính

Phân tích tương quan

Việc 1. Đọc dữ liệu vào R

df = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Bone data.csv")

Việc 2. Đánh giá mối liên quan giữa cân nặng và mật độ xương cổ xương đùi

2.1 Vẽ biểu đồ đánh giá mối liên quan giữa cân nặng và mật độ xương cổ xương đùi

library(ggplot2)
p = ggplot(data = df, aes(x = weight, y = fnbmd))
p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 41 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 41 rows containing missing values (`geom_point()`).

2.2 Phân tích đánh giá tương quan giữa cân nặng và mật độ cổ xương đùi

library(lessR)
## Warning: package 'lessR' was built under R version 4.3.3
## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
Correlation(weight, fnbmd, data = df)
## Correlation Analysis for Variables weight and fnbmd 
##   
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 2121 
## Number of cases (rows of data) deleted: 41 
## 
## Sample Covariance: s = 1.269 
##  
## Sample Correlation: r = 0.581 
## 
## Hypothesis Test of 0 Correlation:  t = 32.882,  df = 2119,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.552 to 0.609

Việc 3. Đánh giá mối tương quan đa biến

vars = df[, c("sex", "age", "weight", "height", "fnbmd")]
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data = vars, mapping = aes(color = sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Việc 4. Đọc dữ liệu vào R

ob = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Obesity data.csv")

Việc 5. So sánh tỉ trọng mỡ giữa nam và nữ

5.1 Vẽ biểu đồ hộp so sánh tỉ trọng mỡ giữa nam và nữ

p = ggplot(data = ob, aes(x = gender, y = pcfat, col = gender))
p1 = p + geom_boxplot() 
p1

5.2 Sử dụng kiểm định t

library(lessR)
ttest(pcfat ~ gender, data = ob)
## 
## Compare pcfat across gender with levels F and M 
## Grouping Variable:  gender
## Response Variable:  pcfat
## 
## 
## ------ Describe ------
## 
## pcfat for gender F:  n.miss = 0,  n = 862,  mean = 34.672,  sd = 5.187
## pcfat for gender M:  n.miss = 0,  n = 355,  mean = 24.156,  sd = 5.764
## 
## Mean Difference of pcfat:  10.516
## 
## Weighted Average Standard Deviation:   5.362 
## 
## 
## ------ Assumptions ------
## 
## Note: These hypothesis tests can perform poorly, and the 
##       t-test is typically robust to violations of assumptions. 
##       Use as heuristic guides instead of interpreting literally. 
## 
## Null hypothesis, for each group, is a normal distribution of pcfat.
## Group F: Sample mean assumed normal because n > 30, so no test needed.
## Group M: Sample mean assumed normal because n > 30, so no test needed.
## 
## Null hypothesis is equal variances of pcfat, homogeneous.
## Variance Ratio test:  F = 33.223/26.909 = 1.235,  df = 354;861,  p-value = 0.016
## Levene's test, Brown-Forsythe:  t = -2.232,  df = 1215,  p-value = 0.026
## 
## 
## ------ Infer ------
## 
## --- Assume equal population variances of pcfat for each gender 
## 
## t-cutoff for 95% range of variation: tcut =  1.962 
## Standard Error of Mean Difference: SE =  0.338 
## 
## Hypothesis Test of 0 Mean Diff:  t-value = 31.101,  df = 1215,  p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.663
## 95% Confidence Interval for Mean Difference:  9.853 to 11.180
## 
## 
## --- Do not assume equal population variances of pcfat for each gender 
## 
## t-cutoff: tcut =  1.964 
## Standard Error of Mean Difference: SE =  0.353 
## 
## Hypothesis Test of 0 Mean Diff:  t = 29.768,  df = 602.015, p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.694
## 95% Confidence Interval for Mean Difference:  9.823 to 11.210
## 
## 
## ------ Effect Size ------
## 
## --- Assume equal population variances of pcfat for each gender 
## 
## Standardized Mean Difference of pcfat, Cohen's d:  1.961
## 
## 
## ------ Practical Importance ------
## 
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for gender F: 1.475
## Density bandwidth for gender M: 1.867

5.3 Sử dụng mô hình hồi qui tuyến tính

m.1 = lm(pcfat ~ gender, data = ob)
summary(m.1)
## 
## Call:
## lm(formula = pcfat ~ gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.0724  -3.2724   0.1484   3.6276  14.8439 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  34.6724     0.1826   189.9 <0.0000000000000002 ***
## genderM     -10.5163     0.3381   -31.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4428 
## F-statistic: 967.3 on 1 and 1215 DF,  p-value: < 0.00000000000000022

Dùng gói lessR

library(lessR)
m.2 = reg(pcfat ~ gender, data = ob)
## 
## >>>  gender is not numeric. Converted to indicator variables.

m.2
## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## reg(pcfat ~ gender, data=ob, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  ob 
##  
## Response Variable: pcfat 
## Predictor Variable: genderM 
##  
## Number of cases (rows) of data:  1217 
## Number of cases retained for analysis:  1217 
## 
## 
##   BASIC ANALYSIS 
## 
##               Estimate    Std Err  t-value  p-value    Lower 95%    Upper 95% 
## (Intercept)  34.672413   0.182622  189.859    0.000    34.314123    35.030703 
##     genderM -10.516344   0.338131  -31.101    0.000   -11.179729    -9.852959 
## 
## Standard deviation of pcfat: 7.182862 
##  
## Standard deviation of residuals:  5.361759 for df=1215 
## 95% range of residuals:  21.038669 = 2 * (1.962 * 5.361759) 
##  
## R-squared: 0.443    Adjusted R-squared: 0.443    PRESS R-squared: 0.441 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 967.297     df: 1 and 1215     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##                df        Sum Sq       Mean Sq     F-value   p-value 
## Model           1  27808.311497  27808.311497  967.297285     0.000 
## Residuals    1215  34929.384159     28.748464 
## pcfat        1216  62737.695656     51.593500 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##           pcfat genderM 
##     pcfat  1.00   -0.67 
##   genderM -0.67    1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"] 
## --------------------------------------------------------------------------- 
##         genderM     pcfat    fitted      resid    rstdnt    dffits    cooks 
##    210        1  9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020 
##    509        1 39.000000 24.156069  14.843931  2.780055  0.147758 0.010860 
##    179        1 38.700000 24.156069  14.543931  2.723523  0.144754 0.010420 
##    518        1  9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300 
##    200        1  9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150 
##    563        1 38.300000 24.156069  14.143931  2.648179  0.140749 0.009860 
##    318        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    972        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    388        1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920 
##    203        1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530 
##   1137        0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150 
##    893        0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070 
##    688        1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020 
##    403        1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640 
##    858        1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400 
##    158        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##   1106        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##    827        1 36.000000 24.156069  11.843931  2.215637  0.117760 0.006910 
##    756        1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810 
##    196        1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##         genderM     pcfat      pred   s_pred    pi.lwr    pi.upr     width 
##      2        1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      5        1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
## ... 
##   1209        1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      1        0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##      3        0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869 
## ... 
##   1215        0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1216        0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1217        0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------

Kiểm tra giả định

Cách đơn giản

plot(m.1)

Dùng gói ggfortify

library(ggfortify)
autoplot(m.1)

Việc 6. Đánh giá mối liên quan giữa cân nặng và tỉ trọng mỡ

6.1 Vẽ biểu đồ tán xạ

p = ggplot(data = ob, aes(x = weight, y = pcfat))
p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

6.2 Sử dụng mô hình hồi qui tuyến tính

m.3 = lm(pcfat ~ weight, data = ob)
summary(m.3)
## 
## Call:
## lm(formula = pcfat ~ weight, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.3122  -4.5234   0.8902   5.2695  16.9742 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 29.22295    1.22370  23.881 <0.0000000000000002 ***
## weight       0.04319    0.02188   1.975              0.0485 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared:  0.003199,   Adjusted R-squared:  0.002378 
## F-statistic: 3.899 on 1 and 1215 DF,  p-value: 0.04855
autoplot(m.3)

Việc 7. Đánh giá mối liên quan độc lập giữa cân nặng và tỉ trọng mỡ

m.4 = lm(pcfat ~ weight + age + gender + height, data = ob)
summary(m.4)
## 
## Call:
## lm(formula = pcfat ~ weight + age + gender + height, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.208  -2.543   0.019   2.582  15.706 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  48.368722   3.505431  13.798 < 0.0000000000000002 ***
## weight        0.439169   0.015594  28.163 < 0.0000000000000002 ***
## age           0.056166   0.007404   7.585   0.0000000000000658 ***
## genderM     -11.483254   0.344343 -33.348 < 0.0000000000000002 ***
## height       -0.257013   0.023768 -10.813 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.974 on 1212 degrees of freedom
## Multiple R-squared:  0.695,  Adjusted R-squared:  0.694 
## F-statistic: 690.4 on 4 and 1212 DF,  p-value: < 0.00000000000000022
autoplot(m.4)

Việc 8. Xây dựng mô hình dự báo tỉ trọng mỡ

8.1 Sử dụng phương pháp BMA

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
yvar = ob[, c("pcfat")]
xvar = ob[, c("gender", "age", "height", "weight", "bmi")]
m.bma = bicreg(xvar, yvar, strict = FALSE, OR = 20)
summary(m.bma)
## 
## Call:
## bicreg(x = xvar, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   3  models were selected
##  Best  3  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1      model 2      model 3    
## Intercept  100.0    5.26146  4.582901      7.95773     -0.79279      8.13735
## genderM    100.0  -11.25139  0.429659    -11.44430    -11.42764    -10.80625
## age        100.0    0.05259  0.008048      0.05497      0.05473      0.04715
## height      31.4    0.01759  0.028494        .          0.05598        .    
## weight      39.2    0.03102  0.042611      0.07921        .            .    
## bmi        100.0    1.01265  0.111625      0.89419      1.08852      1.08936
##                                                                             
## nVar                                         4            4            3    
## r2                                         0.697        0.696        0.695  
## BIC                                    -1423.06312  -1422.62198  -1422.49027
## post prob                                  0.392        0.314        0.294
imageplot.bma(m.bma)

8.2 Kiểm tra giả định của mô hình

m.bma = lm(pcfat ~ gender + age + weight + bmi, data = ob)
summary(m.bma)
## 
## Call:
## lm(formula = pcfat ~ gender + age + weight + bmi, data = ob)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.225  -2.557   0.033   2.608  15.646 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   7.957732   0.852500   9.335 < 0.0000000000000002 ***
## genderM     -11.444303   0.342565 -33.408 < 0.0000000000000002 ***
## age           0.054966   0.007395   7.433    0.000000000000199 ***
## weight        0.079207   0.028620   2.768              0.00573 ** 
## bmi           0.894194   0.080297  11.136 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.963 on 1212 degrees of freedom
## Multiple R-squared:  0.6966, Adjusted R-squared:  0.6956 
## F-statistic: 695.7 on 4 and 1212 DF,  p-value: < 0.00000000000000022
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
## 
##     bc, recode, sp
vif(m.bma)
##   gender      age   weight      bmi 
## 1.878778 1.263468 5.609651 4.663425
autoplot(m.bma)

8.3 Viết phương trình

8.4 Xác định tầm quan trọng

library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:survival':
## 
##     aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
ob$sex = ifelse(ob$gender == "F", 1, 0)
m.bma2 = lm(pcfat ~ sex + age + weight + bmi, data = ob)
calc.relimp(m.bma2, type = "lmg", rela = TRUE, rank = TRUE)
## Response variable: pcfat 
## Total response variance: 51.5935 
## Analysis based on 1217 observations 
## 
## 4 Regressors: 
## sex age weight bmi 
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##               lmg
## sex    0.59317775
## age    0.06893066
## weight 0.09175463
## bmi    0.24613695
## 
## Average coefficients for different model sizes: 
## 
##                 1X         2Xs         3Xs         4Xs
## sex    10.51634414 11.71834412 11.80453842 11.44430262
## age     0.12768705  0.10445197  0.05168496  0.05496623
## weight  0.04319324 -0.05539405 -0.06907993  0.07920690
## bmi     1.03619023  1.50631405  1.54278433  0.89419395

Việc 9. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs