Ngày 2: Hồi qui tuyến tính

Việc 1. Đọc dữ liệu vào R

library(carData)
data(Salaries)
dim(Salaries)
## [1] 397   6
head(Salaries)
##        rank discipline yrs.since.phd yrs.service  sex salary
## 1      Prof          B            19          18 Male 139750
## 2      Prof          B            20          16 Male 173200
## 3  AsstProf          B             4           3 Male  79750
## 4      Prof          B            45          39 Male 115000
## 5      Prof          B            40          41 Male 141500
## 6 AssocProf          B             6           6 Male  97000

Việc 2. Đánh giá mối liên quan giữa giới tính và lương GS

2.1 Đánh giá phân bố của lương GS

library(ggplot2)
ggplot(data = Salaries, aes(x = salary)) + geom_histogram(fill = "blue", col = "white") + labs(x = "Lương (USD)", y = "Số người", title = "Phân bố lương GS (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.2 Vẽ biểu đồ hộp thể hiện mối liên quan giữa giới tính và lương GS

p = ggplot(data = Salaries, aes(x = sex, y = salary, col = sex))
p + geom_boxplot() 

2.3 Sử dụng mô hình hồi qui tuyến tính

m.1 = lm(salary ~ sex, data = Salaries)
summary(m.1)
## 
## Call:
## lm(formula = salary ~ sex, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101002       4809  21.001  < 2e-16 ***
## sexMale        14088       5065   2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667

Kiểm tra giả định của mô hình

par(mfrow = c(2,2))
plot(m.1)

# Dùng gói ggfortify:
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.2
autoplot(m.1)

Việc 3. Đánh giá mối tương quan đa biến

vars = Salaries[, c("rank", "discipline", "yrs.since.phd", "yrs.service", "sex", "salary")]
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data = vars, mapping = aes(color = sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Việc 4. Đánh giá mối liên quan độc lập giữa giới tính và lương GS

4.1 Mô hình hồi qui tuyến tính

m.2 = lm(salary ~ sex + discipline + yrs.service, data = Salaries)
summary(m.2)
## 
## Call:
## lm(formula = salary ~ sex + discipline + yrs.service, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77424 -19404  -4635  15539 105391 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  84361.1     4941.4  17.072  < 2e-16 ***
## sexMale       8423.3     4744.5   1.775   0.0766 .  
## disciplineB  13033.8     2840.3   4.589 6.01e-06 ***
## yrs.service    832.2      110.2   7.550 3.07e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27790 on 393 degrees of freedom
## Multiple R-squared:  0.1646, Adjusted R-squared:  0.1582 
## F-statistic: 25.81 on 3 and 393 DF,  p-value: 2.928e-15

4.2 Kiểm tra giả định mô hình

autoplot((m.2))

Việc 5. Xây dựng mô hình dự báo lương GS

5.1 Xây dựng mô hình bằng phương pháp BMA

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
yvar = Salaries[, c("salary")]
xvar = Salaries[, c("sex", "rank", "discipline", "yrs.since.phd", "yrs.service")]
m.bma = bicreg(xvar, yvar, strict = FALSE, OR = 20)
summary(m.bma)
## 
## Call:
## bicreg(x = xvar, y = yvar, strict = FALSE, OR = 20)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##                p!=0    EV        SD       model 1    model 2    model 3  
## Intercept      100.0  71631.552  3429.93  71944.33   68223.53   72253.53 
## sexMale          8.1    365.573  1649.61      .       4491.80       .    
## rankAssocProf  100.0  13760.698  3984.90  13761.54   13723.42   14483.23 
## rankProf       100.0  47809.009  3259.03  47843.84   47403.32   49377.50 
## disciplineB    100.0  13759.250  2300.72  13760.96   13708.69   13561.43 
## yrs.since.phd    4.8      3.481    31.86      .          .          .    
## yrs.service      5.2     -3.983    30.56      .          .        -76.33 
##                                                                          
## nVar                                            3          4          4  
## r2                                            0.445      0.447      0.446
## BIC                                        -215.78    -211.17    -210.28 
## post prob                                     0.818      0.081      0.052
##                model 4  
## Intercept      71405.40 
## sexMale            .    
## rankAssocProf  13030.16 
## rankProf       46211.57 
## disciplineB    14028.68 
## yrs.since.phd     71.92 
## yrs.service        .    
##                         
## nVar                 4  
## r2                 0.445
## BIC             -210.13 
## post prob          0.048
imageplot.bma(m.bma)

5.2 Mô hình dự báo lương GS

m.3 = lm(salary ~ rank + discipline, data = Salaries)
summary(m.3)
## 
## Call:
## lm(formula = salary ~ rank + discipline, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65990 -14049  -1288  10760  97996 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      71944       3135  22.948  < 2e-16 ***
## rankAssocProf    13762       3961   3.475 0.000569 ***
## rankProf         47844       3112  15.376  < 2e-16 ***
## disciplineB      13761       2296   5.993 4.65e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22650 on 393 degrees of freedom
## Multiple R-squared:  0.445,  Adjusted R-squared:  0.4407 
## F-statistic:   105 on 3 and 393 DF,  p-value: < 2.2e-16

5.3 Kiểm tra giả định của mô hình

autoplot(m.3)

Việc 6. Dữ liệu ‘Insurance’

6.1 Đọc dữ liệu

library(readxl)
df = read_excel("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Insurance data for reg analysis.xlsx")

6.2 Đặc điểm dữ liệu

dim(df)
## [1] 1338    8
head(df)
## # A tibble: 6 × 8
##      id   age sex      bmi children smoking region    charges
##   <dbl> <dbl> <chr>  <dbl>    <dbl> <chr>   <chr>       <dbl>
## 1     1    19 female  27.9        0 yes     southwest  16885.
## 2     2    18 male    33.8        1 no      southeast   1726.
## 3     3    28 male    33          3 no      southeast   4449.
## 4     4    33 male    22.7        0 no      northwest  21984.
## 5     5    32 male    28.9        0 no      northwest   3867.
## 6     6    31 female  25.7        0 no      southeast   3757.

6.3 Phân bố phí bảo hiểm

library(ggplot2)
ggplot(data = df, aes(x = charges)) + geom_histogram(fill = "blue", col = "white") + labs(x = "Phí bảo hiểm (USD)", y = "Số người", title = "Phân bố phí bảo hiểm (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Chuyển đổi dữ liệu

df$y = log(df$charges)
ggplot(data = df, aes(x = y)) + geom_histogram(fill = "blue", col = "white") + labs(x = "log(Phí bảo hiểm (USD))", y = "Số người", title = "Phân bố phí bảo hiểm (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Việc 7. Đánh giá mô hình dự báo phí bảo hiểm bằng pp tách dữ liệu

7.1 Tách dữ liệu

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
set.seed(1234)

index = createDataPartition(df$y, p = 0.7, list = FALSE)
training = df[index, ]
  dim(training)
## [1] 938   9
testing = df[-index, ]
  dim(testing)
## [1] 400   9

7.2 Huấn luyện mô hình

fit = lm(y ~ age + sex + bmi + children + smoking + region, data = training)
summary(fit)
## 
## Call:
## lm(formula = y ~ age + sex + bmi + children + smoking + region, 
##     data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.08746 -0.21190 -0.05229  0.06475  2.15423 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.102467   0.088675  80.095  < 2e-16 ***
## age              0.033259   0.001067  31.181  < 2e-16 ***
## sexmale         -0.061977   0.029935  -2.070   0.0387 *  
## bmi              0.013598   0.002538   5.357 1.07e-07 ***
## children         0.105015   0.012314   8.528  < 2e-16 ***
## smokingyes       1.573470   0.037395  42.077  < 2e-16 ***
## regionnorthwest -0.083466   0.042530  -1.963   0.0500 *  
## regionsoutheast -0.214992   0.043499  -4.942 9.15e-07 ***
## regionsouthwest -0.177750   0.043724  -4.065 5.20e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4561 on 929 degrees of freedom
## Multiple R-squared:  0.7557, Adjusted R-squared:  0.7536 
## F-statistic: 359.1 on 8 and 929 DF,  p-value: < 2.2e-16

7.3 Đánh giá mô hình

testing$predicted = predict(fit, testing)
error = testing$predicted - testing$y

# RMSE:
sqrt(mean(error^2))
## [1] 0.4196563
# R2:
cor(testing$predicted, testing$y)^2
## [1] 0.7935273

Việc 8. Đánh giá mô hình dự báo phí bảo hiểm bằng pp k-fole cross-validation

fit.2 = train(form = y ~ age + sex + bmi + children + smoking + region, data = df, method = "lm", trContrl = trainControl(method = "cv", number = 10))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded
fit.2
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1338, 1338, 1338, 1338, 1338, 1338, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.4458389  0.7619611  0.2814355
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Việc 9. Đánh giá mô hình dự báo phí bảo hiểm bằng pp bootstrap

fit.3 = train(form = y ~ age + sex + bmi + children + smoking + region, data = df, method = "lm", trContrl = trainControl(method = "boot", number = 100))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trContrl' will be disregarded
fit.3
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1338, 1338, 1338, 1338, 1338, 1338, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.4442467  0.767426  0.2822197
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Việc 10. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs (https://rpubs.com/ThachTran/1259866)

Back-up: Đánh giá tầm quan trọng của biến số

library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:survival':
## 
##     aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
df$gender = ifelse(df$sex == "female", 1, 0)
df$smoke = ifelse(df$smoking == "yes", 1, 0)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::select() masks MASS::select()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df = df %>% 
  mutate(area = case_when(region == "northeast" ~ 1,
                          region == "northwest" ~ 2,
                          region == "southeast" ~ 3,
                          region == "southwest" ~ 4)) 
m.rel = lm(y ~ age + gender + bmi + children + smoke + area, data = df)
calc.relimp(m.rel, type = "lmg", rela = TRUE, rank = TRUE)
## Response variable: y 
## Total response variance: 0.8455301 
## Analysis based on 1338 observations 
## 
## 6 Regressors: 
## age gender bmi children smoke area 
## Proportion of variance explained by model: 76.7%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##                  lmg
## age      0.362162950
## gender   0.001258041
## bmi      0.015751896
## children 0.028567714
## smoke    0.588736915
## area     0.003522485
## 
## Average coefficients for different model sizes: 
## 
##                   1X          2Xs         3Xs         4Xs         5Xs
## age       0.03454513  0.034587378  0.03462174  0.03464601  0.03465802
## gender   -0.01035399  0.007543188  0.02507011  0.04221895  0.05898003
## bmi       0.02000482  0.018466697  0.01692440  0.01537887  0.01383119
## children  0.12306432  0.118758126  0.11454432  0.11041611  0.10636778
## smoke     1.51587709  1.522850531  1.52978138  1.53662037  1.54332089
## area     -0.03552783 -0.039673654 -0.04294073 -0.04534090 -0.04688970
##                  6Xs
## age       0.03465562
## gender    0.07534136
## bmi       0.01228258
## children  0.10239500
## smoke     1.54983839
## area     -0.04760667