Van Lang University Workshop on Machine Learning - Day 2: Linear regression

Việc 1. So sánh đặc điểm của mẫu nghiên cứu tiền lương

Đọc dữ liệu “Professorial Salaries” vào R

library(readxl)
salary = read_excel("C:\\Thach\\VLU workshop (Jun2023)\\Datasets\\Professorial Salaries.xlsx")
head(salary)

## # A tibble: 6 × 7
##      ID Rank      Discipline Yrs.since.phd Yrs.service Sex   Salary
##   <dbl> <chr>     <chr>              <dbl>       <dbl> <chr>  <dbl>
## 1     1 Prof      B                     19          18 Male  139750
## 2     2 Prof      B                     20          16 Male  173200
## 3     3 AsstProf  B                      4           3 Male   79750
## 4     4 Prof      B                     45          39 Male  115000
## 5     5 Prof      B                     40          41 Male  141500
## 6     6 AssocProf B                      6           6 Male   97000

So sánh đặc điểm mẫu nghiên cứu

library(compareGroups)
createTable(compareGroups(Sex ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Salary, data = salary))

## 
## --------Summary descriptives table by 'Sex'---------
## 
## _____________________________________________________ 
##                   Female          Male      p.overall 
##                    N=39          N=358                
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Rank:                                         0.014   
##     AssocProf   10 (25.6%)     54 (15.1%)             
##     AsstProf    11 (28.2%)     56 (15.6%)             
##     Prof        18 (46.2%)    248 (69.3%)             
## Discipline:                                   1.000   
##     A           18 (46.2%)    163 (45.5%)             
##     B           21 (53.8%)    195 (54.5%)             
## Yrs.since.phd  16.5 (9.78)    22.9 (13.0)    <0.001   
## Yrs.service    11.6 (8.81)    18.3 (13.2)    <0.001   
## Salary        101002 (25952) 115090 (30437)   0.003   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Việc 2. Đánh giá sơ bộ mối liên quan giữa các biến

library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

vars = salary[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "Salary")]
ggpairs(data = vars, mapping = aes(color = Sex))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Việc 3. Đánh giá mối liên quan giữa giới tính và lương GS

3.1 Thực hiện mô hình hồi qui

m1 = lm(Salary ~ Sex, data = salary)
summary(m1)

## 
## Call:
## lm(formula = Salary ~ Sex, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101002       4809  21.001  < 2e-16 ***
## SexMale        14088       5065   2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667

3.2 Kiểm tra giả định của mô hình

par(mfrow = c(2, 2))
plot(m1)

3.3 Trình bày kết quả

3.4 Đánh giá khả năng tiên lượng của mô hình bằng phương pháp bootstrap

library(rms)

## Loading required package: Hmisc

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

m1.v = ols(Salary ~ Sex, data = salary, x = TRUE, y = TRUE)
validate = validate(m1.v, B = 100)
validate

##             index.orig     training          test      optimism index.corrected
## R-square  1.920000e-02 2.190000e-02  1.440000e-02        0.0075    1.170000e-02
## MSE       8.975331e+08 8.954530e+08  9.019175e+08 -6464537.1892    9.039976e+08
## g         2.502313e+03 2.559476e+03  2.502313e+03       57.1639    2.445149e+03
## Intercept 0.000000e+00 0.000000e+00 -1.290136e+04    12901.3608   -1.290136e+04
## Slope     1.000000e+00 1.000000e+00  1.113300e+00       -0.1133    1.113300e+00
##             n
## R-square  100
## MSE       100
## g         100
## Intercept 100
## Slope     100

Việc 4. Đánh giá mối liên quan giữa tiền lương với giới tính và thời gian sau tiến sĩ

4.1 Tại sao cần mô hình này?

Thời gian sau tiến sĩ rất khác nhau giữa nam và nữ GS nên có thể ảnh hưởng đến mối liến quan giữa lương và giới tính.

4.2 Thực hiện mô hình

m2 = lm(Salary ~ Sex + Yrs.since.phd, data = salary)
summary(m2)

## 
## Call:
## lm(formula = Salary ~ Sex + Yrs.since.phd, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84167 -19735  -2551  15427 102033 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    85181.8     4748.3  17.939   <2e-16 ***
## SexMale         7923.6     4684.1   1.692   0.0915 .  
## Yrs.since.phd    958.1      108.3   8.845   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared:  0.1817, Adjusted R-squared:  0.1775 
## F-statistic: 43.74 on 2 and 394 DF,  p-value: < 2.2e-16

4.3 Trình bày kết quả

4.4 Đánh giá khả năng tiên lượng bằng phương pháp bootstrap

m2.v = ols(Salary ~ Sex + Yrs.since.phd, data = salary, x = TRUE, y = TRUE)
validate = validate(m2.v, B = 100)
validate

##             index.orig     training         test   optimism index.corrected   n
## R-square  1.817000e-01 1.893000e-01 1.745000e-01     0.0148    1.669000e-01 100
## MSE       7.488405e+08 7.553878e+08 7.553888e+08 -1032.3432    7.488416e+08 100
## g         1.480181e+04 1.515046e+04 1.472677e+04   423.6947    1.437811e+04 100
## Intercept 0.000000e+00 0.000000e+00 1.870635e+03 -1870.6347    1.870635e+03 100
## Slope     1.000000e+00 1.000000e+00 9.805000e-01     0.0195    9.805000e-01 100

Việc 5. Đánh giá khả năng tiên lượng của mô hình

5.1 Sử dụng package ‘caret’

Chia dữ liệu thành 2 tập dữ liệu nhò

library(caret)

## Loading required package: lattice

set.seed(234)
sample = createDataPartition(y = salary$Salary, p = 0.6, list = FALSE)
train = salary[sample,]
dim(train)

## [1] 240   7

test = salary[-sample,]
dim(test)

## [1] 157   7

Thực hiện mô hình trên tập dữ liệu train

m3 = train(Salary ~ Sex + Rank + Discipline + Yrs.since.phd, data = train, method = "lm", metric = "Rsquared")
summary(m3)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67696 -14677   -500  11519  83456 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    83567.33    6683.26  12.504  < 2e-16 ***
## SexMale         3288.41    5301.46   0.620   0.5357    
## RankAsstProf  -13476.54    5378.20  -2.506   0.0129 *  
## RankProf       31859.51    4553.46   6.997 2.73e-11 ***
## DisciplineB    12978.84    3035.31   4.276 2.77e-05 ***
## Yrs.since.phd     77.42     167.46   0.462   0.6443    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22980 on 234 degrees of freedom
## Multiple R-squared:  0.4388, Adjusted R-squared:  0.4268 
## F-statistic: 36.59 on 5 and 234 DF,  p-value: < 2.2e-16

Kiểm tra mô hình trên tập dữ liệu test

m3.pred = predict(m3, test)
test.data = data.frame(obs = test$Salary, pred = m3.pred)
defaultSummary(test.data)

##         RMSE     Rsquared          MAE 
## 2.223733e+04 4.618031e-01 1.634904e+04

5.2 Bằng phương pháp bootstrap

m3.v = ols(Salary ~ Sex + Rank + Discipline + Yrs.since.phd, data = salary, x = TRUE, y = TRUE)
validate = validate(m3.v, B = 100)
validate

##             index.orig     training         test      optimism index.corrected
## R-square  4.472000e-01 4.511000e-01 4.409000e-01  1.020000e-02    4.370000e-01
## MSE       5.058598e+08 5.014979e+08 5.116585e+08 -1.016055e+07    5.160204e+08
## g         2.202228e+04 2.212248e+04 2.201399e+04  1.084880e+02    2.191379e+04
## Intercept 0.000000e+00 0.000000e+00 5.726126e+02 -5.726126e+02    5.726126e+02
## Slope     1.000000e+00 1.000000e+00 9.952000e-01  4.800000e-03    9.952000e-01
##             n
## R-square  100
## MSE       100
## g         100
## Intercept 100
## Slope     100

Việc 6. Đánh giá tầm quan trọng của biến số

plot(varImp(m3))

Việc 7. Bạn hãy ghi lại tất cả những hàm/lệnh trên trong RMarkdown và chia sẻ trên mạng rpubs.com/tài khoản của bạn (https://rpubs.com/ThachTran/1051712).

VLU Workshop on Machine Learning - Day 2

Thach Tran

2023-06-07