Day 3: Linear regression

(3.1) Việc 1: Đọc dữ liệu lương GS (Professiorial salary)

library(ggplot2)
library(rms)

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(readxl)

salary = read_excel("C:\\VN trips\\VN trip 4 (Dec 2022)\\VLU\\Regression analysis\\Datasets\\Professorial Salaries.xlsx")

head(salary)

## # A tibble: 6 x 7
##      ID Rank      Discipline Yrs.since.phd Yrs.service Sex   Salary
##   <dbl> <chr>     <chr>              <dbl>       <dbl> <chr>  <dbl>
## 1     1 Prof      B                     19          18 Male  139750
## 2     2 Prof      B                     20          16 Male  173200
## 3     3 AsstProf  B                      4           3 Male   79750
## 4     4 Prof      B                     45          39 Male  115000
## 5     5 Prof      B                     40          41 Male  141500
## 6     6 AssocProf B                      6           6 Male   97000

(3.2) Việc 2: Kiểm tra mối liên quan giữa các biến

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

vars = salary[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "Salary")]

ggpairs(data = vars, mapping = aes(color = Sex))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(3.3) Việc 3: Vẻ biểu đồ phân bố (histogram) của lương GS

ggplot(data = salary, aes(x = Salary)) + geom_histogram(fill = "blue", col = "white") + labs(title = "Distribution of Professiorial salary", x = "Salary (dollars)", y = "Number of participants")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## (3.4) Đánh giá mối liên quan giữa giới tính và lương GS

(3.4.1) Vẽ biểu đồ hộp

ggplot(salary, aes(group = Sex, x = Sex, y = Salary, fill = Sex, color = Sex)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = Sex))+theme(legend.position = "none")+ labs(x = "Sex", y = "Salary (dollars)")

### (3.4.2) Fit mô hình hồi qui giữa giới tính và lương GS

m1 = lm(Salary ~ Sex, data = salary)

summary(m1)

## 
## Call:
## lm(formula = Salary ~ Sex, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101002       4809  21.001  < 2e-16 ***
## SexMale        14088       5065   2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667

(3.4.3) Kiểm tra giả định mô hình

par(mfrow = c(2, 2))

plot(m1)

## (3.5) Đánh giá mối liên quan giữa thời gian sau tiến sĩ và lương GS

(3.5.1) Vẽ biểu đồ tán xạ

ggplot(data = salary, aes(x = Yrs.since.phd, y = Salary)) + geom_point() + geom_smooth(method="loess") + 
  labs(x = "Time since PhD (years)", y= "Salary (dollars)")

## `geom_smooth()` using formula 'y ~ x'

### (3.5.2) Fit mô hình hồi qui

m2 = lm(Salary ~ Yrs.since.phd, data = salary)

summary(m2)

## 
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84171 -19432  -2858  16086 102383 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    91718.7     2765.8  33.162   <2e-16 ***
## Yrs.since.phd    985.3      107.4   9.177   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1737 
## F-statistic: 84.23 on 1 and 395 DF,  p-value: < 2.2e-16

(3.5.3) Kiểm tra giả định mô hình

par(mfrow = c(2, 2))

plot(m2)

## (3.6) Đánh giá mối liên quan giữa giới tính, thời gian sau TS với lương GS

(3.6.1) Fit mô hình

m3 = lm(Salary ~ Sex + Yrs.since.phd, data = salary)

summary(m3)

## 
## Call:
## lm(formula = Salary ~ Sex + Yrs.since.phd, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84167 -19735  -2551  15427 102033 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    85181.8     4748.3  17.939   <2e-16 ***
## SexMale         7923.6     4684.1   1.692   0.0915 .  
## Yrs.since.phd    958.1      108.3   8.845   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared:  0.1817, Adjusted R-squared:  0.1775 
## F-statistic: 43.74 on 2 and 394 DF,  p-value: < 2.2e-16

(3.6.2) Kiểm tra giả định mô hình

par(mfrow = c(2, 2))

plot(m3)

## (3.7) Phân tích mô hình tương tác giữa giới tính, thời gian làm việc và lương GS

(3.7.1) Đánh giá mối liên quan giữa thời gian làm việc và lương GS theo giới tính

ggplot(salary, aes(y = Salary, x = Yrs.service, color = Sex)) + geom_point() + stat_smooth(method = "lm", se = TRUE)

## `geom_smooth()` using formula 'y ~ x'

(3.7.2) Fit mô hình

m4 = lm(Salary ~ Sex*Yrs.service, data = salary)

summary(m4)

## 
## Call:
## lm(formula = Salary ~ Sex * Yrs.service, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80381 -20258  -3727  16353 102536 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          82068.5     7568.7  10.843  < 2e-16 ***
## SexMale              20128.6     7991.1   2.519  0.01217 *  
## Yrs.service           1637.3      523.0   3.130  0.00188 ** 
## SexMale:Yrs.service   -931.7      535.2  -1.741  0.08251 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28420 on 393 degrees of freedom
## Multiple R-squared:  0.1266, Adjusted R-squared:  0.1199 
## F-statistic: 18.98 on 3 and 393 DF,  p-value: 1.622e-11

(3.7.2) Viết ra mô hình

Nữ (sex = 0): Lương = 82068.5 + 1637.3*Yrs.service

Nam (sex = 1): Lương = 82068.5 + 20128.6 + 1637.3Yrs.service - 931.7Yrs.service

(3.8) Tìm mô hình tối ưu dự báo lương GS bằng pp BMA

library(carData)
data("Salaries")
summary(Salaries)

##         rank     discipline yrs.since.phd    yrs.service        sex     
##  AsstProf : 67   A:181      Min.   : 1.00   Min.   : 0.00   Female: 39  
##  AssocProf: 64   B:216      1st Qu.:12.00   1st Qu.: 7.00   Male  :358  
##  Prof     :266              Median :21.00   Median :16.00               
##                             Mean   :22.31   Mean   :17.61               
##                             3rd Qu.:32.00   3rd Qu.:27.00               
##                             Max.   :56.00   Max.   :60.00               
##      salary      
##  Min.   : 57800  
##  1st Qu.: 91000  
##  Median :107300  
##  Mean   :113706  
##  3rd Qu.:134185  
##  Max.   :231545

dim(Salaries)

## [1] 397   6

library(BMA)

## Loading required package: leaps

## Loading required package: robustbase

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## Loading required package: inline

## Loading required package: rrcov

## Scalable Robust Estimators with High Breakdown Point (version 1.6-2)

yvar = Salaries[, 6]
xvars = Salaries[, -6]
bma = bicreg(xvars, yvar)
summary(bma)

## 
## Call:
## bicreg(x = xvars, y = yvar)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##                p!=0    EV        SD       model 1    model 2    model 3  
## Intercept      100.0  71631.552  3429.93  71944.33   68223.53   72253.53 
## rankAssocProf  100.0  13760.698  3984.90  13761.54   13723.42   14483.23 
## rankProf       100.0  47809.009  3259.03  47843.84   47403.32   49377.50 
## disciplineB    100.0  13759.250  2300.72  13760.96   13708.69   13561.43 
## yrs.since.phd    4.8      3.481    31.86      .          .          .    
## yrs.service      5.2     -3.983    30.56      .          .        -76.33 
## sexMale          8.1    365.573  1649.61      .       4491.80       .    
##                                                                          
## nVar                                            3          4          4  
## r2                                            0.445      0.447      0.446
## BIC                                        -215.78    -211.17    -210.28 
## post prob                                     0.818      0.081      0.052
##                model 4  
## Intercept      71405.40 
## rankAssocProf  13030.16 
## rankProf       46211.57 
## disciplineB    14028.68 
## yrs.since.phd     71.92 
## yrs.service        .    
## sexMale            .    
##                         
## nVar                 4  
## r2                 0.445
## BIC             -210.13 
## post prob          0.048

imageplot.bma(bma)

## (3.9) Đánh giá tầm quan trọng của biến số

m5 = lm(salary ~ rank + discipline + yrs.since.phd + yrs.service + sex, data = Salaries)
summary(m5)

## 
## Call:
## lm(formula = salary ~ rank + discipline + yrs.since.phd + yrs.service + 
##     sex, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    65955.2     4588.6  14.374  < 2e-16 ***
## rankAssocProf  12907.6     4145.3   3.114  0.00198 ** 
## rankProf       45066.0     4237.5  10.635  < 2e-16 ***
## disciplineB    14417.6     2342.9   6.154 1.88e-09 ***
## yrs.since.phd    535.1      241.0   2.220  0.02698 *  
## yrs.service     -489.5      211.9  -2.310  0.02143 *  
## sexMale         4783.5     3858.7   1.240  0.21584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16

library(relaimpo)

## Loading required package: MASS

## Loading required package: boot

## 
## Attaching package: 'boot'

## The following object is masked from 'package:robustbase':
## 
##     salinity

## The following object is masked from 'package:survival':
## 
##     aml

## The following object is masked from 'package:lattice':
## 
##     melanoma

## Loading required package: survey

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'survey'

## The following object is masked from 'package:rms':
## 
##     calibrate

## The following object is masked from 'package:Hmisc':
## 
##     deff

## The following object is masked from 'package:graphics':
## 
##     dotchart

## Loading required package: mitools

## This is the global version of package relaimpo.

## If you are a non-US user, a version with the interesting additional metric pmvd is available

## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.

calc.relimp(m5, type = "lmg", rela = TRUE, rank = FALSE)

## Response variable: salary 
## Total response variance: 917425865 
## Analysis based on 397 observations 
## 
## 6 Regressors: 
## Some regressors combined in groups: 
##         Group  rank : rankAssocProf rankProf 
## 
##  Relative importance of 5 (groups of) regressors assessed: 
##  rank discipline yrs.since.phd yrs.service sex 
##  
## Proportion of variance explained by model: 45.47%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##                      lmg
## rank          0.61582062
## discipline    0.10246615
## yrs.since.phd 0.17005888
## yrs.service   0.09521401
## sex           0.01644033
## 
## Average coefficients for different model sizes: 
## 
##                   1group    2groups    3groups    4groups    5groups
## rankAssocProf 13100.4524 13842.5917 13959.7320 13595.6740 12907.5879
## rankProf      45996.1239 47613.0413 47802.7848 46854.9747 45065.9987
## discipline     9480.2635 13044.5687 14406.6157 14560.3549 14417.6256
## yrs.since.phd   985.3421   889.7866   784.1488   666.4088   535.0583
## yrs.service     779.5691   205.7876  -196.1370  -427.3471  -489.5157
## sex           14088.0087  8991.8762  6490.6444  5541.2511  4783.4928

(3.10) Đánh giá mô hình

(3.10.1) Dùng package ‘caret’

library(caret)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

sample = createDataPartition(y = Salaries$salary, p = 0.6, list = FALSE)
develop = Salaries[sample,]
test = Salaries[-sample,]

m6 = train(salary ~ rank + discipline + sex, data = develop, method = "lm", metric = "Rsquared")
summary(m6)

## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67895 -14510  -1301   9707  96091 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      67254       6821   9.860  < 2e-16 ***
## rankAssocProf    13783       5451   2.529   0.0121 *  
## rankProf         48657       4560  10.671  < 2e-16 ***
## disciplineB      14736       3162   4.661 5.28e-06 ***
## sexMale           4806       5732   0.838   0.4026    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24130 on 235 degrees of freedom
## Multiple R-squared:  0.4142, Adjusted R-squared:  0.4042 
## F-statistic: 41.54 on 4 and 235 DF,  p-value: < 2.2e-16

m6.pred = predict(m6, test)
test.data = data.frame(obs = test$salary, pred = m6.pred)
defaultSummary(test.data)

##         RMSE     Rsquared          MAE 
## 2.027966e+04 5.086577e-01 1.563664e+04

(3.10.2) Dùng package ‘rms’

library(rms)
m7 = ols(salary ~ rank + discipline + sex, data = Salaries, x = TRUE, y = TRUE)
validate = validate(m7, B = 100)
validate

##             index.orig     training         test      optimism index.corrected
## R-square  4.469000e-01 4.475000e-01 4.424000e-01        0.0051    4.418000e-01
## MSE       5.061584e+08 5.092307e+08 5.102798e+08 -1049126.8786    5.072075e+08
## g         2.187650e+04 2.190876e+04 2.180504e+04      103.7251    2.177277e+04
## Intercept 0.000000e+00 0.000000e+00 3.159127e+02     -315.9127    3.159127e+02
## Slope     1.000000e+00 1.000000e+00 9.973000e-01        0.0027    9.973000e-01
##             n
## R-square  100
## MSE       100
## g         100
## Intercept 100
## Slope     100

Statistical analysis course- Van Lang University

Thach Tran

2022-12-09