Linear regression model

1: Use “Salaries”

# Use dataset Salaries in Library "carData"
# Ctrl + Alt + I
# Use: library(readxl); library(ggplot2); library(caret); library(rms); library(GGally)

library(carData)
## Warning: package 'carData' was built under R version 4.2.2
data("Salaries")

2: Preliminary assessment of the relationship between variables by package “GGally”

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
head(Salaries)
##        rank discipline yrs.since.phd yrs.service  sex salary
## 1      Prof          B            19          18 Male 139750
## 2      Prof          B            20          16 Male 173200
## 3  AsstProf          B             4           3 Male  79750
## 4      Prof          B            45          39 Male 115000
## 5      Prof          B            40          41 Male 141500
## 6 AssocProf          B             6           6 Male  97000
colnames(Salaries)
## [1] "rank"          "discipline"    "yrs.since.phd" "yrs.service"  
## [5] "sex"           "salary"
vars = Salaries[, c("rank", "discipline","yrs.since.phd", "yrs.service", "sex", "salary")]

ggpairs(data = Salaries, aes(color = sex) )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3: Plot a histogram of wages (Salary).

library(ggplot2)
str(Salaries)
## 'data.frame':    397 obs. of  6 variables:
##  $ rank         : Factor w/ 3 levels "AsstProf","AssocProf",..: 3 3 1 3 3 2 3 3 3 3 ...
##  $ discipline   : Factor w/ 2 levels "A","B": 2 2 2 2 2 2 2 2 2 2 ...
##  $ yrs.since.phd: int  19 20 4 45 40 6 30 45 21 18 ...
##  $ yrs.service  : int  18 16 3 39 41 6 23 45 20 18 ...
##  $ sex          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 1 ...
##  $ salary       : int  139750 173200 79750 115000 141500 97000 175000 147765 119250 129000 ...
ggplot(data = Salaries, aes(x = salary)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = Salaries, mapping = aes(x = salary)) + geom_histogram(fill = "blue", col = "white") + labs(title = "Distribution of Professor's Salary", x = "Salary (US dollars)", y = "Number of Participants")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

4: How about the relationship between gender and salary?

4.1 Boxplot of y = Salary và x = Sex.

ggplot(Salaries, aes(group = sex, x = sex, y = salary, fill = sex, color = sex)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = sex))+theme(legend.position = "none")+ labs(x = "Gender", y = "Salary (dollars)")

library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.2
ggplot(Salaries, aes(group = sex, x = sex, y = salary, fill = sex, color = sex)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = sex))+theme_economist()+ labs(x = "Gender", y = "Salary (dollars)")

## 4.2 Linear Regression model: Salary= α + β×Sex

m1 = lm(salary ~ sex, data = Salaries)
summary(m1)
## 
## Call:
## lm(formula = salary ~ sex, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57290 -23502  -6828  19710 116455 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   101002       4809  21.001  < 2e-16 ***
## sexMale        14088       5065   2.782  0.00567 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared:  0.01921,    Adjusted R-squared:  0.01673 
## F-statistic: 7.738 on 1 and 395 DF,  p-value: 0.005667

4.3 Check the assumption of above model.

par(mfrow = c(2, 2))

plot(m1)

5: How about the relationship between (yrs.since.phd) and (Salary)

5.1 Scatterplot for y = Salary and x = yrs.since.phd

ggplot(Salaries, aes(y = salary, x = yrs.since.phd)) + geom_point() 

ggplot(Salaries, aes(y = salary, x = yrs.since.phd)) + geom_point() + geom_smooth(method = "loess")
## `geom_smooth()` using formula 'y ~ x'

ggplot(Salaries, aes(y = salary, x = yrs.since.phd)) + geom_point() + geom_smooth(method = "gam") + theme_economist() + labs(title = "Scatter plot of Years since PhD and Salary", x = "The number of years after PhD", y = "Salary (US dollars)")
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'

5.2 Linear Regression model: Salary= α + β × yrs.since.phd

m2 = lm(salary ~ yrs.since.phd, data = Salaries)
summary(m2)
## 
## Call:
## lm(formula = salary ~ yrs.since.phd, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84171 -19432  -2858  16086 102383 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    91718.7     2765.8  33.162   <2e-16 ***
## yrs.since.phd    985.3      107.4   9.177   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared:  0.1758, Adjusted R-squared:  0.1737 
## F-statistic: 84.23 on 1 and 395 DF,  p-value: < 2.2e-16

5.3 Check the assumpts of this above model.

par(mfrow = c(2, 2))

plot(m2)

6: The relationship between (Sex) and (Yrs.since.phd) with (Salary)

6.1 Linear Regression model: Salary = α + β×Sex + γ×Yrs.since.phd

m3 = lm(salary ~ sex + yrs.since.phd, data = Salaries)
summary(m3)
## 
## Call:
## lm(formula = salary ~ sex + yrs.since.phd, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -84167 -19735  -2551  15427 102033 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    85181.8     4748.3  17.939   <2e-16 ***
## sexMale         7923.6     4684.1   1.692   0.0915 .  
## yrs.since.phd    958.1      108.3   8.845   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared:  0.1817, Adjusted R-squared:  0.1775 
## F-statistic: 43.74 on 2 and 394 DF,  p-value: < 2.2e-16

6.2 Check the assumptions.

par(mfrow = c(2, 2))

plot(m3)

6.3 Interpret these above information.

6.4 Compare β-coefficents between 4.2 và 5.1.

7: Interaction between (Sex) and (Yrs.service) in the relationship with (Salary)

7.1 Linear Regression model: Salary= α + β×Sex + γ×Yrs.service + δ×Sex×Yrs.service

m4 = lm(salary ~ sex + yrs.service + sex*yrs.service, data = Salaries)
summary(m4)
## 
## Call:
## lm(formula = salary ~ sex + yrs.service + sex * yrs.service, 
##     data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80381 -20258  -3727  16353 102536 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          82068.5     7568.7  10.843  < 2e-16 ***
## sexMale              20128.6     7991.1   2.519  0.01217 *  
## yrs.service           1637.3      523.0   3.130  0.00188 ** 
## sexMale:yrs.service   -931.7      535.2  -1.741  0.08251 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28420 on 393 degrees of freedom
## Multiple R-squared:  0.1266, Adjusted R-squared:  0.1199 
## F-statistic: 18.98 on 3 and 393 DF,  p-value: 1.622e-11
ggplot(Salaries, aes(y = salary, x = yrs.service, color = sex)) + geom_point() + stat_smooth(method = "lm", se = TRUE) + theme_economist() 
## `geom_smooth()` using formula 'y ~ x'

7.2 Interpret the results of model in item 7.1.

8: Find the optimal model for salary prediction.

library(BMA)
## Warning: package 'BMA' was built under R version 4.2.2
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.2.1
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-0)
yvar = Salaries[, 6]
xvars = Salaries[, -6]
bma = bicreg(xvars, yvar)
summary(bma)
## 
## Call:
## bicreg(x = xvars, y = yvar)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##                p!=0    EV        SD       model 1    model 2    model 3  
## Intercept      100.0  71631.552  3429.93  71944.33   68223.53   72253.53 
## rankAssocProf  100.0  13760.698  3984.90  13761.54   13723.42   14483.23 
## rankProf       100.0  47809.009  3259.03  47843.84   47403.32   49377.50 
## disciplineB    100.0  13759.250  2300.72  13760.96   13708.69   13561.43 
## yrs.since.phd    4.8      3.481    31.86      .          .          .    
## yrs.service      5.2     -3.983    30.56      .          .        -76.33 
## sexMale          8.1    365.573  1649.61      .       4491.80       .    
##                                                                          
## nVar                                            3          4          4  
## r2                                            0.445      0.447      0.446
## BIC                                        -215.78    -211.17    -210.28 
## post prob                                     0.818      0.081      0.052
##                model 4  
## Intercept      71405.40 
## rankAssocProf  13030.16 
## rankProf       46211.57 
## disciplineB    14028.68 
## yrs.since.phd     71.92 
## yrs.service        .    
## sexMale            .    
##                         
## nVar                 4  
## r2                 0.445
## BIC             -210.13 
## post prob          0.048
imageplot.bma(bma)

Việc 9: Đánh giá mô hình tiên lượng

m5 = lm(salary ~ rank + discipline + yrs.since.phd + yrs.service + sex, data = Salaries)
summary(m5)
## 
## Call:
## lm(formula = salary ~ rank + discipline + yrs.since.phd + yrs.service + 
##     sex, data = Salaries)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65248 -13211  -1775  10384  99592 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    65955.2     4588.6  14.374  < 2e-16 ***
## rankAssocProf  12907.6     4145.3   3.114  0.00198 ** 
## rankProf       45066.0     4237.5  10.635  < 2e-16 ***
## disciplineB    14417.6     2342.9   6.154 1.88e-09 ***
## yrs.since.phd    535.1      241.0   2.220  0.02698 *  
## yrs.service     -489.5      211.9  -2.310  0.02143 *  
## sexMale         4783.5     3858.7   1.240  0.21584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared:  0.4547, Adjusted R-squared:  0.4463 
## F-statistic:  54.2 on 6 and 390 DF,  p-value: < 2.2e-16
library(relaimpo)
## Warning: package 'relaimpo' was built under R version 4.2.2
## Loading required package: MASS
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
## 
##     salinity
## The following object is masked from 'package:survival':
## 
##     aml
## Loading required package: survey
## Warning: package 'survey' was built under R version 4.2.2
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## Warning: package 'mitools' was built under R version 4.2.2
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(m5, type = "lmg", rela = TRUE, rank = FALSE)
## Response variable: salary 
## Total response variance: 917425865 
## Analysis based on 397 observations 
## 
## 6 Regressors: 
## Some regressors combined in groups: 
##         Group  rank : rankAssocProf rankProf 
## 
##  Relative importance of 5 (groups of) regressors assessed: 
##  rank discipline yrs.since.phd yrs.service sex 
##  
## Proportion of variance explained by model: 45.47%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##                      lmg
## rank          0.61582062
## discipline    0.10246615
## yrs.since.phd 0.17005888
## yrs.service   0.09521401
## sex           0.01644033
## 
## Average coefficients for different model sizes: 
## 
##                   1group    2groups    3groups    4groups    5groups
## rankAssocProf 13100.4524 13842.5917 13959.7320 13595.6740 12907.5879
## rankProf      45996.1239 47613.0413 47802.7848 46854.9747 45065.9987
## discipline     9480.2635 13044.5687 14406.6157 14560.3549 14417.6256
## yrs.since.phd   985.3421   889.7866   784.1488   666.4088   535.0583
## yrs.service     779.5691   205.7876  -196.1370  -427.3471  -489.5157
## sex           14088.0087  8991.8762  6490.6444  5541.2511  4783.4928

9.1 Use package “caret” for model assessment

library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
sample = createDataPartition(y = Salaries$salary, p = 0.6, list = FALSE)
develop = Salaries[sample,]
test = Salaries[-sample,]

m6 = train(salary ~ rank + discipline + sex, data = develop, method = "lm", metric = "Rsquared")
summary(m6)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65615 -13249  -1351  10849  98371 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      69428       6380  10.882  < 2e-16 ***
## rankAssocProf    14694       5605   2.621  0.00933 ** 
## rankProf         47382       4374  10.834  < 2e-16 ***
## disciplineB      15128       3059   4.945 1.45e-06 ***
## sexMale           1237       5517   0.224  0.82284    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23420 on 235 degrees of freedom
## Multiple R-squared:  0.4141, Adjusted R-squared:  0.4041 
## F-statistic: 41.52 on 4 and 235 DF,  p-value: < 2.2e-16
m6.pred = predict(m6, test)
test.data = data.frame(obs = test$salary, pred = m6.pred)
defaultSummary(test.data)
##         RMSE     Rsquared          MAE 
## 2.155489e+04 4.952666e-01 1.611562e+04

9.2 Use package “rms” for model assessment.