library(ggplot2)
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(readxl)
salary = read_excel("C:\\VN trips\\VN trip 4 (Dec 2022)\\VLU\\Regression analysis\\Datasets\\Professorial Salaries.xlsx")
head(salary)
## # A tibble: 6 x 7
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 3 AsstProf B 4 3 Male 79750
## 4 4 Prof B 45 39 Male 115000
## 5 5 Prof B 40 41 Male 141500
## 6 6 AssocProf B 6 6 Male 97000
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
vars = salary[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "Salary")]
ggpairs(data = vars, mapping = aes(color = Sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = salary, aes(x = Salary)) + geom_histogram(fill = "blue", col = "white") + labs(title = "Distribution of Professiorial salary", x = "Salary (dollars)", y = "Number of participants")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## (3.4) Đánh giá mối liên quan giữa giới tính và lương GS
ggplot(salary, aes(group = Sex, x = Sex, y = Salary, fill = Sex, color = Sex)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = Sex))+theme(legend.position = "none")+ labs(x = "Sex", y = "Salary (dollars)")
### (3.4.2) Fit mô hình hồi qui giữa giới tính và lương GS
m1 = lm(Salary ~ Sex, data = salary)
summary(m1)
##
## Call:
## lm(formula = Salary ~ Sex, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101002 4809 21.001 < 2e-16 ***
## SexMale 14088 5065 2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
par(mfrow = c(2, 2))
plot(m1)
## (3.5) Đánh giá mối liên quan giữa thời gian sau tiến sĩ và lương
GS
ggplot(data = salary, aes(x = Yrs.since.phd, y = Salary)) + geom_point() + geom_smooth(method="loess") +
labs(x = "Time since PhD (years)", y= "Salary (dollars)")
## `geom_smooth()` using formula 'y ~ x'
### (3.5.2) Fit mô hình hồi qui
m2 = lm(Salary ~ Yrs.since.phd, data = salary)
summary(m2)
##
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84171 -19432 -2858 16086 102383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91718.7 2765.8 33.162 <2e-16 ***
## Yrs.since.phd 985.3 107.4 9.177 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1737
## F-statistic: 84.23 on 1 and 395 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(m2)
## (3.6) Đánh giá mối liên quan giữa giới tính, thời gian sau TS với
lương GS
m3 = lm(Salary ~ Sex + Yrs.since.phd, data = salary)
summary(m3)
##
## Call:
## lm(formula = Salary ~ Sex + Yrs.since.phd, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84167 -19735 -2551 15427 102033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85181.8 4748.3 17.939 <2e-16 ***
## SexMale 7923.6 4684.1 1.692 0.0915 .
## Yrs.since.phd 958.1 108.3 8.845 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared: 0.1817, Adjusted R-squared: 0.1775
## F-statistic: 43.74 on 2 and 394 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(m3)
## (3.7) Phân tích mô hình tương tác giữa giới tính, thời gian làm việc
và lương GS
ggplot(salary, aes(y = Salary, x = Yrs.service, color = Sex)) + geom_point() + stat_smooth(method = "lm", se = TRUE)
## `geom_smooth()` using formula 'y ~ x'
m4 = lm(Salary ~ Sex*Yrs.service, data = salary)
summary(m4)
##
## Call:
## lm(formula = Salary ~ Sex * Yrs.service, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80381 -20258 -3727 16353 102536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82068.5 7568.7 10.843 < 2e-16 ***
## SexMale 20128.6 7991.1 2.519 0.01217 *
## Yrs.service 1637.3 523.0 3.130 0.00188 **
## SexMale:Yrs.service -931.7 535.2 -1.741 0.08251 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28420 on 393 degrees of freedom
## Multiple R-squared: 0.1266, Adjusted R-squared: 0.1199
## F-statistic: 18.98 on 3 and 393 DF, p-value: 1.622e-11
library(carData)
data("Salaries")
summary(Salaries)
## rank discipline yrs.since.phd yrs.service sex
## AsstProf : 67 A:181 Min. : 1.00 Min. : 0.00 Female: 39
## AssocProf: 64 B:216 1st Qu.:12.00 1st Qu.: 7.00 Male :358
## Prof :266 Median :21.00 Median :16.00
## Mean :22.31 Mean :17.61
## 3rd Qu.:32.00 3rd Qu.:27.00
## Max. :56.00 Max. :60.00
## salary
## Min. : 57800
## 1st Qu.: 91000
## Median :107300
## Mean :113706
## 3rd Qu.:134185
## Max. :231545
dim(Salaries)
## [1] 397 6
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.6-2)
yvar = Salaries[, 6]
xvars = Salaries[, -6]
bma = bicreg(xvars, yvar)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar)
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 71631.552 3429.93 71944.33 68223.53 72253.53
## rankAssocProf 100.0 13760.698 3984.90 13761.54 13723.42 14483.23
## rankProf 100.0 47809.009 3259.03 47843.84 47403.32 49377.50
## disciplineB 100.0 13759.250 2300.72 13760.96 13708.69 13561.43
## yrs.since.phd 4.8 3.481 31.86 . . .
## yrs.service 5.2 -3.983 30.56 . . -76.33
## sexMale 8.1 365.573 1649.61 . 4491.80 .
##
## nVar 3 4 4
## r2 0.445 0.447 0.446
## BIC -215.78 -211.17 -210.28
## post prob 0.818 0.081 0.052
## model 4
## Intercept 71405.40
## rankAssocProf 13030.16
## rankProf 46211.57
## disciplineB 14028.68
## yrs.since.phd 71.92
## yrs.service .
## sexMale .
##
## nVar 4
## r2 0.445
## BIC -210.13
## post prob 0.048
imageplot.bma(bma)
## (3.9) Đánh giá tầm quan trọng của biến số
m5 = lm(salary ~ rank + discipline + yrs.since.phd + yrs.service + sex, data = Salaries)
summary(m5)
##
## Call:
## lm(formula = salary ~ rank + discipline + yrs.since.phd + yrs.service +
## sex, data = Salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65248 -13211 -1775 10384 99592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65955.2 4588.6 14.374 < 2e-16 ***
## rankAssocProf 12907.6 4145.3 3.114 0.00198 **
## rankProf 45066.0 4237.5 10.635 < 2e-16 ***
## disciplineB 14417.6 2342.9 6.154 1.88e-09 ***
## yrs.since.phd 535.1 241.0 2.220 0.02698 *
## yrs.service -489.5 211.9 -2.310 0.02143 *
## sexMale 4783.5 3858.7 1.240 0.21584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22540 on 390 degrees of freedom
## Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463
## F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:lattice':
##
## melanoma
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:rms':
##
## calibrate
## The following object is masked from 'package:Hmisc':
##
## deff
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
calc.relimp(m5, type = "lmg", rela = TRUE, rank = FALSE)
## Response variable: salary
## Total response variance: 917425865
## Analysis based on 397 observations
##
## 6 Regressors:
## Some regressors combined in groups:
## Group rank : rankAssocProf rankProf
##
## Relative importance of 5 (groups of) regressors assessed:
## rank discipline yrs.since.phd yrs.service sex
##
## Proportion of variance explained by model: 45.47%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## rank 0.61582062
## discipline 0.10246615
## yrs.since.phd 0.17005888
## yrs.service 0.09521401
## sex 0.01644033
##
## Average coefficients for different model sizes:
##
## 1group 2groups 3groups 4groups 5groups
## rankAssocProf 13100.4524 13842.5917 13959.7320 13595.6740 12907.5879
## rankProf 45996.1239 47613.0413 47802.7848 46854.9747 45065.9987
## discipline 9480.2635 13044.5687 14406.6157 14560.3549 14417.6256
## yrs.since.phd 985.3421 889.7866 784.1488 666.4088 535.0583
## yrs.service 779.5691 205.7876 -196.1370 -427.3471 -489.5157
## sex 14088.0087 8991.8762 6490.6444 5541.2511 4783.4928
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
sample = createDataPartition(y = Salaries$salary, p = 0.6, list = FALSE)
develop = Salaries[sample,]
test = Salaries[-sample,]
m6 = train(salary ~ rank + discipline + sex, data = develop, method = "lm", metric = "Rsquared")
summary(m6)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67895 -14510 -1301 9707 96091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67254 6821 9.860 < 2e-16 ***
## rankAssocProf 13783 5451 2.529 0.0121 *
## rankProf 48657 4560 10.671 < 2e-16 ***
## disciplineB 14736 3162 4.661 5.28e-06 ***
## sexMale 4806 5732 0.838 0.4026
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24130 on 235 degrees of freedom
## Multiple R-squared: 0.4142, Adjusted R-squared: 0.4042
## F-statistic: 41.54 on 4 and 235 DF, p-value: < 2.2e-16
m6.pred = predict(m6, test)
test.data = data.frame(obs = test$salary, pred = m6.pred)
defaultSummary(test.data)
## RMSE Rsquared MAE
## 2.027966e+04 5.086577e-01 1.563664e+04
library(rms)
m7 = ols(salary ~ rank + discipline + sex, data = Salaries, x = TRUE, y = TRUE)
validate = validate(m7, B = 100)
validate
## index.orig training test optimism index.corrected
## R-square 4.469000e-01 4.475000e-01 4.424000e-01 0.0051 4.418000e-01
## MSE 5.061584e+08 5.092307e+08 5.102798e+08 -1049126.8786 5.072075e+08
## g 2.187650e+04 2.190876e+04 2.180504e+04 103.7251 2.177277e+04
## Intercept 0.000000e+00 0.000000e+00 3.159127e+02 -315.9127 3.159127e+02
## Slope 1.000000e+00 1.000000e+00 9.973000e-01 0.0027 9.973000e-01
## n
## R-square 100
## MSE 100
## g 100
## Intercept 100
## Slope 100