salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary)
Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
---|---|---|---|
Rank | |||
AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
Discipline | |||
A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
Yrs.since.phd | |||
Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
Yrs.service | |||
Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
NPubs | |||
Mean (SD) | 20.2 (14.4) | 17.9 (13.9) | 18.2 (14.0) |
Median [Min, Max] | 18.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | |||
Mean (SD) | 40.7 (16.2) | 40.2 (17.0) | 40.2 (16.9) |
Median [Min, Max] | 36.0 [14.0, 70.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | |||
Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
library(ggplot2)
library(grid)
library(gridExtra)
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of professors' salaries") + theme_bw()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
vars = salary[, c("Yrs.since.phd", "Yrs.service", "NPubs", "Ncits", "Sex", "Salary")]
ggpairs(data = vars, mapping = aes(color = Sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The potential confounders include Ranks, Disciplines, Yrs.service (or Yrs.since.phd), NPubs (or Ncits).
m1 = lm(Salary ~ Sex + Rank + Discipline + Yrs.since.phd + NPubs , data = salary)
summary(m1)
##
## Call:
## lm(formula = Salary ~ Sex + Rank + Discipline + Yrs.since.phd +
## NPubs, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68282 -13908 -1341 10216 97403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80408.77 5204.02 15.451 < 2e-16 ***
## SexMale 4428.23 3886.18 1.139 0.25520
## RankAsstProf -13026.45 4177.81 -3.118 0.00196 **
## RankProf 32928.83 3548.38 9.280 < 2e-16 ***
## DisciplineB 13902.33 2351.28 5.913 7.36e-09 ***
## Yrs.since.phd 60.53 127.16 0.476 0.63433
## NPubs 28.93 82.05 0.353 0.72464
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22690 on 390 degrees of freedom
## Multiple R-squared: 0.4474, Adjusted R-squared: 0.4389
## F-statistic: 52.62 on 6 and 390 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m1)
The assumptions are reasonably met. Interpretation of the analysis
output: there is no evidence (P= 0.21) that professors’ salaries were
independently associated with their salaries after accounting for all
potential confounding effects. Alternative: Given the same ranks of
professor, same discipline, same time in service and same number of
publications, there is no evidence (P= 0.21) that professors’ sexes were
associated with their salaries.
m3 = lm(Salary ~ Sex + Rank + Discipline + Yrs.service + Yrs.since.phd + NPubs , data = salary)
summary(m3)
##
## Call:
## lm(formula = Salary ~ Sex + Rank + Discipline + Yrs.service +
## Yrs.since.phd + NPubs, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66068 -13512 -1502 10709 99966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78291.51 5256.22 14.895 < 2e-16 ***
## SexMale 4861.17 3869.40 1.256 0.20976
## RankAsstProf -12830.98 4155.73 -3.088 0.00216 **
## RankProf 32159.07 3544.64 9.073 < 2e-16 ***
## DisciplineB 14382.81 2347.63 6.127 2.2e-09 ***
## Yrs.service -489.36 212.18 -2.306 0.02161 *
## Yrs.since.phd 534.44 241.27 2.215 0.02733 *
## NPubs 28.54 81.60 0.350 0.72672
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22560 on 389 degrees of freedom
## Multiple R-squared: 0.4548, Adjusted R-squared: 0.445
## F-statistic: 46.37 on 7 and 389 DF, p-value: < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
vif(m3)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.034212 1 1.016962
## Rank 2.019820 2 1.192143
## Discipline 1.066022 1 1.032483
## Yrs.service 5.923063 1 2.433734
## Yrs.since.phd 7.519345 1 2.742142
## NPubs 1.010137 1 1.005056
m4= lm(Salary ~ Sex + Rank + Discipline + Yrs.since.phd + NPubs , data = salary)
summary(m4)
##
## Call:
## lm(formula = Salary ~ Sex + Rank + Discipline + Yrs.since.phd +
## NPubs, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68282 -13908 -1341 10216 97403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80408.77 5204.02 15.451 < 2e-16 ***
## SexMale 4428.23 3886.18 1.139 0.25520
## RankAsstProf -13026.45 4177.81 -3.118 0.00196 **
## RankProf 32928.83 3548.38 9.280 < 2e-16 ***
## DisciplineB 13902.33 2351.28 5.913 7.36e-09 ***
## Yrs.since.phd 60.53 127.16 0.476 0.63433
## NPubs 28.93 82.05 0.353 0.72464
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22690 on 390 degrees of freedom
## Multiple R-squared: 0.4474, Adjusted R-squared: 0.4389
## F-statistic: 52.62 on 6 and 390 DF, p-value: < 2.2e-16
vif(m4)
## GVIF Df GVIF^(1/(2*Df))
## Sex 1.031778 1 1.015765
## Rank 1.993803 2 1.188285
## Discipline 1.057628 1 1.028410
## Yrs.since.phd 2.065755 1 1.437274
## NPubs 1.010133 1 1.005054
salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
table1(~ Sex + Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary, data = salary)
Overall (N=397) |
|
---|---|
Sex | |
Female | 39 (9.8%) |
Male | 358 (90.2%) |
Rank | |
AssocProf | 64 (16.1%) |
AsstProf | 67 (16.9%) |
Prof | 266 (67.0%) |
Discipline | |
A | 181 (45.6%) |
B | 216 (54.4%) |
Yrs.since.phd | |
Mean (SD) | 22.3 (12.9) |
Median [Min, Max] | 21.0 [1.00, 56.0] |
Yrs.service | |
Mean (SD) | 17.6 (13.0) |
Median [Min, Max] | 16.0 [0, 60.0] |
NPubs | |
Mean (SD) | 18.2 (14.0) |
Median [Min, Max] | 13.0 [1.00, 69.0] |
Ncits | |
Mean (SD) | 40.2 (16.9) |
Median [Min, Max] | 35.0 [1.00, 90.0] |
Salary | |
Mean (SD) | 114000 (30300) |
Median [Min, Max] | 107000 [57800, 232000] |
ggplot(data = salary, aes(x = Salary)) + geom_histogram(aes(y = ..density..), color = "white", fill = "blue") + ggtitle("Professors' salaries (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(GGally)
vars = salary[, c("Yrs.since.phd", "Yrs.service", "NPubs", "Ncits", "Sex", "Salary")]
ggpairs(data = vars, mapping = aes(color = Sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
xvars = salary[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "NPubs", "Ncits")]
yvar = salary[,c("Salary")]
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
bma = bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 6 models were selected
## Best 5 models (cumulative posterior probability = 0.9676 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 8.428e+04 4246.08 85705.87 80159.33 81946.95
## RankAsstProf 100.0 -1.360e+04 3992.00 -13761.54 -13011.52 -13723.42
## RankProf 100.0 3.409e+04 3193.57 34082.30 34252.88 33679.90
## DisciplineB 100.0 1.376e+04 2298.00 13760.96 13779.31 13708.69
## Yrs.since.phd 3.7 2.628e+00 27.72 . . .
## Yrs.service 3.9 -3.007e+00 26.61 . . .
## SexMale 6.1 2.760e+02 1441.87 . . 4491.80
## NPubs 3.2 7.712e-01 15.32 . . .
## Ncits 21.3 2.801e+01 62.18 . 131.65 .
##
## nVar 3 4 4
## r2 0.445 0.450 0.447
## BIC -215.78 -213.65 -211.17
## post prob 0.618 0.213 0.061
## model 4 model 5
## Intercept 86736.76 84435.56
## RankAsstProf -14483.23 -13030.16
## RankProf 34894.27 33181.41
## DisciplineB 13561.43 14028.68
## Yrs.since.phd . 71.92
## Yrs.service -76.33 .
## SexMale . .
## NPubs . .
## Ncits . .
##
## nVar 4 4
## r2 0.446 0.445
## BIC -210.28 -210.13
## post prob 0.039 0.037
par(mfrow = c(1,1))
imageplot.bma(bma)
m5 = lm(Salary ~ Rank + Discipline, data = salary)
summary(m5)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65990 -14049 -1288 10760 97996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85706 3142 27.273 < 2e-16 ***
## RankAsstProf -13762 3961 -3.475 0.000569 ***
## RankProf 34082 3160 10.786 < 2e-16 ***
## DisciplineB 13761 2296 5.993 4.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22650 on 393 degrees of freedom
## Multiple R-squared: 0.445, Adjusted R-squared: 0.4407
## F-statistic: 105 on 3 and 393 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m5)
vif(m5)
## GVIF Df GVIF^(1/(2*Df))
## Rank 1.011848 2 1.002949
## Discipline 1.011848 1 1.005907
m.sw = lm(Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex + NPubs + Ncits, data = salary)
step1 = step(m.sw)
## Start: AIC=7965.36
## Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex +
## NPubs + Ncits
##
## Df Sum of Sq RSS AIC
## - NPubs 1 4.5771e+07 1.9626e+11 7963.5
## - Sex 1 7.9652e+08 1.9701e+11 7965.0
## <none> 1.9622e+11 7965.4
## - Ncits 1 1.8373e+09 1.9805e+11 7967.1
## - Yrs.since.phd 1 2.3730e+09 1.9859e+11 7968.1
## - Yrs.service 1 2.5825e+09 1.9880e+11 7968.6
## - Discipline 1 1.9267e+10 2.1548e+11 8000.5
## - Rank 2 6.8891e+10 2.6511e+11 8080.8
##
## Step: AIC=7963.46
## Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex +
## Ncits
##
## Df Sum of Sq RSS AIC
## - Sex 1 8.2020e+08 1.9708e+11 7963.1
## <none> 1.9626e+11 7963.5
## - Ncits 1 1.8538e+09 1.9812e+11 7965.2
## - Yrs.since.phd 1 2.3748e+09 1.9864e+11 7966.2
## - Yrs.service 1 2.5877e+09 1.9885e+11 7966.7
## - Discipline 1 1.9221e+10 2.1548e+11 7998.5
## - Rank 2 6.8845e+10 2.6511e+11 8078.8
##
## Step: AIC=7963.11
## Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Ncits
##
## Df Sum of Sq RSS AIC
## <none> 1.9708e+11 7963.1
## - Ncits 1 1.8143e+09 1.9890e+11 7964.8
## - Yrs.since.phd 1 2.3721e+09 1.9945e+11 7965.9
## - Yrs.service 1 2.4548e+09 1.9954e+11 7966.0
## - Discipline 1 1.9479e+10 2.1656e+11 7998.5
## - Rank 2 7.0053e+10 2.6714e+11 8079.9
summary(step1)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service +
## Ncits, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67185 -13280 -1513 9905 101141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77435.68 4791.69 16.160 < 2e-16 ***
## RankAsstProf -12140.21 4150.07 -2.925 0.00364 **
## RankProf 32672.39 3525.11 9.268 < 2e-16 ***
## DisciplineB 14501.42 2335.70 6.209 1.37e-09 ***
## Yrs.since.phd 521.01 240.47 2.167 0.03087 *
## Yrs.service -465.52 211.22 -2.204 0.02811 *
## Ncits 127.09 67.07 1.895 0.05886 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22480 on 390 degrees of freedom
## Multiple R-squared: 0.4575, Adjusted R-squared: 0.4492
## F-statistic: 54.82 on 6 and 390 DF, p-value: < 2.2e-16