salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
salary$Prof.Rank = factor(salary$Rank, levels = c("AsstProf", "AssocProf", "Prof"))
table1(~ Sex + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Prof.Rank, data = salary)
AsstProf (N=67) |
AssocProf (N=64) |
Prof (N=266) |
Overall (N=397) |
|
---|---|---|---|---|
Sex | ||||
Female | 11 (16.4%) | 10 (15.6%) | 18 (6.8%) | 39 (9.8%) |
Male | 56 (83.6%) | 54 (84.4%) | 248 (93.2%) | 358 (90.2%) |
Discipline | ||||
A | 24 (35.8%) | 26 (40.6%) | 131 (49.2%) | 181 (45.6%) |
B | 43 (64.2%) | 38 (59.4%) | 135 (50.8%) | 216 (54.4%) |
Yrs.since.phd | ||||
Mean (SD) | 5.10 (2.54) | 15.5 (9.65) | 28.3 (10.1) | 22.3 (12.9) |
Median [Min, Max] | 4.00 [1.00, 11.0] | 12.0 [6.00, 49.0] | 28.0 [11.0, 56.0] | 21.0 [1.00, 56.0] |
Yrs.service | ||||
Mean (SD) | 2.37 (1.50) | 12.0 (10.1) | 22.8 (11.6) | 17.6 (13.0) |
Median [Min, Max] | 3.00 [0, 6.00] | 8.00 [1.00, 53.0] | 21.0 [0, 60.0] | 16.0 [0, 60.0] |
NPubs | ||||
Mean (SD) | 15.9 (11.2) | 18.7 (13.5) | 18.6 (14.7) | 18.2 (14.0) |
Median [Min, Max] | 12.0 [2.00, 50.0] | 16.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | ||||
Mean (SD) | 36.3 (16.7) | 42.0 (15.6) | 40.8 (17.2) | 40.2 (16.9) |
Median [Min, Max] | 34.0 [1.00, 83.0] | 36.5 [14.0, 83.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | ||||
Mean (SD) | 80800 (8170) | 93900 (13800) | 127000 (27700) | 114000 (30300) |
Median [Min, Max] | 79800 [63100, 97000] | 95600 [62900, 126000] | 123000 [57800, 232000] | 107000 [57800, 232000] |
library(ggplot2)
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of professors' salaries") + theme_bw()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = salary, aes(x = Prof.Rank, y = Salary, fill = Prof.Rank, col = Prof.Rank))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Rank", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank") + theme_bw()
As professors’ rank has 3 groups (Assistant Prof, Associate Prof, Prof), an ANOVA test is recommended.
sal.rank = aov(Salary ~ Prof.Rank, data = salary)
summary(sal.rank)
## Df Sum Sq Mean Sq F value Pr(>F)
## Prof.Rank 2 1.432e+11 7.162e+10 128.2 <2e-16 ***
## Residuals 394 2.201e+11 5.586e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation: there is evidence (P< 0.0001) that professors’ salaries significantly differed among their ranks.
par(mfrow=c(2,2))
plot(sal.rank)
Interpretation: the assumptions of normality and homoscedasticity are
met.
tukey.sal.rank = TukeyHSD(sal.rank)
tukey.sal.rank
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Salary ~ Prof.Rank, data = salary)
##
## $Prof.Rank
## diff lwr upr p adj
## AssocProf-AsstProf 13100.45 3382.195 22818.71 0.0046514
## Prof-AsstProf 45996.12 38395.941 53596.31 0.0000000
## Prof-AssocProf 32895.67 25154.507 40636.84 0.0000000
Interpretation: The post-hoc analyses indicate the average salaries of associate professors were $13,100 higher than assistant professors, ranging from $3,382 to $22,819 (P= 0.005). Additionally, professors had, on average, $45,966 (95% CI: $38,396 to $53,596; P< 0.0001) and $32,896 ($25,155 to 40,637; P< 0.0001) higher than assistant and associate professors, respectively.
female.B = subset(salary, Sex == "Female" & Discipline == "B")
dim(female.B)
## [1] 21 10
table1(~ Sex + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Prof.Rank, data = female.B)
AsstProf (N=5) |
AssocProf (N=6) |
Prof (N=10) |
Overall (N=21) |
|
---|---|---|---|---|
Sex | ||||
Female | 5 (100%) | 6 (100%) | 10 (100%) | 21 (100%) |
Discipline | ||||
B | 5 (100%) | 6 (100%) | 10 (100%) | 21 (100%) |
Yrs.since.phd | ||||
Mean (SD) | 6.60 (3.65) | 13.5 (2.88) | 21.5 (5.95) | 15.7 (7.72) |
Median [Min, Max] | 5.00 [3.00, 11.0] | 12.5 [11.0, 19.0] | 19.0 [17.0, 36.0] | 17.0 [3.00, 36.0] |
Yrs.service | ||||
Mean (SD) | 2.60 (1.82) | 8.83 (1.94) | 17.9 (4.77) | 11.7 (7.36) |
Median [Min, Max] | 3.00 [0, 5.00] | 9.50 [6.00, 11.0] | 17.5 [10.0, 26.0] | 10.0 [0, 26.0] |
NPubs | ||||
Mean (SD) | 18.8 (18.3) | 16.3 (14.3) | 21.3 (14.2) | 19.3 (14.6) |
Median [Min, Max] | 11.0 [3.00, 50.0] | 14.5 [1.00, 38.0] | 20.0 [6.00, 50.0] | 18.0 [1.00, 50.0] |
Ncits | ||||
Mean (SD) | 46.4 (24.1) | 45.8 (12.9) | 38.0 (15.7) | 42.2 (16.9) |
Median [Min, Max] | 49.0 [14.0, 69.0] | 49.0 [26.0, 60.0] | 36.5 [18.0, 60.0] | 48.0 [14.0, 69.0] |
Salary | ||||
Mean (SD) | 84200 (9790) | 99400 (14100) | 132000 (17500) | 111000 (25400) |
Median [Min, Max] | 80200 [74700, 97000] | 104000 [71100, 110000] | 128000 [105000, 161000] | 105000 [71100, 161000] |
p = ggplot(data = female.B, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of number of publications") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = female.B, aes(x = Prof.Rank, y = Salary, fill = Prof.Rank, col = Prof.Rank))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Rank", y = "Number of publications") + ggtitle("Number of pulbications by professors' rank") + theme_bw()
As the number of publications is not normally distributed, an ANOVA test is not appropriate. The alternative options include (i) non-parametric Kruskal-Wallis test, or (ii) Bootstrap
kruskal.test(NPubs ~ Prof.Rank, data = female.B)
##
## Kruskal-Wallis rank sum test
##
## data: NPubs by Prof.Rank
## Kruskal-Wallis chi-squared = 0.81015, df = 2, p-value = 0.6669
Interpretation: there is no evidence that the number of publications differed among professors’ ranks.
library(lmboot)
## Warning: package 'lmboot' was built under R version 4.3.2
boot = ANOVA.boot(NPubs ~ Prof.Rank, B = 1000, seed = 1234, data = female.B)
boot$'p-values'
## [1] 0.821
Interpretation: The bootstrapping method indicates there is no evidence (P= 0.82) that the number of publications differed among ranks in a group of female professors in Applied discipline.