library(ggplot2)
library(grid)
library(gridExtra)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(lmboot)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = read.csv("D:\\OneDrive\\ANDREAS\\ACADEMICS_UNIVERSITY\\Year_4_2024\\Autumn\\32931\\Classes\\R_Basic\\Professorial_Salaries.csv")
dim(salary)
## [1] 397 9
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
table1(~ Sex + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Rank, data = salary)
| AssocProf (N=64) |
AsstProf (N=67) |
Prof (N=266) |
Overall (N=397) |
|
|---|---|---|---|---|
| Sex | ||||
| Female | 10 (15.6%) | 11 (16.4%) | 18 (6.8%) | 39 (9.8%) |
| Male | 54 (84.4%) | 56 (83.6%) | 248 (93.2%) | 358 (90.2%) |
| Discipline | ||||
| A | 26 (40.6%) | 24 (35.8%) | 131 (49.2%) | 181 (45.6%) |
| B | 38 (59.4%) | 43 (64.2%) | 135 (50.8%) | 216 (54.4%) |
| Yrs.since.phd | ||||
| Mean (SD) | 15.5 (9.65) | 5.10 (2.54) | 28.3 (10.1) | 22.3 (12.9) |
| Median [Min, Max] | 12.0 [6.00, 49.0] | 4.00 [1.00, 11.0] | 28.0 [11.0, 56.0] | 21.0 [1.00, 56.0] |
| Yrs.service | ||||
| Mean (SD) | 12.0 (10.1) | 2.37 (1.50) | 22.8 (11.6) | 17.6 (13.0) |
| Median [Min, Max] | 8.00 [1.00, 53.0] | 3.00 [0, 6.00] | 21.0 [0, 60.0] | 16.0 [0, 60.0] |
| NPubs | ||||
| Mean (SD) | 18.7 (13.5) | 15.9 (11.2) | 18.6 (14.7) | 18.2 (14.0) |
| Median [Min, Max] | 16.0 [1.00, 50.0] | 12.0 [2.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
| Ncits | ||||
| Mean (SD) | 42.0 (15.6) | 36.3 (16.7) | 40.8 (17.2) | 40.2 (16.9) |
| Median [Min, Max] | 36.5 [14.0, 83.0] | 34.0 [1.00, 83.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
| Salary | ||||
| Mean (SD) | 93900 (13800) | 80800 (8170) | 127000 (27700) | 114000 (30300) |
| Median [Min, Max] | 95600 [62900, 126000] | 79800 [63100, 97000] | 123000 [57800, 232000] | 107000 [57800, 232000] |
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
- The distribution is skewed right.
p = ggplot(data = salary, aes(x = Rank, y = Salary, fill = Rank, col = Rank))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Rank", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank") + theme_bw()
sal.rank = aov(Salary ~ Rank, data = salary)
summary(sal.rank)
## Df Sum Sq Mean Sq F value Pr(>F)
## Rank 2 1.432e+11 7.162e+10 128.2 <2e-16 ***
## Residuals 394 2.201e+11 5.586e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))
plot(sal.rank)
tukey.sal.rank = TukeyHSD(sal.rank)
tukey.sal.rank
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Salary ~ Rank, data = salary)
##
## $Rank
## diff lwr upr p adj
## AsstProf-AssocProf -13100.45 -22818.71 -3382.195 0.0046514
## Prof-AssocProf 32895.67 25154.51 40636.836 0.0000000
## Prof-AsstProf 45996.12 38395.94 53596.307 0.0000000
Fem.P = subset(salary, Sex == "Female" & Discipline == "B")
dim(Fem.P)
## [1] 21 9
head(Fem.P)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 10 10 Prof B 18 18 Female 22 29 129000
## 35 35 AsstProf B 4 2 Female 19 69 80225
## 36 36 AsstProf B 5 0 Female 11 69 77000
## 48 48 Prof B 23 19 Female 50 55 151768
## 49 49 Prof B 25 25 Female 18 33 140096
## 53 53 AsstProf B 11 3 Female 50 31 74692
table1(~ Sex + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Rank, data = Fem.P)
| AssocProf (N=6) |
AsstProf (N=5) |
Prof (N=10) |
Overall (N=21) |
|
|---|---|---|---|---|
| Sex | ||||
| Female | 6 (100%) | 5 (100%) | 10 (100%) | 21 (100%) |
| Discipline | ||||
| B | 6 (100%) | 5 (100%) | 10 (100%) | 21 (100%) |
| Yrs.since.phd | ||||
| Mean (SD) | 13.5 (2.88) | 6.60 (3.65) | 21.5 (5.95) | 15.7 (7.72) |
| Median [Min, Max] | 12.5 [11.0, 19.0] | 5.00 [3.00, 11.0] | 19.0 [17.0, 36.0] | 17.0 [3.00, 36.0] |
| Yrs.service | ||||
| Mean (SD) | 8.83 (1.94) | 2.60 (1.82) | 17.9 (4.77) | 11.7 (7.36) |
| Median [Min, Max] | 9.50 [6.00, 11.0] | 3.00 [0, 5.00] | 17.5 [10.0, 26.0] | 10.0 [0, 26.0] |
| NPubs | ||||
| Mean (SD) | 16.3 (14.3) | 18.8 (18.3) | 21.3 (14.2) | 19.3 (14.6) |
| Median [Min, Max] | 14.5 [1.00, 38.0] | 11.0 [3.00, 50.0] | 20.0 [6.00, 50.0] | 18.0 [1.00, 50.0] |
| Ncits | ||||
| Mean (SD) | 45.8 (12.9) | 46.4 (24.1) | 38.0 (15.7) | 42.2 (16.9) |
| Median [Min, Max] | 49.0 [26.0, 60.0] | 49.0 [14.0, 69.0] | 36.5 [18.0, 60.0] | 48.0 [14.0, 69.0] |
| Salary | ||||
| Mean (SD) | 99400 (14100) | 84200 (9790) | 132000 (17500) | 111000 (25400) |
| Median [Min, Max] | 104000 [71100, 110000] | 80200 [74700, 97000] | 128000 [105000, 161000] | 105000 [71100, 161000] |
p = ggplot(data = Fem.P, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
kruskal.test(NPubs ~ Rank, data = Fem.P)
##
## Kruskal-Wallis rank sum test
##
## data: NPubs by Rank
## Kruskal-Wallis chi-squared = 0.81015, df = 2, p-value = 0.6669
boot = ANOVA.boot(NPubs ~ Rank, B = 1000, seed = 1234, data = Fem.P)
boot$'p-values'
## [1] 0.821