library(ggplot2)
library(grid)
library(gridExtra)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.4 âś” readr 2.1.5
## âś” forcats 1.0.0 âś” stringr 1.5.1
## âś” lubridate 1.9.3 âś” tibble 3.2.1
## âś” purrr 1.0.2 âś” tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::combine() masks gridExtra::combine()
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = read.csv("D:\\OneDrive\\ANDREAS\\ACADEMICS_UNIVERSITY\\Year_4_2024\\Autumn\\32931\\Classes\\R_Basic\\Professorial_Salaries.csv")
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary)
| Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
| AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
| Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
| Discipline | |||
| A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
| B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
| Yrs.since.phd | |||
| Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
| Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
| Yrs.service | |||
| Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
| Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
| NPubs | |||
| Mean (SD) | 20.2 (14.4) | 17.9 (13.9) | 18.2 (14.0) |
| Median [Min, Max] | 18.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
| Ncits | |||
| Mean (SD) | 40.7 (16.2) | 40.2 (17.0) | 40.2 (16.9) |
| Median [Min, Max] | 36.0 [14.0, 70.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
| Salary | |||
| Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
| Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
t.test(Salary ~ Sex, data = salary)
##
## Welch Two Sample t-test
##
## data: Salary by Sex
## t = -3.1615, df = 50.122, p-value = 0.002664
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -23037.916 -5138.102
## sample estimates:
## mean in group Female mean in group Male
## 101002.4 115090.4
How did male professors’ salaries differ from female professors’ salaries? What was the range of the differences? - The difference between male and female professors’ is evidenced by the the value of P=~0.003 with a range from ~$5,138.10 and ~23,037.92 of $14,088
Assoc.P = subset(salary, Rank == "AssocProf" & Discipline == "A")
dim(Assoc.P)
## [1] 26 9
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = Assoc.P)
| Female (N=4) |
Male (N=22) |
Overall (N=26) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 4 (100%) | 22 (100%) | 26 (100%) |
| Discipline | |||
| A | 4 (100%) | 22 (100%) | 26 (100%) |
| Yrs.since.phd | |||
| Mean (SD) | 18.5 (8.19) | 17.7 (12.2) | 17.8 (11.5) |
| Median [Min, Max] | 19.0 [10.0, 26.0] | 12.5 [8.00, 49.0] | 13.0 [8.00, 49.0] |
| Yrs.service | |||
| Mean (SD) | 15.5 (8.70) | 13.1 (12.3) | 13.5 (11.7) |
| Median [Min, Max] | 15.0 [8.00, 24.0] | 8.00 [1.00, 49.0] | 8.00 [1.00, 49.0] |
| NPubs | |||
| Mean (SD) | 10.0 (4.97) | 21.6 (14.2) | 19.8 (13.8) |
| Median [Min, Max] | 10.0 [4.00, 16.0] | 16.0 [3.00, 48.0] | 16.0 [3.00, 48.0] |
| Ncits | |||
| Mean (SD) | 38.5 (18.5) | 44.3 (15.2) | 43.4 (15.5) |
| Median [Min, Max] | 37.5 [19.0, 60.0] | 47.0 [24.0, 69.0] | 47.0 [19.0, 69.0] |
| Salary | |||
| Mean (SD) | 72100 (6400) | 85000 (10600) | 83100 (11100) |
| Median [Min, Max] | 74100 [62900, 77500] | 82400 [70000, 108000] | 81900 [62900, 108000] |
p = ggplot(data = Assoc.P, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
wilcox.test(NPubs ~ Sex, data = Assoc.P)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: NPubs by Sex
## W = 21.5, p-value = 0.1168
## alternative hypothesis: true location shift is not equal to 0
Is there evidence that the number of publications significantly differed between male and female professors?
What is the limitation of this non-parametric test?
male = Assoc.P %>% filter(Sex == "Male")
#Male = subset(Assoc.P, Sex == "Male")
female = Assoc.P %>% filter(Sex == "Female")
set.seed(1234)
b.means = two.boot(male$NPubs, female$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)
Is there evidence that the mean number of publications significantly differed between male and female professors?
How did the mean number of publications of male professors differ from that of female professors? What was the range of the differences?