library(ggplot2)
library(grid)
library(gridExtra)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.4     âś” readr     2.1.5
## âś” forcats   1.0.0     âś” stringr   1.5.1
## âś” lubridate 1.9.3     âś” tibble    3.2.1
## âś” purrr     1.0.2     âś” tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::combine() masks gridExtra::combine()
## âś– dplyr::filter()  masks stats::filter()
## âś– dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Task 1:

Task 1.1:

  • Study Design: A cross-sectional investigation of 396 collegiate professors in the US from 2008 to 2009 was conducted to determine whether professors’ salaries differed between male and female professors
  • Null Hypothesis: Salaries NOT different between male and female professors.
  • Alternative Hypothesis: Salaries different between male and female professors.

Task 1.2: Import Data set

salary = read.csv("D:\\OneDrive\\ANDREAS\\ACADEMICS_UNIVERSITY\\Year_4_2024\\Autumn\\32931\\Classes\\R_Basic\\Professorial_Salaries.csv")
head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000

Task 1.3:

table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary)
Female
(N=39)
Male
(N=358)
Overall
(N=397)
Rank
AssocProf 10 (25.6%) 54 (15.1%) 64 (16.1%)
AsstProf 11 (28.2%) 56 (15.6%) 67 (16.9%)
Prof 18 (46.2%) 248 (69.3%) 266 (67.0%)
Discipline
A 18 (46.2%) 163 (45.5%) 181 (45.6%)
B 21 (53.8%) 195 (54.5%) 216 (54.4%)
Yrs.since.phd
Mean (SD) 16.5 (9.78) 22.9 (13.0) 22.3 (12.9)
Median [Min, Max] 17.0 [2.00, 39.0] 22.0 [1.00, 56.0] 21.0 [1.00, 56.0]
Yrs.service
Mean (SD) 11.6 (8.81) 18.3 (13.2) 17.6 (13.0)
Median [Min, Max] 10.0 [0, 36.0] 18.0 [0, 60.0] 16.0 [0, 60.0]
NPubs
Mean (SD) 20.2 (14.4) 17.9 (13.9) 18.2 (14.0)
Median [Min, Max] 18.0 [1.00, 50.0] 13.0 [1.00, 69.0] 13.0 [1.00, 69.0]
Ncits
Mean (SD) 40.7 (16.2) 40.2 (17.0) 40.2 (16.9)
Median [Min, Max] 36.0 [14.0, 70.0] 35.0 [1.00, 90.0] 35.0 [1.00, 90.0]
Salary
Mean (SD) 101000 (26000) 115000 (30400) 114000 (30300)
Median [Min, Max] 104000 [62900, 161000] 108000 [57800, 232000] 107000 [57800, 232000]

Task 1.4: Develop a graph to check the distribution of professors’ salaries. Write a sentence to describe the graph

p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • Skewed right

Task 1.5: Determine whether salaries significantly differed between male and female professors

t.test(Salary ~ Sex, data = salary)
## 
##  Welch Two Sample t-test
## 
## data:  Salary by Sex
## t = -3.1615, df = 50.122, p-value = 0.002664
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -23037.916  -5138.102
## sample estimates:
## mean in group Female   mean in group Male 
##             101002.4             115090.4

How did male professors’ salaries differ from female professors’ salaries? What was the range of the differences? - The difference between male and female professors’ is evidenced by the the value of P=~0.003 with a range from ~$5,138.10 and ~23,037.92 of $14,088

Task 2:

Task 2.1:

  • Study Design: A cross-sectional investigation of 26 associate professors in the Theoretical discipline was conducted to determine whether the average number of publications differed between male and female professors
  • Null Hypothesis: The average number of Publications NOT different between male and female professors.
  • Alternative Hypothesis: The average number of Publications different between male and female professors.

Task 2.2: Select a subgroup of associate professors in the Theoretical discipline

Assoc.P = subset(salary, Rank == "AssocProf" & Discipline == "A")
dim(Assoc.P)
## [1] 26  9

Task 2.3: Describe characteristics of the study sample by sex

table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = Assoc.P)
Female
(N=4)
Male
(N=22)
Overall
(N=26)
Rank
AssocProf 4 (100%) 22 (100%) 26 (100%)
Discipline
A 4 (100%) 22 (100%) 26 (100%)
Yrs.since.phd
Mean (SD) 18.5 (8.19) 17.7 (12.2) 17.8 (11.5)
Median [Min, Max] 19.0 [10.0, 26.0] 12.5 [8.00, 49.0] 13.0 [8.00, 49.0]
Yrs.service
Mean (SD) 15.5 (8.70) 13.1 (12.3) 13.5 (11.7)
Median [Min, Max] 15.0 [8.00, 24.0] 8.00 [1.00, 49.0] 8.00 [1.00, 49.0]
NPubs
Mean (SD) 10.0 (4.97) 21.6 (14.2) 19.8 (13.8)
Median [Min, Max] 10.0 [4.00, 16.0] 16.0 [3.00, 48.0] 16.0 [3.00, 48.0]
Ncits
Mean (SD) 38.5 (18.5) 44.3 (15.2) 43.4 (15.5)
Median [Min, Max] 37.5 [19.0, 60.0] 47.0 [24.0, 69.0] 47.0 [19.0, 69.0]
Salary
Mean (SD) 72100 (6400) 85000 (10600) 83100 (11100)
Median [Min, Max] 74100 [62900, 77500] 82400 [70000, 108000] 81900 [62900, 108000]

Task 2.4: Develop a graph to check the distribution of the number of publications. Write a sentence to describe the graph

p = ggplot(data = Assoc.P, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • The distribution of the mean publications is fluctuate throughout with the highest density between ~10 and 20 papers and gradually decreases.

Task 2.5: Conduct a non-parametric test to determine whether the number of publications differed between male and female professors

wilcox.test(NPubs ~ Sex, data = Assoc.P)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  NPubs by Sex
## W = 21.5, p-value = 0.1168
## alternative hypothesis: true location shift is not equal to 0
  • Is there evidence that the number of publications significantly differed between male and female professors?

  • What is the limitation of this non-parametric test?

Task 2.6: Carry out a bootstrap to determine whether the mean number of publications differed between male and female professors

male = Assoc.P %>% filter(Sex == "Male")
#Male = subset(Assoc.P, Sex == "Male")
female = Assoc.P %>% filter(Sex == "Female")
set.seed(1234)

b.means = two.boot(male$NPubs, female$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)

  • Is there evidence that the mean number of publications significantly differed between male and female professors?

  • How did the mean number of publications of male professors differ from that of female professors? What was the range of the differences?