#TRM Pratical Data Analysis - Basic Level ##Task 1. Read the “Professorial Salaries” and name this dataset “salary”
salary4=read.csv("C:/Users/24544355/OneDrive - UTS/Desktop/Assignment/32931/Module10/Professorial Salaries.csv")
##Task 2. Describe characteristics of the study sample by sex
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary4 )
| Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
| AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
| Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
| Discipline | |||
| A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
| B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
| Yrs.since.phd | |||
| Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
| Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
| Yrs.service | |||
| Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
| Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
| NPubs | |||
| Mean (SD) | 20.2 (14.4) | 17.9 (13.9) | 18.2 (14.0) |
| Median [Min, Max] | 18.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
| Ncits | |||
| Mean (SD) | 40.7 (16.2) | 40.2 (17.0) | 40.2 (16.9) |
| Median [Min, Max] | 36.0 [14.0, 70.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
| Salary | |||
| Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
| Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
##Task 3. Comparision of salaries between male and female professors ###3a. Check the distribution of professors’ salaries
library(ggplot2)
p = ggplot(data = salary4, aes(x = Salary))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of professors' salaries") + theme_bw()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###3b. Student’s t-test to determine whether salaries were different
between male and female professors
t.test(Salary ~ Sex, data = salary4)
##
## Welch Two Sample t-test
##
## data: Salary by Sex
## t = -3.1615, df = 50.122, p-value = 0.002664
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -23037.916 -5138.102
## sample estimates:
## mean in group Female mean in group Male
## 101002.4 115090.4
##Task 4. Select a subgroup of associate professors in Theoretical discipline
###4a
Assoc.A = subset(salary4, Rank == "AssocProf" & Discipline == "A")
dim(Assoc.A)
## [1] 26 9
###4b. Describe characteristics of associate professors in Theoretical discipline by sex
library(table1)
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = Assoc.A )
| Female (N=4) |
Male (N=22) |
Overall (N=26) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 4 (100%) | 22 (100%) | 26 (100%) |
| Discipline | |||
| A | 4 (100%) | 22 (100%) | 26 (100%) |
| Yrs.since.phd | |||
| Mean (SD) | 18.5 (8.19) | 17.7 (12.2) | 17.8 (11.5) |
| Median [Min, Max] | 19.0 [10.0, 26.0] | 12.5 [8.00, 49.0] | 13.0 [8.00, 49.0] |
| Yrs.service | |||
| Mean (SD) | 15.5 (8.70) | 13.1 (12.3) | 13.5 (11.7) |
| Median [Min, Max] | 15.0 [8.00, 24.0] | 8.00 [1.00, 49.0] | 8.00 [1.00, 49.0] |
| NPubs | |||
| Mean (SD) | 10.0 (4.97) | 21.6 (14.2) | 19.8 (13.8) |
| Median [Min, Max] | 10.0 [4.00, 16.0] | 16.0 [3.00, 48.0] | 16.0 [3.00, 48.0] |
| Ncits | |||
| Mean (SD) | 38.5 (18.5) | 44.3 (15.2) | 43.4 (15.5) |
| Median [Min, Max] | 37.5 [19.0, 60.0] | 47.0 [24.0, 69.0] | 47.0 [19.0, 69.0] |
| Salary | |||
| Mean (SD) | 72100 (6400) | 85000 (10600) | 83100 (11100) |
| Median [Min, Max] | 74100 [62900, 77500] | 82400 [70000, 108000] | 81900 [62900, 108000] |
##Task 5. Comparision of number of publications between male and female associate professors in Theoretical discipline ###5a. Check the distribution of number of publications among associate professors in Theoretical discipline
p = ggplot(data = Assoc.A, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of number of publications among associate professors in Theoretical discipline") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###5b. Describe the differences in number of publications among
associate professors in Theoretical discipline
p = ggplot(data = Assoc.A, aes(x = Sex, y = NPubs, fill = Sex, col = Sex))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Sex", y = "Number of publications") + ggtitle("Number of publications by sex") + theme_bw()
###5c. Mann-Whitney non-parametric test to determine whether number of
publications differed between male and female associate professors in
Theoretical discipline
wilcox.test(NPubs ~ Sex, data = Assoc.A)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: NPubs by Sex
## W = 21.5, p-value = 0.1168
## alternative hypothesis: true location shift is not equal to 0
###5d. Bootstrap to determine whether number of publications differed between male and female associate professors in Theoretical discipline ####Differences in the MEAN number of publications
library(simpleboot)
## Simple Bootstrap Routines (1.1-7)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
male = Assoc.A %>% filter(Sex == "Male")
female = Assoc.A %>% filter(Sex == "Female")
set.seed(1234)
b.means = two.boot(male$NPubs, female$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)
quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## 4.976136 11.568182 18.273295
####Differences in the MEDIAN number of publications
set.seed(1234)
b.medians = two.boot(male$NPubs, female$NPubs, median, R = 1000)
hist (b.medians$t, breaks = 20)
quantile(b.medians$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## -0.5000 7.0000 19.5125