professors. The libraries needed for this task: table1, epiDisplay. 1.1 Provide a brief description of the project
The cross-sectional investigation of 397 professors to determine whether the chance to get a high salary differed between male and female professors.
Null hypothesis: Professors’ sexes were not associated with the chance to get a high salary.
Alternative hypothesis: Professors’ sexes were associated with the chance to get a high salary ###
salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Int\\Professorial Salaries.csv")
salary$high.salary = ifelse(salary$Salary>= 130000, 1, 0)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex | high.salary, data = salary)
## Warning in table1.formula(~Rank + Discipline + Yrs.since.phd + Yrs.service + :
## Terms to the right of '|' in formula 'x' define table columns and are expected
## to be factors with meaningful labels.
| 0 (N=287) |
1 (N=110) |
Overall (N=397) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 64 (22.3%) | 0 (0%) | 64 (16.1%) |
| AsstProf | 67 (23.3%) | 0 (0%) | 67 (16.9%) |
| Prof | 156 (54.4%) | 110 (100%) | 266 (67.0%) |
| Discipline | |||
| A | 140 (48.8%) | 41 (37.3%) | 181 (45.6%) |
| B | 147 (51.2%) | 69 (62.7%) | 216 (54.4%) |
| Yrs.since.phd | |||
| Mean (SD) | 19.7 (13.1) | 29.2 (9.41) | 22.3 (12.9) |
| Median [Min, Max] | 17.0 [1.00, 56.0] | 29.0 [11.0, 56.0] | 21.0 [1.00, 56.0] |
| Yrs.service | |||
| Mean (SD) | 15.4 (13.0) | 23.3 (11.2) | 17.6 (13.0) |
| Median [Min, Max] | 11.0 [0, 57.0] | 21.0 [2.00, 60.0] | 16.0 [0, 60.0] |
| NPubs | |||
| Mean (SD) | 17.3 (13.0) | 20.3 (16.2) | 18.2 (14.0) |
| Median [Min, Max] | 13.0 [1.00, 69.0] | 16.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
| Ncits | |||
| Mean (SD) | 38.7 (15.6) | 44.3 (19.4) | 40.2 (16.9) |
| Median [Min, Max] | 35.0 [1.00, 90.0] | 41.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
| Salary | |||
| Mean (SD) | 98400 (16900) | 154000 (18900) | 114000 (30300) |
| Median [Min, Max] | 100000 [57800, 130000] | 149000 [131000, 232000] | 107000 [57800, 232000] |
| Sex | |||
| Female | 34 (11.8%) | 5 (4.5%) | 39 (9.8%) |
| Male | 253 (88.2%) | 105 (95.5%) | 358 (90.2%) |
m.1 = glm(high.salary ~ Sex, family = "binomial", data = salary)
summary(m.1)
##
## Call:
## glm(formula = high.salary ~ Sex, family = "binomial", data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9169 0.4790 -4.002 6.28e-05 ***
## SexMale 1.0375 0.4928 2.105 0.0353 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 468.60 on 396 degrees of freedom
## Residual deviance: 463.11 on 395 degrees of freedom
## AIC: 467.11
##
## Number of Fisher Scoring iterations: 4
# install.packages("epiDisplay")
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
logistic.display(m.1)
##
## Logistic regression predicting high.salary
##
## OR(95%CI) P(Wald's test) P(LR-test)
## Sex (cont. var.) 2.82 (1.07,7.41) 0.035 0.019
##
## Log-likelihood = -231.5529
## No. of observations = 397
## AIC value = 467.1058
professors. The libraries needed for this task: table1, Publish. 1.1 Provide a brief description of the project
The cross-sectional investigation of 397 professors to determine whether the chance to get a high salary differed between male and female professors, accounting for potential confounding effects.
Null hypothesis: Professors’ sexes were not independently associated with the chance to get a high salary.
Alternative hypothesis: Professors’ sexes were independently associated with the chance to get a high salary ###
salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Int\\Professorial Salaries.csv")
salary$high.salary = ifelse(salary$Salary> 120000, 1, 0)
salary$Prof[salary$Rank == "Prof"] = "Professor"
salary$Prof[salary$Rank != "Prof"] = "AssProfessor"
library(table1)
table1(~ Rank + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex | as.factor(high.salary), data = salary)
| 0 (N=255) |
1 (N=142) |
Overall (N=397) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 63 (24.7%) | 1 (0.7%) | 64 (16.1%) |
| AsstProf | 67 (26.3%) | 0 (0%) | 67 (16.9%) |
| Prof | 125 (49.0%) | 141 (99.3%) | 266 (67.0%) |
| Prof | |||
| AssProfessor | 130 (51.0%) | 1 (0.7%) | 131 (33.0%) |
| Professor | 125 (49.0%) | 141 (99.3%) | 266 (67.0%) |
| Discipline | |||
| A | 127 (49.8%) | 54 (38.0%) | 181 (45.6%) |
| B | 128 (50.2%) | 88 (62.0%) | 216 (54.4%) |
| Yrs.since.phd | |||
| Mean (SD) | 19.1 (13.5) | 28.1 (9.29) | 22.3 (12.9) |
| Median [Min, Max] | 15.0 [1.00, 56.0] | 28.0 [11.0, 56.0] | 21.0 [1.00, 56.0] |
| Yrs.service | |||
| Mean (SD) | 14.9 (13.4) | 22.5 (10.7) | 17.6 (13.0) |
| Median [Min, Max] | 9.00 [0, 57.0] | 20.0 [2.00, 60.0] | 16.0 [0, 60.0] |
| NPubs | |||
| Mean (SD) | 17.6 (13.2) | 19.2 (15.2) | 18.2 (14.0) |
| Median [Min, Max] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
| Ncits | |||
| Mean (SD) | 38.6 (15.8) | 43.1 (18.6) | 40.2 (16.9) |
| Median [Min, Max] | 35.0 [1.00, 90.0] | 38.5 [1.00, 90.0] | 35.0 [1.00, 90.0] |
| Salary | |||
| Mean (SD) | 95000 (14600) | 147000 (20400) | 114000 (30300) |
| Median [Min, Max] | 96200 [57800, 120000] | 144000 [121000, 232000] | 107000 [57800, 232000] |
| Sex | |||
| Female | 30 (11.8%) | 9 (6.3%) | 39 (9.8%) |
| Male | 225 (88.2%) | 133 (93.7%) | 358 (90.2%) |
m.1 = glm(high.salary ~ Sex , family = "binomial", data = salary)
summary(m.1)
##
## Call:
## glm(formula = high.salary ~ Sex, family = "binomial", data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2040 0.3801 -3.168 0.00154 **
## SexMale 0.6782 0.3955 1.715 0.08636 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 517.75 on 396 degrees of freedom
## Residual deviance: 514.52 on 395 degrees of freedom
## AIC: 518.52
##
## Number of Fisher Scoring iterations: 4
## install.packages("Publish")
library(Publish)
## Loading required package: prodlim
publish(m.1)
## Variable Units OddsRatio CI.95 p-value
## Sex Female Ref
## Male 1.97 [0.91;4.28] 0.08636
## Multiple model
m.2 = glm(high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + NPubs, family = "binomial", data = salary)
summary(m.2)
##
## Call:
## glm(formula = high.salary ~ Sex + Prof + Discipline + Yrs.since.phd +
## NPubs, family = "binomial", data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.941036 1.140946 -5.207 1.92e-07 ***
## SexMale 0.215592 0.503442 0.428 0.668480
## ProfProfessor 5.031301 1.037844 4.848 1.25e-06 ***
## DisciplineB 0.997854 0.259625 3.843 0.000121 ***
## Yrs.since.phd 0.008645 0.012943 0.668 0.504147
## NPubs 0.004598 0.008735 0.526 0.598648
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 517.75 on 396 degrees of freedom
## Residual deviance: 363.49 on 391 degrees of freedom
## AIC: 375.49
##
## Number of Fisher Scoring iterations: 7
publish(m.2)
## Variable Units OddsRatio CI.95 p-value
## Sex Female Ref
## Male 1.24 [0.46;3.33] 0.6684800
## Prof AssProfessor Ref
## Professor 153.13 [20.03;1170.79] < 1e-04
## Discipline A Ref
## B 2.71 [1.63;4.51] 0.0001213
## Yrs.since.phd 1.01 [0.98;1.03] 0.5041467
## NPubs 1.00 [0.99;1.02] 0.5986477
The libraries needed for this task: table1, Publish, BMA, rms. 1.1 Provide a brief description of the project - What is the study design? - What is the null hypothesis? - What is the alternative hypothesis? ###
salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Int\\Professorial Salaries.csv")
salary$high.salary = ifelse(salary$Salary> 120000, 1, 0)
salary$Prof[salary$Rank == "Prof"] = "Professor"
salary$Prof[salary$Rank != "Prof"] = "AssProfessor"
## install.packages("BMA")
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-5)
m.3 = bic.glm(high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits, strict = F, OR = 20, glm.family = "binomial", data = salary)
summary(m.3)
##
## Call:
## bic.glm.formula(f = high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits, data = salary, glm.family = "binomial", strict = F, OR = 20)
##
##
## 9 models were selected
## Best 5 models (cumulative posterior probability = 0.9093 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 -5.978e+00 1.129453 -6.336e+00 -5.559e+00 -5.691e+00
## SexMale 4.6 1.053e-02 0.117232 . . .
## ProfProfessor 100.0 5.202e+00 1.018200 5.222e+00 5.198e+00 5.033e+00
## DisciplineB 100.0 9.733e-01 0.254090 9.797e-01 9.618e-01 1.006e+00
## Yrs.since.phd 5.3 4.565e-04 0.003544 . . 9.694e-03
## Yrs.service 2.1 1.670e-05 0.001626 . . .
## NPubs 4.7 1.025e-05 0.002167 . . .
## Ncits 51.8 9.540e-03 0.010718 1.837e-02 . .
##
## nVar 3 2 3
## BIC -1.993e+03 -1.993e+03 -1.988e+03
## post prob 0.425 0.408 0.027
## model 4 model 5
## Intercept -6.421e+00 -6.542e+00
## SexMale . 2.341e-01
## ProfProfessor 5.090e+00 5.206e+00
## DisciplineB 1.013e+00 9.837e-01
## Yrs.since.phd 7.581e-03 .
## Yrs.service . .
## NPubs . .
## Ncits 1.802e-02 1.843e-02
##
## nVar 4 4
## BIC -1.988e+03 -1.988e+03
## post prob 0.025 0.024
##
## 1 observations deleted due to missingness.
imageplot.bma(m.3)
m.4 = glm(high.salary ~ Prof + Discipline, family = "binomial", data = salary)
summary(m.4)
##
## Call:
## glm(formula = high.salary ~ Prof + Discipline, family = "binomial",
## data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.5592 1.0247 -5.425 5.78e-08 ***
## ProfProfessor 5.1983 1.0153 5.120 3.06e-07 ***
## DisciplineB 0.9618 0.2516 3.823 0.000132 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 517.75 on 396 degrees of freedom
## Residual deviance: 364.47 on 394 degrees of freedom
## AIC: 370.47
##
## Number of Fisher Scoring iterations: 7
## install.packages("rms")
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
ddist = datadist(salary)
options(datadist = 'ddist')
m = lrm(high.salary ~ Prof + Discipline, data = salary)
summary(m)
## Effects Response : high.salary
##
## Factor Low High Diff. Effect S.E. Lower 0.95
## Prof - AssProfessor:Professor 2 1 NA -5.1983000 1.0154 -7.18850000
## Odds Ratio 2 1 NA 0.0055257 NA 0.00075519
## Discipline - A:B 2 1 NA -0.9617800 0.2516 -1.45490000
## Odds Ratio 2 1 NA 0.3822100 NA 0.23342000
## Upper 0.95
## -3.208100
## 0.040431
## -0.468660
## 0.625840
nom.m = nomogram(m, fun = function(x)1/(1+exp(-x)), fun.at = c(.001, .01, .10, seq(.2, .8, by = .2), .90, .99, .999), funlabel = "Predicted likelihood of a high salary")
plot(nom.m)
## Stepwise approach
m.sw = lm(Salary ~ Prof + Discipline + Yrs.since.phd + Yrs.service + Sex + NPubs + Ncits, data = salary)
step1 = step(m.sw)
## Start: AIC=7972.16
## Salary ~ Prof + Discipline + Yrs.since.phd + Yrs.service + Sex +
## NPubs + Ncits
##
## Df Sum of Sq RSS AIC
## - NPubs 1 2.6173e+07 2.0064e+11 7970.2
## - Sex 1 7.5268e+08 2.0136e+11 7971.6
## <none> 2.0061e+11 7972.2
## - Ncits 1 2.2974e+09 2.0291e+11 7974.7
## - Yrs.service 1 2.7077e+09 2.0332e+11 7975.5
## - Yrs.since.phd 1 3.7642e+09 2.0437e+11 7977.5
## - Discipline 1 1.9980e+10 2.2059e+11 8007.8
## - Prof 1 6.4497e+10 2.6511e+11 8080.8
##
## Step: AIC=7970.21
## Salary ~ Prof + Discipline + Yrs.since.phd + Yrs.service + Sex +
## Ncits
##
## Df Sum of Sq RSS AIC
## - Sex 1 7.7069e+08 2.0141e+11 7969.7
## <none> 2.0064e+11 7970.2
## - Ncits 1 2.4053e+09 2.0304e+11 7972.9
## - Yrs.service 1 2.7116e+09 2.0335e+11 7973.5
## - Yrs.since.phd 1 3.7631e+09 2.0440e+11 7975.6
## - Discipline 1 1.9956e+10 2.2059e+11 8005.9
## - Prof 1 6.4471e+10 2.6511e+11 8078.8
##
## Step: AIC=7969.73
## Salary ~ Prof + Discipline + Yrs.since.phd + Yrs.service + Ncits
##
## Df Sum of Sq RSS AIC
## <none> 2.0141e+11 7969.7
## - Ncits 1 2.3585e+09 2.0377e+11 7972.4
## - Yrs.service 1 2.5792e+09 2.0399e+11 7972.8
## - Yrs.since.phd 1 3.7518e+09 2.0516e+11 7975.1
## - Discipline 1 2.0207e+10 2.2161e+11 8005.7
## - Prof 1 6.5729e+10 2.6714e+11 8079.9
summary(step1)
##
## Call:
## lm(formula = Salary ~ Prof + Discipline + Yrs.since.phd + Yrs.service +
## Ncits, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69513 -13212 -2228 10409 100191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69214.26 3918.27 17.665 < 2e-16 ***
## ProfProfessor 36815.51 3259.13 11.296 < 2e-16 ***
## DisciplineB 14759.23 2356.48 6.263 9.95e-10 ***
## Yrs.since.phd 644.97 238.99 2.699 0.00726 **
## Yrs.service -477.09 213.21 -2.238 0.02581 *
## Ncits 144.34 67.46 2.140 0.03299 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22700 on 391 degrees of freedom
## Multiple R-squared: 0.4456, Adjusted R-squared: 0.4385
## F-statistic: 62.86 on 5 and 391 DF, p-value: < 2.2e-16