salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Int\\Professorial Salaries.csv")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Sex + Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary, data = salary)
| Overall (N=397) |
|
|---|---|
| Sex | |
| Female | 39 (9.8%) |
| Male | 358 (90.2%) |
| Rank | |
| AssocProf | 64 (16.1%) |
| AsstProf | 67 (16.9%) |
| Prof | 266 (67.0%) |
| Discipline | |
| A | 181 (45.6%) |
| B | 216 (54.4%) |
| Yrs.since.phd | |
| Mean (SD) | 22.3 (12.9) |
| Median [Min, Max] | 21.0 [1.00, 56.0] |
| Yrs.service | |
| Mean (SD) | 17.6 (13.0) |
| Median [Min, Max] | 16.0 [0, 60.0] |
| NPubs | |
| Mean (SD) | 18.2 (14.0) |
| Median [Min, Max] | 13.0 [1.00, 69.0] |
| Ncits | |
| Mean (SD) | 40.2 (16.9) |
| Median [Min, Max] | 35.0 [1.00, 90.0] |
| Salary | |
| Mean (SD) | 114000 (30300) |
| Median [Min, Max] | 107000 [57800, 232000] |
library(ggplot2)
library(grid)
library(gridExtra)
p1 = ggplot(data = salary, aes(x = Yrs.since.phd)) + geom_histogram(aes(y = ..density..), color = "white", fill = "blue") + ggtitle("Time since PhD (years)") + theme_bw()
p2 = ggplot(data = salary, aes(x = Yrs.service)) + geom_histogram(aes(y = ..density..), color = "white", fill = "blue") + ggtitle("Time in service (years)") + theme_bw()
grid.arrange(p1, p2, nrow = 1, top = textGrob("Distribution of numeric variables", gp = gpar(fontsize = 20, font = 1)))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = salary, aes(x = Yrs.since.phd, y = Yrs.service)) + geom_point() + geom_smooth() + labs(x = "Time since PhD (years)", y = "Time in service (years)") + ggtitle("Correlation between time since PhD and time in service") + theme_bw()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
2.1 Study design: The cross-sectional investigation of 397 professors to determine whether the number of citations was associated with professors’ salaries. - Null hypothesis: Number of citations was not associated with professors’ salaries. - Alternative hypothesis: Number of citations was associated with professors’ salaries. ###
salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Int\\Professorial Salaries.csv")
library(table1)
table1(~ Sex + Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary, data = salary)
| Overall (N=397) |
|
|---|---|
| Sex | |
| Female | 39 (9.8%) |
| Male | 358 (90.2%) |
| Rank | |
| AssocProf | 64 (16.1%) |
| AsstProf | 67 (16.9%) |
| Prof | 266 (67.0%) |
| Discipline | |
| A | 181 (45.6%) |
| B | 216 (54.4%) |
| Yrs.since.phd | |
| Mean (SD) | 22.3 (12.9) |
| Median [Min, Max] | 21.0 [1.00, 56.0] |
| Yrs.service | |
| Mean (SD) | 17.6 (13.0) |
| Median [Min, Max] | 16.0 [0, 60.0] |
| NPubs | |
| Mean (SD) | 18.2 (14.0) |
| Median [Min, Max] | 13.0 [1.00, 69.0] |
| Ncits | |
| Mean (SD) | 40.2 (16.9) |
| Median [Min, Max] | 35.0 [1.00, 90.0] |
| Salary | |
| Mean (SD) | 114000 (30300) |
| Median [Min, Max] | 107000 [57800, 232000] |
library(ggplot2)
ggplot(data = salary, aes(x = Salary)) + geom_histogram(aes(y = ..density..), color = "white", fill = "blue") + ggtitle("Professors' salaries (USD)") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
m1 = lm(Salary ~ Ncits, data = salary)
summary(m1)
##
## Call:
## lm(formula = Salary ~ Ncits, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61660 -23012 -5654 20638 120083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 105664.57 3899.87 27.094 <2e-16 ***
## Ncits 199.93 89.36 2.237 0.0258 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30140 on 395 degrees of freedom
## Multiple R-squared: 0.01251, Adjusted R-squared: 0.01001
## F-statistic: 5.005 on 1 and 395 DF, p-value: 0.02583
par(mfrow = c(2,2))
plot(m1)