salary = read.csv("D:\\OneDrive\\ANDREAS\\ACADEMICS_UNIVERSITY\\Year_4_2024\\Autumn\\32931\\Classes\\R_Basic\\Professorial_Salaries.csv")
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
Import Library
library(ggplot2)
library(gridExtra)
library(grid)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggthemes)
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(color="white", fill="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
were professors’ salaries normally distributed?
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(color = "white", fill = "blue")
p2 = p + geom_histogram(aes(y = after_stat(density)), color = "white", fill = "blue")
p2 = p2 + geom_density(col="red")
grid.arrange(p1, p2, nrow = 2, top = textGrob("Distribution of professors' salaries by sex", gp = gpar(fontsize = 20, font = 1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
- The distribution of professors’ salaries by sex is is skewed.
p = ggplot(data = salary, aes(x = Salary, fill = Sex))
p1 = p + geom_histogram(position = "dodge")
p2 = ggplot(data = salary, aes(x = Salary, fill = Sex, color = Sex)) + geom_density(alpha = 0.1)
grid.arrange(p1, p2, nrow = 2, top = textGrob("Distribution of professors' salaries by sex", gp = gpar(fontsize = 20, font = 1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
- Based on density, the difference between male and female’s salaries is
minimal.
p = ggplot(data = salary, aes(x = Rank, fill = Rank, col = Rank))
p1 = p + geom_bar(position = "dodge")
salary$Prof.Rank = factor(salary$Rank, levels = c("AsstProf", "AssocProf", "Prof"))
p = ggplot(data = salary, aes(x = Prof.Rank, fill = Prof.Rank, col = Prof.Rank))
p2 = p + geom_bar(position = "dodge")
grid.arrange(p1, p2, nrow = 2, top = textGrob("Distribution of professors' rank", gp = gpar(fontsize = 20, font = 1)))
- Based on the graph, it is seen that the count of Professor is
approximately 260, making it the highest distribution while Assistant
Professor and Associate Professor have similar count of roughly 160.
p = ggplot(data = salary, aes(x = Prof.Rank, fill = Sex, col = Sex))
p1 = p + geom_bar(position = "dodge")
p1 + ggtitle("Distribution of professors' rank by sex")
- The distribution of ranks between male and female professors have
similar ratio, where Prof has the highest count compare to AsstProf and
AssocProf.
p = ggplot(data = salary, aes(x = Sex, y = Salary, fill=Sex, col=Sex))
p1 = p + geom_boxplot(col="black") + geom_jitter(alpha=0.05)
p1 + labs(x="Sex", y="Salaries (USD)") +ggtitle("Professors' salaries by sex") + theme_bw()
- Graph 2b shows that the distribution of salaries between male and
female’s salary have similar trends, where it’s both skewed right.
However, graph 4 show that the range of male professor’s salaries is
slightly higher than the range of female professor’s. ### Task 4b:
Create a graph to describe the differences in salaries by professors’
rank and sex. What do you think about the graph?
p = ggplot(data = salary, aes(x = Prof.Rank, y = Salary, fill = Prof.Rank, col = Prof.Rank))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Rank", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank") + theme_bw()
p = ggplot(data = salary, aes(x = Prof.Rank, y = Salary, fill = Sex, col = Sex))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Professors' Ranks", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank and sex") + theme_bw()
- Overall, while the trends of male and female professors’ salaries by
rank have similar trend, it is observed that the salaries of female
professor has smaller range and overall lower salaries. ## Task 5:
Correlation between numeric data
p = ggplot(data = salary, aes(x = Yrs.service, y = Salary, fill = Sex, col = Sex))
p2 = p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3)) + labs(x = "Time in service (years)", y = "Professors' salaries (USD)") + ggtitle("Correlation between professors' salaries and time in service by sex") + theme_bw()
p2
- Female professors’ salaries has a overall increasing trend while male
professors’ salaries has a more fluctuate trend, increasing and then
decreasing. It should also be noted that the are no female professors
data between 35 and 60 years in services.