Task 1: Import the csv data set “Professorial Salaries” into R.

salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Basic\\Professorial Salaries.csv")

Task 2. Distribution of numeric data

2a. Create a graph to describe the distribution of professors’ salaries. Are professors’ salaries normally distributed?

# install.packages("ggplot2")
# GEOM meaning: geometric; geometrical; geometry.
library(ggplot2)
library(grid)
library(gridExtra)

# First graph
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(color = "white", fill = "blue")

# Second graph
p2 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
# density unit here
p2 = p2 + geom_density(col="red")

# Combine the graphs together
grid.arrange(p1, p2, nrow = 2, top = textGrob("Distribution of professors' salaries by sex", gp = gpar(fontsize = 20, font = 1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# geom density meaning: Computes and draws kernel density estimate, which is a smoothed version of the histogram

2b. Create a graph to describe the differences in salaries between male and female professors. Were the professors’ salaries different between male and female professors?

p = ggplot(data = salary, aes(x = Salary, fill = Sex))

# First graph
p1 = p + geom_histogram(position = "dodge") 

# Second graph
p2 = ggplot(data = salary, aes(x = Salary, fill = Sex, color = Sex)) + geom_density(alpha = 0.1)

# Combine the graph 
grid.arrange(p1, p2, nrow = 2, top = textGrob("Distribution of professors' salaries by sex", gp = gpar(fontsize = 20, font = 1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Task 3. Distribution of categorical data

3a. Create a graph to describe the distribution of professors’ rank. Write a sentence to interpret the graph.

p = ggplot(data = salary, aes(x = Rank, fill = Rank, col = Rank))
p1 = p + geom_bar(position = "dodge")

# Change ranking of AsstProf as the most left label
salary$Prof.Rank = factor(salary$Rank, levels = c("AsstProf", "AssocProf", "Prof"))
p = ggplot(data = salary, aes(x = Prof.Rank, fill = Prof.Rank, col = Prof.Rank))
p2 = p + geom_bar(position = "dodge")

grid.arrange(p1, p2, nrow = 2, top = textGrob("Distribution of professors' rank", gp = gpar(fontsize = 20, font = 1)))

3b. Create a graph to describe whether professors’ ranks differed between male and female professors. Write a sentence to interpret the graph.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
p = ggplot(data = salary, aes(x = Prof.Rank, fill = Sex, col = Sex))
p1 = p + geom_bar(position = "dodge")
p1 + ggtitle("Distribution of professors' rank by sex")

p = ggplot(salary %>% count(Prof.Rank, Sex) %>% mutate(pct = n/sum(n)), aes(factor(Prof.Rank), n, fill = Sex))
p = p + geom_bar(stat ="identity")
p = p + geom_text(aes(label = paste0(sprintf("%1.1f", pct*100),"%")), position = position_stack(vjust=0.5))
p + labs(x = "Professors' ranks", y = "Number of cases") + ggtitle("Professors' rank by sex")

Task 4. Comparison of numeric data

4a. Create a graph to describe the differences in salaries between male and female professors. What are the differences between the graphs 2b and 4a?

p = ggplot(data = salary, aes(x = Sex,  y = Salary, fill = Sex, col = Sex))
# geom jitter meaning: Jitter is added to help avoid all of the dots being plotted on top of each other. Prevent the dots from overlapping
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) 
p1 + labs(x = "Sex", y = "Salaries (USD)") + ggtitle("Professors' salaries by sex") + theme_bw()

4b. Create a graph to describe the differences in salaries by professors’ rank and sex. What do you think about the graph?

p = ggplot(data = salary, aes(x = Prof.Rank,  y = Salary, fill = Prof.Rank, col = Prof.Rank))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) 
p1 + labs(x = "Rank", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank") + theme_bw()

p = ggplot(data = salary, aes(x = Prof.Rank,  y = Salary, fill = Sex, col = Sex))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) 
p1 + labs(x = "Professors' Ranks", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank and sex") + theme_bw()

Task 5. Correlation between numeric data

5a. Create a graph to describe the correlation between professors’ salaries and their time in service. How were professors’ salaries correlated with their time in service?

p = ggplot(data = salary, aes(x = Yrs.service, y = Salary))
p1 = p + geom_point() + geom_smooth() + labs(x = "Time in service (years)", y = "Professors' salaries (USD)") + ggtitle("Correlation between professors' salaries and time in service") + theme_bw()
p1
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

5b. Create a graph to describe whether the correlation between professors’ salaries and their time in service differed between male and female professors. Write a sentence to interpret the graph.

p = ggplot(data = salary, aes(x = Yrs.service, y = Salary, fill = Sex, col = Sex))
p2 = p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3)) + labs(x = "Time in service (years)", y = "Professors' salaries (USD)") + ggtitle("Correlation between professors' salaries and time in service by sex") + theme_bw()
p2

Task 6. Save your works and upload them to your own Rpubs account.

Lecture example

p1 = p + geom_point()
p1

p2 = p1 + geom_smooth()
p2
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3))

p = ggplot(data = salary, aes(x = Yrs.service, y = Salary, fill = Sex, col = Sex))
p + geom_point() + geom_smooth(method = "lm", formula = y ~ x + I(x^2) + I(x^3))

p1 + labs(x = "Time in service (years)", y = "Professors' salary (USD)") + ggtitle("Correlation between professors' salaries and time in service by sex")

library(ggthemes)
p2 + theme_economist()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'