Code
# Installing and loading required packages
# install.packages("car")
library(car)
library(tidyverse)From Basic Layers to Multi-Dimensional Visuals
ggplot2 and customize graphs to publication-quality levels.car package’s Salaries dataset (information regarding 9-month salaries of professors in the US).geom_jitter(), custom color scales, modifying axis tick marks, and positioning legends inside the plot.At the beginning, we will load the necessary packages and take a look at the data.
# Installing and loading required packages
# install.packages("car")
library(car)
library(tidyverse)# Viewing the first few rows of the dataset
# View(Salaries)
head((Salaries), 8) rank discipline yrs.since.phd yrs.service sex salary
1 Prof B 19 18 Male 139750
2 Prof B 20 16 Male 173200
3 AsstProf B 4 3 Male 79750
4 Prof B 45 39 Male 115000
5 Prof B 40 41 Male 141500
6 AssocProf B 6 6 Male 97000
7 Prof B 30 23 Male 175000
8 Prof B 45 45 Male 147765
Let’s look at the relationship between years passed since PhD (yrs.since.phd) and salary (salary).
# Step 1: Basic Scatter Plot
Salaries %>%
ggplot(aes(yrs.since.phd,
salary,
color = rank)) +
geom_point(size=4, alpha=0.8)Observation: Here we can see how salary changes according to professional rank. However, many data points are overlapping each other (Overlapping).
To reduce data overlapping, we will use geom_jitter and add a regression line.
# Step 2: Advanced Customization
Salaries %>%
ggplot(aes(yrs.since.phd, salary)) +
# geom_jitter spreads the data slightly so that overlapping is reduced
geom_jitter(aes(color = rank,
shape = discipline)) +
# Adding a regression line
geom_smooth(method = lm,
se = FALSE,
color = "red") +
# Setting colors using preferred hex codes
scale_color_manual(values = c(
"AsstProf" = "#A88EF2",
"AssocProf" = "#FFB222",
"Prof" = "#D5A6E2"
)) +
theme_test() +
labs(
title = "Salary vs. Years since PhD",
x = "Years since PhD",
y = "Income",
Color = "Position",
Shape = "Research Area"
) # Step 3: Advanced Customization
Salaries %>%
ggplot(aes(yrs.since.phd, salary)) +
# geom_jitter spreads the data slightly so that overlapping is reduced
geom_jitter(aes(color = rank,
shape = discipline), size=4) +
# Adding a regression line
geom_smooth(method = lm,
se = FALSE,
color = "red")# Step 4: Advanced Customization
Salaries %>%
ggplot(aes(yrs.since.phd, salary)) +
# geom_jitter spreads the data slightly so that overlapping is reduced
geom_jitter(aes(color = rank,
shape = discipline), size=4) +
# Adding a regression line
geom_smooth(method = lm,
se = FALSE,
color = "red") +
# Setting colors using preferred hex codes
scale_color_manual(values = c(
"AsstProf" = "#A88EF2",
"AssocProf" = "#FFB222",
"Prof" = "#D5A6E2"
))# Step 5: Advanced Customization
Salaries %>%
ggplot(aes(yrs.since.phd, salary)) +
# geom_jitter spreads the data slightly so that overlapping is reduced
geom_jitter(aes(color = rank,
shape = discipline), size=4) +
# Adding a regression line
geom_smooth(method = lm,
se = FALSE,
color = "red") +
# Setting colors using preferred hex codes
scale_color_manual(values = c(
"AsstProf" = "#A88EF2",
"AssocProf" = "#FFB222",
"Prof" = "#D5A6E2"
)) +
theme_test() +
labs(
title = "Salary vs. Years since PhD",
x = "Years since PhD",
y = "Income",
Color = "Position",
Shape = "Research Area"
) Now we will see the salary distribution according to rank and make the axis labels more readable.
# Step 1: Intermediate Customization
Salaries %>%
filter(salary < 150000) %>% # Filtering high-salary outliers
ggplot(aes(rank, salary, fill = sex)) + # fill = sex added here for insights
geom_boxplot() +
scale_fill_manual(values = c(
"Female" = "#A88EF2",
"Male" = "#D5A6E2"
)) +
theme_test() +
labs(
title = "Faculty Salary by Rank and Gender",
x = "Academic Rank",
y = "Annual Salary",
fill = "Gender"
) # Step 2: Intermediate Customization
Salaries %>%
filter(salary < 150000) %>% # Filtering high-salary outliers
ggplot(aes(rank, salary, fill = sex)) + # fill = sex added here for insights
geom_boxplot() +
scale_fill_manual(values = c(
"Female" = "#A88EF2",
"Male" = "#D5A6E2"
)) +
theme_test() +
labs(
title = "Faculty Salary by Rank and Gender",
x = "Academic Rank",
y = "Annual Salary",
fill = "Gender"
) +
# Beautifying axis labels using 'New Line' (\n)
scale_x_discrete(breaks = c("AsstProf", "AssocProf", "Prof"),
labels = c("Assistant\nProfessor",
"Associate\nProfessor",
"Professor"))# Step 3: Intermediate Customization
Salaries %>%
filter(salary < 150000) %>% # Filtering high-salary outliers
ggplot(aes(rank, salary, fill = sex)) + # fill = sex added here for insights
geom_boxplot() +
scale_fill_manual(values = c(
"Female" = "#A88EF2",
"Male" = "#D5A6E2"
)) +
theme_test() +
labs(
title = "Faculty Salary by Rank and Gender",
x = "Academic Rank",
y = "Annual Salary",
fill = "Gender"
) +
# Beautifying axis labels using 'New Line' (\n)
scale_x_discrete(breaks = c("AsstProf", "AssocProf", "Prof"),
labels = c("Assistant\nProfessor",
"Associate\nProfessor",
"Professor")) +
# Setting Y-axis labels in currency format
scale_y_continuous(breaks = c(50000, 100000, 150000, 200000),
labels = c("$50k", "$100k", "$150k", "$200k"))# Step 4: Intermediate Customization
Salaries %>%
filter(salary < 150000) %>% # Filtering high-salary outliers
ggplot(aes(rank, salary, fill = sex)) + # fill = sex added here for insights
geom_boxplot() +
scale_fill_manual(values = c(
"Female" = "#A88EF2",
"Male" = "#D5A6E2"
)) +
theme_test() +
labs(
title = "Faculty Salary by Rank and Gender",
x = "Academic Rank",
y = "Annual Salary",
fill = "Gender"
) +
# Beautifying axis labels using 'New Line' (\n)
scale_x_discrete(breaks = c("AsstProf", "AssocProf", "Prof"),
labels = c("Assistant\nProfessor",
"Associate\nProfessor",
"Professor")) +
# Setting Y-axis labels in currency format
scale_y_continuous(breaks = c(50000, 100000, 150000, 200000),
labels = c("$50k", "$100k", "$150k", "$200k")) +
# Moving the legend to the empty space inside the graph
theme(legend.position = c(0.15, 0.80))geom_jitter(): When you have a lot of data points, it is mandatory to use this. It gives a visual clue as to where the data density is highest.scale_x_discrete(): Using \n to shorten long labels (Assistant Professor) is a great practice.legend.position = c(x, y) stays between 0 and 1. Using this saves space around the graph.Great work! You have successfully transformed a complex salary dataset into a professional graph.