Loading Necessary Packages:
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.3.1
library(forcats)
Loading Data:
data_scientist = read_csv('data_scientist.csv')
## Rows: 51 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Area name, Employment(1), Employment percent relative standard err...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
database_admin = read_csv('database_administrator.csv')
## Rows: 51 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Area name
## dbl (17): Employment(1), Employment percent relative standard error(3), Hour...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
database_architect = read_csv('database_architect.csv')
## Rows: 48 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Area name, Employment(1), Employment percent relative standard err...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
operation_analyst = read_csv('operation_research_analyst.csv')
## Rows: 51 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Area name
## dbl (17): Employment(1), Employment percent relative standard error(3), Hour...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
statistician = read_csv('statistician.csv')
## Rows: 47 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Area name, Hourly 90th percentile wage, Annual 90th percentile wag...
## dbl (15): Employment(1), Employment percent relative standard error(3), Hour...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Adding Roles:
data_scientist$role = 'Data Scientist'
database_admin$role = 'Database Administrator'
database_architect$role = 'Database Architect'
operation_analyst$role = 'Operation Research Analyst'
statistician$role = 'Statistican'
Combining Data:
df = rbind(data_scientist, database_admin, database_architect, operation_analyst, statistician)
Selecting Relevant Columns:
df = df %>%
select(-c(`Employment percent relative standard error(3)`, `Wage percent relative standard error(3)`, `Employment per 1,000 jobs`, `Location Quotient`))
Cleaning the Dataframe
df = df %>%
mutate(Area = str_replace_all(`Area name`, "\\([0-9]*\\)", "")) %>%
select(-`Area name`)
colnames(df) = c('Employment', 'Mean Hourly Wage','Mean Annual Wage', 'Hourly Wage 10th Percentile', 'Hourly Wage 25th Percentile','Median Hourly Wage','Hourly Wage 75th Percentile','Hourly Wage 90th Percentile','Annual Wage 10th Percentile','Annual Wage 25th Percentile','Median Annual Wage','Annual Wage 75th Percentile','Annual Wage 90th Percentile', 'Role', 'Area')
df[df == '(8) -'] = NA
df = df %>%
mutate_at(c('Employment','Mean Hourly Wage','Mean Annual Wage', 'Hourly Wage 10th Percentile', 'Hourly Wage 25th Percentile','Median Hourly Wage','Hourly Wage 75th Percentile','Hourly Wage 90th Percentile','Annual Wage 10th Percentile','Annual Wage 25th Percentile','Median Annual Wage','Annual Wage 75th Percentile','Annual Wage 90th Percentile'), as.numeric)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Hourly Wage 90th Percentile = .Primitive("as.double")(`Hourly
## Wage 90th Percentile`)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
Employment by Role:
emp_by_role = df %>%
select(c(Role, Employment)) %>%
group_by(Role) %>%
summarise(Jobs = sum(Employment, na.rm = TRUE))
emp_by_role$Role <- factor(emp_by_role$Role, levels = emp_by_role$Role[order(emp_by_role$Jobs)])
ggplot(emp_by_role, aes(x = Jobs, y = Role)) +
geom_bar(stat = "identity", fill = "skyblue") + # Create horizontal bars
geom_text(aes(label = Jobs), hjust = 1.1, size = 4, color = "black") + # Add numerical figure to bars
theme_minimal() + # Set minimal theme
labs(x = "Jobs", y = "", title = "Employment by Role")

Employment by Area (Top 5):
emp_by_area = df %>%
select(c(Area, Employment)) %>%
group_by(Area) %>%
summarise(Jobs = sum(Employment, na.rm = TRUE))
# Arrange the dataframe by the number of jobs
emp_by_area <- emp_by_area %>%
arrange(desc(Jobs))
emp_by_area$Area <- factor(emp_by_area$Area, levels = emp_by_area$Area[order(emp_by_area$Jobs)])
# Select the top 5 areas by jobs
emp_by_area_top <- head(emp_by_area, 5)
# Plotting the horizontal bar chart for top 5 areas by jobs
ggplot(emp_by_area_top, aes(x = Jobs, y = Area)) +
geom_bar(stat = "identity", fill = "skyblue") + # Create horizontal bars
geom_text(aes(label = Jobs), hjust = 1.1, size = 4, color = "black") + # Add numerical figure to bars
theme_minimal() + # Set minimal theme
labs(x = "Jobs", y = "", title = "Top 5 Areas by Employment")

Employment by Area (Bottom 5):
emp_by_area = df %>%
select(c(Area, Employment)) %>%
group_by(Area) %>%
summarise(Jobs = sum(Employment, na.rm = TRUE))
# Arrange the dataframe by the number of jobs
emp_by_area <- emp_by_area %>%
arrange(desc(Jobs))
emp_by_area$Area <- factor(emp_by_area$Area, levels = emp_by_area$Area[order(emp_by_area$Jobs)])
# Select the top 5 areas by jobs
emp_by_area_top <- tail(emp_by_area, 5)
# Plotting the horizontal bar chart for top 5 areas by jobs
ggplot(emp_by_area_top, aes(x = Jobs, y = Area)) +
geom_bar(stat = "identity", fill = "skyblue") + # Create horizontal bars
geom_text(aes(label = Jobs), hjust = 1.1, size = 4, color = "black") + # Add numerical figure to bars
theme_minimal() + # Set minimal theme
labs(x = "Jobs", y = "", title = "Bottom 5 Areas by Employment")

Hourly Wage IQR Lollipop Chart by Role
by_role$Role <- factor(by_role$Role, levels = by_role$Role[order(by_role$`Hourly Wage 75th Percentile`)])
ggplot(by_role) +
geom_segment(aes(x = Role, xend = Role, y = `Hourly Wage 25th Percentile`, yend = `Hourly Wage 75th Percentile`), color = "black") +
geom_point(aes(x = Role, y = `Hourly Wage 25th Percentile`), color = 'black', size = 3) +
geom_point(aes(x = Role, y = `Hourly Wage 75th Percentile`), color = 'black', size = 3) +
geom_point(aes(x = Role, y = `Median Hourly Wage`, color = 'red'), size = 3) + # Add median points with color aesthetic
coord_flip() +
theme_ipsum() +
theme(
legend.position = "right", # Position the legend on the right
legend.title = element_blank() # Remove the legend title
) +
xlab("") +
ylab("") +
ggtitle("IQR of Hourly Wages (USD) by Data-Role") +
scale_color_manual(values = c("red" = "red"), labels = c("Median")) + # Customize legend labels
scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Annual Wage IQR Lollipop Chart by Role
by_role$Role <- factor(by_role$Role, levels = by_role$Role[order(by_role$`Annual Wage 75th Percentile`)])
ggplot(by_role) +
geom_segment(aes(x = Role, xend = Role, y = `Annual Wage 25th Percentile`, yend = `Annual Wage 75th Percentile`), color = "black") +
geom_point(aes(x = Role, y = `Annual Wage 25th Percentile`), color = 'black', size = 3) +
geom_point(aes(x = Role, y = `Annual Wage 75th Percentile`), color = 'black', size = 3) +
geom_point(aes(x = Role, y = `Median Annual Wage`, color = 'red'), size = 3) +
coord_flip() +
theme_ipsum() +
theme(
legend.position = "right", # Position the legend on the right
legend.title = element_blank() # Remove the legend title
) +
xlab("") +
ylab("") +
ggtitle("IQR of Annual Wages (USD) by Data-Role") +
scale_color_manual(values = c("red" = "red"), labels = c("Median")) +
scale_y_continuous(labels = scales::dollar_format(prefix = "$"))
