Abstract

This exploratory data analysis (EDA) aims to provide insights into the salaries and demographics of data science professionals and cyber security professionals. As both fields continue to experience rapid growth, understanding the trends and patterns in salaries and demographics can provide valuable information for both job seekers and employers. The dataset used in this analysis contains detailed information on job titles, company locations, work experience, employee residence, and salaries of cyber security professionals across various countries. By conducting an EDA, we hope to identify key trends and patterns in the data and gain insights into the factors that contribute to variations in salaries and demographics. This information can help inform strategic business decisions related to recruitment, retention, and compensation in the fields of data science and cyber security.

# load data
salaries_cybersecurity <- read.csv("salaries_cyberSecurity_CLEANED.csv",sep= ",")
salaries_datascience <- read.csv("salaries_dataScience_CLEANED.csv",sep= ",")

# load libraries
library(ggplot2)
library(scales)
library(hrbrthemes)
library(dplyr)
library(tidyverse)
library(scales)
library(treemap)
library(choroplethr)

5.0.1.1 Experience Level

#Cybersecurity
employee_residence_viz <- table(salaries_cybersecurity$experience_level)[order(table(salaries_cybersecurity$experience_level), decreasing = TRUE)][1:10]

salaries_cybersecurity %>%
  filter(experience_level %in% names(employee_residence_viz)) %>%
  ggplot(aes(x=experience_level, fill=experience_level)) +
  geom_bar() +
  scale_fill_discrete(name = "Experience Level") +
  labs(x = "Experience Level", y = "Count", title = "Experience Level of Cyber Securities Professional",subtitle = " Cybersecurity Salaries",caption = "Source: Cybersecurity salaries dataset") +
  theme_minimal() +
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )+
  geom_text(aes(label=..count..), stat='count', vjust=-0.3, size=3.5)

#Data Science
employee_residence_viz <- table(salaries_datascience$experience_level)[order(table(salaries_datascience$experience_level), decreasing = TRUE)][1:10]

salaries_datascience %>%
  filter(experience_level %in% names(employee_residence_viz)) %>%
  ggplot(aes(x=experience_level, fill=experience_level)) +
  geom_bar() +
  scale_fill_discrete(name = "Experience Level") +
  labs(x = "Experience Level", y = "Count", title = "Experience Level of Data Science Professional",subtitle = "DataScience Salaries",caption = "Source: DataScience salaries dataset") +
  theme_minimal() +
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )+
  geom_text(aes(label=..count..), stat='count', vjust=-0.3, size=3.5)

5.0.1.2 Does Experience level affect how much money we make?

# 5.0.1.2   Does Experience level affect how much money we make? (Sa --------

#Cybersecurity
ggplot(salaries_cybersecurity, aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
  geom_boxplot()+
  scale_y_continuous(labels = comma)+
  labs(
    title = "Does experience level affect how much money we make?",
    subtitle = " Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",
    x = "Experience Level", 
    y = "Salary (USD)"
  )+
  theme_classic( )+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )+
  theme(legend.position = "none")

#Data Science
ggplot(salaries_datascience, aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
  geom_boxplot()+
  scale_y_continuous(labels = comma)+
  labs(
    title = "Does experience level affect how much money we make?",
    subtitle = " DataScience Salaries",
    caption = "Source: datascience salaries dataset",
    x = "Experience Level", 
    y = "Salary (USD)"
  )+
  theme_classic( )+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )+
  theme(legend.position = "none")

5.0.1.3 Top 10 Job Title

# 5.0.1.3   Top 10 job titles -----------------------------------------------

#Cybersecurity
top_job_titles <- salaries_cybersecurity %>%
  group_by(job_title) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1:10)

ggplot(top_job_titles, aes(x = job_title, y = count)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Top 10 Job Titles of Cyber Security",
    subtitle = " Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",
    x = "Job Title", 
    y = "Count"
  )+
  theme_classic( )+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

#DataScience
top_job_titles <- salaries_datascience%>%
  group_by(job_title) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1:10)

ggplot(top_job_titles, aes(x = job_title, y = count)) +
  geom_bar(stat = "identity", fill = "maroon") +
  labs(
    title = "Top 10 Job Titles of Data Science",
    subtitle = "DataScience Salaries",
    caption = "Source: datascience salaries dataset",
    x = "Job Title", 
    y = "Count"
  )+
  theme_classic( )+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

5.0.1.4 Tree Map of Different Roles with Experience Level

#Cybersecurity
cs_treemap <- salaries_cybersecurity[, c("job_title", "company_location", "experience_level", "salary_in_usd")]
treemap(cs_treemap, 
        index = c("job_title", "company_location", "experience_level"), 
        vSize = "salary_in_usd", 
        type = "index",
        title = "Tree Map of Different Roles in Cyber Security with Experience Level",
        fontsize.title =12,
)

#DataScience
ds_treemap <- salaries_datascience[, c("job_title", "company_location", "experience_level", "salary_in_usd")]
treemap(cs_treemap, 
        index = c("job_title", "company_location", "experience_level"), 
        vSize = "salary_in_usd", 
        type = "index",
        title = "Tree Map of Different Roles in Data Science with Experience Level",
        fontsize.title =12,
)

5.0.1.5 Average Salary by Job Title and Experience Level

#Cybersecurity
ggplot(salaries_cybersecurity, aes(x = salary_in_usd, y = job_title, fill = experience_level)) +
  geom_col() +
  scale_fill_discrete(name = "Experience Level") +
  scale_x_continuous(labels = comma)+
  labs(x = "Salary in USD", y = "Job Title", title = "Average Salary by Job Title and Experience Level of Cyber Security (in USD)",subtitle = "CyberSecurity Salaries",caption = "Source: cybersecurity salaries dataset") +
  theme_minimal()+
    theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

#DataScience
ggplot(salaries_datascience, aes(x = salary_in_usd, y = job_title, fill = experience_level)) +
  geom_col() +
  scale_fill_discrete(name = "Experience Level") +
  scale_x_continuous(labels = comma)+
  labs(x = "Salary in USD", y = "Job Title", title = "Average Salary by Job Title and Experience Level of Data Science (in USD)",subtitle = "DataScience Salaries",caption = "Source: datascience salaries dataset") +
  theme_minimal()+
    theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

5.0.1.6 Does Employment Type affects the Salary(USD)

#Cybersecurity
ggplot(salaries_cybersecurity, aes(x = employment_type, y = salary_in_usd, fill = employment_type)) +
  geom_boxplot()+
  scale_y_continuous(labels = comma)+
  labs(
    title = "Does employment type affects the salary?",
    subtitle = " Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",
    x = "Employment Type", 
    y = "Salary (USD)"
  )+
  theme_classic( )+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )+
  theme(legend.position = "none")

#Data Science
ggplot(salaries_datascience, aes(x = employment_type, y = salary_in_usd, fill = employment_type)) +
  geom_boxplot()+
  scale_y_continuous(labels = comma)+
  labs(
    title = "Does employment type affects the salary?",
    subtitle = " DataScience Salaries",
    caption = "Source: datascience salaries dataset",
    x = "Employment Type", 
    y = "Salary (USD)"
  )+
  theme_classic( )+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )+
  theme(legend.position = "none")

5.0.1.7 Distribution of Work Year and Salary in USD

#Cybersecurity
ggplot(salaries_cybersecurity, aes(x = work_year, y = salary_in_usd, color = experience_level)) +
  geom_point(alpha = 0.4) +
    labs(x = "Work Year", y = "Salary in USD", title = "Distribution of Work Years and Salaries of Cyber Security in USD",subtitle = "Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",) +
  scale_color_discrete(name = "Experience Level") +
  theme_minimal()+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

#DataScience
ggplot(salaries_datascience, aes(x = work_year, y = salary_in_usd, color = experience_level)) +
  geom_point(alpha = 0.4) +
    labs(x = "Work Year", y = "Salary in USD", title = "Distribution of Work Years and Salaries of Data Science in USD",subtitle = "DataScience Salaries",
    caption = "Source: daatscience salaries dataset",) +
  scale_color_discrete(name = "Experience Level") +
  theme_minimal()+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

5.0.1.8 Number of Employees by Company Location

#Cybersecurity
ggplot(salaries_cybersecurity, aes(x = company_location)) +
  geom_bar(fill = "steelblue", color = "white") +
  labs(x = "Company Location", y = "Number of Employees", title = "Number of Employees by Company Location of Cyber Security",subtitle = "Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",) +
  theme_minimal()+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

#DataScience
ggplot(salaries_datascience, aes(x = company_location)) +
  geom_bar(fill = "maroon", color = "white") +
  labs(x = "Company Location", y = "Number of Employees", title = "Number of Employees by Company Location of Data Science",subtitle = "DataScience Salaries",
    caption = "Source: datascience salaries dataset",) +
  theme_minimal()+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

5.0.1.9 Analysis between Employees’ Residence Location and Salaries

#Cybersecurity
salary_by_location <- aggregate(salaries_cybersecurity$salary_in_usd, by = list(salaries_cybersecurity$employee_residence), mean)
names(salary_by_location) <- c("employee_residence", "mean_salary")

ggplot(salary_by_location, aes(x = employee_residence, y = mean_salary)) +
  geom_bar(stat = "identity", color = "white", fill = "steelblue") +
  labs(x = "Employee Residence", y = "Mean Salary of Cyber Security", title = "Mean Salary of Cyber Security by Employee Residence",subtitle = "CyberSecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",) +
  theme_minimal()+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

#DataScience
salary_by_location <- aggregate(salaries_datascience$salary_in_usd, by = list(salaries_datascience$employee_residence), mean)
names(salary_by_location) <- c("employee_residence", "mean_salary")

ggplot(salary_by_location, aes(x = employee_residence, y = mean_salary)) +
  geom_bar(stat = "identity", color = "white", fill = "maroon") +
  labs(x = "Employee Residence", y = "Mean Salary of Data Science", title = "Mean Salary of Data Science by Employee Residence",subtitle = "DataScience Salaries",
    caption = "Source: datascience salaries dataset",) +
  theme_minimal()+
  theme(
    plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
    plot.caption = element_text(face = "italic")
  )

5.0.1.10 Salaries by Company Location

#Cybersecurity
salary_by_location <- aggregate(salaries_cybersecurity$salary_in_usd, by = list(salaries_cybersecurity$company_location), mean)
names(salary_by_location) <- c("company_location", "mean_salary")

ggplot(salary_by_location, aes(x = reorder(company_location, mean_salary), y = mean_salary)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(x = "Company Location", y = "Mean Salary in USD", title = "Salaries by Company Location (Cyber Security)",subtitle = "CyberSecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),    
        plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))

#DataScience
salary_by_location <- aggregate(salaries_datascience$salary_in_usd, by = list(salaries_datascience$company_location), mean)
names(salary_by_location) <- c("company_location", "mean_salary")

ggplot(salary_by_location, aes(x = reorder(company_location, mean_salary), y = mean_salary)) +
  geom_bar(stat = "identity", fill = "maroon") +
  labs(x = "Company Location", y = "Mean Salary in USD", title = "Salaries by Company Location (Data Science)",subtitle = "DataScience Salaries",
    caption = "Source: datascience salaries dataset",) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),    
        plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))

5.0.1.11 Salary in USD by Company Size

#Cybersecurity
salaries_cybersecurity$work_year <- as.factor(salaries_cybersecurity$work_year)
ggplot(salaries_cybersecurity, aes(x = as.factor(company_size), y = salary_in_usd, group = as.factor(company_size))) +
  geom_boxplot(aes(color = as.factor(company_size))) +
  scale_y_continuous(labels = comma)+
  scale_color_discrete(name = "Company Size") +
  labs(x = "Company Size", y = "Salary in USD", title = "Cyber Security Salaries by Company Size",subtitle = "Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",) +
  theme_minimal()+
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))

#DataScience
salaries_datascience$work_year <- as.factor(salaries_datascience$work_year)
ggplot(salaries_datascience, aes(x = as.factor(company_size), y = salary_in_usd, group = as.factor(company_size))) +
  geom_boxplot(aes(color = as.factor(company_size))) +
  scale_y_continuous(labels = comma)+
  scale_color_discrete(name = "Company Size") +
  labs(x = "Company Size", y = "Salary in USD", title = "Data Science Salaries by Company Size",subtitle = "DataScience Salaries",
    caption = "Source: datascience salaries dataset",) +
  theme_minimal()+
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))

5.0.1.12 Which Year has the highest average salary?

5.0.1.13 Top 10 Jobs based on Average Pay (USD)

#Cybersecurity
# Find the top 10 salaries by job title for cybersecurity salaries
top_10_cybersecurity_salaries_by_job_title <- salaries_cybersecurity %>%
  group_by(job_title) %>%
  summarize(mean_salary = mean(salary_in_usd, na.rm = TRUE)) %>%
  arrange(desc(mean_salary)) %>%
  head(10)

# Create a bar plot of the top 10 salaries by job title for cybersecurity salaries
ggplot(top_10_cybersecurity_salaries_by_job_title, aes(x = job_title, y = mean_salary, fill = job_title)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = scales::dollar) +
  scale_fill_discrete(name = "Job Title") +
  labs(
    title = "Top 10 Jobs in CyberSecurity based on Average Pay (USD) ",
    subtitle = "Cybersecurity Salaries",
    caption = "Source: cybersecurity salaries dataset",
    x = "Job Title", 
    y = "Salary (USD)"
  )+
  geom_text(aes(label = scales::dollar(mean_salary, prefix = "$")), vjust = -0.5) +
  theme_bw() +
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#Datascience

# Find the top 10 salaries by job title for datascience salaries
top_10_datascience_salaries_by_job_title <- salaries_datascience %>%
  group_by(job_title) %>%
  summarize(mean_salary = mean(salary_in_usd, na.rm = TRUE)) %>%
  arrange(desc(mean_salary)) %>%
  head(10)

# Create a bar plot of the top 10 salaries by job title for cybersecurity salaries
ggplot(top_10_datascience_salaries_by_job_title, aes(x = job_title, y = mean_salary, fill = job_title)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = scales::dollar) +
  scale_fill_discrete(name = "Job Title") +
  labs(
    title = "Top 10 Jobs in Data Science based on Average Pay (USD)",
    subtitle = "DataScience Salary",
    caption = "Source: datascience salaries dataset",
    x = "Job Title", 
    y = "Salary (USD)"
  )+
  geom_text(aes(label = scales::dollar(mean_salary, prefix = "$")), vjust = -0.5) +
  theme_bw() +
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"),
        axis.text.x = element_text(angle = 45, hjust = 1))

5.0.1.14 Are remote jobs more available in smaller or bigger companies?

#Cybersecurity
salaries_cybersecurity$company_size <- factor(salaries_cybersecurity$company_size)

ggplot(salaries_cybersecurity, aes(x = remote_ratio, fill = company_size)) +
  geom_bar(position = "dodge") +
  scale_fill_viridis_d() +
  labs(title = 'Are remote jobs more available in smaller or bigger companies?',
       x = 'Remote Ratio', y = 'Count',
       subtitle = "Cybersecurity Salaries",
       caption = "Source: cybersecurity salaries dataset",
       ) +
  scale_x_continuous(breaks = c(0, 50, 100)) +
  theme_classic()+
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))+
  guides(fill = guide_legend(title = 'Company Size'))

#DataScience
salaries_datascience$company_size <- factor(salaries_datascience$company_size)

ggplot(salaries_datascience, aes(x = remote_ratio, fill = company_size)) +
  geom_bar(position = "dodge") +
  scale_fill_viridis_d() +
  labs(title = 'Are remote jobs more available in smaller or bigger companies?',
       x = 'Remote Ratio', y = 'Count',
       subtitle = "DataScience Salaries",
       caption = "Source: datascience salaries dataset",
       ) +
  scale_x_continuous(breaks = c(0, 50, 100)) +
  theme_classic()+
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))+
  guides(fill = guide_legend(title = 'Company Size'))

5.0.1.15 Are non-remote jobs better paid than full-remote jobs?

#Cybersecurity
ggplot(salaries_cybersecurity, aes(x=factor(employment_type), y=salary_in_usd, fill=factor(remote_ratio)))+  
  geom_bar(stat='identity', position='dodge') +
  scale_y_continuous(labels = comma)+
  labs(title='Are non-remote jobs better paid than full-remote jobs?',
       subtitle = "CyberSecurity Salaries",
       caption = "Source: cybersecurity salaries dataset",
       x='Employment type', y='Salary in US dollars') +
  scale_fill_discrete(name='Remote ratio', labels=c('Not remote', 'Half remote', 'Full remote')) +
  theme_classic()+
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))

#DataScience
ggplot(salaries_datascience, aes(x=factor(employment_type), y=salary_in_usd, fill=factor(remote_ratio)))+  
  geom_bar(stat='identity', position='dodge') +
  scale_y_continuous(labels = comma)+
  labs(title='Are non-remote jobs better paid than full-remote jobs?',
       subtitle = "DataScience Salaries",
       caption = "Source: datascience salaries dataset",
       x='Employment type', y='Salary in US dollars') +
  scale_fill_discrete(name='Remote ratio', labels=c('Not remote', 'Half remote', 'Full remote')) +
  theme_classic()+
  theme(plot.title = element_text(color = "#0099f8", size = 16, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(face = "bold.italic", hjust = 0.5),
        plot.caption = element_text(face = "italic"))