library(tidyverse) # This includes dplyr, tidyr, ggplot2, and others
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
url='https://raw.githubusercontent.com/mehreengillani/DATA607/refs/heads/main/salaries_data.csv'
salaries_data <- read.csv(url)
# Examine the wide structure
glimpse(salaries_data)
## Rows: 825
## Columns: 8
## $ work_year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, …
## $ job_title <chr> "AI Scientist", "Azure Data Engineer", "BI Data Analys…
## $ company_size <chr> "S", "S", "M", "L", "M", "S", "L", "L", "M", "S", "L",…
## $ employment_type <chr> "FT", "FT", "FT", "FT", "FT", "FT", "CT", "FT", "FT", …
## $ avg_salary_EN <dbl> 45896.00, 100000.00, NA, 70000.00, NA, NA, 100000.00, …
## $ avg_salary_MI <dbl> NA, NA, 98000.00, NA, NA, NA, NA, 135000.00, 95000.00,…
## $ avg_salary_SE <dbl> NA, NA, NA, NA, 109024.0, 114047.0, NA, NA, NA, 60000.…
## $ avg_salary_EX <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 79833,…
head(salaries_data)
## work_year job_title company_size employment_type avg_salary_EN
## 1 2020 AI Scientist S FT 45896
## 2 2020 Azure Data Engineer S FT 100000
## 3 2020 BI Data Analyst M FT NA
## 4 2020 Big Data Engineer L FT 70000
## 5 2020 Big Data Engineer M FT NA
## 6 2020 Big Data Engineer S FT NA
## avg_salary_MI avg_salary_SE avg_salary_EX
## 1 NA NA NA
## 2 NA NA NA
## 3 98000 NA NA
## 4 NA NA NA
## 5 NA 109024 NA
## 6 NA 114047 NA
# Pivot from wide to long format
tidy_salaries <- salaries_data %>%
pivot_longer(
cols = starts_with("avg_salary_"), # Select all salary columns
names_to = "experience_level", # Put column names in this new column
values_to = "average_salary" # Put values in this new column
) %>%
# Clean up the experience_level column
mutate(
experience_level = str_remove(experience_level, "avg_salary_"), # Remove prefix
experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX")) #performs ordering. All analyses will respect the career progression order
) %>%
# Remove rows where salary data is missing
filter(!is.na(average_salary))
glimpse(tidy_salaries)
## Rows: 1,425
## Columns: 6
## $ work_year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,…
## $ job_title <chr> "AI Scientist", "Azure Data Engineer", "BI Data Analy…
## $ company_size <chr> "S", "S", "M", "L", "M", "S", "L", "L", "M", "S", "L"…
## $ employment_type <chr> "FT", "FT", "FT", "FT", "FT", "FT", "CT", "FT", "FT",…
## $ experience_level <fct> EN, EN, MI, EN, SE, SE, EN, MI, MI, SE, EN, MI, EN, E…
## $ average_salary <dbl> 45896.00, 100000.00, 98000.00, 70000.00, 109024.00, 1…
head(tidy_salaries)
## # A tibble: 6 × 6
## work_year job_title company_size employment_type experience_level
## <int> <chr> <chr> <chr> <fct>
## 1 2020 AI Scientist S FT EN
## 2 2020 Azure Data Engineer S FT EN
## 3 2020 BI Data Analyst M FT MI
## 4 2020 Big Data Engineer L FT EN
## 5 2020 Big Data Engineer M FT SE
## 6 2020 Big Data Engineer S FT SE
## # ℹ 1 more variable: average_salary <dbl>
Dataset shape before transformation:
Rows: 825
Columns: 8
after transforming wide to long data shape:
Rows: 1,425
Columns: 6
tidy_salaries <- tidy_salaries %>%
mutate(
# Convert to proper types
work_year = as.integer(work_year),
# Create meaningful Experience labels
experience_label = case_when(
experience_level == "EN" ~ "Entry",
experience_level == "MI" ~ "Mid",
experience_level == "SE" ~ "Senior",
experience_level == "EX" ~ "Executive"
),
experience_label = factor(experience_label,
levels = c("Entry", "Mid", "Senior", "Executive")),
# Employment type labels
employment_label = case_when(
employment_type == "PT" ~ "part-time",
employment_type == "FT" ~ "full-time",
employment_type == "CT" ~ "contract",
employment_type == "FL" ~ "freelance"
),
employment_label = factor(employment_label,
levels = c( "freelance", "part-time","contract","full-time")),
# company size labels
company_size_label = case_when(
company_size == "L" ~ "large",
company_size == "M" ~ "medium",
company_size == "S" ~ "small"
),
company_size_label = factor(company_size_label,
levels = c("small", "medium", "large")),
# Create salary categories for easy grouping
salary_range = case_when(
average_salary < 50000 ~ "Under $50K",
average_salary >= 50000 & average_salary < 100000 ~ "$50K-$100K",
average_salary >= 100000 & average_salary < 150000 ~ "$100K-$150K",
average_salary >= 150000 ~ "Over $150K"
),
# Handle any missing values
across(where(is.numeric), ~ replace_na(., median(., na.rm = TRUE)))
) %>%
# Remove duplicates
distinct()
glimpse(tidy_salaries)
## Rows: 1,425
## Columns: 10
## $ work_year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 202…
## $ job_title <chr> "AI Scientist", "Azure Data Engineer", "BI Data Ana…
## $ company_size <chr> "S", "S", "M", "L", "M", "S", "L", "L", "M", "S", "…
## $ employment_type <chr> "FT", "FT", "FT", "FT", "FT", "FT", "CT", "FT", "FT…
## $ experience_level <fct> EN, EN, MI, EN, SE, SE, EN, MI, MI, SE, EN, MI, EN,…
## $ average_salary <dbl> 45896.00, 100000.00, 98000.00, 70000.00, 109024.00,…
## $ experience_label <fct> Entry, Entry, Mid, Entry, Senior, Senior, Entry, Mi…
## $ employment_label <fct> full-time, full-time, full-time, full-time, full-ti…
## $ company_size_label <fct> small, small, medium, large, medium, small, large, …
## $ salary_range <chr> "Under $50K", "$100K-$150K", "$50K-$100K", "$50K-$1…
#unique(tidy_salaries$company_size)
4 new columns have been added
# Multiple summary statistics
summary_stats <- tidy_salaries %>%
group_by(company_size_label, experience_label) %>%
summarise(
count = n(),
mean_salary = mean(average_salary),
median_salary = median(average_salary),
sd_salary = sd(average_salary),
min_salary = min(average_salary),
max_salary = max(average_salary),
.groups = 'drop'
)
summary_stats
## # A tibble: 12 × 8
## company_size_label experience_label count mean_salary median_salary sd_salary
## <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 small Entry 39 61932. 55000 36766.
## 2 small Mid 48 74814. 62450. 42096.
## 3 small Senior 42 112872. 98586. 63458.
## 4 small Executive 7 166197 115222 121938.
## 5 medium Entry 191 87198. 85233. 41084.
## 6 medium Mid 306 118046. 111483. 62509.
## 7 medium Senior 365 151655. 151134. 47693.
## 8 medium Executive 84 173287. 175000 52379.
## 9 large Entry 83 82708. 70000 52259.
## 10 large Mid 111 108777. 97712 63724.
## 11 large Senior 133 147396. 145700 67630.
## 12 large Executive 16 163051. 150000 70668.
## # ℹ 2 more variables: min_salary <dbl>, max_salary <dbl>
highest_paid_by_year <- tidy_salaries %>%
group_by(work_year) %>%
summarise(
highest_paid_job = job_title[which.max(average_salary)],
highest_salary = max(average_salary),
.groups = 'drop'
)
print("Highest Paid Job Title Each Year:")
## [1] "Highest Paid Job Title Each Year:"
print(highest_paid_by_year)
## # A tibble: 6 × 3
## work_year highest_paid_job highest_salary
## <int> <chr> <dbl>
## 1 2020 Research Scientist 450000
## 2 2021 Applied Machine Learning Scientist 423000
## 3 2022 Data Analytics Lead 405000
## 4 2023 Analytics Engineering Manager 399880
## 5 2024 AI Architect 800000
## 6 2025 Engineering Manager 287000
ggplot() +
geom_histogram(data = tidy_salaries, aes(x = average_salary, fill = "average_salary"),
fill ='lightblue',alpha = 0.6, bins = 15, position = "identity") +
labs(title = "Distribution of avaerage salaries",
x = "salaries",
y = "Count") +
theme_minimal()
After analyzing the graph I can see there is one or few points
more than 45000USD salary. Which creates an outlier in the dataset.
first we will query about these data points.
high_salary_data <- tidy_salaries %>%
filter(average_salary > 450000)
print("Data points with salary > $450,000:")
## [1] "Data points with salary > $450,000:"
print(high_salary_data)
## # A tibble: 1 × 10
## work_year job_title company_size employment_type experience_level
## <int> <chr> <chr> <chr> <fct>
## 1 2024 AI Architect M FT MI
## # ℹ 5 more variables: average_salary <dbl>, experience_label <fct>,
## # employment_label <fct>, company_size_label <fct>, salary_range <chr>
Because there is only one data point which is above 450000 and
its value is almost double than the last value. I would remove this data
point because it is an outlier.
#Removed outlier by deleting avg salary more than 500000
cleaned_data <- tidy_salaries %>%
filter(average_salary < 500000)
cat("Original rows:", nrow(tidy_salaries), "\n")
## Original rows: 1425
cat("After removal:", nrow(cleaned_data), "\n")
## After removal: 1424
cat("Removed", nrow(tidy_salaries) - nrow(cleaned_data), "rows\n")
## Removed 1 rows
Lets re-plot the avg salary distribution graph.
ggplot() +
geom_histogram(data = cleaned_data, aes(x = average_salary, fill = "average_salary"),
fill='lightblue',alpha = 0.6, bins = 15, position = "identity") +
labs(title = "Distribution of avaerage salaries",
x = "salaries",
y = "Count") +
theme_minimal()
Still avg salary distribution is right or positive skewed.
analysis1 <- cleaned_data %>%
group_by(experience_label) %>%
summarise(
avg_salary = mean(average_salary),
median_salary = median(average_salary),
count = n()
) %>%
arrange(experience_label) # Use the logical order we set
print("Average Salary by Experience Level:")
## [1] "Average Salary by Experience Level:"
print(analysis1)
## # A tibble: 4 × 4
## experience_label avg_salary median_salary count
## <fct> <dbl> <dbl> <int>
## 1 Entry 82859. 77742. 313
## 2 Mid 109887. 104944 464
## 3 Senior 147590. 146115. 540
## 4 Executive 171293. 171167. 107
analysis2 <- cleaned_data %>%
group_by(company_size_label) %>%
summarise(
avg_salary = mean(average_salary),
count = n()
) %>%
arrange(desc(avg_salary))
print("Salary by Company Size:")
## [1] "Salary by Company Size:"
print(analysis2)
## # A tibble: 3 × 3
## company_size_label avg_salary count
## <fct> <dbl> <int>
## 1 medium 128981. 945
## 2 large 119975. 343
## 3 small 87577. 136
analysis3 <- cleaned_data %>%
group_by(job_title) %>%
summarise(
avg_salary = mean(average_salary),
count = n()
) %>%
filter(count >= 5) %>% # Only jobs with decent sample size
arrange(desc(avg_salary)) %>%
head(10) # Top 10 only
print("Top 10 Paying Job Titles:")
## [1] "Top 10 Paying Job Titles:"
print(analysis3)
## # A tibble: 10 × 3
## job_title avg_salary count
## <chr> <dbl> <int>
## 1 AI Architect 220107. 7
## 2 Principal Data Scientist 202735. 7
## 3 Data Infrastructure Engineer 184257. 5
## 4 Applied Scientist 182327. 17
## 5 Product Manager 181912. 9
## 6 Data Analytics Lead 181041. 6
## 7 Machine Learning Manager 176715. 5
## 8 Platform Engineer 176183. 5
## 9 Machine Learning Scientist 174931. 17
## 10 Head of Data 171553. 12
analysis4 <- cleaned_data %>%
group_by(work_year, experience_label) %>%
summarise(
avg_salary = mean(average_salary),
count = n()
) %>%
arrange(work_year, experience_label)
## `summarise()` has grouped output by 'work_year'. You can override using the
## `.groups` argument.
print("Salary Trends Over Years:")
## [1] "Salary Trends Over Years:"
print(analysis4)
## # A tibble: 24 × 4
## # Groups: work_year [6]
## work_year experience_label avg_salary count
## <int> <fct> <dbl> <int>
## 1 2020 Entry 70301. 16
## 2 2020 Mid 102640. 17
## 3 2020 Senior 132256. 16
## 4 2020 Executive 135778. 3
## 5 2021 Entry 61776. 24
## 6 2021 Mid 86976. 34
## 7 2021 Senior 129277. 42
## 8 2021 Executive 192932. 6
## 9 2022 Entry 74838. 43
## 10 2022 Mid 91365. 59
## # ℹ 14 more rows
# Plot 1: Salary by experience level
ggplot(analysis1, aes(x = experience_label, y = avg_salary)) +
geom_col(fill = "grey") +
labs(title = "Average Salary by Experience Level",
x = "Experience Level", y = "Average Salary") +
theme_minimal()
# Plot 2: Salary distribution by company size
ggplot(cleaned_data, aes(x = company_size_label, y = average_salary)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Salary Distribution by Company Size",
x = "Company Size", y = "Salary") +
theme_minimal()
Outliers Present: All company size categories show outliers in
average salaries
Salary Variability: Large companies exhibit
the widest interquartile range
Comparative Analysis: This
indicates more diverse salary structures in large organizations versus
more compressed ranges in medium-sized companies.
ggplot(analysis4, aes(x = factor(work_year), y = avg_salary, fill = experience_label)) +
geom_col(position = "dodge", color = "black", size = 0.3) +
scale_fill_manual(values = c("Entry" = "gray95",
"Mid" = "gray75",
"Senior" = "gray50",
"Executive" = "gray25")) +
labs(title = "Average Salary by Year and Experience Level",
x = "Year",
y = "Average Salary (USD)",
fill = "Experience Level") +
theme_minimal() +
theme(panel.background = element_rect(fill = "white")) +
scale_y_continuous(labels = scales::dollar)
## Warning in geom_col(position = "dodge", color = "black", size = 0.3): Ignoring
## unknown parameters: `size`
# Plot 2: Salary distribution by employment type
ggplot(cleaned_data, aes(x = employment_label, y = average_salary)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Salary Distribution by employment type",
x = "employment type", y = "Salary") +
theme_minimal()
The employment type salary distribution demonstrates a clear stratification:
Full-time: Maximum salary range and highest average
compensation, with numerous outliers indicating diverse pay scales
Contract: Secondary position in both salary level and
variability Part-time: Third-tier compensation range
Freelance: Most restricted salary band with lowest overall
earnings
data_scientist_trend <- tidy_salaries %>%
filter(job_title == "Data Scientist") %>%
group_by(work_year) %>%
summarise(
avg_salary = mean(average_salary),
median_salary = median(average_salary),
min_salary = min(average_salary),
max_salary = max(average_salary),
job_count = n(),
.groups = 'drop'
)
print("Data Scientist Salary Trend:")
## [1] "Data Scientist Salary Trend:"
print(data_scientist_trend)
## # A tibble: 6 × 6
## work_year avg_salary median_salary min_salary max_salary job_count
## <int> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2020 115084. 82023. 21669 312500 10
## 2 2021 93732. 87556. 53012. 181838. 12
## 3 2022 108988. 105000 52008 190159. 13
## 4 2023 114976. 103714 21000 213007. 14
## 5 2024 133270. 125987 81572. 200296. 14
## 6 2025 136995. 147864. 78947 173304 4
# Basic line plot for average salary trend
ggplot(data_scientist_trend, aes(x = work_year, y = avg_salary)) +
geom_line(color = "blue", size = 1.5) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = paste0("$", round(avg_salary/1000, 1), "K")),
vjust = -1, size = 3.5) +
labs(title = "Data Scientist Average Salary Trend Over Years",
x = "Year",
y = "Average Salary (USD)") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Enhanced Plot with Confidence Interval
# More detailed plot with min/max range
ggplot(data_scientist_trend, aes(x = work_year)) +
geom_ribbon(aes(ymin = min_salary, ymax = max_salary),
fill = "lightblue", alpha = 0.3) +
geom_line(aes(y = avg_salary), color = "blue", size = 1.5) +
geom_line(aes(y = median_salary), color = "red", size = 1, linetype = "dashed") +
geom_point(aes(y = avg_salary), color = "blue", size = 3) +
geom_point(aes(y = median_salary), color = "red", size = 2) +
labs(title = "Data Scientist Salary Range Over Years",
subtitle = "Blue line = Average, Red dashed line = Median, Shaded area = Min-Max Range",
x = "Year",
y = "Salary (USD)") +
theme_minimal() +
scale_y_continuous(labels = scales::dollar)
cat("DATA SCIENTIST SALARY TREND SUMMARY:\n")
## DATA SCIENTIST SALARY TREND SUMMARY:
if(nrow(data_scientist_trend) > 1) {
growth <- ((data_scientist_trend$avg_salary[nrow(data_scientist_trend)] -
data_scientist_trend$avg_salary[1]) /
data_scientist_trend$avg_salary[1]) * 100
cat("Overall salary change:", round(growth, 1), "%\n")
}
## Overall salary change: 19 %
cat("Years analyzed:", paste(data_scientist_trend$work_year, collapse = ", "), "\n")
## Years analyzed: 2020, 2021, 2022, 2023, 2024, 2025
cat("Average salaries:", paste0("$", round(data_scientist_trend$avg_salary), collapse = " → "), "\n")
## Average salaries: $115084 → $93732 → $108988 → $114976 → $133270 → $136995
This analysis successfully transformed and analyzed salary data for ML, AI, Data science professionals across multiple dimensions, including experience levels, company sizes, employment types, and temporal trends. The project demonstrated robust data wrangling techniques and produced meaningful insights.