library(tidyverse)
library(DataExplorer)
library(odbc)
library(DBI)
library(RSQLite)CMSC Group Project Observations
About the Project
For this group project, we will be observing the AI, ML, Data Science Salary data set created by Samith Chimminiyan via webscraping from https://aijobs.net/ . This data set can be located on kaggle via the following link: https://www.kaggle.com/datasets/samithsachidanandan/the-global-ai-ml-data-science-salary-for-2025?resource=download
About the Data Set
The data set contains details of the AI, Machine Learning, and Data Science Position Salaries from 2020 to 2025. Contained in the data set are the following variables:
- work_year: The year the salary was paid
- experience_level: offering ‘EN’ for entry level/Junior, ‘MI’ for Mid-Level/Intermediate, ‘SE’ for Senior-Level/Expert, and ‘EX’ for Executive Level/Director
- employment_type: The type of employment for the role with ‘PT’ for part time, ‘FT’ for full time, ‘CT’ for Contract, and ‘FL’ for Freelance
- job_title: The role worked in during the year
- salary: The total gross salary amount paid
- salary_currency: The currency of the salary paid
- salary_in_usd: The salary rate in USD (assuming if conversion is necessary)
- employee_residence: Employee’s primary country of residence during the given work year
- remote_ratio: Ratio of overall amount of work done remotely based on the scale of ‘0’ if no remote work or less than 20% of the time, ‘50’ if partially remote/hybrid, ‘100’ if fully remote or more than 80% of the time
- company_location: The country of the employer’s main office or contracting branch
- company_size: The average number of people that worked for the company during the year where ‘S’ is less than 50 employees(small), ‘M’ is 50 to 250 employees (medium), and ‘L’ is more than 250 employees (large)
Data Preperation
salaries <- read_csv("salaries.csv")
str(salaries)spc_tbl_ [88,584 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ work_year : num [1:88584] 2025 2025 2025 2025 2025 ...
$ experience_level : chr [1:88584] "MI" "SE" "SE" "SE" ...
$ employment_type : chr [1:88584] "FT" "FT" "FT" "FT" ...
$ job_title : chr [1:88584] "Customer Success Manager" "Engineer" "Engineer" "Applied Scientist" ...
$ salary : num [1:88584] 57000 165000 109000 294000 137600 ...
$ salary_currency : chr [1:88584] "EUR" "USD" "USD" "USD" ...
$ salary_in_usd : num [1:88584] 60000 165000 109000 294000 137600 ...
$ employee_residence: chr [1:88584] "NL" "US" "US" "US" ...
$ remote_ratio : num [1:88584] 50 0 0 0 0 0 0 0 0 0 ...
$ company_location : chr [1:88584] "NL" "US" "US" "US" ...
$ company_size : chr [1:88584] "L" "M" "M" "M" ...
- attr(*, "spec")=
.. cols(
.. work_year = col_double(),
.. experience_level = col_character(),
.. employment_type = col_character(),
.. job_title = col_character(),
.. salary = col_double(),
.. salary_currency = col_character(),
.. salary_in_usd = col_double(),
.. employee_residence = col_character(),
.. remote_ratio = col_double(),
.. company_location = col_character(),
.. company_size = col_character()
.. )
- attr(*, "problems")=<externalptr>
salaries.db <- dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(salaries.db, salaries)Average, Minimum, and Max Salary (Global in USD)
SELECT ROUND(AVG(salary_in_usd), 2) AS avg_salary, MIN(salary_in_usd) AS min_salary, MAX(salary_in_usd) as max_salary
FROM salaries| avg_salary | min_salary | max_salary |
|---|---|---|
| 157567.8 | 15000 | 8e+05 |
Top 10 Highest Salaries (in USD)
SELECT job_title, company_location, salary, salary_currency, salary_in_usd
FROM salaries
ORDER BY salary_in_usd DESC
LIMIT 10| job_title | company_location | salary | salary_currency | salary_in_usd |
|---|---|---|---|---|
| AI Architect | CA | 800000 | USD | 800000 |
| Data Engineer | AT | 753480 | EUR | 793136 |
| Data Engineer | AT | 753480 | EUR | 793136 |
| Data Analyst | MX | 774000 | USD | 774000 |
| Data Scientist | US | 750000 | USD | 750000 |
| Analytics Engineer | US | 750000 | USD | 750000 |
| Machine Learning Scientist | US | 750000 | USD | 750000 |
| Data Analyst | US | 750000 | USD | 750000 |
| Machine Learning Scientist | US | 750000 | USD | 750000 |
| Machine Learning Scientist | US | 750000 | USD | 750000 |
Average Salary by Country (with Salary Currency for reference)
SELECT company_location, ROUND(AVG(salary), 2) AS avg_salary, salary_currency
FROM salaries
GROUP BY company_location
ORDER BY avg_salary DESC| company_location | avg_salary | salary_currency |
|---|---|---|
| HU | 4750000.0 | HUF |
| JP | 2746920.0 | USD |
| CL | 1736740.4 | USD |
| IN | 1713398.0 | USD |
| ID | 1085637.0 | USD |
| TH | 618333.3 | THB |
| TW | 570000.0 | TWD |
| HK | 510000.0 | HKD |
| NO | 396278.6 | NOK |
| PH | 381944.2 | PHP |
This contains 90 rows of information
Top 5 Countries by Median Salary(using USD)
medians <- salaries |>
group_by(company_location) |>
summarise(median_salary = median(salary_in_usd, na.rm = TRUE)) |>
arrange(desc(median_salary)) |>
slice_head(n = 5)
medians# A tibble: 5 × 2
company_location median_salary
<chr> <dbl>
1 QA 300000
2 VE 192500
3 US 151500
4 SA 150000
5 PR 140100
Average Salary by Role and Country(in USD)
SELECT job_title, company_location, ROUND(AVG(salary_in_usd), 2) AS avg_salary
FROM salaries
GROUP BY job_title, company_location
ORDER BY avg_salary DESC| job_title | company_location | avg_salary |
|---|---|---|
| AI Architect | CA | 800000 |
| ML Infrastructure Engineer | GB | 465625 |
| AI Scientist | IL | 417937 |
| Analytics Engineering Manager | GB | 399880 |
| Data Science Tech Lead | US | 375000 |
| Architect | GB | 368354 |
| Head of AI | AU | 358000 |
| Finance Data Analyst | US | 323905 |
| Head of Machine Learning | US | 304300 |
| AI Engineer | QA | 300000 |
This contains 1000 rows of information
Salary by Experience Level and Country (in USD)
SELECT company_location, experience_level, ROUND(AVG(salary_in_usd), 2) AS avg_salary
FROM salaries
GROUP BY experience_level, company_location
ORDER BY company_location| company_location | experience_level | avg_salary |
|---|---|---|
| AD | MI | 50745.00 |
| AE | MI | 115000.00 |
| AE | SE | 78750.00 |
| AM | EN | 21333.00 |
| AM | MI | 50000.00 |
| AM | SE | 75550.00 |
| AR | EN | 55111.11 |
| AR | MI | 57690.63 |
| AR | SE | 100921.00 |
| AS | EN | 31684.33 |
This contains 221 rows of information
Average Salary by Year and Country(in USD)
SELECT company_location, work_year, ROUND(AVG(salary_in_usd), 2) AS avg_salary
FROM salaries
GROUP BY work_year, company_location
ORDER BY company_location, work_year| company_location | work_year | avg_salary |
|---|---|---|
| AD | 2023 | 50745.00 |
| AE | 2020 | 115000.00 |
| AE | 2022 | 92500.00 |
| AE | 2024 | 65000.00 |
| AM | 2023 | 50000.00 |
| AM | 2024 | 48441.50 |
| AR | 2022 | 50000.00 |
| AR | 2023 | 65000.00 |
| AR | 2024 | 69231.53 |
| AR | 2025 | 94738.95 |
This contains 269 rows of information
Average Salary by Work Setting (Remote Vs Onsite) from 2020-2025 (in USD)
SELECT work_year, remote_ratio, ROUND(AVG(salary_in_usd), 2) AS avg_salary, COUNT(*) AS count
FROM salaries
GROUP BY work_year, remote_ratio
ORDER BY remote_ratio DESC| work_year | remote_ratio | avg_salary | count |
|---|---|---|---|
| 2020 | 100 | 115214.08 | 36 |
| 2021 | 100 | 109905.28 | 116 |
| 2022 | 100 | 137093.88 | 887 |
| 2023 | 100 | 148079.49 | 2674 |
| 2024 | 100 | 150533.99 | 11855 |
| 2025 | 100 | 149181.95 | 3140 |
| 2020 | 50 | 86879.14 | 21 |
| 2021 | 50 | 81311.82 | 73 |
| 2022 | 50 | 83170.71 | 63 |
| 2023 | 50 | 79669.34 | 73 |
This contains 18 rows of information
Top 5 Roles per Country (Based on Salary in USD) from 2020-2025
SELECT work_year, company_location, job_title, ROUND(AVG(salary_in_usd), 2) AS avg_salary
FROM salaries
GROUP BY work_year, company_location, job_title
HAVING COUNT(*) > 5
ORDER BY company_location, work_year, avg_salary| work_year | company_location | job_title | avg_salary |
|---|---|---|---|
| 2024 | AR | Data Engineer | 63481.25 |
| 2024 | AR | Data Analyst | 64371.43 |
| 2024 | AT | AI Engineer | 50006.00 |
| 2024 | AT | Data Scientist | 56815.47 |
| 2024 | AT | Engineer | 58840.40 |
| 2024 | AT | Data Analyst | 59416.00 |
| 2024 | AT | Data Engineer | 59894.40 |
| 2024 | AT | Manager | 60435.00 |
| 2025 | AT | Data Scientist | 44765.36 |
| 2025 | AT | Data Analyst | 55785.92 |
(This contains 638 rows of information)
Which Country Has the Widest Salary Range?
SELECT work_year, company_location, MAX(salary) - MIN(salary) AS salary_range, salary_currency
FROM salaries
GROUP BY work_year, company_location
ORDER BY salary_range DESC| work_year | company_location | salary_range | salary_currency |
|---|---|---|---|
| 2025 | IN | 17982000 | USD |
| 2024 | JP | 11939700 | USD |
| 2021 | US | 10980000 | USD |
| 2021 | JP | 8426000 | USD |
| 2022 | IN | 7477000 | USD |
| 2021 | IN | 6980000 | USD |
| 2024 | IN | 6482000 | USD |
| 2025 | CA | 6303000 | USD |
| 2024 | HU | 5343000 | EUR |
| 2023 | IN | 4982000 | USD |
This contains 269 rows of information
dbDisconnect(salaries.db)