#loading necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(dplyr)
# 1) Import the dataset
library(readr)
ds_salaries <- read_csv("C:/Users/hp/Downloads/ds_salaries.csv")
## Rows: 3755 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): experience_level, employment_type, job_title, salary_currency, empl...
## dbl (4): work_year, salary, salary_in_usd, remote_ratio
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(tidyverse)
library(dplyr)
# 1) Import the dataset
library(readr)
ds_salaries_Copy <- read_csv("C:/Users/hp/Downloads/ds_salaries.csv")
## Rows: 3755 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): experience_level, employment_type, job_title, salary_currency, empl...
## dbl (4): work_year, salary, salary_in_usd, remote_ratio
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(ds_salaries_Copy)
summary(ds_salaries_Copy)
## work_year experience_level employment_type job_title
## Min. :2020 Length:3755 Length:3755 Length:3755
## 1st Qu.:2022 Class :character Class :character Class :character
## Median :2022 Mode :character Mode :character Mode :character
## Mean :2022
## 3rd Qu.:2023
## Max. :2023
## salary salary_currency salary_in_usd employee_residence
## Min. : 6000 Length:3755 Min. : 5132 Length:3755
## 1st Qu.: 100000 Class :character 1st Qu.: 95000 Class :character
## Median : 138000 Mode :character Median :135000 Mode :character
## Mean : 190696 Mean :137570
## 3rd Qu.: 180000 3rd Qu.:175000
## Max. :30400000 Max. :450000
## remote_ratio company_location company_size
## Min. : 0.00 Length:3755 Length:3755
## 1st Qu.: 0.00 Class :character Class :character
## Median : 0.00 Mode :character Mode :character
## Mean : 46.27
## 3rd Qu.:100.00
## Max. :100.00
str(ds_salaries_Copy)
## spc_tbl_ [3,755 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ work_year : num [1:3755] 2023 2023 2023 2023 2023 ...
## $ experience_level : chr [1:3755] "SE" "MI" "MI" "SE" ...
## $ employment_type : chr [1:3755] "FT" "CT" "CT" "FT" ...
## $ job_title : chr [1:3755] "Principal Data Scientist" "ML Engineer" "ML Engineer" "Data Scientist" ...
## $ salary : num [1:3755] 80000 30000 25500 175000 120000 ...
## $ salary_currency : chr [1:3755] "EUR" "USD" "USD" "USD" ...
## $ salary_in_usd : num [1:3755] 85847 30000 25500 175000 120000 ...
## $ employee_residence: chr [1:3755] "ES" "US" "US" "CA" ...
## $ remote_ratio : num [1:3755] 100 100 100 100 100 0 0 0 0 0 ...
## $ company_location : chr [1:3755] "ES" "US" "US" "CA" ...
## $ company_size : chr [1:3755] "L" "S" "S" "M" ...
## - attr(*, "spec")=
## .. cols(
## .. work_year = col_double(),
## .. experience_level = col_character(),
## .. employment_type = col_character(),
## .. job_title = col_character(),
## .. salary = col_double(),
## .. salary_currency = col_character(),
## .. salary_in_usd = col_double(),
## .. employee_residence = col_character(),
## .. remote_ratio = col_double(),
## .. company_location = col_character(),
## .. company_size = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# 4) Find the top 6 records,find the last 6 records
head(ds_salaries_Copy)
## # A tibble: 6 × 11
## work_year experience_level employment_type job_title salary salary_currency
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 2023 SE FT Principal D… 80000 EUR
## 2 2023 MI CT ML Engineer 30000 USD
## 3 2023 MI CT ML Engineer 25500 USD
## 4 2023 SE FT Data Scient… 175000 USD
## 5 2023 SE FT Data Scient… 120000 USD
## 6 2023 SE FT Applied Sci… 222200 USD
## # ℹ 5 more variables: salary_in_usd <dbl>, employee_residence <chr>,
## # remote_ratio <dbl>, company_location <chr>, company_size <chr>
tail(ds_salaries_Copy)
## # A tibble: 6 × 11
## work_year experience_level employment_type job_title salary salary_currency
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 2021 SE FT Data Specia… 1.65e5 USD
## 2 2020 SE FT Data Scient… 4.12e5 USD
## 3 2021 MI FT Principal D… 1.51e5 USD
## 4 2020 EN FT Data Scient… 1.05e5 USD
## 5 2020 EN CT Business Da… 1 e5 USD
## 6 2021 SE FT Data Scienc… 7 e6 INR
## # ℹ 5 more variables: salary_in_usd <dbl>, employee_residence <chr>,
## # remote_ratio <dbl>, company_location <chr>, company_size <chr>
# 5)Find the no. of rows and cols in the dataset
nrow(ds_salaries_Copy)
## [1] 3755
ncol(ds_salaries_Copy)
## [1] 11
# 6) What are the values corresponding to the job title,experience level,employment type.
# Explore unique values in categorical variables
unique(ds_salaries_Copy$job_title)
## [1] "Principal Data Scientist"
## [2] "ML Engineer"
## [3] "Data Scientist"
## [4] "Applied Scientist"
## [5] "Data Analyst"
## [6] "Data Modeler"
## [7] "Research Engineer"
## [8] "Analytics Engineer"
## [9] "Business Intelligence Engineer"
## [10] "Machine Learning Engineer"
## [11] "Data Strategist"
## [12] "Data Engineer"
## [13] "Computer Vision Engineer"
## [14] "Data Quality Analyst"
## [15] "Compliance Data Analyst"
## [16] "Data Architect"
## [17] "Applied Machine Learning Engineer"
## [18] "AI Developer"
## [19] "Research Scientist"
## [20] "Data Analytics Manager"
## [21] "Business Data Analyst"
## [22] "Applied Data Scientist"
## [23] "Staff Data Analyst"
## [24] "ETL Engineer"
## [25] "Data DevOps Engineer"
## [26] "Head of Data"
## [27] "Data Science Manager"
## [28] "Data Manager"
## [29] "Machine Learning Researcher"
## [30] "Big Data Engineer"
## [31] "Data Specialist"
## [32] "Lead Data Analyst"
## [33] "BI Data Engineer"
## [34] "Director of Data Science"
## [35] "Machine Learning Scientist"
## [36] "MLOps Engineer"
## [37] "AI Scientist"
## [38] "Autonomous Vehicle Technician"
## [39] "Applied Machine Learning Scientist"
## [40] "Lead Data Scientist"
## [41] "Cloud Database Engineer"
## [42] "Financial Data Analyst"
## [43] "Data Infrastructure Engineer"
## [44] "Software Data Engineer"
## [45] "AI Programmer"
## [46] "Data Operations Engineer"
## [47] "BI Developer"
## [48] "Data Science Lead"
## [49] "Deep Learning Researcher"
## [50] "BI Analyst"
## [51] "Data Science Consultant"
## [52] "Data Analytics Specialist"
## [53] "Machine Learning Infrastructure Engineer"
## [54] "BI Data Analyst"
## [55] "Head of Data Science"
## [56] "Insight Analyst"
## [57] "Deep Learning Engineer"
## [58] "Machine Learning Software Engineer"
## [59] "Big Data Architect"
## [60] "Product Data Analyst"
## [61] "Computer Vision Software Engineer"
## [62] "Azure Data Engineer"
## [63] "Marketing Data Engineer"
## [64] "Data Analytics Lead"
## [65] "Data Lead"
## [66] "Data Science Engineer"
## [67] "Machine Learning Research Engineer"
## [68] "NLP Engineer"
## [69] "Manager Data Management"
## [70] "Machine Learning Developer"
## [71] "3D Computer Vision Researcher"
## [72] "Principal Machine Learning Engineer"
## [73] "Data Analytics Engineer"
## [74] "Data Analytics Consultant"
## [75] "Data Management Specialist"
## [76] "Data Science Tech Lead"
## [77] "Data Scientist Lead"
## [78] "Cloud Data Engineer"
## [79] "Data Operations Analyst"
## [80] "Marketing Data Analyst"
## [81] "Power BI Developer"
## [82] "Product Data Scientist"
## [83] "Principal Data Architect"
## [84] "Machine Learning Manager"
## [85] "Lead Machine Learning Engineer"
## [86] "ETL Developer"
## [87] "Cloud Data Architect"
## [88] "Lead Data Engineer"
## [89] "Head of Machine Learning"
## [90] "Principal Data Analyst"
## [91] "Principal Data Engineer"
## [92] "Staff Data Scientist"
## [93] "Finance Data Analyst"
unique(ds_salaries_Copy$employment_type)
## [1] "FT" "CT" "FL" "PT"
unique(ds_salaries_Copy$experience_level)
## [1] "SE" "MI" "EN" "EX"
#Data Cleaning: If there are missing values or data quality issues, you may need to clean the dataset.
# 7) Find all the NA values available in the dataset
#To view all the NA values inside the dataset
ds_salaries_Copy[!complete.cases(ds_salaries_Copy),]
## # A tibble: 0 × 11
## # ℹ 11 variables: work_year <dbl>, experience_level <chr>,
## # employment_type <chr>, job_title <chr>, salary <dbl>,
## # salary_currency <chr>, salary_in_usd <dbl>, employee_residence <chr>,
## # remote_ratio <dbl>, company_location <chr>, company_size <chr>
# 8) Convert Job_Title to a factor with custom levels.
# factor and levels in r
# Convert "Job_Title" to a factor with custom levels
custom_levels <- c("Data Analyst", "Data Scientist", "Machine Learning Engineer", "Statistician", "Data Engineer")
ds_salaries_Copy$job_title <- factor(ds_salaries_Copy$job_title, levels = custom_levels)
# new custom levels
job_title_levels <- levels(ds_salaries_Copy$job_title)
print(job_title_levels)
## [1] "Data Analyst" "Data Scientist"
## [3] "Machine Learning Engineer" "Statistician"
## [5] "Data Engineer"
# Calculate the mean salary for each combination of factors
mean_salary_by_factors <- ds_salaries_Copy %>%
group_by(experience_level, employment_type,company_location) %>%
summarise(Mean_Salary = mean(salary))
## `summarise()` has grouped output by 'experience_level', 'employment_type'. You
## can override using the `.groups` argument.
mean_salary_by_factors
## # A tibble: 159 × 4
## # Groups: experience_level, employment_type [13]
## experience_level employment_type company_location Mean_Salary
## <chr> <chr> <chr> <dbl>
## 1 EN CT CZ 29000
## 2 EN CT US 100000
## 3 EN FL IR 100000
## 4 EN FL US 50000
## 5 EN FT AR 25000
## 6 EN FT AS 468333.
## 7 EN FT AT 50000
## 8 EN FT AU 79333.
## 9 EN FT BA 120000
## 10 EN FT BE 64750
## # ℹ 149 more rows
# 10) What are the median salaries for Indians in different positions related to Data Science and Machine Learning?
#using filter,median,groupby,summarise
# Filter the data for Indian salaries
indian_data <- ds_salaries_Copy %>%
filter(employee_residence == "IN" | company_location == "IN")
indian_data
## # A tibble: 73 × 11
## work_year experience_level employment_type job_title salary salary_currency
## <dbl> <chr> <chr> <fct> <dbl> <chr>
## 1 2022 MI FT Machine Le… 1.65e6 INR
## 2 2023 MI FT <NA> 6.5 e4 EUR
## 3 2022 EN FT <NA> 3 e5 USD
## 4 2023 MI FT <NA> 1.7 e6 INR
## 5 2023 EN FT Data Engin… 1.4 e6 INR
## 6 2023 MI FT Data Scien… 1.4 e6 INR
## 7 2023 EN FT <NA> 7 e4 USD
## 8 2023 MI FT <NA> 1.5 e6 INR
## 9 2023 EN FT Data Analy… 3 e4 USD
## 10 2023 SE FT Data Analy… 1.3 e6 INR
## # ℹ 63 more rows
## # ℹ 5 more variables: salary_in_usd <dbl>, employee_residence <chr>,
## # remote_ratio <dbl>, company_location <chr>, company_size <chr>
# Group by position and calculate median salary
median_salaries <- indian_data %>%
group_by(job_title) %>%
summarise(Median_Salary = median(salary, na.rm = TRUE))
print(median_salaries)
## # A tibble: 5 × 2
## job_title Median_Salary
## <fct> <dbl>
## 1 Data Analyst 475000
## 2 Data Scientist 1400000
## 3 Machine Learning Engineer 1724998.
## 4 Data Engineer 1500000
## 5 <NA> 1292500
# 11) Find the top 10 countries with the highest average salary
#using mean,head,fill,reorder,bar chart
# Group by country and calculate average salary
average_salaries <- ds_salaries_Copy%>%
group_by(company_location) %>%
summarise(Average_Salary = mean(salary, na.rm = TRUE))
average_salaries
## # A tibble: 72 × 2
## company_location Average_Salary
## <chr> <dbl>
## 1 AE 100000
## 2 AL 10000
## 3 AM 50000
## 4 AR 25000
## 5 AS 468333.
## 6 AT 65167.
## 7 AU 98071.
## 8 BA 120000
## 9 BE 68625
## 10 BO 7500
## # ℹ 62 more rows
# Sort the data in descending order of average salary
average_salaries <- average_salaries %>%
arrange(desc(Average_Salary))
# Select the top 10 countries with the highest average salary
top_10_countries <- head(average_salaries, 10)
# Print the top 10 countries with their average salaries
print(top_10_countries)
## # A tibble: 10 × 2
## company_location Average_Salary
## <chr> <dbl>
## 1 CL 30400000
## 2 HU 8800000
## 3 JP 3408667.
## 4 ID 2107500
## 5 IN 1814034.
## 6 IL 830000
## 7 TH 618333.
## 8 HK 510000
## 9 AS 468333.
## 10 SG 308000
ggplot(top_10_countries, aes(x = reorder(company_location, -Average_Salary), y = Average_Salary)) +
geom_bar(stat = "identity", fill =c("skyblue","blue","red","orange","green","yellow","pink","white","black","grey") )+
labs(x = "Country", y = "Average Salary")

# 12) Median salary for each company size
#line chart
median_salaries_by_company_size <- ds_salaries_Copy %>%
group_by(company_size) %>%
summarise(MedianSalary = median(salary, na.rm = TRUE))
print(median_salaries_by_company_size)
## # A tibble: 3 × 2
## company_size MedianSalary
## <chr> <dbl>
## 1 L 131300
## 2 M 140000
## 3 S 73000
ggplot(median_salaries_by_company_size, aes(x = company_size, y = MedianSalary, group = 1)) +
geom_line() +
geom_point() +
labs(x = "Company Size", y = "Median Salary", title = "Median Salary by Company Size")

# 13) create pie chart of company size and employee type
# Create a pie chart for Company Size
company_size_counts <- ds_salaries_Copy %>%
group_by(company_size) %>%
summarise(Count = n())
company_size_counts
## # A tibble: 3 × 2
## company_size Count
## <chr> <int>
## 1 L 454
## 2 M 3153
## 3 S 148
# Create a pie chart for Employee Type
employee_type_counts <- ds_salaries_Copy %>%
group_by(employment_type) %>%
summarise(Count = n())
employee_type_counts
## # A tibble: 4 × 2
## employment_type Count
## <chr> <int>
## 1 CT 10
## 2 FL 10
## 3 FT 3718
## 4 PT 17
# 14) What is the highest salary in the dataset and which job title corresponds to it
highest_salary <- max(ds_salaries_Copy$salary,na.rm=TRUE)
# Find the job title corresponding to the highest salary
job_title <- ds_salaries_Copy %>%
filter(salary == highest_salary) %>%
select(job_title)
highest_salary
## [1] 30400000
job_title
## # A tibble: 1 × 1
## job_title
## <fct>
## 1 Data Scientist
# 15) What is the lowest salary in the dataset and which job title corresponds to it
lowest_salary <- min(ds_salaries_Copy$salary,na.rm=TRUE)
# Find the job title corresponding to the lowest salary
job_title <- ds_salaries_Copy %>%
filter(salary == lowest_salary) %>%
select(job_title)
lowest_salary
## [1] 6000
# 16) Histogram for Salary distribution in india
data_india <- ds_salaries_Copy[ds_salaries_Copy$company_location == "IN", ]
# Create a histogram for the Salary distribution in India
ggplot(data_india, aes(x = salary_in_usd)) +
geom_histogram(binwidth = 10000, fill = "skyblue",color="black") +
labs(x = "Salary (INR)", y = "Frequency", title = "Salary Distribution in India")

labs(x = "Salary (INR)", y = "Frequency", title = "Salary Distribution in India")
## $x
## [1] "Salary (INR)"
##
## $y
## [1] "Frequency"
##
## $title
## [1] "Salary Distribution in India"
##
## attr(,"class")
## [1] "labels"
# 17)Salary Distribution by Top 5 Job Titles and Employment Types
# Calculate the top 5 job titles (modify as needed)
top_job_titles <- ds_salaries_Copy %>%
group_by(job_title) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
head(5)
top_job_titles
## # A tibble: 5 × 2
## job_title Count
## <fct> <int>
## 1 Data Engineer 1040
## 2 <NA> 974
## 3 Data Scientist 840
## 4 Data Analyst 612
## 5 Machine Learning Engineer 289
# Filter the dataset to include only the top 5 job titles
filtered_data <- ds_salaries_Copy %>%
filter(job_title %in% top_job_titles$job_title)
filtered_data
## # A tibble: 3,755 × 11
## work_year experience_level employment_type job_title salary salary_currency
## <dbl> <chr> <chr> <fct> <dbl> <chr>
## 1 2023 SE FT <NA> 80000 EUR
## 2 2023 MI CT <NA> 30000 USD
## 3 2023 MI CT <NA> 25500 USD
## 4 2023 SE FT Data Scien… 175000 USD
## 5 2023 SE FT Data Scien… 120000 USD
## 6 2023 SE FT <NA> 222200 USD
## 7 2023 SE FT <NA> 136000 USD
## 8 2023 SE FT Data Scien… 219000 USD
## 9 2023 SE FT Data Scien… 141000 USD
## 10 2023 SE FT Data Scien… 147100 USD
## # ℹ 3,745 more rows
## # ℹ 5 more variables: salary_in_usd <dbl>, employee_residence <chr>,
## # remote_ratio <dbl>, company_location <chr>, company_size <chr>
# Group by Job_Title and Employment_Type, calculate mean salary
summary_data <- filtered_data %>%
group_by(job_title, employment_type) %>%
summarise(Mean_Salary = mean(salary_in_usd))
## `summarise()` has grouped output by 'job_title'. You can override using the
## `.groups` argument.
# Create a bar graph for Salary Distribution
ggplot(summary_data, aes(x = job_title, y = Mean_Salary, fill = employment_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Job Title", y = "Mean Salary (USD)", title = "Salary Distribution by Top 5 Job Titles and Employment Types") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 18) Salary Distribution by Company Size and Employment Type using box plot
ggplot(ds_salaries_Copy, aes(x = work_year, y = salary_in_usd, fill = experience_level)) +
geom_boxplot() +
labs(x = "year", y = "Salary (USD)", title = "Salary Distribution by Year and Experience Level") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
