options(repos = c(CRAN = "https://cloud.r-project.org/"))
my_data <- read.table("./ds_salaries.csv",
header = TRUE,
sep = ",")
head(my_data)
## X work_year experience_level employment_type job_title salary salary_currency salary_in_usd
## 1 0 2020 MI FT Data Scientist 70000 EUR 79833
## 2 1 2020 SE FT Machine Learning Scientist 260000 USD 260000
## 3 2 2020 SE FT Big Data Engineer 85000 GBP 109024
## 4 3 2020 MI FT Product Data Analyst 20000 USD 20000
## 5 4 2020 SE FT Machine Learning Engineer 150000 USD 150000
## 6 5 2020 EN FT Data Analyst 72000 USD 72000
## employee_residence remote_ratio company_location company_size
## 1 DE 0 DE L
## 2 JP 0 JP S
## 3 GB 50 GB M
## 4 HN 0 HN S
## 5 US 50 US L
## 6 US 100 US L
library(psych)
describe(my_data[,-1])
## vars n mean sd median trimmed mad min max range skew kurtosis
## work_year 1 607 2021.41 0.69 2022 2021.51 0.00 2020 2022 2 -0.73 -0.66
## experience_level* 2 607 3.13 1.03 3 3.28 1.48 1 4 3 -1.04 -0.10
## employment_type* 3 607 2.99 0.24 3 3.00 0.00 1 4 3 -4.14 45.81
## job_title* 4 607 21.96 10.49 18 21.00 7.41 1 50 49 0.88 0.40
## salary 5 607 324000.06 1544357.49 115000 118919.11 68706.65 4000 30400000 30396000 13.98 244.57
## salary_currency* 6 607 14.03 4.38 17 14.67 0.00 1 17 16 -1.03 -0.38
## salary_in_usd 7 607 112297.87 70957.26 101570 106157.63 62906.72 2859 600000 597141 1.66 6.26
## employee_residence* 8 607 41.41 18.27 56 43.66 0.00 1 57 56 -0.67 -1.22
## remote_ratio 9 607 70.92 40.71 100 76.08 0.00 0 100 100 -0.90 -0.90
## company_location* 10 607 36.89 16.03 49 39.07 0.00 1 50 49 -0.77 -1.09
## company_size* 11 607 1.81 0.65 2 1.76 0.00 1 3 2 0.21 -0.73
## se
## work_year 0.03
## experience_level* 0.04
## employment_type* 0.01
## job_title* 0.43
## salary 62683.54
## salary_currency* 0.18
## salary_in_usd 2880.07
## employee_residence* 0.74
## remote_ratio 1.65
## company_location* 0.65
## company_size* 0.03
mydata <- my_data[,c(-1)]
head(mydata)
## work_year experience_level employment_type job_title salary salary_currency salary_in_usd
## 1 2020 MI FT Data Scientist 70000 EUR 79833
## 2 2020 SE FT Machine Learning Scientist 260000 USD 260000
## 3 2020 SE FT Big Data Engineer 85000 GBP 109024
## 4 2020 MI FT Product Data Analyst 20000 USD 20000
## 5 2020 SE FT Machine Learning Engineer 150000 USD 150000
## 6 2020 EN FT Data Analyst 72000 USD 72000
## employee_residence remote_ratio company_location company_size
## 1 DE 0 DE L
## 2 JP 0 JP S
## 3 GB 50 GB M
## 4 HN 0 HN S
## 5 US 50 US L
## 6 US 100 US L
work_year: The year the salary was paid (Numeric, Interval) experience_level: Experience level in the job during the year with the following possible values: EN Entry-level, Junior MI Mid-level, Intermediate SE Senior-level, Expert EX Executive-level, Director (Categorical, Nominal) employment_type: The style of employent: PT Part-time, FT Full-time, CT Contract, FL Freelance (Categorical Nominal) job_title: The role worked in during the Recorded year (Categorical, Nominal) salary: Gross Salary paid during the year (Numeric, Ratio) salary_currency: Currency of the salary paid during the recorded year (Categorical, Nominal) salary_in_usd: Salary in USD converted from original currency (Numeric, Ratio) employee_residence: The country of residence of the employee (Categorical, Nominal) remote_ratio: Value of work completed remotely: 0 = No remote work (less than 20%), 50 = Partially remote 100 Fully remote (more than 80%) (Categorical, Ordinal) company_location: The country of the employer’s main office or contracting branch as an ISO 3166 country code (Categorical, Nominal) company_size: The average number of people that worked for the company during the year: S less than 50 employees (small) M 50 to 250 employees (medium) L more than 250 employees (large) (Categorical, Nominal)
my_data$experience_level <- factor(my_data$experience_level, levels = c("EN", "MI", "SE", "EX"))
my_data$employment_type <- factor(my_data$employment_type)
my_data$job_title <- factor(my_data$job_title)
my_data$salary_currency <- factor(my_data$salary_currency)
my_data$employee_residence <- factor(my_data$employee_residence)
my_data$company_location <- factor(my_data$company_location)
my_data$company_size <- factor(my_data$company_size, levels = c("S", "M", "L"))
I analyzed the median and mean of the salary in USD variable, with the median being $10,150 and the mean is $112,298. The median demonstrates that 50% of salaries in USD are $10,150 or below, and 50% are above $10,150. Meanwhile the mean is the combined salaries over the total salary count (607). Additionally the 3rd quartile is $150,000 which states that 50% of salaries in USD are $150,00 or below, and the other 50% are above $150,000.
summary(my_data)
## X work_year experience_level employment_type job_title salary
## Min. : 0.0 Min. :2020 EN: 88 CT: 5 Data Scientist :143 Min. : 4000
## 1st Qu.:151.5 1st Qu.:2021 MI:213 FL: 4 Data Engineer :132 1st Qu.: 70000
## Median :303.0 Median :2022 SE:280 FT:588 Data Analyst : 97 Median : 115000
## Mean :303.0 Mean :2021 EX: 26 PT: 10 Machine Learning Engineer: 41 Mean : 324000
## 3rd Qu.:454.5 3rd Qu.:2022 Research Scientist : 16 3rd Qu.: 165000
## Max. :606.0 Max. :2022 Data Science Manager : 12 Max. :30400000
## (Other) :166
## salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
## USD :398 Min. : 2859 US :332 Min. : 0.00 US :355 S: 83
## EUR : 95 1st Qu.: 62726 GB : 44 1st Qu.: 50.00 GB : 47 M:326
## GBP : 44 Median :101570 IN : 30 Median :100.00 CA : 30 L:198
## INR : 27 Mean :112298 CA : 29 Mean : 70.92 DE : 28
## CAD : 18 3rd Qu.:150000 DE : 25 3rd Qu.:100.00 IN : 24
## JPY : 3 Max. :600000 FR : 18 Max. :100.00 FR : 15
## (Other): 22 (Other):129 (Other):108
ggplot(my_data, aes(x = experience_level, y = salary_in_usd)) +
geom_boxplot() +
labs(title = "Salary Distribution by Experience Level", x = "Experience Level", y = "Salary in USD")
#install.packages("car")
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
ggplot(mydata, aes(x = work_year, y = salary_in_usd)) +
geom_point(color = "blue", alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
scale_y_continuous(labels = comma) +
scale_x_continuous(breaks = seq(2020, 2022, 1)) +
labs(x = "Year Salary was Dispersed", y = "Salary in USD") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Research Question: Does experience level significantly impact salary in USD? Hypothesis: - H0: There is no significant difference in salary based on experience level. - H1: There is a significant difference in salary based on experience level.
shapiro.test(my_data$salary_in_usd)
##
## Shapiro-Wilk normality test
##
## data: my_data$salary_in_usd
## W = 0.89836, p-value < 2.2e-16
library(onewaytests)
##
## Attaching package: 'onewaytests'
## The following object is masked from 'package:psych':
##
## describe
welch.test(salary_in_usd ~ experience_level,
data = my_data)
##
## Welch's Heteroscedastic F Test (alpha = 0.05)
## -------------------------------------------------------------
## data : salary_in_usd and experience_level
##
## statistic : 69.35689
## num df : 3
## denom df : 101.6379
## p.value : 1.710586e-24
##
## Result : Difference is statistically significant.
## -------------------------------------------------------------
anova_result <- aov(salary_in_usd ~ experience_level, data = my_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## experience_level 3 7.428e+11 2.476e+11 64.68 <2e-16 ***
## Residuals 603 2.308e+12 3.828e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kruskal.test(salary_in_usd ~ experience_level, data = my_data)
##
## Kruskal-Wallis rank sum test
##
## data: salary_in_usd by experience_level
## Kruskal-Wallis chi-squared = 190.11, df = 3, p-value < 2.2e-16
anova_table <- summary(anova_result)[[1]]
ss_total <- sum(anova_table[, "Sum Sq"])
ss_effect <- anova_table[1, "Sum Sq"]
eta_squared_value <- ss_effect / ss_total
paste("Effect Size (Eta Squared):", round(eta_squared_value, 3))
## [1] "Effect Size (Eta Squared): 0.243"