options(scipen = 999)
jobs = read.csv("jobs_in_data.csv")
names(jobs)
## [1] "work_year" "job_title" "job_category"
## [4] "salary_currency" "salary" "salary_in_usd"
## [7] "employee_residence" "experience_level" "employment_type"
## [10] "work_setting" "company_location" "company_size"
total_jobs = table(jobs$job_category)
barplot(total_jobs, main="Total amount of jobs",legend = rownames(total_jobs), col = c("blue", "red", "green", "yellow", "purple", "brown", "orange", "cyan", "grey") )
All this does is show the total amount of jobs that is on the table
median_salary <- aggregate(salary_in_usd ~ job_category, data = jobs, FUN = median)
median_salary
## job_category salary_in_usd
## 1 BI and Visualization 130000
## 2 Cloud and Database 160000
## 3 Data Analysis 105000
## 4 Data Architecture and Modeling 150000
## 5 Data Engineering 139930
## 6 Data Management and Strategy 90000
## 7 Data Quality and Operations 82000
## 8 Data Science and Research 159100
## 9 Leadership and Management 138750
## 10 Machine Learning and AI 176000
par(las=2)
par(mar=c(5,8,4,2))
barplot(median_salary$salary_in_usd, horiz=TRUE, names.arg = median_salary$job_category, main="Median Salary for Each Job Category", col = c("blue", "red", "green", "yellow", "purple", "brown", "orange", "cyan", "grey"), axes = FALSE)
y_max <- max(median_salary$salary_in_usd)
y_ticks <- seq(0, y_max, by = 20000)
# Add y-axis with horizontal labels
axis(side = 1, at = y_ticks, labels = format(y_ticks, big.mark = ","), las = 1)
This shows the median amount of money on each job category.
cor.test(jobs$salary, jobs$salary_in_usd)
##
## Pearson's product-moment correlation
##
## data: jobs$salary and jobs$salary_in_usd
## t = 728.74, df = 9353, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9909507 0.9916524
## sample estimates:
## cor
## 0.9913086
I wanted to see if the two salaries where related in any way. The results are .9913 cor
par(las=1)
hist(jobs$salary_in_usd, breaks= 15)
axis(1, at = seq(0, max(jobs$salary_in_usd), by = 50000),labels = format(seq(0, max(jobs$salary_in_usd), by = 50000)))
This tells us that most of the jobs in this table are in the 100,000$ to
150,000$ range.
t_test_results = t.test(jobs$salary_in_usd, y = jobs$salary)
print(t_test_results)
##
## Welch Two Sample t-test
##
## data: jobs$salary_in_usd and jobs$salary
## t = 0.40081, df = 18707, p-value = 0.6886
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1445.314 2188.342
## sample estimates:
## mean of x mean of y
## 150299.5 149928.0
This is the T test for both salarys in the US and then those same salaries in there original countries.