# Load libraries
library('DBI') # Database interface in R
library('RMySQL') # MySQL driver for R
library('tidyr') # Data tidying functions
library('dplyr') # Data tidying functions
library(ggplot2)
# Connect to SQL DB in R
mydb <- dbConnect(MySQL(), user='chhiring.lama65', password='lama65', dbname='chhiring.lama65', host='cunydata607sql.mysql.database.azure.com')
# Retrieve data from database tables
job <- dbGetQuery(mydb,'select * from job')
company <- dbGetQuery(mydb,'select * from company')
# Join tables based on a common column 'cid'
total_df_joined <- left_join(company, job, by='cid')
# Join tables and filter data
total_df <- left_join(company, job, by='cid')
total_df <- subset(total_df, total_df$employment_type=='FT')
# Count null rows in the dataframe
count_null_rows <- sum(rowSums(is.na(total_df)) > 0)
# Calculate mean and median salary by company
mean_salary_by_company <- total_df |> group_by(cid, salary_currency) |> # Group by 'cid' and 'salary_currency'
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n()) # Calculate mean and median salaries, and count of records
result_by_company <- mean_salary_by_company |> arrange(desc(mean_salary)) # Arrange by descending mean salary
result_by_company <- subset(result_by_company, result_by_company$count>5) # Subset companies with more than 5 records
# Add cost of living information
eur_avg_monthly_cost <- mean(c(1260, 1128, 1414)) # Calculate average monthly cost for EUR currency
cost_of_living <- tibble(
salary_currency = c("USD", "GBP", "EUR", "INR"), # Currency codes
monthly_cost = c(1951, 1929, eur_avg_monthly_cost, 423) # Monthly costs for each currency
)
w_monthly_cost <- left_join(result_by_company, cost_of_living, by='salary_currency') # Join salary and cost of living data
w_monthly_cost$annual_cost <- w_monthly_cost$monthly_cost*12 # Calculate annual cost
w_monthly_cost$proportion_income_monthly_exp <- w_monthly_cost$annual_cost/w_monthly_cost$mean_salary # Calculate wealthiness based on annual cost and mean salary
# Arrange companies by wealthiness
wealthiness <- w_monthly_cost |> arrange(proportion_income_monthly_exp) # Arrange by descending wealthiness
wealthiness[c("cid","salary_currency", "mean_salary","proportion_income_monthly_exp")]
## # A tibble: 12 × 4
## # Groups: cid [12]
## cid salary_currency mean_salary proportion_income_monthly_exp
## <chr> <chr> <dbl> <dbl>
## 1 US_100_US_L USD 171861. 0.136
## 2 US_50_US_L USD 160917. 0.145
## 3 US_0_US_L USD 160882. 0.146
## 4 US_100_US_M USD 145474. 0.161
## 5 US_0_US_M USD 138308. 0.169
## 6 US_100_US_S USD 117423. 0.199
## 7 IN_100_IN_L INR 21382. 0.237
## 8 GR_100_GR_M EUR 56773 0.268
## 9 GB_0_GB_M GBP 85797. 0.270
## 10 ES_100_ES_M EUR 55351. 0.275
## 11 GB_50_GB_L GBP 81310. 0.285
## 12 GB_100_GB_M GBP 73514. 0.315
# Calculate mean and median salary by job title
mean_salary_by_job_title <- total_df |> group_by(job_title) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
# Arrange job titles by descending mean salary
result_job_title <- mean_salary_by_job_title |> arrange(desc(mean_salary))
result_job_title <- subset(result_job_title, result_job_title$count>5)
result_job_title
## # A tibble: 15 × 4
## job_title mean_salary median_salary count
## <chr> <dbl> <dbl> <int>
## 1 Director of Data Science 195074 168000 7
## 2 Principal Data Scientist 181783. 168218 6
## 3 Machine Learning Scientist 179329. 160000 7
## 4 Data Architect 177874. 180000 11
## 5 Data Science Manager 158328. 155750 12
## 6 Lead Data Engineer 139724. 121594. 6
## 7 Data Analytics Manager 127134. 120000 7
## 8 Data Engineer 111314. 103136 118
## 9 Research Scientist 109020. 76264. 16
## 10 Data Scientist 104032. 100000 127
## 11 Machine Learning Engineer 101165. 87425 39
## 12 Data Analyst 91074. 90000 81
## 13 BI Data Analyst 74755. 76500 6
## 14 Data Science Consultant 69421. 76833 7
## 15 Big Data Engineer 51974 41306. 8
# Perform sub-analysis for specific job titles and locations
engineer_consultant_country <- subset(total_df, total_df$job_title=='Big Data Engineer' | total_df$job_title=='Data Science Consultant') |>
group_by(company_location) |> summarize(mean_by_country_title = mean(salary_in_usd), median = median(salary_in_usd), count = n()) |> arrange(desc(mean_by_country_title))
engineer_consultant_country
## # A tibble: 8 × 4
## company_location mean_by_country_title median count
## <chr> <dbl> <dbl> <int>
## 1 GB 111536. 111536. 2
## 2 US 87667. 90000 3
## 3 DE 72499 76833 3
## 4 ES 69741 69741 1
## 5 RO 60000 60000 1
## 6 MD 18000 18000 1
## 7 IN 14849. 16228 3
## 8 CH 5882 5882 1
# Calculate mean salary by remote work ratio
mean_salary_by_remote_ratio <- total_df |> group_by(remote_ratio) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
# Arrange remote work ratios by descending mean salary
result_remote_ratio <- mean_salary_by_remote_ratio |> arrange(desc(mean_salary))
result_remote_ratio <- subset(result_remote_ratio, result_remote_ratio$count>5)
result_remote_ratio
## # A tibble: 3 × 4
## remote_ratio mean_salary median_salary count
## <int> <dbl> <dbl> <int>
## 1 100 121169. 112900 335
## 2 0 106500. 98579 120
## 3 50 84371. 70912 91
# Perform sub-analysis for specific remote work ratios and company locations
mean_salary_by_remote_ratio <- total_df |> group_by(remote_ratio, company_location) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
# Arrange remote work ratios by descending mean salary
result_remote_ratio <- mean_salary_by_remote_ratio |> arrange(desc(mean_salary))
# Subset remote work ratios with specific values and counts
result_remote_ratio_100 <- subset(result_remote_ratio, remote_ratio==100 & count>2)
result_remote_ratio_50 <- subset(result_remote_ratio, remote_ratio==50 & count>2)
result_remote_ratio_0 <- subset(result_remote_ratio, remote_ratio==0 & count>2)
result_remote_ratio_100
## # A tibble: 8 × 5
## # Groups: remote_ratio [1]
## remote_ratio company_location mean_salary median_salary count
## <int> <chr> <dbl> <dbl> <int>
## 1 100 US 147454. 138475 226
## 2 100 CA 103655. 78791 17
## 3 100 DE 91934. 79197 11
## 4 100 GB 74946. 77742 16
## 5 100 PL 66082. 41094. 4
## 6 100 ES 54388. 48372. 12
## 7 100 GR 52176. 46714. 8
## 8 100 IN 20953. 19609 11
result_remote_ratio_50
## # A tibble: 7 × 5
## # Groups: remote_ratio [1]
## remote_ratio company_location mean_salary median_salary count
## <int> <chr> <dbl> <dbl> <int>
## 1 50 US 131379. 120000 20
## 2 50 CA 108896. 99703 7
## 3 50 GB 82033. 76833 11
## 4 50 DE 76022. 65013 9
## 5 50 JP 71692. 74000 3
## 6 50 FR 58925. 56738 11
## 7 50 IN 45228. 35160. 6
result_remote_ratio_0
## # A tibble: 5 × 5
## # Groups: remote_ratio [1]
## remote_ratio company_location mean_salary median_salary count
## <int> <chr> <dbl> <dbl> <int>
## 1 0 US 140285. 129000 63
## 2 0 DE 87142. 79833 5
## 3 0 GB 87073. 78526 19
## 4 0 CA 69750 71000 4
## 5 0 IN 29784. 28210. 6
# Calculate mean salary by offered currency
mean_salary_by_currency_offered <- total_df |> group_by(salary_currency) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
# Arrange currencies by descending mean salary
result_currency_offered <- mean_salary_by_currency_offered |> arrange(desc(mean_salary))
result_currency_offered <- subset(result_currency_offered, result_currency_offered$count>5)
result_currency_offered
## # A tibble: 5 × 4
## salary_currency mean_salary median_salary count
## <chr> <dbl> <dbl> <int>
## 1 USD 137870. 130000 348
## 2 CAD 97223. 83264. 18
## 3 GBP 81744. 78526 43
## 4 EUR 67772. 62726 87
## 5 INR 28625. 22124 26
# Calculate mean salary by year
mean_salary_by_year <- total_df |> group_by(work_year) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
# Arrange years by ascending mean salary
result_by_year <- mean_salary_by_year |> arrange(mean_salary)
result_by_year <- subset(result_by_year, result_by_year$count>5)
result_by_year
## # A tibble: 3 × 4
## work_year mean_salary median_salary count
## <int> <dbl> <dbl> <int>
## 1 2020 98543. 78396. 68
## 2 2021 100081. 83872 204
## 3 2022 123839. 120000 274
# Add inflation data
inflation <- tibble(
work_year = c(2020, 2021, 2022),
inflation_rate = c(1.40, 7.00, 6.50)
)
results_inflation <- left_join(result_by_year, inflation, by = 'work_year')
# Figure out salary growth vs inflation
results_inflation <- results_inflation |> mutate(percent_change = (mean_salary / lag(mean_salary) - 1) * 100)
perfomance_vs_inflation <- (125195)/((1 + 1.40 / 100)*(1 + 7 / 100)*(1 + 6.5 / 100)*(98543))
perfomance_vs_inflation <- (perfomance_vs_inflation - 1)*100
paste('Salary growth has outperformed inflation rate by ', perfomance_vs_inflation, '% since 2020', sep = '')
## [1] "Salary growth has outperformed inflation rate by 9.9486406753853% since 2020"
# Calculate mean salary by experience level
mean_salary_by_exp_level <- total_df |> group_by(experience_level) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
# Arrange experience levels by descending mean salary
result_exp_level <- mean_salary_by_exp_level |> arrange(desc(mean_salary))
result_exp_level <- subset(result_exp_level, result_exp_level$count>5)
result_exp_level
## # A tibble: 4 × 4
## experience_level mean_salary median_salary count
## <chr> <dbl> <int> <int>
## 1 EX 190728. 167875 25
## 2 SE 138839. 136000 241
## 3 MI 88203. 76958 201
## 4 EN 64457. 59102 79
# Define years of experience for each level
yoe <- tibble(
experience_level = c('EX', 'SE', 'MI', 'EN'),
year_of_experience = c(10, 8, 4, 1)
)
# Join experience levels with years of experience
exp_leve_w_years <- left_join(result_exp_level, yoe, by = 'experience_level')
exp_leve_w_years <- exp_leve_w_years |> arrange(mean_salary)
exp_leve_w_years <- exp_leve_w_years |> arrange(mean_salary)
exp_leve_w_years
## # A tibble: 4 × 5
## experience_level mean_salary median_salary count year_of_experience
## <chr> <dbl> <int> <int> <dbl>
## 1 EN 64457. 59102 79 1
## 2 MI 88203. 76958 201 4
## 3 SE 138839. 136000 241 8
## 4 EX 190728. 167875 25 10
# Calculate mean salary changes and average pay increase by year
exp_leve_w_years <- exp_leve_w_years |>
mutate(mean_salary_change = mean_salary - lag(mean_salary, default = first(mean_salary)),
year_of_experience_change = year_of_experience - lag(year_of_experience, default = first(year_of_experience)))
exp_leve_w_years$mean_salary_change_average <- exp_leve_w_years$mean_salary_change/exp_leve_w_years$year_of_experience_change
average_pay_increase_by_year <- mean(exp_leve_w_years$mean_salary_change_average, na.rm = TRUE)
average_pay_increase_by_year
## [1] 15506.22
# Reshape data into long format
data_long <- pivot_longer(exp_leve_w_years, cols = c(mean_salary, median_salary),
names_to = "salary_type", values_to = "salary")
# Create a grouped bar chart
ggplot(data = data_long, aes(x = experience_level, y = salary, fill = salary_type)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black") +
labs(title = "Mean and Median Salary by Experience Level",
x = "Experience Level",
y = "Income",
fill = "Salary Type") +
scale_fill_manual(values = c("mean_salary" = "blue", "median_salary" = "red")) +
theme_minimal()
### Analyze pay by employee residence
# Calculate mean salary by employee residence
mean_salary_by_employee_res <- total_df |> group_by(employee_residence) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
result_employee_res <- mean_salary_by_employee_res |> arrange(desc(mean_salary))
result_employee_res <- subset(result_employee_res, result_employee_res$count>5)
result_employee_res
## # A tibble: 10 × 4
## employee_residence mean_salary median_salary count
## <chr> <dbl> <dbl> <int>
## 1 US 149096. 138000 291
## 2 JP 103538. 74000 7
## 3 CA 97084. 81896. 26
## 4 DE 89871. 79515 22
## 5 GB 81470. 78526 43
## 6 ES 60968. 52230. 14
## 7 FR 59887. 57920 18
## 8 GR 56446. 52209 12
## 9 PT 42862. 53090 6
## 10 IN 38423. 22611 29
# Calculate mean salary by company location
mean_salary_by_company_loc <- total_df |> group_by(company_location) |>
summarize(mean_salary = mean(salary_in_usd), median_salary = median(salary_in_usd), count = n())
result_company_loc <- mean_salary_by_company_loc |> arrange(desc(mean_salary))
result_company_loc <- subset(result_company_loc, result_company_loc$count>5)
result_company_loc
## # A tibble: 9 × 4
## company_location mean_salary median_salary count
## <chr> <dbl> <dbl> <int>
## 1 US 144952. 135000 309
## 2 JP 114127. 75682 6
## 3 CA 100122. 81896. 28
## 4 DE 85247. 79197 25
## 5 GB 81650. 78526 46
## 6 FR 63971. 56738 15
## 7 ES 56345. 49461 13
## 8 GR 52027. 48680 10
## 9 IN 29589. 22611 23