Dataset : Data Science Salaries 2023 💸
“Salaries of Different Data Science Fields in the Data Science Domain”, sumber dataset kaggle, kreotor mendeklarasikan “Data has been sourced from aijobs.net” for yearly salary.
Dengan begitu dapat kita pahami bahwa dataset tersebut menggambarkan perbedaan gaji tiap pekerjaan di bidang data(terutama data scientist), yang dikhususkan untuk pendapatan per-tahun (annual salary)
dplyrlibrary(dplyr)
library(plotly)
library(glue)
library(scales)ds_salary <- read.csv("ds_salaries.csv")
head(ds_salary)# dplyr
glimpse(ds_salary)#> Rows: 3,755
#> Columns: 11
#> $ work_year <int> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 202…
#> $ experience_level <chr> "SE", "MI", "MI", "SE", "SE", "SE", "SE", "SE", "SE…
#> $ employment_type <chr> "FT", "CT", "CT", "FT", "FT", "FT", "FT", "FT", "FT…
#> $ job_title <chr> "Principal Data Scientist", "ML Engineer", "ML Engi…
#> $ salary <int> 80000, 30000, 25500, 175000, 120000, 222200, 136000…
#> $ salary_currency <chr> "EUR", "USD", "USD", "USD", "USD", "USD", "USD", "U…
#> $ salary_in_usd <int> 85847, 30000, 25500, 175000, 120000, 222200, 136000…
#> $ employee_residence <chr> "ES", "US", "US", "CA", "CA", "US", "US", "CA", "CA…
#> $ remote_ratio <int> 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, 100, 100…
#> $ company_location <chr> "ES", "US", "US", "CA", "CA", "US", "US", "CA", "CA…
#> $ company_size <chr> "L", "S", "S", "M", "M", "L", "L", "M", "M", "M", "…
Data Science Job Salaries Dataset contains 11 columns, each are:
IN BAHASA INDONESIA
ds_salary <- ds_salary %>%
mutate(work_year = as.factor(work_year),
experience_level = as.factor(experience_level),
employment_type = as.factor(employment_type),
salary_currency = as.factor(salary_currency),
employee_residence = as.factor(employee_residence),
remote_ratio = as.factor(remote_ratio),
company_location = as.factor(company_location),
company_size = as.factor(company_size)
)glimpse(ds_salary)#> Rows: 3,755
#> Columns: 11
#> $ work_year <fct> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 202…
#> $ experience_level <fct> SE, MI, MI, SE, SE, SE, SE, SE, SE, SE, SE, SE, SE,…
#> $ employment_type <fct> FT, CT, CT, FT, FT, FT, FT, FT, FT, FT, FT, FT, FT,…
#> $ job_title <chr> "Principal Data Scientist", "ML Engineer", "ML Engi…
#> $ salary <int> 80000, 30000, 25500, 175000, 120000, 222200, 136000…
#> $ salary_currency <fct> EUR, USD, USD, USD, USD, USD, USD, USD, USD, USD, U…
#> $ salary_in_usd <int> 85847, 30000, 25500, 175000, 120000, 222200, 136000…
#> $ employee_residence <fct> ES, US, US, CA, CA, US, US, CA, CA, US, US, US, US,…
#> $ remote_ratio <fct> 100, 100, 100, 100, 100, 0, 0, 0, 0, 0, 0, 100, 100…
#> $ company_location <fct> ES, US, US, CA, CA, US, US, CA, CA, US, US, US, US,…
#> $ company_size <fct> L, S, S, M, M, L, L, M, M, M, M, M, M, L, L, M, M, …
Modifikasi Kolom
Kita tidak akan mengambil kolom salary dan
salary_currency karena masih tergolong subjektif dalam
kasus kali ini. Setelah penyortiran kolom, simpan kedalam objek
ds_select
# dplyr / dengan piping
ds_select <- ds_salary %>%
select(-c(salary, salary_currency))# dplyr
ds_select %>%
is.na %>%
colSums()karena tidak ada nilai yg NA, maka tidak dilakukan handling missing value
tema algoritma untuk branding visualization
theme_algoritma <- theme(legend.key = element_rect(fill="black"),
legend.background = element_rect(color="white", fill="#263238"),
plot.subtitle = element_text(size=6, color="white"),
panel.background = element_rect(fill="#dddddd"),
panel.border = element_rect(fill=NA),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color="darkgrey", linetype=2),
panel.grid.minor.y = element_blank(),
plot.background = element_rect(fill="#263238"),
text = element_text(color="white"),
axis.text = element_text(color="white")
)📊 Tahapan pembuatan interactive plot menggunakan
plotly:
ggplot()ggplotly()job_title yang di
publish berdasarkan tahunData Science Fields yang dipublish
berdasarkan experience_levelsalary_in_usd) berdasarkan
experience_levelsalary_in_usd)
pada tiap company_sizejob_title yang di
publish berdasarkan tahunplot_agg1 <- ds_select %>%
distinct(job_title, work_year, .keep_all= TRUE) %>% #remove duplicates
group_by(work_year) %>%
summarise(freq = n())Plot Statis 1
# plot 1 statis (ggplot2)
plot_1 <- plot_agg1 %>%
ggplot(mapping = aes(x = work_year,
y = freq,
text = glue("Tahun : {work_year}
Frekuensi : {freq}")))+
geom_col(fill = "#FF7F50", width = 0.3)+
geom_text(aes(label = freq), nudge_y = 2)+
labs(title = "Number of Distinct 'Job Titles' in Data Science Fields by year",
x = NULL,
y = NULL,
caption = "Source: aijobs.net")+
theme_algoritma
plot_1
> Insight:
job_title yang
dipublish berdasarkan tahun, dari 2020-2022 perkembangan
job_title mengalami peningkatan yg signifikan, seiring
berjalanya waktu mungkin saja suatu perusahaan membutuhkan role data
yang lebih spesifik dalam menjalankan jobdesk-nyaData Science Fields yang dipublish
berdasarkan experience_levelds_select$experience_level <- sapply(X = as.character(ds_select$experience_level),
FUN = switch,
"EN" = "Entry Level",
"EX" = "Executive Level",
"MI" = "Mid Level",
"SE" = "Senior Level"
)plot_agg2 <- ds_select %>%
group_by(work_year, experience_level) %>%
summarise(freq = n()) %>%
ungroup()Plot Statis 2
plot_2 <- plot_agg2 %>%
ggplot(mapping = aes(x = work_year,
y = freq))+
geom_line(aes(group = experience_level,
color = experience_level),
size=1.2)+
geom_point(aes(text = glue("{experience_level}
Frekuensi: {freq}"),
color = experience_level),
size = 2)+
geom_text(aes(label = freq),check_overlap = TRUE)+
labs(title = "Number of Data Science Fields published by Experience",
x = NULL,
y = NULL,
caption = "Source: aijobs.net",
) +
theme(legend.position = "bottom",
legend.title = element_blank())+
theme_algoritma
plot_2
> Insight:
experience_level, menunjukkan tren tiap kategori
experience_levelnya mengalami kenaikan tiap tahun,Senior Level mengalami kenaikan
yang paling tinggi dimulai pada periode 2021-2022, dan terus meningkat
hingga periode 2022-2023, bisa jadi pada saat itu perusahaan sedang
berlomba-lomba untuk merekrut expertise dan berpengalaman di bidang data
tech dan bisa memanage tim data.Executive Level tidak begitu signifikan
karena role tersebut bisa jadi tetap diisi oleh orang yang sudah lama di
perusahaan, baik dari sisi internal ataupun berpengaruh dalam tatanan
struktural perusahaan itu dan mengerti benar akan role tersebutsalary_in_usd) berdasarkan
experience_levelplot_agg3 <- ds_select %>%
group_by(work_year, experience_level) %>%
summarise(mean_salary = round(mean(salary_in_usd),2)) %>%
ungroup()Plot Statis 3
plot_3 <- plot_agg3 %>%
ggplot(mapping = aes(x = work_year,
y = mean_salary))+
geom_line(aes(group = experience_level,
color = experience_level
),
size=1.2,
)+
geom_point(aes(text = glue("{experience_level}
mean_salary: ${mean_salary}"),
color = experience_level),
size = 2)+
geom_text(aes(label = label_number_si()(mean_salary)),check_overlap = TRUE)+
labs(title = "Annual Average Salary(US$) of Data Science Fields published by Experience",
x = NULL,
y = NULL,
caption = "Source: aijobs.net",
) +
theme(legend.position = "bottom",
legend.title = element_blank())+
theme_algoritma
plot_3
> Insight:
Excecutive Level mengalami kenaikan, diramalkan karena role
tersebut mengambil peranan penting dalam pengambilan keputusan suatu
perusahaan pada periode kritis yang sedang terjadicompany_size by
averageplot_agg4 <- ds_select %>%
group_by(company_size, work_year) %>%
summarise(salary_movement = round(mean(salary_in_usd),2)) %>%
ungroup()Plot Statis 4
plot_4 <- plot_agg4 %>%
ggplot(mapping = aes(x = work_year,
y = salary_movement))+
geom_line(aes(group = company_size,
color = company_size
),
size=1.2,
)+
geom_point(aes(text = glue("Company Size: {company_size}
Salary Movement: ${salary_movement}"),
color = company_size),
size = 2)+
geom_text(aes(label = label_number_si()(salary_movement)))+
labs(title = "Trend of Salary Movement by Company Size",
x = NULL,
y = NULL,
color = "Company Size",
caption = "Source: aijobs.net",
) +
theme(legend.position = "bottom")+
theme_algoritma
plot_4
> Insight:
Dapat diakses disini: link rpubs