library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
ai_jobs <- readr::read_csv("C:/Users/Lenovo/Documents/School/Mgr/R - Ekonometria/Cvicenia/My dataset/ai_job_dataset.csv")
## Rows: 15000 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): job_id, job_title, salary_currency, experience_level, employment_...
## dbl (5): salary_usd, remote_ratio, years_experience, job_description_lengt...
## date (2): posting_date, application_deadline
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(ai_jobs)
## Rows: 15,000
## Columns: 19
## $ job_id <chr> "AI00001", "AI00002", "AI00003", "AI00004", "AI…
## $ job_title <chr> "AI Research Scientist", "AI Software Engineer"…
## $ salary_usd <dbl> 90376, 61895, 152626, 80215, 54624, 123574, 796…
## $ salary_currency <chr> "USD", "USD", "USD", "USD", "EUR", "EUR", "GBP"…
## $ experience_level <chr> "SE", "EN", "MI", "SE", "EN", "SE", "MI", "EN",…
## $ employment_type <chr> "CT", "CT", "FL", "FL", "PT", "CT", "FL", "FL",…
## $ company_location <chr> "China", "Canada", "Switzerland", "India", "Fra…
## $ company_size <chr> "M", "M", "L", "M", "S", "M", "S", "L", "L", "M…
## $ employee_residence <chr> "China", "Ireland", "South Korea", "India", "Si…
## $ remote_ratio <dbl> 50, 100, 0, 50, 100, 50, 0, 0, 0, 0, 100, 0, 10…
## $ required_skills <chr> "Tableau, PyTorch, Kubernetes, Linux, NLP", "De…
## $ education_required <chr> "Bachelor", "Master", "Associate", "PhD", "Mast…
## $ years_experience <dbl> 9, 1, 2, 7, 0, 7, 3, 0, 7, 5, 8, 15, 5, 0, 6, 0…
## $ industry <chr> "Automotive", "Media", "Education", "Consulting…
## $ posting_date <date> 2024-10-18, 2024-11-20, 2025-03-18, 2024-12-23…
## $ application_deadline <date> 2024-11-07, 2025-01-11, 2025-04-07, 2025-02-24…
## $ job_description_length <dbl> 1076, 1268, 1974, 1345, 1989, 819, 1936, 1286, …
## $ benefits_score <dbl> 5.9, 5.2, 9.4, 8.6, 6.6, 5.9, 6.3, 7.6, 9.3, 5.…
## $ company_name <chr> "Smart Analytics", "TechCorp Inc", "Autonomous …
Komentár: Dataset má 19 premenných a 15 000 riadkov. Premenné sú správne načítané (chr, dbl, date). Dáta sú konzistentné.
ai_jobs %>%
select(salary_usd, years_experience, remote_ratio,
job_description_length, benefits_score) %>%
summary()
## salary_usd years_experience remote_ratio job_description_length
## Min. : 32519 Min. : 0.000 Min. : 0.00 Min. : 500
## 1st Qu.: 70180 1st Qu.: 2.000 1st Qu.: 0.00 1st Qu.:1004
## Median : 99705 Median : 5.000 Median : 50.00 Median :1512
## Mean :115349 Mean : 6.253 Mean : 49.48 Mean :1503
## 3rd Qu.:146409 3rd Qu.:10.000 3rd Qu.:100.00 3rd Qu.:2000
## Max. :399095 Max. :19.000 Max. :100.00 Max. :2499
## benefits_score
## Min. : 5.000
## 1st Qu.: 6.200
## Median : 7.500
## Mean : 7.504
## 3rd Qu.: 8.800
## Max. :10.000
Komentár:
- Priemerná mzda ~115k USD, medián ~99k → rozdelenie je
pravostranné.
- Roky praxe: priemer 6.25, medián 5 → veľa juniorov aj seniorov.
- Remote_ratio má tri hodnoty (0/50/100).
- job_description_length ~1500 znakov → texty sú stredne dlhé.
- benefits_score ~7.5 → firmy ponúkajú relatívne dobré benefity.
par(mfrow = c(1, 2))
hist(ai_jobs$salary_usd)
hist(ai_jobs$years_experience)
par(mfrow = c(1,1))
Komentár:
Mzdy sú silne šikmé, čo naznačuje potrebu logaritmickej transformácie.
Roky praxe majú normálnejšie rozdelenie.
ai_jobs <- ai_jobs %>%
mutate(
log_salary = log(salary_usd),
exp2 = years_experience^2,
remote_factor = factor(remote_ratio, levels=c(0,50,100), labels=c("on_site","hybrid","remote")),
company_size = factor(company_size, levels=c("S","M","L"), labels=c("small","medium","large")),
experience_level = factor(experience_level, levels=c("EN","MI","SE","EX"),
labels=c("entry","mid","senior","executive")),
education_required = factor(education_required, levels=c("Associate","Bachelor","Master","PhD"))
)
Komentár:
Transformácia prebehla správne. log_salary je pripravené na
regresiu.
model_lin_salary <- lm(
salary_usd ~ years_experience + remote_factor +
benefits_score + company_size +
education_required + experience_level,
data = ai_jobs
)
summary(model_lin_salary)
##
## Call:
## lm(formula = salary_usd ~ years_experience + remote_factor +
## benefits_score + company_size + education_required + experience_level,
## data = ai_jobs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93621 -22098 -4144 18624 194703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47262.0 1810.3 26.107 <2e-16 ***
## years_experience -383.3 176.4 -2.173 0.0298 *
## remote_factorhybrid 1084.8 717.8 1.511 0.1307
## remote_factorremote 346.0 720.9 0.480 0.6313
## benefits_score 213.9 202.8 1.055 0.2915
## company_sizemedium 11963.3 720.5 16.604 <2e-16 ***
## company_sizelarge 29261.6 720.5 40.613 <2e-16 ***
## education_requiredBachelor 903.2 827.9 1.091 0.2753
## education_requiredMaster 431.1 830.2 0.519 0.6036
## education_requiredPhD 433.7 834.2 0.520 0.6032
## experience_levelmid 25343.4 943.6 26.859 <2e-16 ***
## experience_levelsenior 61033.6 1416.7 43.082 <2e-16 ***
## experience_levelexecutive 130121.0 2604.2 49.966 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36020 on 14987 degrees of freedom
## Multiple R-squared: 0.643, Adjusted R-squared: 0.6427
## F-statistic: 2249 on 12 and 14987 DF, p-value: < 2.2e-16
Interpretácia:
- Adjusted R² = 0.643 → model vysvetľuje 64% variácie
miezd.
- Koeficient pri years_experience =
-383.3, p=0.0298 → negatívny efekt → ekonomicky
nezmyselné, znak zlej špecifikácie.
- Veľkosť firmy (medium +12k, large +29k) veľmi významné
p<0.001.
- Seniorita (mid +25k, senior +61k, executive +130k) extrémne významná →
dáva perfektný zmysel.
- Remote režim a benefity sú nevýznamné v tejto špecifikácii.
par(mfrow=c(1,2))
plot(model_lin_salary, which=1)
plot(model_lin_salary, which=2)
par(mfrow=c(1,1))
Komentár:
Residuals vs Fitted ukazuje heteroskedasticitu. QQ-plot ukazuje porušenú
normalitu.
resettest(model_lin_salary)
##
## RESET test
##
## data: model_lin_salary
## RESET = 140.46, df1 = 2, df2 = 14985, p-value < 2.2e-16
Komentár:
RESET p < 2e-16 → model je funkčne zle
špecifikovaný, treba transformáciu.
model_log_salary <- lm(
log_salary ~ years_experience + remote_factor +
benefits_score + company_size +
education_required + experience_level,
data = ai_jobs
)
summary(model_log_salary)
##
## Call:
## lm(formula = log_salary ~ years_experience + remote_factor +
## benefits_score + company_size + education_required + experience_level,
## data = ai_jobs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.61804 -0.20151 -0.01739 0.20936 0.70759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.8850608 0.0137258 793.036 <2e-16 ***
## years_experience -0.0020644 0.0013373 -1.544 0.123
## remote_factorhybrid 0.0073834 0.0054421 1.357 0.175
## remote_factorremote 0.0057899 0.0054655 1.059 0.289
## benefits_score 0.0002966 0.0015375 0.193 0.847
## company_sizemedium 0.1083028 0.0054629 19.825 <2e-16 ***
## company_sizelarge 0.2531485 0.0054628 46.341 <2e-16 ***
## education_requiredBachelor 0.0050083 0.0062774 0.798 0.425
## education_requiredMaster 0.0017885 0.0062947 0.284 0.776
## education_requiredPhD 0.0015523 0.0063250 0.245 0.806
## experience_levelmid 0.3336675 0.0071541 46.640 <2e-16 ***
## experience_levelsenior 0.6706293 0.0107411 62.436 <2e-16 ***
## experience_levelexecutive 1.1130338 0.0197449 56.371 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2731 on 14987 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6963
## F-statistic: 2867 on 12 and 14987 DF, p-value: < 2.2e-16
Interpretácia:
- Adjusted R² = 0.696 → zlepšenie oproti
predchádzajúcemu modelu.
- years_experience už nie je významne → log transformácia stabilizovala
vzťah.
- company_size: medium ≈ +11% mzdy, large ≈ +29%.
- experience_level: mid ≈ +39%, senior ≈ +96%, executive ≈ +205%.
- Remote a benefity stále nevýznamné.
resettest(model_log_salary)
##
## RESET test
##
## data: model_log_salary
## RESET = 0.015648, df1 = 2, df2 = 14985, p-value = 0.9845
Komentár:
p = 0.984 → log model má správny funkčný tvar. Toto je
hlavný argument, prečo je log-shape najlepší.
model_log_quad <- lm(
log_salary ~ years_experience + exp2 + remote_factor +
benefits_score + company_size +
education_required + experience_level,
data = ai_jobs
)
summary(model_log_quad)
##
## Call:
## lm(formula = log_salary ~ years_experience + exp2 + remote_factor +
## benefits_score + company_size + education_required + experience_level,
## data = ai_jobs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62023 -0.20218 -0.01756 0.20967 0.70533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.8832028 0.0138339 786.703 <2e-16 ***
## years_experience 0.0020347 0.0040375 0.504 0.614
## exp2 -0.0001689 0.0001570 -1.076 0.282
## remote_factorhybrid 0.0074380 0.0054423 1.367 0.172
## remote_factorremote 0.0057649 0.0054655 1.055 0.292
## benefits_score 0.0002843 0.0015376 0.185 0.853
## company_sizemedium 0.1082576 0.0054631 19.816 <2e-16 ***
## company_sizelarge 0.2531756 0.0054628 46.345 <2e-16 ***
## education_requiredBachelor 0.0050649 0.0062776 0.807 0.420
## education_requiredMaster 0.0017915 0.0062947 0.285 0.776
## education_requiredPhD 0.0015500 0.0063250 0.245 0.806
## experience_levelmid 0.3248897 0.0108504 29.943 <2e-16 ***
## experience_levelsenior 0.6525088 0.0199745 32.667 <2e-16 ***
## experience_levelexecutive 1.0924037 0.0275221 39.692 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2731 on 14986 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6963
## F-statistic: 2647 on 13 and 14986 DF, p-value: < 2.2e-16
Komentár:
Ani years_experience (p=0.61) ani exp2 (p=0.28) nie sú významné → žiadna
nelinearita.
anova(model_log_salary, model_log_quad)
Komentár:
p=0.28 → kvadratický člen nezlepšuje model. Hypotéza H1
zamietnutá.
resettest(model_log_quad)
##
## RESET test
##
## data: model_log_quad
## RESET = 0.0073198, df1 = 2, df2 = 14984, p-value = 0.9927
Komentár:
p=0.99 → špecifikácia je OK aj s exp2, ale bez neho je to
jednoduchšie.
model_interact <- lm(
log_salary ~ years_experience + exp2 +
remote_factor * benefits_score +
company_size + education_required + experience_level,
data = ai_jobs
)
summary(model_interact)
##
## Call:
## lm(formula = log_salary ~ years_experience + exp2 + remote_factor *
## benefits_score + company_size + education_required + experience_level,
## data = ai_jobs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6203 -0.2023 -0.0177 0.2093 0.7070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.8680853 0.0213395 509.295 <2e-16 ***
## years_experience 0.0019917 0.0040379 0.493 0.622
## exp2 -0.0001681 0.0001570 -1.071 0.284
## remote_factorhybrid 0.0180911 0.0287292 0.630 0.529
## remote_factorremote 0.0407251 0.0288086 1.414 0.157
## benefits_score 0.0023014 0.0026564 0.866 0.386
## company_sizemedium 0.1082479 0.0054632 19.814 <2e-16 ***
## company_sizelarge 0.2531361 0.0054630 46.337 <2e-16 ***
## education_requiredBachelor 0.0051109 0.0062780 0.814 0.416
## education_requiredMaster 0.0017405 0.0062950 0.276 0.782
## education_requiredPhD 0.0015768 0.0063252 0.249 0.803
## experience_levelmid 0.3250364 0.0108523 29.951 <2e-16 ***
## experience_levelsenior 0.6527731 0.0199778 32.675 <2e-16 ***
## experience_levelexecutive 1.0927929 0.0275283 39.697 <2e-16 ***
## remote_factorhybrid:benefits_score -0.0014169 0.0037630 -0.377 0.707
## remote_factorremote:benefits_score -0.0046522 0.0037644 -1.236 0.217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2731 on 14984 degrees of freedom
## Multiple R-squared: 0.6966, Adjusted R-squared: 0.6963
## F-statistic: 2294 on 15 and 14984 DF, p-value: < 2.2e-16
Komentár:
Ani jedna interakcia nie je významná (p=0.71 a p=0.22).
→ Remote ≠ „zvyšuje hodnotu benefitov“. Hypotéza H2 zamietnutá.
resettest(model_interact)
##
## RESET test
##
## data: model_interact
## RESET = 0.014935, df1 = 2, df2 = 14982, p-value = 0.9852
Komentár:
p = 0.985 → model je dobre špecifikovaný, ale interakcie netreba.
modelsummary <- function(model) {
c(R2_adj = summary(model)$adj.r.squared,
AIC = AIC(model),
BIC = BIC(model))
}
rbind(
linear_salary = modelsummary(model_lin_salary),
log_salary = modelsummary(model_log_salary),
log_salary_quad = modelsummary(model_log_quad),
log_salary_interact= modelsummary(model_interact)
)
## R2_adj AIC BIC
## linear_salary 0.6426850 357339.271 357445.893
## log_salary 0.6963222 3646.941 3753.562
## log_salary_quad 0.6963253 3647.782 3762.019
## log_salary_interact 0.6963173 3650.177 3779.646
Komentár:
- Najvyššie Adjusted R² má log model (~0.696).
- Najnižšie AIC má tiež log model.
- Kvadratické ani interakčné prvky model nijak nezlepšujú.
Mzdy v AI vysvetľuje najmä seniority a
veľkosť firmy.
Remote režim, benefity a samotné roky praxe majú minimálny dopad po
kontrole za ostatné premenne.