library(tidyverse)
library(car)
setwd("C:/Users/Lenovo/Documents/School/Mgr/R - Ekonometria/Cvicenia/My dataset")
ai <- read.csv("ai_job_dataset.csv",
dec = ".", sep = ",", header = TRUE)
udaje <- ai %>%
dplyr::select(salary_usd,
years_experience,
job_description_length,
benefits_score) %>%
drop_na()
str(udaje)
## 'data.frame': 15000 obs. of 4 variables:
## $ salary_usd : int 90376 61895 152626 80215 54624 123574 79670 70640 160710 102557 ...
## $ years_experience : int 9 1 2 7 0 7 3 0 7 5 ...
## $ job_description_length: int 1076 1268 1974 1345 1989 819 1936 1286 551 2340 ...
## $ benefits_score : num 5.9 5.2 9.4 8.6 6.6 5.9 6.3 7.6 9.3 5.8 ...
summary(udaje)
## salary_usd years_experience job_description_length benefits_score
## Min. : 32519 Min. : 0.000 Min. : 500 Min. : 5.000
## 1st Qu.: 70180 1st Qu.: 2.000 1st Qu.:1004 1st Qu.: 6.200
## Median : 99705 Median : 5.000 Median :1512 Median : 7.500
## Mean :115349 Mean : 6.253 Mean :1503 Mean : 7.504
## 3rd Qu.:146409 3rd Qu.:10.000 3rd Qu.:2000 3rd Qu.: 8.800
## Max. :399095 Max. :19.000 Max. :2499 Max. :10.000
model <- lm(salary_usd ~ years_experience +
job_description_length +
benefits_score,
data = udaje)
summary(model)
##
## Call:
## lm(formula = salary_usd ~ years_experience + job_description_length +
## benefits_score, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128732 -24846 -5943 18906 253426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63808.8073 1987.7034 32.102 <2e-16 ***
## years_experience 8014.5768 59.9201 133.754 <2e-16 ***
## job_description_length -0.3749 0.5768 -0.650 0.516
## benefits_score 264.7793 229.0359 1.156 0.248
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40700 on 14996 degrees of freedom
## Multiple R-squared: 0.544, Adjusted R-squared: 0.544
## F-statistic: 5964 on 3 and 14996 DF, p-value: < 2.2e-16
xvars <- udaje[, c("years_experience",
"job_description_length",
"benefits_score")]
round(cor(xvars), 3)
## years_experience job_description_length benefits_score
## years_experience 1.000 -0.008 -0.007
## job_description_length -0.008 1.000 0.007
## benefits_score -0.007 0.007 1.000
pairs(xvars,
main = "Scatterplotová matica – vysvetľujúce premenné",
pch = 16, cex = 0.5)
- Viditeľné pásy = premenné nie sú medzi sebou lineárne prepojené. -
Nepozorujeme žiadny vzťah medzi vysvetľujúcimi premennými. - Graficky
potvrdené: žiadna multikolinearita.
vif(model)
## years_experience job_description_length benefits_score
## 1.000109 1.000101 1.000098
X <- model.matrix(model)[, -1]
XtX <- t(X) %*% X
eig <- eigen(XtX)
condition_number <- sqrt(max(eig$values) / min(eig$values))
condition_number
## [1] 570.4547
udaje_scaled <- udaje %>%
mutate(
years_experience_s = scale(years_experience, center = TRUE, scale = TRUE)[,1],
job_description_length_s = scale(job_description_length, center = TRUE, scale = TRUE)[,1],
benefits_score_s = scale(benefits_score, center = TRUE, scale = TRUE)[,1]
)
model_scaled <- lm(salary_usd ~ years_experience_s +
job_description_length_s +
benefits_score_s,
data = udaje_scaled)
summary(model_scaled)
##
## Call:
## lm(formula = salary_usd ~ years_experience_s + job_description_length_s +
## benefits_score_s, data = udaje_scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128732 -24846 -5943 18906 253426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115349.0 332.3 347.150 <2e-16 ***
## years_experience_s 44447.0 332.3 133.754 <2e-16 ***
## job_description_length_s -216.0 332.3 -0.650 0.516
## benefits_score_s 384.2 332.3 1.156 0.248
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40700 on 14996 degrees of freedom
## Multiple R-squared: 0.544, Adjusted R-squared: 0.544
## F-statistic: 5964 on 3 and 14996 DF, p-value: < 2.2e-16
vif(model_scaled)
## years_experience_s job_description_length_s benefits_score_s
## 1.000109 1.000101 1.000098
X_s <- model.matrix(model_scaled)[, -1]
XtX_s <- t(X_s) %*% X_s
eig_s <- eigen(XtX_s)
condition_number_s <- sqrt(max(eig_s$values) / min(eig_s$values))
condition_number_s
## [1] 1.011031
Model nepreukázal žiadnu multikolinearitu podľa VIF ani Condition Number po škálovaní. Roky skúseností sú štatisticky významným determinantom mzdy.