setwd("C:/Users/Lenovo/Documents/School/Mgr/R - Ekonometria/Cvicenia/My dataset")
ai <- read.csv("ai_job_dataset.csv",
stringsAsFactors = TRUE)
if("posting_date" %in% names(ai)){
ai$posting_date <- as.Date(ai$posting_date)
}
if("application_deadline" %in% names(ai)){
ai$application_deadline <- as.Date(ai$application_deadline)
}
str(ai)
## 'data.frame': 15000 obs. of 19 variables:
## $ job_id : Factor w/ 15000 levels "AI00001","AI00002",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ job_title : Factor w/ 20 levels "AI Architect",..: 4 5 6 17 2 1 18 17 9 5 ...
## $ salary_usd : int 90376 61895 152626 80215 54624 123574 79670 70640 160710 102557 ...
## $ salary_currency : Factor w/ 3 levels "EUR","GBP","USD": 3 3 3 3 1 1 2 1 3 3 ...
## $ experience_level : Factor w/ 4 levels "EN","EX","MI",..: 4 1 3 4 1 4 3 1 4 4 ...
## $ employment_type : Factor w/ 4 levels "CT","FL","FT",..: 1 1 2 2 4 1 2 2 1 4 ...
## $ company_location : Factor w/ 20 levels "Australia","Austria",..: 8 3 18 9 6 7 19 6 15 2 ...
## $ company_size : Factor w/ 3 levels "L","M","S": 2 2 1 2 3 2 3 1 1 2 ...
## $ employee_residence : Factor w/ 20 levels "Australia","Austria",..: 8 10 16 9 15 7 19 6 15 2 ...
## $ remote_ratio : int 50 100 0 50 100 50 0 0 0 0 ...
## $ required_skills : Factor w/ 13663 levels "AWS, Azure, GCP, Docker",..: 12974 1780 5168 10832 6997 1652 9724 8621 3954 6942 ...
## $ education_required : Factor w/ 4 levels "Associate","Bachelor",..: 2 3 1 4 3 1 1 3 4 3 ...
## $ years_experience : int 9 1 2 7 0 7 3 0 7 5 ...
## $ industry : Factor w/ 15 levels "Automotive","Consulting",..: 1 10 3 2 10 8 6 8 7 7 ...
## $ posting_date : Date, format: "2024-10-18" "2024-11-20" ...
## $ application_deadline : Date, format: "2024-11-07" "2025-01-11" ...
## $ job_description_length: int 1076 1268 1974 1345 1989 819 1936 1286 551 2340 ...
## $ benefits_score : num 5.9 5.2 9.4 8.6 6.6 5.9 6.3 7.6 9.3 5.8 ...
## $ company_name : Factor w/ 16 levels "Advanced Robotics",..: 15 16 4 10 1 12 7 5 14 5 ...
summary(ai)
## job_id job_title salary_usd
## AI00001: 1 Machine Learning Researcher: 808 Min. : 32519
## AI00002: 1 AI Software Engineer : 784 1st Qu.: 70180
## AI00003: 1 Autonomous Systems Engineer: 777 Median : 99705
## AI00004: 1 Machine Learning Engineer : 772 Mean :115349
## AI00005: 1 AI Architect : 771 3rd Qu.:146409
## AI00006: 1 Head of AI : 765 Max. :399095
## (Other):14994 (Other) :10323
## salary_currency experience_level employment_type company_location
## EUR: 2314 EN:3718 CT:3721 Germany : 814
## GBP: 729 EX:3760 FL:3758 Denmark : 778
## USD:11957 MI:3781 FT:3812 Canada : 769
## SE:3741 PT:3709 France : 769
## Austria : 765
## Singapore: 764
## (Other) :10341
## company_size employee_residence remote_ratio
## L:4998 Sweden : 790 Min. : 0.00
## M:4995 France : 781 1st Qu.: 0.00
## S:5007 Denmark: 777 Median : 50.00
## Austria: 776 Mean : 49.48
## India : 772 3rd Qu.:100.00
## Germany: 769 Max. :100.00
## (Other):10335
## required_skills education_required
## Python, TensorFlow, PyTorch : 17 Associate:3785
## Python, TensorFlow, Tableau : 9 Bachelor :3789
## Python, TensorFlow, Data Visualization: 7 Master :3748
## Python, TensorFlow, Linux : 7 PhD :3678
## Python, TensorFlow, MLOps : 7
## Linux, Python, TensorFlow : 6
## (Other) :14947
## years_experience industry posting_date application_deadline
## Min. : 0.000 Retail :1063 Min. :2024-01-01 Min. :2024-01-16
## 1st Qu.: 2.000 Media :1045 1st Qu.:2024-04-29 1st Qu.:2024-06-13
## Median : 5.000 Automotive :1020 Median :2024-08-28 Median :2024-10-12
## Mean : 6.253 Consulting :1020 Mean :2024-08-29 Mean :2024-10-11
## 3rd Qu.:10.000 Technology :1011 3rd Qu.:2024-12-29 3rd Qu.:2025-02-10
## Max. :19.000 Real Estate:1007 Max. :2025-04-30 Max. :2025-07-11
## (Other) :8834
## job_description_length benefits_score company_name
## Min. : 500 Min. : 5.000 TechCorp Inc : 980
## 1st Qu.:1004 1st Qu.: 6.200 Cognitive Computing : 972
## Median :1512 Median : 7.500 AI Innovations : 964
## Mean :1503 Mean : 7.504 Digital Transformation LLC: 961
## 3rd Qu.:2000 3rd Qu.: 8.800 Future Systems : 960
## Max. :2499 Max. :10.000 Quantum Computing Inc : 960
## (Other) :9203
Komentár k štruktúre a základnému súhrnu:
salary_usd – ročný plat (int),remote_ratio – podiel remote práce v %,years_experience – požadované roky praxe,job_description_length – dĺžka popisu (500–2499),benefits_score – skóre benefitov (5–10).num_candidates <- c("salary_usd",
"remote_ratio",
"years_experience",
"job_description_length",
"benefits_score")
num_vars <- intersect(num_candidates, names(ai))
ai_num <- ai[, num_vars, drop = FALSE]
summary(ai_num)
## salary_usd remote_ratio years_experience job_description_length
## Min. : 32519 Min. : 0.00 Min. : 0.000 Min. : 500
## 1st Qu.: 70180 1st Qu.: 0.00 1st Qu.: 2.000 1st Qu.:1004
## Median : 99705 Median : 50.00 Median : 5.000 Median :1512
## Mean :115349 Mean : 49.48 Mean : 6.253 Mean :1503
## 3rd Qu.:146409 3rd Qu.:100.00 3rd Qu.:10.000 3rd Qu.:2000
## Max. :399095 Max. :100.00 Max. :19.000 Max. :2499
## benefits_score
## Min. : 5.000
## 1st Qu.: 6.200
## Median : 7.500
## Mean : 7.504
## 3rd Qu.: 8.800
## Max. :10.000
Komentár k numerickým premenným:
salary_usd má široké rozpätie, čo je dôležité pre
regresiu a zhlukovanie.years_experience má rozumné rozdelenie (0–19), takže
model vie zachytiť rozdiely medzi juniorom a seniorom.job_description_length je relatívne rovnomerne
rozložená v intervale 500–2499 – inzeráty sú väčšinou dlhé.benefits_score je sústredené medzi 6 a 9 – extrémne zlé
benefity v datasete prakticky nie sú.if(ncol(ai_num) > 1){
cor_matrix <- cor(ai_num, use = "complete.obs")
cor_matrix
} else {
cor_matrix <- NULL
"Nedá sa vypočítať korelačná matica – je iba jedna numerická premenná."
}
## salary_usd remote_ratio years_experience
## salary_usd 1.0000000000 0.013726826 0.737555909
## remote_ratio 0.0137268260 1.000000000 0.015320983
## years_experience 0.7375559087 0.015320983 1.000000000
## job_description_length -0.0090922171 0.004474897 -0.007526152
## benefits_score 0.0009852305 0.003139244 -0.007274574
## job_description_length benefits_score
## salary_usd -0.009092217 0.0009852305
## remote_ratio 0.004474897 0.0031392438
## years_experience -0.007526152 -0.0072745740
## job_description_length 1.000000000 0.0067435618
## benefits_score 0.006743562 1.0000000000
Komentár ku koreláciám:
remote_ratio ≈ 0.014,job_description_length ≈ −0.009,benefits_score ≈ 0.001.Cieľ: rozdeliť pracovné ponuky na prirodzené skupiny podľa číselných vlastností.
cluster_candidates <- c("salary_usd",
"remote_ratio",
"years_experience",
"job_description_length",
"benefits_score")
cluster_vars <- intersect(cluster_candidates, names(ai))
ai_clust <- ai[, cluster_vars, drop = FALSE]
ai_clust_complete <- na.omit(ai_clust)
ai_scaled <- scale(ai_clust_complete)
nrow(ai_scaled)
## [1] 15000
Komentár k príprave:
set.seed(123)
max_k <- 6
wss <- numeric(max_k)
for (k in 1:max_k) {
km <- kmeans(ai_scaled, centers = k, nstart = 20)
wss[k] <- km$tot.withinss
}
plot(1:max_k, wss, type = "b",
xlab = "Počet klastrov",
ylab = "WSS",
main = "Elbow graf")
Komentár k Elbow grafu:
set.seed(123)
k <- 3
km3 <- kmeans(ai_scaled,
centers = k,
nstart = 50)
ai_clusters <- ai_clust_complete
ai_clusters$cluster <- factor(km3$cluster)
table(ai_clusters$cluster)
##
## 1 2 3
## 5555 4076 5369
Komentár k veľkosti klastrov:
table) sú:
cluster_means <- aggregate(ai_clust_complete,
by = list(cluster = km3$cluster),
FUN = mean)
cluster_means
## cluster salary_usd remote_ratio years_experience job_description_length
## 1 1 86928.19 15.90459 3.358596 1501.716
## 2 2 189678.00 49.84053 13.825810 1491.696
## 3 3 88325.74 83.95418 3.499162 1513.789
## benefits_score
## 1 7.085203
## 2 7.481183
## 3 7.955392
Komentár k priemerom v klastroch:
Z tabuľky cluster_means vidíme:
salary_usd
aj years_experience).benefits_score:
remote_ratio a job_description_length sa
medzi klastrami líšia len mierne – nie sú hlavný faktor, ktorý zhluky
odlišuje.fviz_cluster(km3,
data = ai_scaled,
geom = "point",
ellipse.type = "norm",
main = "K-means zhlukovanie (k = 3)")
Komentár k grafu klastrov:
reg_candidates <- c("salary_usd",
"years_experience",
"remote_ratio",
"job_description_length",
"benefits_score",
"posting_date")
reg_vars <- intersect(reg_candidates, names(ai))
reg_data <- ai[, reg_vars, drop = FALSE]
if("posting_date" %in% names(reg_data)){
reg_data <- reg_data %>% arrange(posting_date)
}
reg_data <- na.omit(reg_data)
summary(reg_data)
## salary_usd years_experience remote_ratio job_description_length
## Min. : 32519 Min. : 0.000 Min. : 0.00 Min. : 500
## 1st Qu.: 70180 1st Qu.: 2.000 1st Qu.: 0.00 1st Qu.:1004
## Median : 99705 Median : 5.000 Median : 50.00 Median :1512
## Mean :115349 Mean : 6.253 Mean : 49.48 Mean :1503
## 3rd Qu.:146409 3rd Qu.:10.000 3rd Qu.:100.00 3rd Qu.:2000
## Max. :399095 Max. :19.000 Max. :100.00 Max. :2499
## benefits_score posting_date
## Min. : 5.000 Min. :2024-01-01
## 1st Qu.: 6.200 1st Qu.:2024-04-29
## Median : 7.500 Median :2024-08-28
## Mean : 7.504 Mean :2024-08-29
## 3rd Qu.: 8.800 3rd Qu.:2024-12-29
## Max. :10.000 Max. :2025-04-30
Komentár k regresným dátam:
posting_date, aby malo
testovanie autokorelácie rezíduí nejakú časovú logiku.salary_usd, years_experience,
remote_ratio, job_description_length a
benefits_score je rovnaké ako v úvodnej deskriptíve.if(!("salary_usd" %in% names(reg_data))){
stop("V dátach chýba salary_usd – uprav model.")
}
x_vars <- intersect(c("years_experience",
"remote_ratio",
"job_description_length",
"benefits_score"),
names(reg_data))
reg_formula <- as.formula(
paste("salary_usd ~", paste(x_vars, collapse = " + "))
)
model <- lm(reg_formula, data = reg_data)
summary(model)
##
## Call:
## lm(formula = reg_formula, data = reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -128907 -24849 -5950 18892 253606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63638.4712 2025.2099 31.423 <2e-16 ***
## years_experience 8014.1719 59.9288 133.728 <2e-16 ***
## remote_ratio 3.5778 8.1430 0.439 0.660
## job_description_length -0.3760 0.5768 -0.652 0.514
## benefits_score 264.4552 229.0432 1.155 0.248
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40700 on 14995 degrees of freedom
## Multiple R-squared: 0.544, Adjusted R-squared: 0.5439
## F-statistic: 4473 on 4 and 14995 DF, p-value: < 2.2e-16
Komentár k výsledkom regresie:
Z výstupu:
\[ \widehat{salary\_usd} = 63\,638.47 + 8\,014.17 \cdot years\_experience + 3.58 \cdot remote\_ratio - 0.38 \cdot job\_description\_length + 264.46 \cdot benefits\_score \]
reg_data$fitted <- fitted(model)
reg_data$residual <- resid(model)
plot(reg_data$fitted, reg_data$salary_usd,
xlab = "Predikovaný plat",
ylab = "Skutočný plat",
main = "Skutočný vs. predikovaný salary_usd")
abline(0, 1, col = "red")
Komentár k grafu:
acf(reg_data$residual,
lag.max = 20,
main = "ACF rezíduí")
Komentár k ACF grafu:
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 1.9778, p-value = 0.08728
## alternative hypothesis: true autocorrelation is greater than 0
Konkrétny výsledok:
Interpretácia:
bgtest(model, order = 4)
##
## Breusch-Godfrey test for serial correlation of order up to 4
##
## data: model
## LM test = 4.8091, df = 4, p-value = 0.3074
Konkrétny výsledok:
Interpretácia: