Budeme postupovať presne podľa predlohy a na
tých istých dátach (Eurostat une_rt_a,
miera nezamestnanosti v %) ukážeme: deskriptívne štatistiky, medziročné
zmeny, indexy (fixná báza, reťazové), kĺzavé priemery,
normalizáciu/štandardizáciu, korelácie a kovariancie, lineárny
trend vrátane štandardných 4-panelových diagnostických
grafov (Residuals vs Fitted, QQ-plot,
Scale-Location, Residuals vs Leverage) a stručnú
projekciu. Vždy interpretujeme výstupy (nie kód).
Ak je CSV inde, uprav data_path nižšie.
need <- c("tidyverse","lubridate","janitor","scales","knitr","kableExtra","zoo","broom","rlang","glue","purrr","tidyr")
to_install <- setdiff(need, rownames(installed.packages()))
if(length(to_install)) install.packages(to_install, quiet = TRUE)
invisible(lapply(need, library, character.only = TRUE))
data_path <- "une_rt_a__custom_18708117_linear.csv"
raw <- read_csv(
data_path,
show_col_types = FALSE,
na = c("", ":", "NA", "NaN")
) |> clean_names()
df <- raw
# Rok
if("time" %in% names(df)) {
df <- df |> mutate(year = readr::parse_number(as.character(time)))
} else if ("time_period" %in% names(df)) {
df <- df |> mutate(year = readr::parse_number(as.character(time_period)))
} else {
stop("Chýba časový stĺpec (time/time_period).")
}
# Hodnota a geografia
val_col <- if ("value" %in% names(df)) "value" else names(df)[stringr::str_detect(names(df), "^value$|_value$|^obs_value$")][1]
geo_col <- if ("geo" %in% names(df)) "geo" else names(df)[stringr::str_detect(names(df), "^geo($|_)|^location$|^country$")][1]
core <- df |>
rename(geo = all_of(geo_col), value = all_of(val_col)) |>
mutate(
geo = as.factor(geo),
year = as.integer(year),
value = as.numeric(value)
) |>
filter(!is.na(year), !is.na(value)) |>
arrange(geo, year)
# Výber krajín (podľa predlohy: V4 + DE, ak sú dostupné; inak prvých 5)
sel <- c("SK","CZ","PL","HU","DE")
sel <- sel[sel %in% unique(core$geo)]
if(length(sel) < 3) sel <- head(sort(unique(core$geo)), 5)
sel <- as.character(sel)
dat <- core |> filter(geo %in% sel)
# Rozsah a rýchly prehľad
rng <- dat |> summarise(from=min(year), to=max(year), n=n())
rng
Interpretácia (dáta): Máme ročné rady miery nezamestnanosti (%) pre Czechia, Germany, Slovakia v období 2014–2024. Hodnoty sú v percentách populácie podľa metodiky Eurostat.
descr <- dat |>
group_by(geo) |>
summarise(
n = n(),
priemer = mean(value, na.rm = TRUE),
median = median(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
min = min(value, na.rm = TRUE),
q1 = quantile(value, 0.25, na.rm = TRUE),
q3 = quantile(value, 0.75, na.rm = TRUE),
max = max(value, na.rm = TRUE)
) |>
mutate(across(where(is.numeric), ~round(.x, 2)))
kable(descr, caption = "Základné deskriptívne štatistiky (%)") |>
kable_styling(full_width = FALSE)
| geo | n | priemer | median | sd | min | q1 | q3 | max |
|---|---|---|---|---|---|---|---|---|
| Czechia | 11 | 3.19 | 2.6 | 1.32 | 2.0 | 2.40 | 3.45 | 6.1 |
| Germany | 11 | 3.58 | 3.5 | 0.56 | 2.9 | 3.15 | 3.75 | 4.7 |
| Slovakia | 11 | 7.75 | 6.7 | 2.58 | 5.3 | 5.95 | 8.85 | 13.1 |
Interpretácia: Najnižší priemer má Czechia, najvyšší Slovakia. Väčšie SD = väčšie výkyvy v čase.
ggplot(dat, aes(year, value, color=geo)) +
geom_line(linewidth=1) + geom_point(size=1.7) +
scale_y_continuous(labels = label_percent(accuracy = 0.1, scale = 1)) +
scale_x_continuous(breaks = pretty) +
labs(title="Miera nezamestnanosti v čase", x="Rok", y="Nezamestnanosť (%)", color="Krajina") +
theme_minimal(base_size = 12)
Interpretácia: Krajiny reagujú na spoločné šoky podobne (regionálna previaznutosť), ale úrovne sa líšia.
yoy <- dat |>
arrange(geo, year) |>
group_by(geo) |>
mutate(
yoy_pp = value - lag(value), # percentuálne body
yoy_pct = (value/lag(value) - 1)*100 # medziročná % zmena
)
kable(
yoy |>
summarise(
priemer_yoy_pp = mean(yoy_pp, na.rm=TRUE),
sd_yoy_pp = sd(yoy_pp, na.rm=TRUE),
max_narast_pp = max(yoy_pp, na.rm=TRUE),
max_pokles_pp = min(yoy_pp, na.rm=TRUE)
) |>
mutate(across(where(is.numeric), ~round(.x, 2))),
caption = "Medziročné zmeny (p. b.): priemer, rozptyl a extrémy"
) |> kable_styling(full_width = FALSE)
| geo | priemer_yoy_pp | sd_yoy_pp | max_narast_pp | max_pokles_pp |
|---|---|---|---|---|
| Czechia | -0.35 | 0.64 | 0.6 | -1.1 |
| Germany | -0.13 | 0.39 | 0.7 | -0.5 |
| Slovakia | -0.78 | 0.91 | 1.0 | -1.9 |
ggplot(yoy, aes(year, yoy_pp, color=geo)) +
geom_hline(yintercept=0, linetype=2) +
geom_line(linewidth=1) + geom_point(size=1.6) +
labs(title="Medziročná zmena miery nezamestnanosti (p. b.)", x="Rok", y="Δ p. b.") +
theme_minimal(base_size = 12)
Interpretácia: Kladné = nárast nezamestnanosti, záporné = pokles. Extrémy indikujú krízové/po-krízové roky.
base_year <- min(dat$year, na.rm = TRUE)
indexy <- dat |>
group_by(geo) |>
mutate(
fix_base = value / value[year==base_year] * 100,
chain = value / lag(value) * 100
)
kable(
indexy |>
filter(year %in% c(base_year, base_year+1, max(year))) |>
arrange(geo, year) |>
mutate(across(c(fix_base, chain), ~round(.x, 2))),
caption = glue("Indexy (fixná báza {base_year}=100; reťazové, %) – ukážka rokov")
) |> kable_styling(full_width = FALSE)
| dataflow | last_update | freq | age | unit | sex | geo | time_period | value | obs_flag | conf_status | year | fix_base | chain |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Czechia | 2014 | 6.1 | NA | NA | 2014 | 100.00 | NA |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Czechia | 2015 | 5.1 | NA | NA | 2015 | 83.61 | 83.61 |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Czechia | 2024 | 2.6 | NA | NA | 2024 | 42.62 | 100.00 |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Germany | 2014 | 4.7 | NA | NA | 2014 | 100.00 | NA |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Germany | 2015 | 4.4 | NA | NA | 2015 | 93.62 | 93.62 |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Germany | 2024 | 3.4 | NA | NA | 2024 | 72.34 | 109.68 |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Slovakia | 2014 | 13.1 | NA | NA | 2014 | 100.00 | NA |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Slovakia | 2015 | 11.5 | NA | NA | 2015 | 87.79 | 87.79 |
| ESTAT:UNE_RT_A(1.0) | 11/09/25 23:00:00 | Annual | From 15 to 74 years | Percentage of population in the labour force | Total | Slovakia | 2024 | 5.3 | NA | NA | 2024 | 40.46 | 91.38 |
ggplot(indexy, aes(year, fix_base, color=geo)) +
geom_hline(yintercept=100, linetype=2) +
geom_line(linewidth=1) + geom_point(size=1.6) +
labs(title=glue("Index s fixnou bázou ({base_year}=100)"), x="Rok", y="Index (%)") +
theme_minimal(base_size = 12)
Interpretácia: Hodnota 100 = úroveň v základnom roku. Nad/pod 100 = vyššia/nižšia nezamestnanosť než v 2014.
ma <- dat |>
arrange(geo, year) |>
group_by(geo) |>
mutate(
ma3 = zoo::rollmean(value, k=3, fill=NA, align="right"),
ma5 = zoo::rollmean(value, k=5, fill=NA, align="right")
)
ggplot(ma, aes(year, value, color=geo)) +
geom_line(alpha=.55) +
geom_line(aes(y=ma3), linewidth=1.1) +
labs(title="Kĺzavý priemer MA(3)", x="Rok", y="Nezamestnanosť (%)", color="Krajina") +
theme_minimal(base_size = 12)
ggplot(ma, aes(year, value, color=geo)) +
geom_line(alpha=.45) +
geom_line(aes(y=ma5), linewidth=1.1) +
labs(title="Kĺzavý priemer MA(5)", x="Rok", y="Nezamestnanosť (%)", color="Krajina") +
theme_minimal(base_size = 12)
Interpretácia: MA(5) je hladší než MA(3); lepšie ukazuje trend.
wide <- dat |> select(year, geo, value) |> distinct() |>
pivot_wider(names_from=geo, values_from=value) |>
arrange(year)
last_year <- max(wide$year[complete.cases(wide)], na.rm=TRUE)
last_row <- wide |> filter(year == last_year) |> select(-year)
minmax <- map_df(last_row, ~ (.x - min(.x, na.rm=TRUE)) / (max(.x, na.rm=TRUE)-min(.x, na.rm=TRUE)))
zscore <- map_df(last_row, ~ (.x - mean(.x, na.rm=TRUE)) / sd(.x, na.rm=TRUE))
norm_tbl <- tibble(geo = names(last_row),
value = as.numeric(last_row[1,]),
minmax = as.numeric(minmax[1,]),
zscore = as.numeric(zscore[1,])) |>
arrange(value) |>
mutate(across(where(is.numeric), ~round(.x, 2)))
kable(norm_tbl, caption = glue("Normalizácia (0–1) a z-score v roku {last_year}")) |>
kable_styling(full_width = FALSE)
| geo | value | minmax | zscore |
|---|---|---|---|
| Czechia | 2.6 | NaN | NA |
| Germany | 3.4 | NaN | NA |
| Slovakia | 5.3 | NaN | NA |
Interpretácia: Min–max ukáže poradie v rozsahu 0–1; z-score odchýlku od priemeru (kladné = nadpriemer).
num_mat <- wide |> select(where(is.numeric), -year)
cor_mat <- cor(num_mat, use="pairwise.complete.obs")
cov_mat <- cov(num_mat, use="pairwise.complete.obs")
cor_long <- as.data.frame(as.table(cor_mat)) |> set_names(c("Var1","Var2","Cor"))
ggplot(cor_long, aes(Var1, Var2, fill=Cor)) +
geom_tile(color="white") +
scale_fill_gradient2(limits=c(-1,1), midpoint=0, labels=number_format(accuracy = 0.01)) +
geom_text(aes(label=sprintf("%.2f", Cor)), size=3) +
labs(title="Korelačná matica (Pearson r)", x=NULL, y=NULL, fill="r") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle=45, hjust=1))
kable(round(cov_mat, 3), caption = "Kovariančná matica") |>
kable_styling(full_width = FALSE)
| Czechia | Germany | Slovakia | |
|---|---|---|---|
| Czechia | 1.743 | 0.714 | 3.302 |
| Germany | 0.714 | 0.314 | 1.363 |
| Slovakia | 3.302 | 1.363 | 6.653 |
Interpretácia: Vysoké kladné r = podobná cyklická dynamika; kovariancia ukazuje spoločný rozsah pohybov.
# Stabilné fitovanie: nest -> map -> broom
lm_fits <- dat |>
group_by(geo) |>
tidyr::nest() |>
mutate(
model = purrr::map(data, ~lm(value ~ year, data = .x)),
glance= purrr::map(model, broom::glance)
)
fit_summ <- lm_fits |>
select(geo, glance) |>
unnest(glance) |>
select(geo, r.squared, adj.r.squared, sigma, p.value, AIC, BIC) |>
mutate(across(where(is.numeric), ~round(.x, 4)))
kable(fit_summ, caption = "Súhrn trendových modelov value ~ year") |>
kable_styling(full_width = FALSE)
| geo | r.squared | adj.r.squared | sigma | p.value | AIC | BIC |
|---|---|---|---|---|---|---|
| Czechia | 0.5577 | 0.5086 | 0.9255 | 0.0083 | 33.3050 | 34.4987 |
| Germany | 0.5283 | 0.4758 | 0.4055 | 0.0113 | 15.1489 | 16.3426 |
| Slovakia | 0.7625 | 0.7361 | 1.3249 | 0.0004 | 41.1993 | 42.3930 |
# Vyber krajinu pre diagnostiku: prednostne SK, inak prvá z 'sel'
geo_diag <- if ("SK" %in% sel) "SK" else sel[1]
diag_df <- dat |> filter(geo == geo_diag)
mdl <- lm(value ~ year, data = diag_df)
op <- par(mfrow=c(2,2)) # 4-panel
plot(mdl) # Residuals vs Fitted; QQ; Scale-Location; Residuals vs Leverage
par(op)
Interpretácia (diagnostika): - Residuals vs
Fitted: rezíduá by mali kmitovať okolo 0 bez vzoru
(lineárnosť/homoskedasticita).
- Q–Q: približne po diagonále (normalita rezíduí).
- Scale–Location: konštantný rozptyl naprieč predikciou.
- Residuals vs Leverage: body s veľkou pákou a Cookovou
vzdialenosťou môžu byť vplyvné.
Model je ilustračný; pri porušení predpokladov zváž transformáciu alebo
robustné metódy.
future_years <- (max(dat$year)+1):(max(dat$year)+2)
preds <- lm_fits |>
transmute(geo, model,
newdata = list(tibble(year = c(seq(min(dat$year), max(dat$year)), future_years)))) |>
mutate(pred_tbl = map2(model, newdata, ~mutate(.y, pred = predict(.x, .y)))) |>
select(geo, pred_tbl) |>
unnest(pred_tbl)
ggplot() +
geom_line(data=dat, aes(year, value, color=geo), linewidth=1) +
geom_point(data=dat, aes(year, value, color=geo), size=1.6) +
geom_line(data=preds, aes(year, pred, color=geo), linetype=2) +
labs(title="Lineárny trend a 2-ročná projekcia (ilustratívna)", x="Rok", y="Nezamestnanosť (%)", color="Krajina") +
theme_minimal(base_size = 12)
Interpretácia: R² (v tabuľke vyššie) ukazuje, koľko variability trend vysvetlí; projekcia je orientačná.
last_vals <- dat |>
group_by(geo) |>
filter(year == max(year, na.rm=TRUE)) |>
summarise(year=first(year), value=first(value)) |>
arrange(value) |>
mutate(poradie = row_number())
avg_vals <- dat |>
group_by(geo) |>
summarise(priemer = mean(value, na.rm=TRUE))
rank_tbl <- last_vals |>
left_join(avg_vals, by="geo") |>
mutate(odchylka_od_priemeru = round(value - priemer, 2))
kable(rank_tbl, caption = "Poradie podľa posledného roku a odchýlka od dlhodobého priemeru") |>
kable_styling(full_width = FALSE)
| geo | year | value | poradie | priemer | odchylka_od_priemeru |
|---|---|---|---|---|---|
| Czechia | 2024 | 2.6 | 1 | 3.190909 | -0.59 |
| Germany | 2024 | 3.4 | 2 | 3.581818 | -0.18 |
| Slovakia | 2024 | 5.3 | 3 | 7.745455 | -2.45 |
ggplot(last_vals, aes(reorder(geo, value), value, fill=geo)) +
geom_col(show.legend = FALSE) + coord_flip() +
scale_y_continuous(labels = label_percent(accuracy=0.1, scale=1)) +
labs(title=glue("Miera nezamestnanosti v poslednom roku ({unique(last_vals$year)})"),
x="Krajina", y="Nezamestnanosť (%)") +
theme_minimal(base_size = 12)
Interpretácia: Poradie vyjadruje aktuálne postavenie; kladná odchýlka = nad dlhodobým priemerom krajiny.
pair <- c("SK","DE")
if(!all(pair %in% sel)) pair <- sel[1:2]
pair <- as.character(pair)
pair_dat <- dat |> filter(geo %in% pair) |>
select(geo, year, value) |>
pivot_wider(names_from=geo, values_from=value)
if(all(pair %in% names(pair_dat))){
p1 <- pair[1]; p2 <- pair[2]
pair_dat <- pair_dat |>
mutate(
rozdiel_pb = .data[[p1]] - .data[[p2]],
pomer = .data[[p1]] / .data[[p2]]
)
kable(pair_dat |> mutate(across(where(is.numeric), ~round(.x,2))) |> head(10),
caption = glue("Porovnanie {p1} vs {p2} – prvých 10 rokov")) |>
kable_styling(full_width = FALSE)
}
| year | Czechia | Germany | rozdiel_pb | pomer |
|---|---|---|---|---|
| 2014 | 6.1 | 4.7 | 1.4 | 1.30 |
| 2015 | 5.1 | 4.4 | 0.7 | 1.16 |
| 2016 | 4.0 | 3.9 | 0.1 | 1.03 |
| 2017 | 2.9 | 3.5 | -0.6 | 0.83 |
| 2018 | 2.2 | 3.2 | -1.0 | 0.69 |
| 2019 | 2.0 | 2.9 | -0.9 | 0.69 |
| 2020 | 2.6 | 3.6 | -1.0 | 0.72 |
| 2021 | 2.8 | 3.6 | -0.8 | 0.78 |
| 2022 | 2.2 | 3.1 | -0.9 | 0.71 |
| 2023 | 2.6 | 3.1 | -0.5 | 0.84 |
if(exists("pair_dat") && all(pair %in% names(pair_dat))){
p1 <- pair[1]; p2 <- pair[2]
ggplot(pair_dat, aes(year, pomer)) +
geom_hline(yintercept=1, linetype=2) +
geom_line(linewidth=1, color="grey30") + geom_point(size=1.6, color="grey30") +
labs(title=glue("Pomer nezamestnanosti {p1}/{p2}"), x="Rok", y="Pomer (×)") +
theme_minimal(base_size = 12)
}
Interpretácia: Pomer > 1 = prvá krajina má vyššiu nezamestnanosť; < 1 = nižšiu. Rozdiel v p. b. ukazuje absolútnu medzeru.