Databáza: Eurostat – Unemployment rate (annual
series), UNE_RT_A (linear export).
Cieľ: základné spracovanie časovej rady (ročné
údaje).
Súbor:
une_rt_a__custom_18696257_linear.csv (nahrať do projektu,
ideálne do data/).
RPubs: doplniť po publikovaní.
file_path <- "une_rt_a__custom_18696257_linear.csv"
if (!file.exists(file_path)) {
candidates <- c(
"data/une_rt_a__custom_18696257_linear.csv",
"Data/une_rt_a__custom_18696257_linear.csv",
"datasets/une_rt_a__custom_18696257_linear.csv",
"Downloads/une_rt_a__custom_18696257_linear.csv"
)
hit <- candidates[file.exists(candidates)]
if (length(hit) > 0) file_path <- hit[1]
}
if (!file.exists(file_path)) {
rec <- Sys.glob("**/une_rt_a__custom_18696257_linear.csv")
if (length(rec) > 0) file_path <- rec[1]
}
if (!file.exists(file_path)) {
stop(paste0(
"CSV sa nenašlo v pracovnom priečinku: ", getwd(),
"\nNahraj súbor do koreňa projektu alebo do priečinka data/ a skús znova."
))
}
raw <- readr::read_csv(file_path, show_col_types = FALSE) %>% clean_names()
DT::datatable(head(raw, 10), options = list(pageLength = 5), caption = "Náhľad pôvodných dát")# Očakávané názvy stĺpcov v Eurostat linear exporte po clean_names():
# geo, time_period, obs_value, age, sex, unit, freq, obs_flag
stopifnot(all(c("geo","time_period","obs_value") %in% names(raw)))
df <- raw %>% select(geo, time_period, obs_value,
any_of(c("age","sex","unit","freq")))
# Helper: vyberie najvhodnejšiu hodnotu (preferuje vzor)
pick_level <- function(x, prefer = NULL, pattern = NULL){
x <- as.character(x)
x <- x[!is.na(x)]
if (!length(x)) return(NA_character_)
tab <- sort(table(x), decreasing = TRUE)
cand <- names(tab)
if (!is.null(pattern)) {
hits <- cand[grepl(pattern, cand, ignore.case = TRUE)]
if (length(hits)) return(hits[1])
}
if (!is.null(prefer) && prefer %in% cand) return(prefer)
cand[1]
}
age_pick <- if ("age" %in% names(df)) pick_level(df$age, pattern = "15|15-24|y15") else NA_character_
sex_pick <- if ("sex" %in% names(df)) { if ("T" %in% df$sex) "T" else pick_level(df$sex, pattern = "T|total|both") } else NA_character_
unit_pick <- if ("unit" %in% names(df)) pick_level(df$unit, pattern = "^PC|PCT|percent") else NA_character_
freq_pick <- if ("freq" %in% names(df)) { if ("A" %in% df$freq) "A" else pick_level(df$freq, pattern = "A|annual") } else NA_character_
message("Použité filtre -> ",
"age='", age_pick, "', ",
"sex='", sex_pick, "', ",
"unit='", unit_pick, "', ",
"freq='", freq_pick, "'")
dat <- df %>%
{ if (!is.na(age_pick)) filter(., age == age_pick) else . } %>%
{ if (!is.na(sex_pick)) filter(., sex == sex_pick) else . } %>%
{ if (!is.na(unit_pick)) filter(., unit == unit_pick) else . } %>%
{ if (!is.na(freq_pick)) filter(., freq == freq_pick) else . } %>%
transmute(
geo = factor(geo),
year = as.integer(time_period),
value = suppressWarnings(as.numeric(obs_value))
) %>%
group_by(geo, year) %>%
summarise(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
arrange(geo, year)
# Ak hodnoty vyzerajú ako podiely (0–1), prepočítame na %
if (max(dat$value, na.rm = TRUE) <= 1.5) {
dat <- mutate(dat, value = value * 100)
message("Poznámka: hodnoty boli v podiele (0–1) → prepočítané na percentá.")
}
# Ak sa nájde mimo 0–100, necháme NA a upozorníme
if (any(dat$value < 0 | dat$value > 100, na.rm = TRUE)) {
n_bad <- sum(dat$value < 0 | dat$value > 100, na.rm = TRUE)
message("Upozornenie: ", n_bad, " hodnôt mimo 0–100 % → nahradené NA.")
dat <- dat %>% mutate(value = ifelse(value < 0 | value > 100, NA_real_, value))
}
# Prehľad dostupných krajín a rokov
dat %>%
group_by(geo) %>%
summarise(min_year = min(year, na.rm = TRUE),
max_year = max(year, na.rm = TRUE),
n_years = dplyr::n()) %>%
kbl(caption = "Rozsah rokov v dátach po filtrovaní") %>%
kable_classic(full_width = FALSE)| geo | min_year | max_year | n_years |
|---|---|---|---|
| Germany | 2014 | 2024 | 11 |
| Slovakia | 2014 | 2024 | 11 |
desc <- dat %>%
group_by(geo) %>%
summarise(
n = dplyr::n(),
min = min(value, na.rm = TRUE),
q1 = quantile(value, 0.25, na.rm = TRUE),
mean = mean(value, na.rm = TRUE),
median = median(value, na.rm = TRUE),
q3 = quantile(value, 0.75, na.rm = TRUE),
max = max(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
.groups = "drop"
)
desc %>%
mutate(across(where(is.numeric), ~round(.x, 2))) %>%
kbl(caption = "Deskriptívne štatistiky – miera nezamestnanosti (%)") %>%
kable_classic(full_width = FALSE)| geo | n | min | q1 | mean | median | q3 | max | sd |
|---|---|---|---|---|---|---|---|---|
| Germany | 11 | 5.9 | 6.3 | 6.96 | 7 | 7.6 | 8.2 | 0.80 |
| Slovakia | 11 | 15.8 | 19.5 | 21.35 | 20 | 22.0 | 31.0 | 4.46 |
# Uprednostníme Slovensko/Nemecko, ak sú v dataset-e; inak vykreslíme všetky
preferred <- c("Slovakia","Germany","SK","DE","Slovensko","Deutschland")
geos_in <- intersect(preferred, unique(as.character(dat$geo)))
dat_plot <- if (length(geos_in)) filter(dat, geo %in% geos_in) else dat
# Odstrániť NA a skontrolovať, že niečo zostalo
dat_plot <- dat_plot %>% filter(!is.na(year), !is.na(value))
if (nrow(dat_plot) == 0) stop("Po filtrovaní nezostali žiadne hodnoty na vykreslenie.")
ggplot(dat_plot, aes(x = year, y = value, color = geo)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
scale_y_continuous(labels = scales::label_number(accuracy = 0.1, suffix = "%")) +
labs(title = "Nezamestnanosť (ročné hodnoty)",
subtitle = "Zdroj: Eurostat UNE_RT_A | mierka: percentá",
x = "Rok", y = "Miera nezamestnanosti", color = "Krajina") +
theme_minimal(base_size = 12)