Tento report načíta súbor Databaza.csv (stĺpce: Okres, Obec, 2024M12, …, 2024M01), prevedie údaje do “long” formátu a vykoná základné grafy, štatistiky, testy a regresiu.

Balíčky a dáta

suppressPackageStartupMessages({
  library(dplyr)
  library(readr)
  library(tidyr)
  library(ggplot2)
  library(knitr)
  library(kableExtra)
  library(broom)
  library(stringr)
})
# CSV je bodkočiarkové
udaje_raw <- read_delim("Databaza.csv", delim = ";", show_col_types = FALSE)

# validácie
stopifnot(all(c("Okres","Obec") %in% names(udaje_raw)))
mesiac_cols <- grep("^2024M\\d{2}$", names(udaje_raw), value = TRUE)
if (length(mesiac_cols) == 0) {
  stop("Nenašiel som žiadne stĺpce typu 2024Mxx (napr. 2024M12, 2024M01).")
}

# Long formát
udaje_long <- udaje_raw |>
  pivot_longer(
    cols = all_of(mesiac_cols),
    names_to = "YM",
    values_to = "Hodnota"
  ) |>
  mutate(
    Rok    = as.integer(str_sub(YM, 1, 4)),
    Mesiac = as.integer(str_sub(YM, 6, 7)),
    Hodnota = suppressWarnings(as.numeric(Hodnota))
  )

# Wide podmnožina pre jednoduché modely
udaje_dec <- udaje_raw |>
  dplyr::select(Okres, Obec, any_of(c("2024M12","2024M11","2024M06","2024M01"))) |>
  mutate(across(-c(Okres, Obec), ~ suppressWarnings(as.numeric(.))))

Vizualizácie

Scatter plot: 2024M11 vs 2024M12

if (all(c("2024M12","2024M11") %in% names(udaje_dec))) {
  ggplot(udaje_dec, aes(x = `2024M11`, y = `2024M12`, colour = Okres)) +
    geom_point(alpha = 0.8) +
    theme_minimal() +
    labs(
      title = "Vzťah medzi 2024M11 a 2024M12 naprieč obcami",
      x = "Hodnota v 2024M11",
      y = "Hodnota v 2024M12",
      colour = "Okres"
    )
} else {
  plot.new(); title("Na scatter plot chýbajú 2024M11 a/alebo 2024M12")
}

Boxplot: rozdelenie hodnôt podľa mesiacov

ggplot(udaje_long, aes(x = factor(Mesiac), y = Hodnota)) +
  geom_boxplot(fill = "lightblue", colour = "darkblue") +
  labs(
    title = "Rozdelenie hodnôt podľa mesiacov (rok 2024)",
    x = "Mesiac",
    y = "Hodnota"
  ) +
  theme_minimal()

Základné štatistiky

stat_tbl <- udaje_long |>
  group_by(Mesiac) |>
  summarise(
    n      = n(),
    mean   = mean(Hodnota, na.rm = TRUE),
    sd     = sd(Hodnota, na.rm = TRUE),
    min    = min(Hodnota, na.rm = TRUE),
    q25    = quantile(Hodnota, 0.25, na.rm = TRUE),
    median = median(Hodnota, na.rm = TRUE),
    q75    = quantile(Hodnota, 0.75, na.rm = TRUE),
    max    = max(Hodnota, na.rm = TRUE),
    .groups = "drop"
  )

# knitr tabuľka
kable(stat_tbl, digits = 2, caption = "Základné štatistiky podľa mesiacov (2024)")
Základné štatistiky podľa mesiacov (2024)
Mesiac n mean sd min q25 median q75 max
1 200 4912.38 11845.18 102 880.50 1701.5 3250.50 112794
2 200 4913.83 11845.32 103 878.75 1702.5 3260.50 112740
3 200 4915.00 11844.50 103 881.75 1703.5 3257.00 112686
4 200 4916.87 11844.14 103 879.75 1707.5 3264.00 112661
5 200 4919.06 11846.72 103 882.00 1707.0 3266.00 112584
6 200 4921.61 11848.82 103 886.50 1710.0 3261.50 112553
7 200 4923.44 11853.09 101 887.75 1713.0 3267.50 112528
8 200 4925.03 11853.64 101 890.00 1717.0 3269.25 112469
9 200 4926.24 11854.74 100 889.50 1719.0 3268.25 112449
10 200 4927.26 11854.32 100 886.75 1717.5 3273.75 112436
11 200 4929.35 11856.14 101 892.00 1719.0 3276.50 112400
12 200 4930.90 11862.18 101 891.50 1720.0 3279.00 112447

# kableExtra tabuľka
stat_tbl |>
  kable(digits = 2, caption = "Základné štatistiky podľa mesiacov (2024)") |>
  kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed")) |>
  column_spec(1, bold = TRUE) |>
  row_spec(0, bold = TRUE, background = "#f2f2f2") |>
  add_header_above(c(" " = 2, "Štatistiky hodnôt" = 7))
Základné štatistiky podľa mesiacov (2024)
Štatistiky hodnôt
Mesiac n mean sd min q25 median q75 max
1 200 4912.38 11845.18 102 880.50 1701.5 3250.50 112794
2 200 4913.83 11845.32 103 878.75 1702.5 3260.50 112740
3 200 4915.00 11844.50 103 881.75 1703.5 3257.00 112686
4 200 4916.87 11844.14 103 879.75 1707.5 3264.00 112661
5 200 4919.06 11846.72 103 882.00 1707.0 3266.00 112584
6 200 4921.61 11848.82 103 886.50 1710.0 3261.50 112553
7 200 4923.44 11853.09 101 887.75 1713.0 3267.50 112528
8 200 4925.03 11853.64 101 890.00 1717.0 3269.25 112469
9 200 4926.24 11854.74 100 889.50 1719.0 3268.25 112449
10 200 4927.26 11854.32 100 886.75 1717.5 3273.75 112436
11 200 4929.35 11856.14 101 892.00 1719.0 3276.50 112400
12 200 4930.90 11862.18 101 891.50 1720.0 3279.00 112447

Testovanie hypotéz

t-test: porovnanie priemerov (2024M12 vs 2024M06)

if (all(c("2024M12","2024M06") %in% names(udaje_raw))) {
  t.test.result <- t.test(
    udaje_long$Hodnota[udaje_long$YM == "2024M12"],
    udaje_long$Hodnota[udaje_long$YM == "2024M06"]
  )
  t.test.result
} else {
  "Na t-test chýba 2024M12 a/alebo 2024M06."
}

    Welch Two Sample t-test

data:  udaje_long$Hodnota[udaje_long$YM == "2024M12"] and udaje_long$Hodnota[udaje_long$YM == "2024M06"]
t = 0.007836, df = 398, p-value = 0.9938
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2321.433  2340.013
sample estimates:
mean of x mean of y 
  4930.90   4921.61 

ANOVA: Hodnota ~ Mesiac

anova.result <- aov(Hodnota ~ factor(Mesiac), data = udaje_long)
summary(anova.result)
                 Df    Sum Sq   Mean Sq F value Pr(>F)
factor(Mesiac)   11 8.663e+04      7875       0      1
Residuals      2388 3.354e+11 140439858               

Lineárna regresia

Predikujeme 2024M12 pomocou 2024M11 a 2024M01 (ak sú dostupné).

if (all(c("2024M12","2024M11","2024M01") %in% names(udaje_dec))) {
  model <- lm(`2024M12` ~ `2024M11` + `2024M01`, data = udaje_dec)
  summary(model)
} else {
  model <- NULL
  "Na regresiu chýbajú niektoré z 2024M12, 2024M11, 2024M01."
}

Call:
lm(formula = `2024M12` ~ `2024M11` + `2024M01`, data = udaje_dec)

Residuals:
    Min      1Q  Median      3Q     Max 
-51.294  -2.214   0.940   3.381  31.350 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.778536   0.633074  -2.809  0.00546 ** 
`2024M11`    1.066640   0.006654 160.291  < 2e-16 ***
`2024M01`   -0.066194   0.006661  -9.938  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 8.195 on 197 degrees of freedom
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 2.085e+08 on 2 and 197 DF,  p-value: < 2.2e-16

Koeficienty a intervaly spoľahlivosti

if (!is.null(model)) {
  coef.tbl <- broom::tidy(model, conf.int = TRUE) |>
    dplyr::mutate(
      term = dplyr::recode(term,
                           "(Intercept)" = "Intercept",
                           "`2024M11`"   = "Hodnota v 2024M11",
                           "`2024M01`"   = "Hodnota v 2024M01"),
      stars = dplyr::case_when(
        p.value < 0.001 ~ "***",
        p.value < 0.01  ~ "**",
        p.value < 0.05  ~ "*",
        p.value < 0.1   ~ "·",
        TRUE            ~ ""
      )
    ) |>
    dplyr::transmute(
      Term         = term,
      Estimate     = estimate,
      `Std. Error` = std.error,
      `t value`    = statistic,
      `p value`    = p.value,
      `95% CI`     = stringr::str_c("[", round(conf.low, 3), ", ", round(conf.high, 3), "]"),
      Sig          = stars
    )

  coef.tbl |>
    kable(digits = 3, caption = "OLS koeficienty: 2024M12 ~ 2024M11 + 2024M01") |>
    kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover","condensed")) |>
    column_spec(1, bold = TRUE) |>
    row_spec(0, bold = TRUE, background = "#f2f2f2") |>
    footnote(
      general = "Signif. codes: *** p<0.001, ** p<0.01, * p<0.05, · p<0.1.",
      threeparttable = TRUE
    )
} else {
  cat("Koeficienty nie sú k dispozícii (model sa nefitol).")
}
OLS koeficienty: 2024M12 ~ 2024M11 + 2024M01
Term Estimate Std. Error t value p value 95% CI Sig
Intercept -1.779 0.633 -2.809 0.005 [-3.027, -0.53] **
Hodnota v 2024M11 1.067 0.007 160.291 0.000 [1.054, 1.08] ***
Hodnota v 2024M01 -0.066 0.007 -9.938 0.000 [-0.079, -0.053] ***
Note:
Signif. codes: *** p<0.001, ** p<0.01, * p<0.05, · p<0.1.

Štatistiky kvality modelu

if (!is.null(model)) {
  fit.tbl <- broom::glance(model) |>
    dplyr::transmute(
      `R-squared`      = r.squared,
      `Adj. R-squared` = adj.r.squared,
      `F-statistic`    = statistic,
      `F p-value`      = p.value,
      `AIC`            = AIC,
      `BIC`            = BIC,
      `Num. obs.`      = nobs
    )

  fit.tbl |>
    kable(digits = 3, caption = "Model Fit Statistics") |>
    kable_styling(full_width = FALSE, bootstrap_options = c("condensed"))
} else {
  cat("Fit štatistiky nie sú k dispozícii (model sa nefitol).")
}
Model Fit Statistics
R-squared Adj. R-squared F-statistic F p-value AIC BIC Num. obs.
1 1 208498789 0 1413.939 1427.133 200
