knitr::opts_chunk$set(
    echo = TRUE,
    message = FALSE,
    warning = FALSE
)

Práca s údajmi

Tradičná práca s databázou

Príklad

Majme údaje o žiakoch, ktoré predstavujú tri premenné - Meno, Priezvisko a známka:

# Working with data frames

  Meno = c("Diana", "Simona", "Katka")
  Priezvisko = c("Spálová", "Hrušovská", "Vančová")
  známka = c(2, 1, 3)
udaje <- data.frame(Meno,Priezvisko,známka)
print(udaje)
print(udaje$Priezvisko)                 # takto adresujeme jednotlivé premenné v data.frame
[1] "Spálová"   "Hrušovská" "Vančová"  
print(mean(udaje$Priezvisko))           # priemerny vek
[1] NA
print(udaje[Meno=="Simona",])     # adresovanie celého riadku
print(udaje[3,])                 # ina moznost adresovania celeho riadku
print(udaje[,2:3])               # vypisanie druheho a tretieho stlpca tabulky
print(udaje[1,1])                # vypisanie jednej bunky tabulky
[1] "Diana"
summary(udaje)                   # zakladna deskriptivna statistika celej tabulky
     Meno            Priezvisko            známka   
 Length:3           Length:3           Min.   :1.0  
 Class :character   Class :character   1st Qu.:1.5  
 Mode  :character   Mode  :character   Median :2.0  
                                       Mean   :2.0  
                                       3rd Qu.:2.5  
                                       Max.   :3.0  

pridanie stĺpca

Absolvoval <- c(TRUE,TRUE,TRUE)
udaje <- cbind(udaje,Absolvoval)
print(udaje)

pridanie riadku

# New record (must match column order/types)
novy.riadok <- data.frame(Meno = "Andrej", Priezvisko = "Varga", známka = 5,Absolvoval = FALSE)

# Append
udaje <- rbind(udaje, novy.riadok)
print(udaje)
NA

Tabuľky v prostredí kableextra

library(knitr)
library(kableExtra)

# Dáta
Meno <- c("Diana", "Simona", "Katka", "Andrej")
Priezvisko <- c("Spálová", "Hrušovská", "Vančová", "Varga")
Znamka <- c(2, 1, 3, 5)
Absolvoval <- c(TRUE, TRUE, TRUE, FALSE)

udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)

# Tabuľka
kable(
  udaje,
  digits = 1,
  align = c("l", "l", "c", "c"),
  caption = "Toto je tabuľka"
) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center"
  )
Toto je tabuľka
Meno Priezvisko Znamka Absolvoval
Diana Spálová 2 TRUE
Simona Hrušovská 1 TRUE
Katka Vančová 3 TRUE
Andrej Varga 5 FALSE
NA
NA
NA
library(knitr)
library(kableExtra)
library(dplyr)

# Dáta
Meno <- c("Diana", "Simona", "Katka", "Andrej")
Priezvisko <- c("Spálová", "Hrušovská", "Vančová", "Varga")
Znamka <- c(2, 1, 3, 5)
Absolvoval <- c(TRUE, TRUE, TRUE, FALSE)

udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)

# Tabuľka s ružovým štýlom
udaje %>%
  kable(
    digits = 1,
    align = c("l", "l", "c", "c"),
    caption = "🌷 Výsledky študentov"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center",
    font_size = 14
  ) %>%
  row_spec(0, bold = TRUE, background = "#f8d7da", color = "#6f1a2e") %>% # hlavička
  row_spec(which(!udaje$Absolvoval), background = "#f3bcc8", color = "black") %>% # neabsolvoval
  row_spec(which(udaje$Absolvoval), background = "#fde2e4", color = "black") %>%  # absolvoval
  column_spec(3, bold = TRUE, color = "#b30059") %>% # známka
  column_spec(4, bold = TRUE)
🌷 Výsledky študentov
Meno Priezvisko Znamka Absolvoval
Diana Spálová 2 TRUE
Simona Hrušovská 1 TRUE
Katka Vančová 3 TRUE
Andrej Varga 5 FALSE
NA

Tidyverse - moderná práca s údajmi

Tidyverse je súbor knižníc, ktoré majú zjednodušiť prácu s údajmi. Majú jednotný komunikačný štandard, vzájomne sa doplňujú.

# Load tidyverse
library(tidyverse)

dplyr - pre manipuláciu s údajmi

Výber a triedenie

udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)

# Výber a následné triedenie
udaje %>%
  filter(Absolvoval == TRUE) %>%       # vyberie len tých, čo absolvovali
  arrange(Znamka) %>%                  # zoradí podľa známky vzostupne
  kable(
    align = c("l", "l", "c", "c"),
    caption = "Študenti, ktorí absolvovali"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center"
  )
Študenti, ktorí absolvovali
Meno Priezvisko Znamka Absolvoval
Simona Hrušovská 1 TRUE
Diana Spálová 2 TRUE
Katka Vančová 3 TRUE

Zoskupenie a sumarizácia

udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)

# Zoskupenie a sumarizácia podľa absolvovania
udaje %>%
  group_by(Absolvoval) %>%                    # zoskupí podľa toho, kto absolvoval
  summarise(
    Priem.Znamka = mean(Znamka),              # priemer známky v skupine
    count = n()                               # počet študentov v skupine
  ) %>%
  kable(
    caption = "Priemerná známka podľa absolvovania",
    col.names = c("Absolvoval", "Priemer známka", "Počet"),
    align = "c"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center"
  )
Priemerná známka podľa absolvovania
Absolvoval Priemer známka Počet
FALSE 5 1
TRUE 2 3

Vytváranie novej premennej

# Vytváranie novej premennej
udaje <- udaje %>%
  mutate(
    Hodnotenie = case_when(
      Znamka == 1 ~ "Výborný",
      Znamka == 2 ~ "Chválitebný",
      Znamka == 3 ~ "Dobrý",
      Znamka == 4 ~ "Dostatočný",
      Znamka == 5 ~ "Nedostatočný"
    )
  )

# Zobrazenie tabuľky
udaje %>%
  kable(
    align = c("l","l","c","c","l"),
    caption = "Študenti s novou premennou Hodnotenie"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE,
    position = "center"
  )
Študenti s novou premennou Hodnotenie
Meno Priezvisko Znamka Absolvoval Hodnotenie
Diana Spálová 2 TRUE Chválitebný
Simona Hrušovská 1 TRUE Výborný
Katka Vančová 3 TRUE Dobrý
Andrej Varga 5 FALSE Nedostatočný

Import údajov

Import údajov z .csv alebo .xls

file.exists(“C:/Users/Admin/Documents/R_dataset/student_exam_scores.csv”)

library(readr)

# Načítanie CSV súboru (meno súboru presne také, ako si ho nahrala)
udaje2 <- read_csv("student_exam_scores.csv")

# Pozrieť prvých pár riadkov
head(udaje2)
NA

Grafy

ggplot2 - knižnica pre grafy

ggplot(udaje2, aes(x = hours_studied, y = exam_score, color = attendance_percent)) +
  geom_point(size = 4) +
  scale_color_gradient(low = "#FFD1DC", high = "#FF69B4") +  # pastelové ružové odtiene
  labs(
    title = "Vzťah medzi študijnými hodinami a výsledkom skúšky",
    x = "Hodiny strávené štúdiom",
    y = "Výsledok skúšky",
    color = "Dochádzka (%)"
  ) +
  theme_minimal(base_size = 13)

Scatter plot

library(ggplot2)

ggplot(udaje2, aes(x = hours_studied, y = exam_score, 
                   color = attendance_percent, size = sleep_hours)) +
  geom_point(alpha = 0.8) +
  scale_color_gradientn(
    colors = c("#FFB3BA", "#BAE1FF", "#B3FFBA", "#FFF3BA")  # pastelové farby
  ) +
  labs(
    title = "Vzťah medzi hodinami štúdia, výsledkom skúšky a spánkom",
    x = "Hodiny štúdia",
    y = "Výsledok skúšky",
    color = "Dochádzka (%)",
    size = "Hodiny spánku"
  ) +
  theme_minimal(base_size = 13)

NA
NA
NA

Boxplot

library(ggplot2)
library(dplyr)

# Rozdelenie hours_studied do kategórií
udaje2 <- udaje2 %>%
  mutate(hours_category = cut(hours_studied, breaks = 3, 
                              labels = c("Low", "Medium", "High")))

# Boxplot s rozprávkovými modrými odtieňmi
ggplot(udaje2, aes(x = hours_category, y = exam_score, fill = hours_category)) +
  geom_boxplot(color = "#0D1B2A", size = 1) +  # tmavý kontrastný obrys
  scale_fill_manual(values = c(
    "Low" = "#6CA0DC",     # svetlá, sýta modrá
    "Medium" = "#89CFF0",  # pastelovo modrá
    "High" = "#A2D2FF"     # jemná svetlomodrá
  )) +
  labs(
    title = "Výsledky skúšky podľa kategórie hodín štúdia",
    x = "Hodiny štúdia",
    y = "Výsledok skúšky",
    fill = "Kategória hodín"
  ) +
  theme_minimal(base_size = 13)

NA
NA

Základné štatistiky.

knitr - tabuľka

library(dplyr)
library(knitr)

# Summarise basic statistics for udaje3
udaje2.stats <- udaje2 %>%
  summarise(
    n = n(),
    hours_mean = mean(hours_studied, na.rm = TRUE),
    hours_sd   = sd(hours_studied, na.rm = TRUE),
    hours_min  = min(hours_studied, na.rm = TRUE),
    hours_q25  = quantile(hours_studied, 0.25, na.rm = TRUE),
    hours_median = median(hours_studied, na.rm = TRUE),
    hours_q75  = quantile(hours_studied, 0.75, na.rm = TRUE),
    hours_max  = max(hours_studied, na.rm = TRUE),
    
    sleep_mean = mean(sleep_hours, na.rm = TRUE),
    sleep_sd   = sd(sleep_hours, na.rm = TRUE),
    sleep_min  = min(sleep_hours, na.rm = TRUE),
    sleep_q25  = quantile(sleep_hours, 0.25, na.rm = TRUE),
    sleep_median = median(sleep_hours, na.rm = TRUE),
    sleep_q75  = quantile(sleep_hours, 0.75, na.rm = TRUE),
    sleep_max  = max(sleep_hours, na.rm = TRUE),
    
    attendance_mean = mean(attendance_percent, na.rm = TRUE),
    attendance_sd   = sd(attendance_percent, na.rm = TRUE),
    attendance_min  = min(attendance_percent, na.rm = TRUE),
    attendance_q25  = quantile(attendance_percent, 0.25, na.rm = TRUE),
    attendance_median = median(attendance_percent, na.rm = TRUE),
    attendance_q75  = quantile(attendance_percent, 0.75, na.rm = TRUE),
    attendance_max  = max(attendance_percent, na.rm = TRUE),
    
    previous_mean = mean(previous_scores, na.rm = TRUE),
    previous_sd   = sd(previous_scores, na.rm = TRUE),
    previous_min  = min(previous_scores, na.rm = TRUE),
    previous_q25  = quantile(previous_scores, 0.25, na.rm = TRUE),
    previous_median = median(previous_scores, na.rm = TRUE),
    previous_q75  = quantile(previous_scores, 0.75, na.rm = TRUE),
    previous_max  = max(previous_scores, na.rm = TRUE),
    
    exam_mean = mean(exam_score, na.rm = TRUE),
    exam_sd   = sd(exam_score, na.rm = TRUE),
    exam_min  = min(exam_score, na.rm = TRUE),
    exam_q25  = quantile(exam_score, 0.25, na.rm = TRUE),
    exam_median = median(exam_score, na.rm = TRUE),
    exam_q75  = quantile(exam_score, 0.75, na.rm = TRUE),
    exam_max  = max(exam_score, na.rm = TRUE)
  )

# Vytvorenie tabuľky
kable(udaje2.stats, digits = 2, caption = "Basic statistics for udaje2")
Basic statistics for udaje2
n hours_mean hours_sd hours_min hours_q25 hours_median hours_q75 hours_max sleep_mean sleep_sd sleep_min sleep_q25 sleep_median sleep_q75 sleep_max attendance_mean attendance_sd attendance_min attendance_q25 attendance_median attendance_q75 attendance_max previous_mean previous_sd previous_min previous_q25 previous_median previous_q75 previous_max exam_mean exam_sd exam_min exam_q25 exam_median exam_q75 exam_max
200 6.33 3.23 1 3.5 6.15 9 12 6.62 1.5 4 5.3 6.7 8.03 9 74.83 14.25 50.3 62.2 75.25 87.43 100 66.8 15.66 40 54 67.5 80 95 33.95 6.79 17.1 29.5 34.05 38.75 51.3
NA
NA

t-test: Porovnanie priemeru ESG indexu v rokoch 2013 a 2015

library(dplyr)

# Rozdelenie na dve skupiny podľa mediánu hodín štúdia
udaje2 <- udaje2 %>%
  mutate(hours_group = ifelse(hours_studied <= median(hours_studied), "Low", "High"))

# T-test medzi Low a High skupinou
t.test.result <- t.test(
  exam_score ~ hours_group,
  data = udaje2
)

print(t.test.result)

    Welch Two Sample t-test

data:  exam_score by hours_group
t = 13.84, df = 197.91, p-value < 2.2e-16
alternative hypothesis: true difference in means between group High and group Low is not equal to 0
95 percent confidence interval:
  8.144613 10.851387
sample estimates:
mean in group High  mean in group Low 
            38.704             29.206 

ANOVA: Comparing Reading Scores Across Programs

# ANOVA test pre exam_score podľa skupiny hodín štúdia
anova.result <- aov(exam_score ~ hours_group, data = udaje2)
summary(anova.result)
             Df Sum Sq Mean Sq F value Pr(>F)    
hours_group   1   4511    4511   191.5 <2e-16 ***
Residuals   198   4663      24                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Linear Regression: Predicting Math Scores

# Lineárny model pre udaje2
model <- lm(exam_score ~ hours_studied + sleep_hours + attendance_percent, data = udaje2)
summary(model)

Call:
lm(formula = exam_score ~ hours_studied + sleep_hours + attendance_percent, 
    data = udaje2)

Residuals:
    Min      1Q  Median      3Q     Max 
-8.5534 -2.7064 -0.1704  3.1321  7.6393 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        10.90021    1.95832   5.566 8.49e-08 ***
hours_studied       1.62964    0.08502  19.168  < 2e-16 ***
sleep_hours         0.57941    0.18318   3.163  0.00181 ** 
attendance_percent  0.11907    0.01920   6.202 3.24e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.857 on 196 degrees of freedom
Multiple R-squared:  0.6821,    Adjusted R-squared:  0.6773 
F-statistic: 140.2 on 3 and 196 DF,  p-value: < 2.2e-16
