Majme údaje o žiakoch, ktoré predstavujú tri premenné - Meno, Priezvisko a známka:
# Working with data frames
Meno = c("Diana", "Simona", "Katka")
Priezvisko = c("Spálová", "Hrušovská", "Vančová")
známka = c(2, 1, 3)
[1] "Spálová" "Hrušovská" "Vančová"
[1] NA
[1] "Diana"
Meno Priezvisko známka
Length:3 Length:3 Min. :1.0
Class :character Class :character 1st Qu.:1.5
Mode :character Mode :character Median :2.0
Mean :2.0
3rd Qu.:2.5
Max. :3.0
# New record (must match column order/types)
novy.riadok <- data.frame(Meno = "Andrej", Priezvisko = "Varga", známka = 5,Absolvoval = FALSE)
# Append
udaje <- rbind(udaje, novy.riadok)
print(udaje)
library(knitr)
library(kableExtra)
# Dáta
Meno <- c("Diana", "Simona", "Katka", "Andrej")
Priezvisko <- c("Spálová", "Hrušovská", "Vančová", "Varga")
Znamka <- c(2, 1, 3, 5)
Absolvoval <- c(TRUE, TRUE, TRUE, FALSE)
udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)
# Tabuľka
kable(
udaje,
digits = 1,
align = c("l", "l", "c", "c"),
caption = "Toto je tabuľka"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)
Meno | Priezvisko | Znamka | Absolvoval |
---|---|---|---|
Diana | Spálová | 2 | TRUE |
Simona | Hrušovská | 1 | TRUE |
Katka | Vančová | 3 | TRUE |
Andrej | Varga | 5 | FALSE |
library(knitr)
library(kableExtra)
library(dplyr)
# Dáta
Meno <- c("Diana", "Simona", "Katka", "Andrej")
Priezvisko <- c("Spálová", "Hrušovská", "Vančová", "Varga")
Znamka <- c(2, 1, 3, 5)
Absolvoval <- c(TRUE, TRUE, TRUE, FALSE)
udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)
# Tabuľka s ružovým štýlom
udaje %>%
kable(
digits = 1,
align = c("l", "l", "c", "c"),
caption = "🌷 Výsledky študentov"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center",
font_size = 14
) %>%
row_spec(0, bold = TRUE, background = "#f8d7da", color = "#6f1a2e") %>% # hlavička
row_spec(which(!udaje$Absolvoval), background = "#f3bcc8", color = "black") %>% # neabsolvoval
row_spec(which(udaje$Absolvoval), background = "#fde2e4", color = "black") %>% # absolvoval
column_spec(3, bold = TRUE, color = "#b30059") %>% # známka
column_spec(4, bold = TRUE)
Meno | Priezvisko | Znamka | Absolvoval |
---|---|---|---|
Diana | Spálová | 2 | TRUE |
Simona | Hrušovská | 1 | TRUE |
Katka | Vančová | 3 | TRUE |
Andrej | Varga | 5 | FALSE |
Tidyverse je súbor knižníc, ktoré majú zjednodušiť prácu s údajmi. Majú jednotný komunikačný štandard, vzájomne sa doplňujú.
udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)
# Výber a následné triedenie
udaje %>%
filter(Absolvoval == TRUE) %>% # vyberie len tých, čo absolvovali
arrange(Znamka) %>% # zoradí podľa známky vzostupne
kable(
align = c("l", "l", "c", "c"),
caption = "Študenti, ktorí absolvovali"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)
Meno | Priezvisko | Znamka | Absolvoval |
---|---|---|---|
Simona | Hrušovská | 1 | TRUE |
Diana | Spálová | 2 | TRUE |
Katka | Vančová | 3 | TRUE |
udaje <- data.frame(Meno, Priezvisko, Znamka, Absolvoval)
# Zoskupenie a sumarizácia podľa absolvovania
udaje %>%
group_by(Absolvoval) %>% # zoskupí podľa toho, kto absolvoval
summarise(
Priem.Znamka = mean(Znamka), # priemer známky v skupine
count = n() # počet študentov v skupine
) %>%
kable(
caption = "Priemerná známka podľa absolvovania",
col.names = c("Absolvoval", "Priemer známka", "Počet"),
align = "c"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)
Absolvoval | Priemer známka | Počet |
---|---|---|
FALSE | 5 | 1 |
TRUE | 2 | 3 |
# Vytváranie novej premennej
udaje <- udaje %>%
mutate(
Hodnotenie = case_when(
Znamka == 1 ~ "Výborný",
Znamka == 2 ~ "Chválitebný",
Znamka == 3 ~ "Dobrý",
Znamka == 4 ~ "Dostatočný",
Znamka == 5 ~ "Nedostatočný"
)
)
# Zobrazenie tabuľky
udaje %>%
kable(
align = c("l","l","c","c","l"),
caption = "Študenti s novou premennou Hodnotenie"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center"
)
Meno | Priezvisko | Znamka | Absolvoval | Hodnotenie |
---|---|---|---|---|
Diana | Spálová | 2 | TRUE | Chválitebný |
Simona | Hrušovská | 1 | TRUE | Výborný |
Katka | Vančová | 3 | TRUE | Dobrý |
Andrej | Varga | 5 | FALSE | Nedostatočný |
ggplot(udaje2, aes(x = hours_studied, y = exam_score, color = attendance_percent)) +
geom_point(size = 4) +
scale_color_gradient(low = "#FFD1DC", high = "#FF69B4") + # pastelové ružové odtiene
labs(
title = "Vzťah medzi študijnými hodinami a výsledkom skúšky",
x = "Hodiny strávené štúdiom",
y = "Výsledok skúšky",
color = "Dochádzka (%)"
) +
theme_minimal(base_size = 13)
library(ggplot2)
ggplot(udaje2, aes(x = hours_studied, y = exam_score,
color = attendance_percent, size = sleep_hours)) +
geom_point(alpha = 0.8) +
scale_color_gradientn(
colors = c("#FFB3BA", "#BAE1FF", "#B3FFBA", "#FFF3BA") # pastelové farby
) +
labs(
title = "Vzťah medzi hodinami štúdia, výsledkom skúšky a spánkom",
x = "Hodiny štúdia",
y = "Výsledok skúšky",
color = "Dochádzka (%)",
size = "Hodiny spánku"
) +
theme_minimal(base_size = 13)
library(ggplot2)
library(dplyr)
# Rozdelenie hours_studied do kategórií
udaje2 <- udaje2 %>%
mutate(hours_category = cut(hours_studied, breaks = 3,
labels = c("Low", "Medium", "High")))
# Boxplot s rozprávkovými modrými odtieňmi
ggplot(udaje2, aes(x = hours_category, y = exam_score, fill = hours_category)) +
geom_boxplot(color = "#0D1B2A", size = 1) + # tmavý kontrastný obrys
scale_fill_manual(values = c(
"Low" = "#6CA0DC", # svetlá, sýta modrá
"Medium" = "#89CFF0", # pastelovo modrá
"High" = "#A2D2FF" # jemná svetlomodrá
)) +
labs(
title = "Výsledky skúšky podľa kategórie hodín štúdia",
x = "Hodiny štúdia",
y = "Výsledok skúšky",
fill = "Kategória hodín"
) +
theme_minimal(base_size = 13)
library(dplyr)
library(knitr)
# Summarise basic statistics for udaje3
udaje2.stats <- udaje2 %>%
summarise(
n = n(),
hours_mean = mean(hours_studied, na.rm = TRUE),
hours_sd = sd(hours_studied, na.rm = TRUE),
hours_min = min(hours_studied, na.rm = TRUE),
hours_q25 = quantile(hours_studied, 0.25, na.rm = TRUE),
hours_median = median(hours_studied, na.rm = TRUE),
hours_q75 = quantile(hours_studied, 0.75, na.rm = TRUE),
hours_max = max(hours_studied, na.rm = TRUE),
sleep_mean = mean(sleep_hours, na.rm = TRUE),
sleep_sd = sd(sleep_hours, na.rm = TRUE),
sleep_min = min(sleep_hours, na.rm = TRUE),
sleep_q25 = quantile(sleep_hours, 0.25, na.rm = TRUE),
sleep_median = median(sleep_hours, na.rm = TRUE),
sleep_q75 = quantile(sleep_hours, 0.75, na.rm = TRUE),
sleep_max = max(sleep_hours, na.rm = TRUE),
attendance_mean = mean(attendance_percent, na.rm = TRUE),
attendance_sd = sd(attendance_percent, na.rm = TRUE),
attendance_min = min(attendance_percent, na.rm = TRUE),
attendance_q25 = quantile(attendance_percent, 0.25, na.rm = TRUE),
attendance_median = median(attendance_percent, na.rm = TRUE),
attendance_q75 = quantile(attendance_percent, 0.75, na.rm = TRUE),
attendance_max = max(attendance_percent, na.rm = TRUE),
previous_mean = mean(previous_scores, na.rm = TRUE),
previous_sd = sd(previous_scores, na.rm = TRUE),
previous_min = min(previous_scores, na.rm = TRUE),
previous_q25 = quantile(previous_scores, 0.25, na.rm = TRUE),
previous_median = median(previous_scores, na.rm = TRUE),
previous_q75 = quantile(previous_scores, 0.75, na.rm = TRUE),
previous_max = max(previous_scores, na.rm = TRUE),
exam_mean = mean(exam_score, na.rm = TRUE),
exam_sd = sd(exam_score, na.rm = TRUE),
exam_min = min(exam_score, na.rm = TRUE),
exam_q25 = quantile(exam_score, 0.25, na.rm = TRUE),
exam_median = median(exam_score, na.rm = TRUE),
exam_q75 = quantile(exam_score, 0.75, na.rm = TRUE),
exam_max = max(exam_score, na.rm = TRUE)
)
# Vytvorenie tabuľky
kable(udaje2.stats, digits = 2, caption = "Basic statistics for udaje2")
n | hours_mean | hours_sd | hours_min | hours_q25 | hours_median | hours_q75 | hours_max | sleep_mean | sleep_sd | sleep_min | sleep_q25 | sleep_median | sleep_q75 | sleep_max | attendance_mean | attendance_sd | attendance_min | attendance_q25 | attendance_median | attendance_q75 | attendance_max | previous_mean | previous_sd | previous_min | previous_q25 | previous_median | previous_q75 | previous_max | exam_mean | exam_sd | exam_min | exam_q25 | exam_median | exam_q75 | exam_max |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
200 | 6.33 | 3.23 | 1 | 3.5 | 6.15 | 9 | 12 | 6.62 | 1.5 | 4 | 5.3 | 6.7 | 8.03 | 9 | 74.83 | 14.25 | 50.3 | 62.2 | 75.25 | 87.43 | 100 | 66.8 | 15.66 | 40 | 54 | 67.5 | 80 | 95 | 33.95 | 6.79 | 17.1 | 29.5 | 34.05 | 38.75 | 51.3 |
library(dplyr)
# Rozdelenie na dve skupiny podľa mediánu hodín štúdia
udaje2 <- udaje2 %>%
mutate(hours_group = ifelse(hours_studied <= median(hours_studied), "Low", "High"))
# T-test medzi Low a High skupinou
t.test.result <- t.test(
exam_score ~ hours_group,
data = udaje2
)
print(t.test.result)
Welch Two Sample t-test
data: exam_score by hours_group
t = 13.84, df = 197.91, p-value < 2.2e-16
alternative hypothesis: true difference in means between group High and group Low is not equal to 0
95 percent confidence interval:
8.144613 10.851387
sample estimates:
mean in group High mean in group Low
38.704 29.206
# ANOVA test pre exam_score podľa skupiny hodín štúdia
anova.result <- aov(exam_score ~ hours_group, data = udaje2)
summary(anova.result)
Df Sum Sq Mean Sq F value Pr(>F)
hours_group 1 4511 4511 191.5 <2e-16 ***
Residuals 198 4663 24
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Lineárny model pre udaje2
model <- lm(exam_score ~ hours_studied + sleep_hours + attendance_percent, data = udaje2)
summary(model)
Call:
lm(formula = exam_score ~ hours_studied + sleep_hours + attendance_percent,
data = udaje2)
Residuals:
Min 1Q Median 3Q Max
-8.5534 -2.7064 -0.1704 3.1321 7.6393
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.90021 1.95832 5.566 8.49e-08 ***
hours_studied 1.62964 0.08502 19.168 < 2e-16 ***
sleep_hours 0.57941 0.18318 3.163 0.00181 **
attendance_percent 0.11907 0.01920 6.202 3.24e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.857 on 196 degrees of freedom
Multiple R-squared: 0.6821, Adjusted R-squared: 0.6773
F-statistic: 140.2 on 3 and 196 DF, p-value: < 2.2e-16