Vybrala som si súbor dát, v ktorom sú údaje o HDP krajín od roku 1975-2025. Skúšala som základné prvy práce s datasetom + s vecami, ktoré som nevedela sama mi pomohol chat gpt
library(readr) #táto knižnica číta csv súbory
getwd()
## [1] "/cloud/project"
list.files()
## [1] "GDP_1975_2025_uploaded.csv" "Práca s údajmi.Rmd"
## [3] "Práca-s-údajmi.html" "Práca-s-údajmi.Rmd"
## [5] "project.Rproj" "rsconnect"
data <- read.csv("GDP_1975_2025_uploaded.csv", check.names = FALSE)
summary(data) #Súhrn deskriptívnej štatistiky datasetu
## Country 1975 1976 1977
## Length:205 Min. : 4 Min. : 4 Min. : 4.0
## Class :character 1st Qu.: 585 1st Qu.: 685 1st Qu.: 701.5
## Mode :character Median : 2610 Median : 2814 Median : 3139.0
## Mean : 36715 Mean : 39632 Mean : 44606.0
## 3rd Qu.: 13654 3rd Qu.: 15626 3rd Qu.: 18762.5
## Max. :1623400 Max. :1809100 Max. :2013600.0
## NA's :30 NA's :30 NA's :30
## 1978 1979 1980 1981
## Min. : 4 Min. : 4 Min. : 32 Min. : 30
## 1st Qu.: 800 1st Qu.: 870 1st Qu.: 1988 1st Qu.: 1695
## Median : 3020 Median : 3709 Median : 7327 Median : 7578
## Mean : 52869 Mean : 60831 Mean : 78015 Mean : 79502
## 3rd Qu.: 21096 3rd Qu.: 24343 3rd Qu.: 41865 3rd Qu.: 48156
## Max. :2276000 Max. :2543500 Max. :2857325 Max. :3207025
## NA's :30 NA's :30 NA's :61 NA's :60
## 1982 1983 1984 1985
## Min. : 33 Min. : 31 Min. : 34 Min. : 26
## 1st Qu.: 1548 1st Qu.: 1767 1st Qu.: 1579 1st Qu.: 1808
## Median : 8170 Median : 7898 Median : 7891 Median : 6634
## Mean : 78010 Mean : 80101 Mean : 82957 Mean : 86010
## 3rd Qu.: 49535 3rd Qu.: 48401 3rd Qu.: 46358 3rd Qu.: 45498
## Max. :3343800 Max. :3634025 Max. :4037650 Max. :4339000
## NA's :60 NA's :60 NA's :60 NA's :59
## 1986 1987 1988 1989
## Min. : 26 Min. : 30 Min. : 38 Min. : 38
## 1st Qu.: 2008 1st Qu.: 2256 1st Qu.: 2330 1st Qu.: 2336
## Median : 6746 Median : 8149 Median : 7605 Median : 8550
## Mean : 100656 Mean : 115877 Mean : 130296 Mean : 137087
## 3rd Qu.: 46584 3rd Qu.: 51422 3rd Qu.: 60043 3rd Qu.: 58424
## Max. :4579625 Max. :4855250 Max. :5236425 Max. :5641600
## NA's :58 NA's :58 NA's :58 NA's :58
## 1990 1991 1992 1993
## Min. : 37 Min. : 41 Min. : 62 Min. : 59
## 1st Qu.: 2518 1st Qu.: 2528 1st Qu.: 2015 1st Qu.: 2164
## Median : 7494 Median : 6984 Median : 7158 Median : 7349
## Mean : 148936 Mean : 154708 Mean : 150045 Mean : 152550
## 3rd Qu.: 61364 3rd Qu.: 53519 3rd Qu.: 52275 3rd Qu.: 56058
## Max. :5963125 Max. :6158125 Max. :6520325 Max. :6858550
## NA's :53 NA's :52 NA's :36 NA's :34
## 1994 1995 1996 1997
## Min. : 67 Min. : 69 Min. : 81 Min. : 80
## 1st Qu.: 2164 1st Qu.: 2365 1st Qu.: 2874 1st Qu.: 2550
## Median : 8004 Median : 8544 Median : 8794 Median : 9970
## Mean : 162124 Mean : 175698 Mean : 179601 Mean : 178008
## 3rd Qu.: 57517 3rd Qu.: 67803 3rd Qu.: 71986 3rd Qu.: 80572
## Max. :7287250 Max. :7639750 Max. :8073125 Max. :8577550
## NA's :32 NA's :27 NA's :26 NA's :25
## 1998 1999 2000 2001
## Min. : 73 Min. : 77 Min. : 15 Min. : 14
## 1st Qu.: 2585 1st Qu.: 2646 1st Qu.: 1963 1st Qu.: 1992
## Median : 10251 Median : 10502 Median : 9448 Median : 9424
## Mean : 176149 Mean : 182420 Mean : 178796 Mean : 176624
## 3rd Qu.: 75304 3rd Qu.: 75596 3rd Qu.: 61936 3rd Qu.: 68519
## Max. :9062825 Max. :9631175 Max. :10250950 Max. :10581925
## NA's :24 NA's :24 NA's :14 NA's :13
## 2002 2003 2004 2005
## Min. : 17 Min. : 20 Min. : 23 Min. : 23
## 1st Qu.: 2301 1st Qu.: 2854 1st Qu.: 3180 1st Qu.: 3400
## Median : 9972 Median : 11429 Median : 13590 Median : 15434
## Mean : 181505 Mean : 203827 Mean : 228248 Mean : 247334
## 3rd Qu.: 70295 3rd Qu.: 82408 3rd Qu.: 94230 3rd Qu.: 107327
## Max. :10929100 Max. :11456450 Max. :12217175 Max. :13039200
## NA's :12 NA's :12 NA's :11 NA's :11
## 2006 2007 2008 2009
## Min. : 24 Min. : 25 Min. : 32 Min. : 28
## 1st Qu.: 4012 1st Qu.: 4785 1st Qu.: 5704 1st Qu.: 5497
## Median : 18032 Median : 21732 Median : 26161 Median : 24834
## Mean : 267961 Mean : 302552 Mean : 332171 Mean : 314692
## 3rd Qu.: 120486 3rd Qu.: 143066 3rd Qu.: 180204 3rd Qu.: 166425
## Max. :13815600 Max. :14474250 Max. :14769850 Max. :14478050
## NA's :11 NA's :11 NA's :11 NA's :11
## 2010 2011 2012 2013
## Min. : 33 Min. : 40 Min. : 39 Min. : 39
## 1st Qu.: 6888 1st Qu.: 6834 1st Qu.: 7623 1st Qu.: 8098
## Median : 27008 Median : 29524 Median : 30939 Median : 33823
## Mean : 344314 Mean : 380419 Mean : 387838 Mean : 399313
## 3rd Qu.: 175921 3rd Qu.: 205589 3rd Qu.: 213518 3rd Qu.: 228191
## Max. :15048975 Max. :15599725 Max. :16253950 Max. :16880675
## NA's :11 NA's :10 NA's :10 NA's :10
## 2014 2015 2016 2017
## Min. : 39 Min. : 37 Min. : 41 Min. : 45
## 1st Qu.: 8652 1st Qu.: 8202 1st Qu.: 7562 1st Qu.: 8598
## Median : 36396 Median : 33241 Median : 33885 Median : 37205
## Mean : 410172 Mean : 387278 Mean : 393934 Mean : 419054
## 3rd Qu.: 225777 3rd Qu.: 190979 3rd Qu.: 194696 3rd Qu.: 213554
## Max. :17608125 Max. :18295000 Max. :18804900 Max. :19612100
## NA's :10 NA's :10 NA's :10 NA's :10
## 2018 2019 2020 2021
## Min. : 48 Min. : 54 Min. : 52 Min. : 62
## 1st Qu.: 8954 1st Qu.: 9860 1st Qu.: 9588 1st Qu.: 11142
## Median : 39568 Median : 37925 Median : 35334 Median : 37719
## Mean : 444983 Mean : 451423 Mean : 437889 Mean : 504350
## 3rd Qu.: 236196 3rd Qu.: 238412 3rd Qu.: 207481 3rd Qu.: 254613
## Max. :20656525 Max. :21539975 Max. :21354125 Max. :23681175
## NA's :10 NA's :10 NA's :9 NA's :11
## 2022 2023 2024 2025
## Min. : 61 Min. : 63 Min. : 65 Min. : 65
## 1st Qu.: 12650 1st Qu.: 13604 1st Qu.: 13500 1st Qu.: 14214
## Median : 41568 Median : 43631 Median : 47136 Median : 47829
## Mean : 525506 Mean : 548617 Mean : 575687 Mean : 599141
## 3rd Qu.: 274631 3rd Qu.: 285036 3rd Qu.: 291511 3rd Qu.: 303293
## Max. :26006900 Max. :27720725 Max. :29184900 Max. :30507217
## NA's :11 NA's :11 NA's :13 NA's :16
Zaujímalo ma, ktorých top 5 krajín malo najvyššie HDP v roku 1975 vs. 2025
# Načítanie CSV súboru
gdp <- read.csv("GDP_1975_2025_uploaded.csv", check.names = FALSE)
# Top 5 krajín podľa HDP v roku 1975
top_1975 <- gdp[order(-gdp[["1975"]]), c("Country", "1975")][1:5, ]
print(top_1975)
## Country 1975
## 196 United States 1623400
## 171 Soviet Union 685972
## 88 Japan 512861
## 66 Germany 474785
## 62 France 355617
# Top 5 krajín podľa HDP v roku 2025
top_2025 <- gdp[order(-gdp[["2025"]]), c("Country", "2025")][1:5, ]
print(top_2025)
## Country 2025
## 196 United States 30507217
## 37 China 19231705
## 66 Germany 4744804
## 79 India 4187017
## 88 Japan 4186431
#stiahnem si knižnice, ktoré mi pomôžu vykresliť pekné grafy
library(ggplot2)
library(reshape2)
# Vyber len USA a Čínu
usa <- gdp[gdp$Country == "United States", -1]
china <- gdp[gdp$Country == "China", -1]
# Získaj roky ako čísla
roky <- as.numeric(names(usa))
# Nakresli graf pre USA
plot(roky, as.numeric(usa[1, ]), type = "l", col = "blue", lwd = 2,
xlab = "Rok", ylab = "HDP (v miliónoch USD)",
main = "Vývoj HDP: USA vs. Čína (1975–2025)")
#Graf pre Čínu
lines(roky, as.numeric(china[1, ]), col = "red", lwd = 2)
Ako posledný krok si chcem pripraviť peknú tabuľku z údajov s použitím knižníc knitr, dplyr a kableExtra
# Načítanie potrebných knižníc
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(textshaping)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# Základná deskriptívna štatistika pre vybrané roky (1975, 2025)
gdp.stats <- gdp %>%
select(Country, `1975`, `2025`) %>%
summarise(
n = n(),
mean_1975 = mean(`1975`, na.rm = TRUE), #vďaka na.rm nepočíta s chýbajúcimi číslami v datasete
sd_1975 = sd(`1975`, na.rm = TRUE),
min_1975 = min(`1975`, na.rm = TRUE),
max_1975 = max(`1975`, na.rm = TRUE),
mean_2025 = mean(`2025`, na.rm = TRUE),
sd_2025 = sd(`2025`, na.rm = TRUE),
min_2025 = min(`2025`, na.rm = TRUE),
max_2025 = max(`2025`, na.rm = TRUE)
)
gdp.stats;
## n mean_1975 sd_1975 min_1975 max_1975 mean_2025 sd_2025 min_2025 max_2025
## 1 205 36714.87 146931.1 4 1623400 599141.4 2693421 65 30507217
# pekná tabuľka
top_2025%>%
kable(
digits = 2,
caption = "Top 5 krajín v roku 2025"
) %>%
kable_styling(
full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed", "responsive")
) %>%
column_spec(1, bold = TRUE) %>%
row_spec(0, bold = TRUE, background = "#f2f2f2")
| Country | 2025 | |
|---|---|---|
| 196 | United States | 30507217 |
| 37 | China | 19231705 |
| 66 | Germany | 4744804 |
| 79 | India | 4187017 |
| 88 | Japan | 4186431 |
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ kableExtra::group_rows() masks dplyr::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Načítanie dát
gdp <- read.csv("GDP_1975_2025_uploaded.csv")
# Transpozícia dát
gdp_t <- gdp %>%
column_to_rownames("Country") %>%
t() %>%
as.data.frame() %>%
mutate(across(everything(), as.numeric))
# Výbery krajín
austria <- na.omit(gdp_t$Austria)
australia <- na.omit(gdp_t$Australia)
canada <- na.omit(gdp_t$Canada)
# Roky synchronizované s NAs odstanenými pre Rakúsko
df_reg <- data.frame(
year = as.numeric(rownames(gdp_t)),
gdp = gdp_t$Austria
) %>% na.omit()
## Warning in data.frame(year = as.numeric(rownames(gdp_t)), gdp = gdp_t$Austria):
## NAs introduced by coercion
# 1. t-test
t_test_result <- t.test(austria, australia, var.equal = FALSE)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: austria and australia
## t = -5.6102, df = 57.234, p-value = 6.131e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -637043.7 -301926.6
## sample estimates:
## mean of x mean of y
## 262454.0 731939.2
# 2. ANOVA - dáta z troch krajín do jedného vektora, skupiny ako faktor
group <- factor(rep(c("Austria", "Australia", "Canada"),
times = c(length(austria), length(australia), length(canada))))
values <- c(austria, australia, canada)
anova_result <- aov(values ~ group)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 1.47e+13 7.350e+12 27.7 5.77e-11 ***
## Residuals 150 3.98e+13 2.653e+11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA boxplot
boxplot(values ~ group, main = "ANOVA: Comparison of GDP", ylab = "GDP", col = c("lightblue", "lightgreen", "orange"))
# Vyberieme HDP a roky bez NA pre Rakúsko
df_reg <- data.frame(
year = as.numeric(rownames(gdp_t)),
gdp = gdp_t$Austria
) %>%
na.omit()
## Warning in data.frame(year = as.numeric(rownames(gdp_t)), gdp = gdp_t$Austria):
## NAs introduced by coercion
# 4. Korelačná analýza
correlation <- cor(austria, australia, use = "complete.obs")
print(paste("Korelačný koeficient:", round(correlation, 3)))
## [1] "Korelačný koeficient: 0.963"
# Korelačný scatterplot
plot(austria, australia, main = "Correlation: Austria vs Australia GDP",
xlab = "Austria GDP", ylab = "Australia GDP", pch = 19, col = "purple")
library(tidyverse)
# Načítanie dát
gdp <- read.csv("GDP_1975_2025_uploaded.csv")
# Transpozícia dát
gdp_t <- gdp %>%
column_to_rownames("Country") %>%
t() %>%
as.data.frame() %>%
mutate(across(everything(), as.numeric))
# Korelačná matica vybraných krajín
countries <- c("Austria", "Australia", "Canada", "Albania")
gdp_sel <- gdp_t[, countries]
cor_matrix <- cor(gdp_sel, use = "pairwise.complete.obs")
print(cor_matrix)
## Austria Australia Canada Albania
## Austria 1.0000000 0.9632496 0.9815865 0.9051433
## Australia 0.9632496 1.0000000 0.9908720 0.9447691
## Canada 0.9815865 0.9908720 1.0000000 0.9416496
## Albania 0.9051433 0.9447691 0.9416496 1.0000000
# Vykreslenie korelačnej matice ako heatmapy
library(ggcorrplot)
ggcorrplot(cor_matrix, lab = TRUE, title = "Korelačná matica HDP")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.