library(readxl)
podatki <- read_xlsx("./Mesta.xlsx")
podatki <- as.data.frame(podatki)
head(podatki, 4)
## ID Mesto Drzava Evropa BDP Prebivalstvo Stroski Povezljivost Varnost Zdravje Izobrazevanje Strpnost
## 1 1 Aarhus Denmark Sever 74958 336411 4.015 6.31175 9.6165 8.704333 5.3665 9.7385
## 2 2 Amsterdam Netherlands Zahod 73317 821752 3.824 6.11850 8.5035 7.907333 6.1800 8.3680
## 3 3 Athens Greece Jug 39864 665780 6.500 5.21975 6.7585 7.153000 3.1625 6.2680
## 4 4 Barcelona Spain Jug 50472 1620000 6.074 5.69925 7.4075 8.403333 5.0290 7.4715
Opis spremenljivk:
podatki$Evropa <- factor(podatki$Evropa,
levels = c("Sever", "Zahod", "Jug", "Jugovzhod", "Srednja"),
labels = c("Sever", "Zahod", "Jug", "Jugovzhod", "Srednja"))
summary(podatki[c(5:12)])
## BDP Prebivalstvo Stroski Povezljivost Varnost Zdravje Izobrazevanje
## Min. : 16916 Min. : 79934 Min. :1.000 Min. :2.294 Min. :5.252 Min. :4.763 Min. :1.399
## 1st Qu.: 50472 1st Qu.: 314819 1st Qu.:4.648 1st Qu.:5.352 1st Qu.:7.282 1st Qu.:7.460 1st Qu.:4.243
## Median : 57801 Median : 548808 Median :5.343 Median :5.666 Median :7.982 Median :8.099 Median :4.665
## Mean : 61598 Mean : 993733 Mean :5.508 Mean :5.591 Mean :7.932 Mean :7.837 Mean :4.758
## 3rd Qu.: 66036 3rd Qu.: 851664 3rd Qu.:6.712 3rd Qu.:5.931 3rd Qu.:8.651 3rd Qu.:8.394 3rd Qu.:5.278
## Max. :143304 Max. :13000000 Max. :9.343 Max. :6.729 Max. :9.617 Max. :9.321 Max. :9.027
## Strpnost
## Min. :3.475
## 1st Qu.:6.718
## Median :7.477
## Mean :7.382
## 3rd Qu.:8.325
## Max. :9.739
podatki_clu_std <- as.data.frame(scale(podatki[c(7:12)])) #Standardizacija razvrstitvenih spremenljivk
podatki$Razlicnost = sqrt(podatki_clu_std$Stroski^2 + podatki_clu_std$Povezljivost^2 + podatki_clu_std$Varnost^2 + podatki_clu_std$Zdravje^2 + podatki_clu_std$Izobrazevanje^2 + podatki_clu_std$Strpnost^2) #Iskanje osamelcev
head(podatki[order(-podatki$Razlicnost), c("ID", "Mesto", "Drzava", "Razlicnost")], 10)
## ID Mesto Drzava Razlicnost
## 49 49 Lille France 6.078523
## 62 62 Moscow Russia 5.112621
## 22 22 Chisinau Moldova 4.724679
## 77 77 Saint Petersburg Russia 4.642885
## 53 53 London United Kingdom 3.928648
## 79 79 Sofia Bulgaria 3.843641
## 98 98 Zurich Switzerland 3.713133
## 23 23 Cluj-Napoca Romania 3.682526
## 18 18 Bucharest Romania 3.680305
## 96 96 Wroclaw Poland 3.581514
podatki <- podatki %>%
filter(!Mesto %in% c("Lille", "Moscow", "Chisinau", "Saint Petersburg"))
podatki$ID <- seq(1, nrow(podatki))
podatki_clu_std <- as.data.frame(scale(podatki[c(7:12)]))
podatki_clu_std[c(34, 44, 78), ]
## Stroski Povezljivost Varnost Zdravje Izobrazevanje Strpnost
## 34 1.5316149 0.4839479 0.09577858 -1.3860185 -0.1265902 -2.4939968
## 44 1.7467200 -0.0443300 0.61571284 -1.3881952 -0.1257037 -2.3992258
## 78 0.8581443 0.6467577 1.59288660 -0.7190876 0.4881969 0.3115614
#Grafičen prikaz matrike razdalj
library(factoextra)
Razdalje <- get_dist(podatki_clu_std,
method = "euclidian")
fviz_dist(Razdalje,
gradient = list(low = "darkred",
mid = "grey95",
high = "white"))
library(factoextra)
get_clust_tendency(podatki_clu_std,
n = nrow(podatki_clu_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.6819882
##
## $plot
## NULL
library(factoextra)
library(NbClust)
fviz_nbclust(podatki_clu_std, kmeans, method = "wss") +
labs(subtitle = "Metoda preloma")
fviz_nbclust(podatki_clu_std, kmeans, method = "silhouette")+
labs(subtitle = "Metoda silhuete")
library(NbClust)
NbClust(podatki_clu_std,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 5 proposed 2 as the best number of clusters
## * 13 proposed 3 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 4 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB Silhouette
## 2 0.0702 25.1805 32.7664 -2.1421 123.0754 178440683067 8119.7468 438.0933 3.2303 1.2737 0.3527 1.9100 0.2017
## 3 10.3971 33.0939 10.9700 -0.4376 245.8541 108747667210 3554.7398 323.0404 6.2220 1.7273 0.4309 1.4257 0.2552
## 4 0.3983 28.0670 11.5615 -0.9813 307.4061 100442554201 2675.7037 288.2874 7.7969 1.9356 0.4339 1.4495 0.1880
## 5 1.1170 26.3488 9.7836 -0.7429 386.3749 67746926615 2387.3010 255.4694 9.9699 2.1842 0.4046 1.4426 0.2035
## 6 3.6096 25.0680 3.5122 -0.3528 437.6957 56512208587 1809.1622 230.1676 11.4482 2.4243 0.3812 1.4493 0.1977
## 7 0.1871 22.0556 10.2882 -1.4175 481.7489 48139654088 1803.9156 221.3339 13.6645 2.5211 0.3733 1.5519 0.1832
## 8 0.7561 22.3503 12.5085 -0.3249 548.9523 30760747561 1335.2465 197.9278 16.6808 2.8192 0.3549 1.4171 0.2033
## 9 2.4412 23.6859 6.8515 1.4041 605.3390 21369125853 891.3093 172.7951 18.6747 3.2293 0.4503 1.3153 0.2167
## 10 0.8231 23.2358 7.5277 1.7314 640.7703 18096881198 794.9581 159.9059 20.1894 3.4896 0.4647 1.2991 0.2173
## Duda Pseudot2 Beale Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 0.8314 10.3420 0.7639 0.2539 219.0466 0.3240 -0.2172 0.8074 0.0963 0.0032 2.0289 2.0020 1.8371
## 3 1.5694 -10.8839 -1.2961 0.3682 107.6801 0.5120 1.2127 1.1282 0.1806 0.0050 1.6730 1.7514 1.0155
## 4 2.4644 -24.3634 -2.1517 0.3430 72.0719 0.4640 0.2594 1.7570 0.1436 0.0052 1.7268 1.6536 0.5992
## 5 1.2389 -6.1700 -0.7081 0.3270 51.0939 0.4714 0.5601 2.1607 0.1560 0.0059 1.8437 1.5604 0.5219
## 6 1.1101 -2.0822 -0.3590 0.3114 38.3613 0.4411 1.2276 2.9453 0.1909 0.0061 1.7379 1.4753 0.4784
## 7 2.5648 -12.8123 -2.1339 0.2923 31.6191 0.4071 -0.0543 3.6473 0.1214 0.0063 2.0677 1.4395 0.4536
## 8 2.6901 -18.2196 -2.2157 0.2829 24.7410 0.4268 0.0395 3.6650 0.1214 0.0067 1.8442 1.3637 0.3916
## 9 1.9006 -5.2123 -1.4584 0.2765 19.1995 0.4411 0.3528 3.8720 0.2477 0.0071 1.8405 1.2898 0.3503
## 10 2.0715 -11.8969 -1.7910 0.2667 15.9906 0.4204 0.0532 4.6029 0.2668 0.0075 1.8223 1.2384 0.3330
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6484 27.6503 0.5989
## 3 0.4503 36.6282 1.0000
## 4 0.4889 42.8595 1.0000
## 5 0.5356 27.7430 1.0000
## 6 0.4889 21.9524 1.0000
## 7 0.3979 31.7777 1.0000
## 8 0.4174 40.4854 1.0000
## 9 0.1924 46.1593 1.0000
## 10 0.3758 38.1962 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW Friedman Rubin Cindex DB
## Number_clusters 3.0000 3.0000 3.0000 10.0000 3.0000 3 3.000 3.0000 8.0000 3.0000 2.0000 10.0000
## Value_Index 10.3971 33.0939 21.7963 1.7314 122.7787 61387902848 4565.007 80.2999 3.0163 -0.2454 0.3527 1.2991
## Silhouette Duda PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert SDindex
## Number_clusters 3.0000 2.0000 2.000 2.0000 3.0000 3.0000 3.000 1 2.0000 10.0000 0 3.000
## Value_Index 0.2552 0.8314 10.342 0.7639 0.3682 111.3665 0.512 NA 0.8074 0.2668 0 1.673
## Dindex SDbw
## Number_clusters 0 10.000
## Value_Index 0 0.333
##
## $Best.partition
## [1] 3 3 1 2 2 3 2 3 3 2 2 2 1 3 2 3 2 1 1 3 3 1 2 3 3 2 2 2 3 3 2 2 3 1 3 2 2 2 2 3 3 3 2 1 3 2 2 3 2 3 2 3 2 2 2 2 2 2
## [59] 2 3 2 2 2 1 2 3 2 3 3 3 1 2 3 3 1 2 3 3 3 3 3 1 2 2 3 2 3 2 3 1 1 1 1 3
Razvrstitev <- kmeans(podatki_clu_std,
centers = 3, #Določimo število skupin
nstart = 25) #Število poskusov različnih položajev začetnih voditeljev
Razvrstitev
## K-means clustering with 3 clusters of sizes 15, 38, 41
##
## Cluster means:
## Stroski Povezljivost Varnost Zdravje Izobrazevanje Strpnost
## 1 1.3169977 -0.3896841 0.3595290 -1.7465327 -1.1942306 -0.7308649
## 2 -0.3687227 0.5618624 0.7840130 0.1658194 0.3052982 0.8345383
## 3 -0.1400854 -0.3781831 -0.8581812 0.4852891 0.1539543 -0.5060849
##
## Clustering vector:
## [1] 2 2 1 3 3 2 3 2 2 3 3 3 1 2 3 2 3 1 1 2 2 1 3 2 2 3 3 3 2 2 3 3 2 1 2 3 3 3 3 2 2 2 3 1 2 3 3 2 3 2 3 2 3 3 3 3 3 3
## [59] 3 2 3 3 3 1 3 2 3 2 2 2 1 3 2 2 1 3 2 2 2 2 2 1 3 3 2 3 2 3 2 1 1 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 59.42509 128.94251 134.67283
## (between_SS / total_SS = 42.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault"
rownames(podatki_clu_std) <- podatki$Mesto
library(factoextra)
fviz_cluster(Razvrstitev,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
data = podatki_clu_std)
library(dplyr)
library(factoextra)
WARD <- podatki_clu_std %>%
get_dist(method = "euclidean") %>%
hclust(method = "ward.D2")
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 94
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
podatki$RazvrstitevWARD <- cutree(WARD,
k = 3)
head(podatki[, c("ID", "RazvrstitevWARD")])
## ID RazvrstitevWARD
## 1 1 1
## 2 2 1
## 3 3 2
## 4 4 1
## 5 5 2
## 6 6 1
library(factoextra)
K_MEANS <- hkmeans(podatki_clu_std,
k = 3,
hc.metric = "euclidian",
hc.method = "ward.D2")
K_MEANS
## Hierarchical K-means clustering with 3 clusters of sizes 38, 41, 15
##
## Cluster means:
## Stroski Povezljivost Varnost Zdravje Izobrazevanje Strpnost
## 1 -0.3687227 0.5618624 0.7840130 0.1658194 0.3052982 0.8345383
## 2 -0.1400854 -0.3781831 -0.8581812 0.4852891 0.1539543 -0.5060849
## 3 1.3169977 -0.3896841 0.3595290 -1.7465327 -1.1942306 -0.7308649
##
## Clustering vector:
## Aarhus Amsterdam Athens Barcelona Belfast Bergen Berlin Bern Bilbao
## 1 1 3 2 2 1 2 1 1
## Birmingham Bologna Bordeaux Bratislava Brighton Bristol Brno Brussels Bucharest
## 2 2 2 3 1 2 1 2 3
## Budapest Cambridge Cardiff Cluj-Napoca Cologne Copenhagen Cork Dresden Dublin
## 3 1 1 3 2 1 1 2 2
## Dusseldorf Edinburgh Eindhoven Florence Frankfurt Galway Gdansk Geneva Glasgow
## 2 1 1 2 2 1 3 1 2
## Gothenburg Grenoble Hamburg Hannover Helsinki Innsbruck Karlsruhe Krakow Lausanne
## 2 2 2 1 1 1 2 3 1
## Leeds Leipzig Lisbon Liverpool Ljubljana London Luxembourg Lyon Madrid
## 2 2 1 2 1 2 1 2 2
## Malaga Malmo Manchester Marseille Milan Munich Nantes Naples Nice
## 2 2 2 2 2 1 2 2 2
## Nicosia Oslo Oxford Paris Porto Prague Reykjavik Riga Rome
## 3 2 1 2 1 1 1 3 2
## Rotterdam Seville Sofia Stockholm Stuttgart Tallinn Tampere Tartu The Hague
## 1 1 3 2 1 1 1 1 1
## Thessaloniki Toulouse Turin Turku Uppsala Utrecht Valencia Vienna Vilnius
## 3 2 2 1 2 1 2 1 3
## Warsaw Wroclaw Zagreb Zurich
## 3 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 128.94251 134.67283 59.42509
## (between_SS / total_SS = 42.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault" "data" "hclust"
library(factoextra)
fviz_cluster(K_MEANS,
palette = "Set1",
repel = TRUE,
ggtheme = theme_bw(),
data = podatki_clu_std)
Povprecja <- Razvrstitev$centers
Povprecja #Povprečne vrednosti razvrstitvenih spremenljivk po skupinah, ki so osnova za opis skupin.
## Stroski Povezljivost Varnost Zdravje Izobrazevanje Strpnost
## 1 1.3169977 -0.3896841 0.3595290 -1.7465327 -1.1942306 -0.7308649
## 2 -0.3687227 0.5618624 0.7840130 0.1658194 0.3052982 0.8345383
## 3 -0.1400854 -0.3781831 -0.8581812 0.4852891 0.1539543 -0.5060849
Slika <- as.data.frame(Povprecja)
Slika$id <- 1:nrow(Slika)
library(tidyr)
Slika <- pivot_longer(Slika, cols = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))
Slika$Skupina <- factor(Slika$id,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Slika$ImeF <- factor(Slika$name,
levels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"),
labels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))
library(ggplot2)
ggplot(Slika, aes(x = ImeF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Skupina, col = Skupina), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Povprečje") +
xlab("Razvrstitvene spremenljivke") +
ylim(-2.2, 2.2) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
podatki$Uvrstitev <- Razvrstitev$cluster #Shranimo razvrstitev enot
fit <- aov(cbind(Stroski, Povezljivost, Varnost, Zdravje, Izobrazevanje, Strpnost) ~ as.factor(Uvrstitev),
data = podatki)
summary(fit)
## Response Stroski :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 100.88 50.441 23.855 4.681e-09 ***
## Residuals 91 192.41 2.114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Povezljivost :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 3.8839 1.94194 12.575 1.506e-05 ***
## Residuals 91 14.0525 0.15442
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Varnost :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 47.492 23.7462 67.316 < 2.2e-16 ***
## Residuals 91 32.101 0.3528
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Zdravje :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 33.100 16.5499 70.293 < 2.2e-16 ***
## Residuals 91 21.425 0.2354
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Izobrazevanje :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 32.965 16.4824 17.569 3.532e-07 ***
## Residuals 91 85.374 0.9382
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Strpnost :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 64.513 32.256 42.617 8.695e-14 ***
## Residuals 91 68.877 0.757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aggregate(podatki$BDP,
by = list(podatki$Uvrstitev),
FUN = mean)
## Group.1 x
## 1 1 45501.40
## 2 2 70439.11
## 3 3 61733.88
library(car)
leveneTest(podatki$BDP, as.factor(podatki$Uvrstitev))
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 5.9813 0.003626 **
## 91
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dplyr)
library(rstatix)
podatki %>%
group_by(Uvrstitev) %>%
shapiro_test(BDP)
## # A tibble: 3 × 4
## Uvrstitev variable statistic p
## <int> <chr> <dbl> <dbl>
## 1 1 BDP 0.749 8.85e- 4
## 2 2 BDP 0.799 9.97e- 6
## 3 3 BDP 0.532 3.05e-10
fit <- aov(BDP ~ as.factor(Uvrstitev),
data = podatki)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 6.751e+09 3.375e+09 10.19 0.000101 ***
## Residuals 91 3.014e+10 3.312e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(onewaytests)
welch.test(BDP ~ Uvrstitev,
data = podatki)
##
## Welch's Heteroscedastic F Test (alpha = 0.05)
## -------------------------------------------------------------
## data : BDP and Uvrstitev
##
## statistic : 26.62726
## num df : 2
## denom df : 56.40861
## p.value : 7.194268e-09
##
## Result : Difference is statistically significant.
## -------------------------------------------------------------
kruskal.test(BDP ~ Uvrstitev,
data = podatki)
##
## Kruskal-Wallis rank sum test
##
## data: BDP by Uvrstitev
## Kruskal-Wallis chi-squared = 30.337, df = 2, p-value = 2.585e-07
kruskal_effsize(BDP ~ Uvrstitev,
data = podatki)
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 BDP 94 0.311 eta2[H] large
aggregate(podatki$Prebivalstvo,
by = list(podatki$Uvrstitev),
FUN = mean)
## Group.1 x
## 1 1 799401.8
## 2 2 477247.3
## 3 3 1168588.1
library(car)
leveneTest(podatki$Prebivalstvo, as.factor(podatki$Uvrstitev))
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.4084 0.09568 .
## 91
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dplyr)
library(rstatix)
podatki %>%
group_by(Uvrstitev) %>%
shapiro_test(Prebivalstvo)
## # A tibble: 3 × 4
## Uvrstitev variable statistic p
## <int> <chr> <dbl> <dbl>
## 1 1 Prebivalstvo 0.816 5.91e- 3
## 2 2 Prebivalstvo 0.791 6.85e- 6
## 3 3 Prebivalstvo 0.571 9.56e-10
fit <- aov(Prebivalstvo ~ as.factor(Uvrstitev),
data = podatki)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Uvrstitev) 2 9.443e+12 4.721e+12 4.173 0.0185 *
## Residuals 91 1.030e+14 1.131e+12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(onewaytests)
welch.test(Prebivalstvo ~ Uvrstitev,
data = podatki)
##
## Welch's Heteroscedastic F Test (alpha = 0.05)
## -------------------------------------------------------------
## data : Prebivalstvo and Uvrstitev
##
## statistic : 5.373341
## num df : 2
## denom df : 36.81618
## p.value : 0.008963587
##
## Result : Difference is statistically significant.
## -------------------------------------------------------------
kruskal.test(Prebivalstvo ~ Uvrstitev,
data = podatki)
##
## Kruskal-Wallis rank sum test
##
## data: Prebivalstvo by Uvrstitev
## Kruskal-Wallis chi-squared = 14.994, df = 2, p-value = 0.0005548
kruskal_effsize(Prebivalstvo ~ Uvrstitev,
data = podatki)
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Prebivalstvo 94 0.143 eta2[H] large
hi_kvadrat <- chisq.test(podatki$Evropa, as.factor(podatki$Uvrstitev))
## Warning in chisq.test(podatki$Evropa, as.factor(podatki$Uvrstitev)): Chi-squared approximation may be incorrect
hi_kvadrat
##
## Pearson's Chi-squared test
##
## data: podatki$Evropa and as.factor(podatki$Uvrstitev)
## X-squared = 35.78, df = 8, p-value = 1.926e-05
addmargins(hi_kvadrat$observed)
## as.factor(podatki$Uvrstitev)
## podatki$Evropa 1 2 3 Sum
## Sever 2 9 5 16
## Zahod 1 18 26 45
## Jug 4 4 10 18
## Jugovzhod 2 0 0 2
## Srednja 6 7 0 13
## Sum 15 38 41 94
addmargins(round(hi_kvadrat$expected, 2))
## as.factor(podatki$Uvrstitev)
## podatki$Evropa 1 2 3 Sum
## Sever 2.55 6.47 6.98 16
## Zahod 7.18 18.19 19.63 45
## Jug 2.87 7.28 7.85 18
## Jugovzhod 0.32 0.81 0.87 2
## Srednja 2.07 5.26 5.67 13
## Sum 14.99 38.01 41.00 94
round(hi_kvadrat$res, 2)
## as.factor(podatki$Uvrstitev)
## podatki$Evropa 1 2 3
## Sever -0.35 1.00 -0.75
## Zahod -2.31 -0.04 1.44
## Jug 0.67 -1.21 0.77
## Jugovzhod 2.98 -0.90 -0.93
## Srednja 2.73 0.76 -2.38
library(effectsize)
effectsize::cramers_v(podatki$Evropa, as.factor(podatki$Uvrstitev))
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.39 | [0.16, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
Razvrstitev <- kmeans(podatki_clu_std,
centers = 10, #Določimo število skupin
nstart = 25) #Število poskusov različnih položajev začetnih voditeljev
Razvrstitev
## K-means clustering with 10 clusters of sizes 14, 16, 9, 7, 3, 15, 6, 4, 4, 16
##
## Cluster means:
## Stroski Povezljivost Varnost Zdravje Izobrazevanje Strpnost
## 1 0.14022878 0.75890296 -0.4836462 0.28174414 -0.28429204 -0.25769138
## 2 0.31865686 -0.04144810 0.7085389 0.09890407 -0.29214389 0.88363226
## 3 1.32408031 -0.36874782 0.5432295 -2.11757230 -1.56232473 0.02414012
## 4 -2.41380309 0.16166262 0.4016086 -0.33531001 0.61445974 0.76334932
## 5 0.26500962 -1.88742041 -1.4784368 -0.67381291 -0.89695790 -0.87760007
## 6 -0.35406528 1.13454197 1.0177881 0.46484541 0.77267450 0.82090382
## 7 -0.60742387 -0.17089659 -1.1275803 -0.06035544 1.95505745 -0.82583084
## 8 -0.01452385 0.02768203 -2.1127260 0.92124906 -0.49459812 -1.57738924
## 9 1.63775969 -0.53859436 0.4376273 -1.40234353 -0.23562957 -2.55724715
## 10 -0.02589148 -1.00382621 -0.6019711 0.82585661 0.04408859 -0.26739811
##
## Clustering vector:
## Aarhus Amsterdam Athens Barcelona Belfast Bergen Berlin Bern Bilbao
## 6 6 5 1 10 4 7 4 2
## Birmingham Bologna Bordeaux Bratislava Brighton Bristol Brno Brussels Bucharest
## 10 10 10 3 2 10 2 7 3
## Budapest Cambridge Cardiff Cluj-Napoca Cologne Copenhagen Cork Dresden Dublin
## 3 6 2 3 10 6 2 10 7
## Dusseldorf Edinburgh Eindhoven Florence Frankfurt Galway Gdansk Geneva Glasgow
## 2 6 6 1 1 2 9 4 10
## Gothenburg Grenoble Hamburg Hannover Helsinki Innsbruck Karlsruhe Krakow Lausanne
## 1 8 1 1 6 6 10 9 4
## Leeds Leipzig Lisbon Liverpool Ljubljana London Luxembourg Lyon Madrid
## 10 1 2 10 2 7 2 10 10
## Malaga Malmo Manchester Marseille Milan Munich Nantes Naples Nice
## 1 8 10 8 10 6 1 5 1
## Nicosia Oslo Oxford Paris Porto Prague Reykjavik Riga Rome
## 3 4 6 7 2 2 4 3 5
## Rotterdam Seville Sofia Stockholm Stuttgart Tallinn Tampere Tartu The Hague
## 2 2 3 7 1 2 6 6 2
## Thessaloniki Toulouse Turin Turku Uppsala Utrecht Valencia Vienna Vilnius
## 1 10 8 6 1 6 1 6 3
## Warsaw Wroclaw Zagreb Zurich
## 9 9 3 4
##
## Within cluster sum of squares by cluster:
## [1] 18.946645 28.542440 22.241141 15.951866 2.766059 25.309381 13.495058 3.542595 3.359472 21.022889
## (between_SS / total_SS = 72.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault"
rownames(podatki_clu_std) <- podatki$Mesto
library(factoextra)
fviz_cluster(Razvrstitev,
palette = "kelly",
repel = TRUE,
ggtheme = theme_bw(),
data = podatki_clu_std)
Povprecja <- Razvrstitev$centers
Slika <- as.data.frame(Povprecja)
Slika$id <- 1:nrow(Slika)
library(tidyr)
Slika <- pivot_longer(Slika, cols = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))
Slika$Skupina <- factor(Slika$id,
levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
labels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
Slika$ImeF <- factor(Slika$name,
levels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"),
labels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))
library(ggplot2)
ggplot(Slika, aes(x = ImeF, y = value)) +
geom_hline(yintercept = 0) +
scale_shape_manual(values=1:nlevels(Slika$Skupina)) +
theme_bw() +
geom_point(aes(shape = Skupina, col = Skupina), size = 3) +
geom_line(aes(group = id, col = Skupina), linewidth = 1) +
ylab("Povprečje") +
xlab("Razvrstitvene spremenljivke") +
ylim(-3.2, 3.2) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))