Razvrščanje v skupine

library(readxl)
podatki <- read_xlsx("./Mesta.xlsx")
podatki <- as.data.frame(podatki) 

head(podatki, 4)

##   ID     Mesto      Drzava Evropa   BDP Prebivalstvo Stroski Povezljivost Varnost  Zdravje Izobrazevanje Strpnost
## 1  1    Aarhus     Denmark  Sever 74958       336411   4.015      6.31175  9.6165 8.704333        5.3665   9.7385
## 2  2 Amsterdam Netherlands  Zahod 73317       821752   3.824      6.11850  8.5035 7.907333        6.1800   8.3680
## 3  3    Athens      Greece    Jug 39864       665780   6.500      5.21975  6.7585 7.153000        3.1625   6.2680
## 4  4 Barcelona       Spain    Jug 50472      1620000   6.074      5.69925  7.4075 8.403333        5.0290   7.4715

Opis spremenljivk:

ID: oznaka mesta
Mesto
Država
Evropa: geografski del Evrope
BDP: bruto domači proizvod (PPP v USD)
Prebivalstvo: število prebivalcev
Stroski: (1-10), višja vrednost cenejše bivanje
Povezljivost: (1-10), višja vrednost boljša povezljivost mesta
Varnost: (1-10), višja vrednost bolj varno mesto
Zdravje: (1-10), višja vrednost boljša zdravstvena oskrba
Izobrazevanje: (1-10), višja vrednost boljši izobraževalni sistem
Strpnost: (1-10), višja vrednost večja strpnost

podatki$Evropa <- factor(podatki$Evropa, 
                         levels = c("Sever", "Zahod", "Jug", "Jugovzhod", "Srednja"), 
                         labels = c("Sever", "Zahod", "Jug", "Jugovzhod", "Srednja"))

summary(podatki[c(5:12)])

##       BDP          Prebivalstvo         Stroski       Povezljivost      Varnost         Zdravje      Izobrazevanje  
##  Min.   : 16916   Min.   :   79934   Min.   :1.000   Min.   :2.294   Min.   :5.252   Min.   :4.763   Min.   :1.399  
##  1st Qu.: 50472   1st Qu.:  314819   1st Qu.:4.648   1st Qu.:5.352   1st Qu.:7.282   1st Qu.:7.460   1st Qu.:4.243  
##  Median : 57801   Median :  548808   Median :5.343   Median :5.666   Median :7.982   Median :8.099   Median :4.665  
##  Mean   : 61598   Mean   :  993733   Mean   :5.508   Mean   :5.591   Mean   :7.932   Mean   :7.837   Mean   :4.758  
##  3rd Qu.: 66036   3rd Qu.:  851664   3rd Qu.:6.712   3rd Qu.:5.931   3rd Qu.:8.651   3rd Qu.:8.394   3rd Qu.:5.278  
##  Max.   :143304   Max.   :13000000   Max.   :9.343   Max.   :6.729   Max.   :9.617   Max.   :9.321   Max.   :9.027  
##     Strpnost    
##  Min.   :3.475  
##  1st Qu.:6.718  
##  Median :7.477  
##  Mean   :7.382  
##  3rd Qu.:8.325  
##  Max.   :9.739

podatki_clu_std <- as.data.frame(scale(podatki[c(7:12)])) #Standardizacija razvrstitvenih spremenljivk

podatki$Razlicnost = sqrt(podatki_clu_std$Stroski^2 + podatki_clu_std$Povezljivost^2 + podatki_clu_std$Varnost^2 + podatki_clu_std$Zdravje^2 + podatki_clu_std$Izobrazevanje^2 + podatki_clu_std$Strpnost^2) #Iskanje osamelcev

head(podatki[order(-podatki$Razlicnost), c("ID", "Mesto", "Drzava", "Razlicnost")], 10)

##    ID            Mesto         Drzava Razlicnost
## 49 49            Lille         France   6.078523
## 62 62           Moscow         Russia   5.112621
## 22 22         Chisinau        Moldova   4.724679
## 77 77 Saint Petersburg         Russia   4.642885
## 53 53           London United Kingdom   3.928648
## 79 79            Sofia       Bulgaria   3.843641
## 98 98           Zurich    Switzerland   3.713133
## 23 23      Cluj-Napoca        Romania   3.682526
## 18 18        Bucharest        Romania   3.680305
## 96 96          Wroclaw         Poland   3.581514

podatki <- podatki %>%
  filter(!Mesto %in% c("Lille", "Moscow", "Chisinau", "Saint Petersburg"))

podatki$ID <- seq(1, nrow(podatki))

podatki_clu_std <- as.data.frame(scale(podatki[c(7:12)]))

podatki_clu_std[c(34, 44, 78), ]

##      Stroski Povezljivost    Varnost    Zdravje Izobrazevanje   Strpnost
## 34 1.5316149    0.4839479 0.09577858 -1.3860185    -0.1265902 -2.4939968
## 44 1.7467200   -0.0443300 0.61571284 -1.3881952    -0.1257037 -2.3992258
## 78 0.8581443    0.6467577 1.59288660 -0.7190876     0.4881969  0.3115614

#Grafičen prikaz matrike razdalj
library(factoextra) 
Razdalje <- get_dist(podatki_clu_std, 
                     method = "euclidian")

fviz_dist(Razdalje, 
          gradient = list(low = "darkred", 
                          mid = "grey95", 
                          high = "white"))

library(factoextra) 
get_clust_tendency(podatki_clu_std,
                   n = nrow(podatki_clu_std) - 1,
                   graph = FALSE)

## $hopkins_stat
## [1] 0.6819882
## 
## $plot
## NULL

library(factoextra)
library(NbClust)

fviz_nbclust(podatki_clu_std, kmeans, method = "wss") +
  labs(subtitle = "Metoda preloma")

fviz_nbclust(podatki_clu_std, kmeans, method = "silhouette")+
  labs(subtitle = "Metoda silhuete")

library(NbClust)
NbClust(podatki_clu_std, 
        distance = "euclidean", 
        min.nc = 2, max.nc = 10,
        method = "kmeans", 
        index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 13 proposed 3 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 4 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## $All.index
##         KL      CH Hartigan     CCC    Scott      Marriot    TrCovW   TraceW Friedman  Rubin Cindex     DB Silhouette
## 2   0.0702 25.1805  32.7664 -2.1421 123.0754 178440683067 8119.7468 438.0933   3.2303 1.2737 0.3527 1.9100     0.2017
## 3  10.3971 33.0939  10.9700 -0.4376 245.8541 108747667210 3554.7398 323.0404   6.2220 1.7273 0.4309 1.4257     0.2552
## 4   0.3983 28.0670  11.5615 -0.9813 307.4061 100442554201 2675.7037 288.2874   7.7969 1.9356 0.4339 1.4495     0.1880
## 5   1.1170 26.3488   9.7836 -0.7429 386.3749  67746926615 2387.3010 255.4694   9.9699 2.1842 0.4046 1.4426     0.2035
## 6   3.6096 25.0680   3.5122 -0.3528 437.6957  56512208587 1809.1622 230.1676  11.4482 2.4243 0.3812 1.4493     0.1977
## 7   0.1871 22.0556  10.2882 -1.4175 481.7489  48139654088 1803.9156 221.3339  13.6645 2.5211 0.3733 1.5519     0.1832
## 8   0.7561 22.3503  12.5085 -0.3249 548.9523  30760747561 1335.2465 197.9278  16.6808 2.8192 0.3549 1.4171     0.2033
## 9   2.4412 23.6859   6.8515  1.4041 605.3390  21369125853  891.3093 172.7951  18.6747 3.2293 0.4503 1.3153     0.2167
## 10  0.8231 23.2358   7.5277  1.7314 640.7703  18096881198  794.9581 159.9059  20.1894 3.4896 0.4647 1.2991     0.2173
##      Duda Pseudot2   Beale Ratkowsky     Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  0.8314  10.3420  0.7639    0.2539 219.0466     0.3240 -0.2172  0.8074 0.0963 0.0032  2.0289 2.0020 1.8371
## 3  1.5694 -10.8839 -1.2961    0.3682 107.6801     0.5120  1.2127  1.1282 0.1806 0.0050  1.6730 1.7514 1.0155
## 4  2.4644 -24.3634 -2.1517    0.3430  72.0719     0.4640  0.2594  1.7570 0.1436 0.0052  1.7268 1.6536 0.5992
## 5  1.2389  -6.1700 -0.7081    0.3270  51.0939     0.4714  0.5601  2.1607 0.1560 0.0059  1.8437 1.5604 0.5219
## 6  1.1101  -2.0822 -0.3590    0.3114  38.3613     0.4411  1.2276  2.9453 0.1909 0.0061  1.7379 1.4753 0.4784
## 7  2.5648 -12.8123 -2.1339    0.2923  31.6191     0.4071 -0.0543  3.6473 0.1214 0.0063  2.0677 1.4395 0.4536
## 8  2.6901 -18.2196 -2.2157    0.2829  24.7410     0.4268  0.0395  3.6650 0.1214 0.0067  1.8442 1.3637 0.3916
## 9  1.9006  -5.2123 -1.4584    0.2765  19.1995     0.4411  0.3528  3.8720 0.2477 0.0071  1.8405 1.2898 0.3503
## 10 2.0715 -11.8969 -1.7910    0.2667  15.9906     0.4204  0.0532  4.6029 0.2668 0.0075  1.8223 1.2384 0.3330
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.6484            27.6503       0.5989
## 3          0.4503            36.6282       1.0000
## 4          0.4889            42.8595       1.0000
## 5          0.5356            27.7430       1.0000
## 6          0.4889            21.9524       1.0000
## 7          0.3979            31.7777       1.0000
## 8          0.4174            40.4854       1.0000
## 9          0.1924            46.1593       1.0000
## 10         0.3758            38.1962       1.0000
## 
## $Best.nc
##                      KL      CH Hartigan     CCC    Scott     Marriot   TrCovW  TraceW Friedman   Rubin Cindex      DB
## Number_clusters  3.0000  3.0000   3.0000 10.0000   3.0000           3    3.000  3.0000   8.0000  3.0000 2.0000 10.0000
## Value_Index     10.3971 33.0939  21.7963  1.7314 122.7787 61387902848 4565.007 80.2999   3.0163 -0.2454 0.3527  1.2991
##                 Silhouette   Duda PseudoT2  Beale Ratkowsky     Ball PtBiserial Frey McClain    Dunn Hubert SDindex
## Number_clusters     3.0000 2.0000    2.000 2.0000    3.0000   3.0000      3.000    1  2.0000 10.0000      0   3.000
## Value_Index         0.2552 0.8314   10.342 0.7639    0.3682 111.3665      0.512   NA  0.8074  0.2668      0   1.673
##                 Dindex   SDbw
## Number_clusters      0 10.000
## Value_Index          0  0.333
## 
## $Best.partition
##  [1] 3 3 1 2 2 3 2 3 3 2 2 2 1 3 2 3 2 1 1 3 3 1 2 3 3 2 2 2 3 3 2 2 3 1 3 2 2 2 2 3 3 3 2 1 3 2 2 3 2 3 2 3 2 2 2 2 2 2
## [59] 2 3 2 2 2 1 2 3 2 3 3 3 1 2 3 3 1 2 3 3 3 3 3 1 2 2 3 2 3 2 3 1 1 1 1 3

Razvrstitev <- kmeans(podatki_clu_std, 
                      centers = 3, #Določimo število skupin
                      nstart = 25) #Število poskusov različnih položajev začetnih voditeljev

Razvrstitev

## K-means clustering with 3 clusters of sizes 15, 38, 41
## 
## Cluster means:
##      Stroski Povezljivost    Varnost    Zdravje Izobrazevanje   Strpnost
## 1  1.3169977   -0.3896841  0.3595290 -1.7465327    -1.1942306 -0.7308649
## 2 -0.3687227    0.5618624  0.7840130  0.1658194     0.3052982  0.8345383
## 3 -0.1400854   -0.3781831 -0.8581812  0.4852891     0.1539543 -0.5060849
## 
## Clustering vector:
##  [1] 2 2 1 3 3 2 3 2 2 3 3 3 1 2 3 2 3 1 1 2 2 1 3 2 2 3 3 3 2 2 3 3 2 1 2 3 3 3 3 2 2 2 3 1 2 3 3 2 3 2 3 2 3 3 3 3 3 3
## [59] 3 2 3 3 3 1 3 2 3 2 2 2 1 3 2 2 1 3 2 2 2 2 2 1 3 3 2 3 2 3 2 1 1 1 1 2
## 
## Within cluster sum of squares by cluster:
## [1]  59.42509 128.94251 134.67283
##  (between_SS / total_SS =  42.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
## [8] "iter"         "ifault"

rownames(podatki_clu_std) <- podatki$Mesto

library(factoextra)
fviz_cluster(Razvrstitev, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = podatki_clu_std)

DODATNO

library(dplyr)
library(factoextra)

WARD <- podatki_clu_std %>%
  get_dist(method = "euclidean") %>%  
  hclust(method = "ward.D2")          

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 94

library(factoextra)

fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

podatki$RazvrstitevWARD <- cutree(WARD, 
                                  k = 3)

head(podatki[, c("ID", "RazvrstitevWARD")])

##   ID RazvrstitevWARD
## 1  1               1
## 2  2               1
## 3  3               2
## 4  4               1
## 5  5               2
## 6  6               1

library(factoextra)
K_MEANS <- hkmeans(podatki_clu_std, 
                   k = 3, 
                   hc.metric = "euclidian", 
                   hc.method = "ward.D2")

K_MEANS

## Hierarchical K-means clustering with 3 clusters of sizes 38, 41, 15
## 
## Cluster means:
##      Stroski Povezljivost    Varnost    Zdravje Izobrazevanje   Strpnost
## 1 -0.3687227    0.5618624  0.7840130  0.1658194     0.3052982  0.8345383
## 2 -0.1400854   -0.3781831 -0.8581812  0.4852891     0.1539543 -0.5060849
## 3  1.3169977   -0.3896841  0.3595290 -1.7465327    -1.1942306 -0.7308649
## 
## Clustering vector:
##       Aarhus    Amsterdam       Athens    Barcelona      Belfast       Bergen       Berlin         Bern       Bilbao 
##            1            1            3            2            2            1            2            1            1 
##   Birmingham      Bologna     Bordeaux   Bratislava     Brighton      Bristol         Brno     Brussels    Bucharest 
##            2            2            2            3            1            2            1            2            3 
##     Budapest    Cambridge      Cardiff  Cluj-Napoca      Cologne   Copenhagen         Cork      Dresden       Dublin 
##            3            1            1            3            2            1            1            2            2 
##   Dusseldorf    Edinburgh    Eindhoven     Florence    Frankfurt       Galway       Gdansk       Geneva      Glasgow 
##            2            1            1            2            2            1            3            1            2 
##   Gothenburg     Grenoble      Hamburg     Hannover     Helsinki    Innsbruck    Karlsruhe       Krakow     Lausanne 
##            2            2            2            1            1            1            2            3            1 
##        Leeds      Leipzig       Lisbon    Liverpool    Ljubljana       London   Luxembourg         Lyon       Madrid 
##            2            2            1            2            1            2            1            2            2 
##       Malaga        Malmo   Manchester    Marseille        Milan       Munich       Nantes       Naples         Nice 
##            2            2            2            2            2            1            2            2            2 
##      Nicosia         Oslo       Oxford        Paris        Porto       Prague    Reykjavik         Riga         Rome 
##            3            2            1            2            1            1            1            3            2 
##    Rotterdam      Seville        Sofia    Stockholm    Stuttgart      Tallinn      Tampere        Tartu    The Hague 
##            1            1            3            2            1            1            1            1            1 
## Thessaloniki     Toulouse        Turin        Turku      Uppsala      Utrecht     Valencia       Vienna      Vilnius 
##            3            2            2            1            2            1            2            1            3 
##       Warsaw      Wroclaw       Zagreb       Zurich 
##            3            3            3            1 
## 
## Within cluster sum of squares by cluster:
## [1] 128.94251 134.67283  59.42509
##  (between_SS / total_SS =  42.1 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
##  [8] "iter"         "ifault"       "data"         "hclust"

library(factoextra)
fviz_cluster(K_MEANS, 
             palette = "Set1", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = podatki_clu_std)

DODATNO

Povprecja <- Razvrstitev$centers
Povprecja #Povprečne vrednosti razvrstitvenih spremenljivk po skupinah, ki so osnova za opis skupin.

##      Stroski Povezljivost    Varnost    Zdravje Izobrazevanje   Strpnost
## 1  1.3169977   -0.3896841  0.3595290 -1.7465327    -1.1942306 -0.7308649
## 2 -0.3687227    0.5618624  0.7840130  0.1658194     0.3052982  0.8345383
## 3 -0.1400854   -0.3781831 -0.8581812  0.4852891     0.1539543 -0.5060849

Slika <- as.data.frame(Povprecja)
Slika$id <- 1:nrow(Slika)

library(tidyr)
Slika <- pivot_longer(Slika, cols = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))

Slika$Skupina <- factor(Slika$id, 
                        levels = c(1, 2, 3), 
                        labels = c("1", "2", "3"))

Slika$ImeF <- factor(Slika$name, 
              levels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"), 
              labels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))


library(ggplot2)
ggplot(Slika, aes(x = ImeF, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Skupina, col = Skupina), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Povprečje") +
  xlab("Razvrstitvene spremenljivke") +
  ylim(-2.2, 2.2) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))

podatki$Uvrstitev <- Razvrstitev$cluster #Shranimo razvrstitev enot

fit <- aov(cbind(Stroski, Povezljivost, Varnost, Zdravje, Izobrazevanje, Strpnost) ~ as.factor(Uvrstitev), 
             data = podatki)

summary(fit)

##  Response Stroski :
##                      Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Uvrstitev)  2 100.88  50.441  23.855 4.681e-09 ***
## Residuals            91 192.41   2.114                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Povezljivost :
##                      Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Uvrstitev)  2  3.8839 1.94194  12.575 1.506e-05 ***
## Residuals            91 14.0525 0.15442                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Varnost :
##                      Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Uvrstitev)  2 47.492 23.7462  67.316 < 2.2e-16 ***
## Residuals            91 32.101  0.3528                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Zdravje :
##                      Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Uvrstitev)  2 33.100 16.5499  70.293 < 2.2e-16 ***
## Residuals            91 21.425  0.2354                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Izobrazevanje :
##                      Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Uvrstitev)  2 32.965 16.4824  17.569 3.532e-07 ***
## Residuals            91 85.374  0.9382                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response Strpnost :
##                      Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(Uvrstitev)  2 64.513  32.256  42.617 8.695e-14 ***
## Residuals            91 68.877   0.757                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aggregate(podatki$BDP, 
          by = list(podatki$Uvrstitev), 
          FUN = mean)

##   Group.1        x
## 1       1 45501.40
## 2       2 70439.11
## 3       3 61733.88

library(car)
leveneTest(podatki$BDP, as.factor(podatki$Uvrstitev))

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value   Pr(>F)   
## group  2  5.9813 0.003626 **
##       91                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(dplyr)
library(rstatix)
podatki %>%
  group_by(Uvrstitev) %>%
  shapiro_test(BDP)

## # A tibble: 3 × 4
##   Uvrstitev variable statistic        p
##       <int> <chr>        <dbl>    <dbl>
## 1         1 BDP          0.749 8.85e- 4
## 2         2 BDP          0.799 9.97e- 6
## 3         3 BDP          0.532 3.05e-10

fit <- aov(BDP ~ as.factor(Uvrstitev), 
           data = podatki)

summary(fit)

##                      Df    Sum Sq   Mean Sq F value   Pr(>F)    
## as.factor(Uvrstitev)  2 6.751e+09 3.375e+09   10.19 0.000101 ***
## Residuals            91 3.014e+10 3.312e+08                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(onewaytests)
welch.test(BDP ~ Uvrstitev, 
           data = podatki)

## 
##   Welch's Heteroscedastic F Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : BDP and Uvrstitev 
## 
##   statistic  : 26.62726 
##   num df     : 2 
##   denom df   : 56.40861 
##   p.value    : 7.194268e-09 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------

kruskal.test(BDP ~ Uvrstitev, 
             data = podatki)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  BDP by Uvrstitev
## Kruskal-Wallis chi-squared = 30.337, df = 2, p-value = 2.585e-07

kruskal_effsize(BDP ~ Uvrstitev, 
                data = podatki)

## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 BDP      94   0.311 eta2[H] large

aggregate(podatki$Prebivalstvo, 
          by = list(podatki$Uvrstitev), 
          FUN = mean)

##   Group.1         x
## 1       1  799401.8
## 2       2  477247.3
## 3       3 1168588.1

library(car)
leveneTest(podatki$Prebivalstvo, as.factor(podatki$Uvrstitev))

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)  
## group  2  2.4084 0.09568 .
##       91                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(dplyr)
library(rstatix)
podatki %>%
  group_by(Uvrstitev) %>%
  shapiro_test(Prebivalstvo)

## # A tibble: 3 × 4
##   Uvrstitev variable     statistic        p
##       <int> <chr>            <dbl>    <dbl>
## 1         1 Prebivalstvo     0.816 5.91e- 3
## 2         2 Prebivalstvo     0.791 6.85e- 6
## 3         3 Prebivalstvo     0.571 9.56e-10

fit <- aov(Prebivalstvo ~ as.factor(Uvrstitev), 
           data = podatki)

summary(fit)

##                      Df    Sum Sq   Mean Sq F value Pr(>F)  
## as.factor(Uvrstitev)  2 9.443e+12 4.721e+12   4.173 0.0185 *
## Residuals            91 1.030e+14 1.131e+12                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(onewaytests)
welch.test(Prebivalstvo ~ Uvrstitev, 
           data = podatki)

## 
##   Welch's Heteroscedastic F Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : Prebivalstvo and Uvrstitev 
## 
##   statistic  : 5.373341 
##   num df     : 2 
##   denom df   : 36.81618 
##   p.value    : 0.008963587 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------

kruskal.test(Prebivalstvo ~ Uvrstitev, 
             data = podatki)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Prebivalstvo by Uvrstitev
## Kruskal-Wallis chi-squared = 14.994, df = 2, p-value = 0.0005548

kruskal_effsize(Prebivalstvo ~ Uvrstitev, 
                data = podatki)

## # A tibble: 1 × 5
##   .y.              n effsize method  magnitude
## * <chr>        <int>   <dbl> <chr>   <ord>    
## 1 Prebivalstvo    94   0.143 eta2[H] large

hi_kvadrat <- chisq.test(podatki$Evropa, as.factor(podatki$Uvrstitev))

## Warning in chisq.test(podatki$Evropa, as.factor(podatki$Uvrstitev)): Chi-squared approximation may be incorrect

hi_kvadrat

## 
##  Pearson's Chi-squared test
## 
## data:  podatki$Evropa and as.factor(podatki$Uvrstitev)
## X-squared = 35.78, df = 8, p-value = 1.926e-05

addmargins(hi_kvadrat$observed)

##               as.factor(podatki$Uvrstitev)
## podatki$Evropa  1  2  3 Sum
##      Sever      2  9  5  16
##      Zahod      1 18 26  45
##      Jug        4  4 10  18
##      Jugovzhod  2  0  0   2
##      Srednja    6  7  0  13
##      Sum       15 38 41  94

addmargins(round(hi_kvadrat$expected, 2))

##               as.factor(podatki$Uvrstitev)
## podatki$Evropa     1     2     3 Sum
##      Sever      2.55  6.47  6.98  16
##      Zahod      7.18 18.19 19.63  45
##      Jug        2.87  7.28  7.85  18
##      Jugovzhod  0.32  0.81  0.87   2
##      Srednja    2.07  5.26  5.67  13
##      Sum       14.99 38.01 41.00  94

round(hi_kvadrat$res, 2)

##               as.factor(podatki$Uvrstitev)
## podatki$Evropa     1     2     3
##      Sever     -0.35  1.00 -0.75
##      Zahod     -2.31 -0.04  1.44
##      Jug        0.67 -1.21  0.77
##      Jugovzhod  2.98 -0.90 -0.93
##      Srednja    2.73  0.76 -2.38

library(effectsize)
effectsize::cramers_v(podatki$Evropa, as.factor(podatki$Uvrstitev))

## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.39              | [0.16, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

Razvrstitev <- kmeans(podatki_clu_std, 
                      centers = 10, #Določimo število skupin
                      nstart = 25) #Število poskusov različnih položajev začetnih voditeljev

Razvrstitev

## K-means clustering with 10 clusters of sizes 14, 16, 9, 7, 3, 15, 6, 4, 4, 16
## 
## Cluster means:
##        Stroski Povezljivost    Varnost     Zdravje Izobrazevanje    Strpnost
## 1   0.14022878   0.75890296 -0.4836462  0.28174414   -0.28429204 -0.25769138
## 2   0.31865686  -0.04144810  0.7085389  0.09890407   -0.29214389  0.88363226
## 3   1.32408031  -0.36874782  0.5432295 -2.11757230   -1.56232473  0.02414012
## 4  -2.41380309   0.16166262  0.4016086 -0.33531001    0.61445974  0.76334932
## 5   0.26500962  -1.88742041 -1.4784368 -0.67381291   -0.89695790 -0.87760007
## 6  -0.35406528   1.13454197  1.0177881  0.46484541    0.77267450  0.82090382
## 7  -0.60742387  -0.17089659 -1.1275803 -0.06035544    1.95505745 -0.82583084
## 8  -0.01452385   0.02768203 -2.1127260  0.92124906   -0.49459812 -1.57738924
## 9   1.63775969  -0.53859436  0.4376273 -1.40234353   -0.23562957 -2.55724715
## 10 -0.02589148  -1.00382621 -0.6019711  0.82585661    0.04408859 -0.26739811
## 
## Clustering vector:
##       Aarhus    Amsterdam       Athens    Barcelona      Belfast       Bergen       Berlin         Bern       Bilbao 
##            6            6            5            1           10            4            7            4            2 
##   Birmingham      Bologna     Bordeaux   Bratislava     Brighton      Bristol         Brno     Brussels    Bucharest 
##           10           10           10            3            2           10            2            7            3 
##     Budapest    Cambridge      Cardiff  Cluj-Napoca      Cologne   Copenhagen         Cork      Dresden       Dublin 
##            3            6            2            3           10            6            2           10            7 
##   Dusseldorf    Edinburgh    Eindhoven     Florence    Frankfurt       Galway       Gdansk       Geneva      Glasgow 
##            2            6            6            1            1            2            9            4           10 
##   Gothenburg     Grenoble      Hamburg     Hannover     Helsinki    Innsbruck    Karlsruhe       Krakow     Lausanne 
##            1            8            1            1            6            6           10            9            4 
##        Leeds      Leipzig       Lisbon    Liverpool    Ljubljana       London   Luxembourg         Lyon       Madrid 
##           10            1            2           10            2            7            2           10           10 
##       Malaga        Malmo   Manchester    Marseille        Milan       Munich       Nantes       Naples         Nice 
##            1            8           10            8           10            6            1            5            1 
##      Nicosia         Oslo       Oxford        Paris        Porto       Prague    Reykjavik         Riga         Rome 
##            3            4            6            7            2            2            4            3            5 
##    Rotterdam      Seville        Sofia    Stockholm    Stuttgart      Tallinn      Tampere        Tartu    The Hague 
##            2            2            3            7            1            2            6            6            2 
## Thessaloniki     Toulouse        Turin        Turku      Uppsala      Utrecht     Valencia       Vienna      Vilnius 
##            1           10            8            6            1            6            1            6            3 
##       Warsaw      Wroclaw       Zagreb       Zurich 
##            9            9            3            4 
## 
## Within cluster sum of squares by cluster:
##  [1] 18.946645 28.542440 22.241141 15.951866  2.766059 25.309381 13.495058  3.542595  3.359472 21.022889
##  (between_SS / total_SS =  72.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
## [8] "iter"         "ifault"

rownames(podatki_clu_std) <- podatki$Mesto

library(factoextra)
fviz_cluster(Razvrstitev, 
             palette = "kelly", 
             repel = TRUE,
             ggtheme = theme_bw(),
             data = podatki_clu_std)

Povprecja <- Razvrstitev$centers

Slika <- as.data.frame(Povprecja)
Slika$id <- 1:nrow(Slika)

library(tidyr)
Slika <- pivot_longer(Slika, cols = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))

Slika$Skupina <- factor(Slika$id, 
                        levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 
                        labels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))

Slika$ImeF <- factor(Slika$name, 
              levels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"), 
              labels = c("Stroski", "Povezljivost", "Varnost", "Zdravje", "Izobrazevanje", "Strpnost"))


library(ggplot2)
ggplot(Slika, aes(x = ImeF, y = value)) +
  geom_hline(yintercept = 0) +
  scale_shape_manual(values=1:nlevels(Slika$Skupina)) +
  theme_bw() +
  geom_point(aes(shape = Skupina, col = Skupina), size = 3) +
  geom_line(aes(group = id,  col = Skupina), linewidth = 1) +
  ylab("Povprečje") +
  xlab("Razvrstitvene spremenljivke") +
  ylim(-3.2, 3.2) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))