mydata <- read.table("./2016.csv", header=TRUE, sep=",", dec=".")
head(mydata)
## Country Region Happiness.Rank Happiness.Score
## 1 Denmark Western Europe 1 7.526
## 2 Switzerland Western Europe 2 7.509
## 3 Iceland Western Europe 3 7.501
## 4 Norway Western Europe 4 7.498
## 5 Finland Western Europe 5 7.413
## 6 Canada North America 6 7.404
## Lower.Confidence.Interval Upper.Confidence.Interval Economy..GDP.per.Capita.
## 1 7.460 7.592 1.44178
## 2 7.428 7.590 1.52733
## 3 7.333 7.669 1.42666
## 4 7.421 7.575 1.57744
## 5 7.351 7.475 1.40598
## 6 7.335 7.473 1.44015
## Family Health..Life.Expectancy. Freedom Trust..Government.Corruption.
## 1 1.16374 0.79504 0.57941 0.44453
## 2 1.14524 0.86303 0.58557 0.41203
## 3 1.18326 0.86733 0.56624 0.14975
## 4 1.12690 0.79579 0.59609 0.35776
## 5 1.13464 0.81091 0.57104 0.41004
## 6 1.09610 0.82760 0.57370 0.31329
## Generosity Dystopia.Residual
## 1 0.36171 2.73939
## 2 0.28083 2.69463
## 3 0.47678 2.83137
## 4 0.37895 2.66465
## 5 0.25492 2.82596
## 6 0.44834 2.70485
Units of observation are observed countries.
Sample size is 156.
Variables:
1.Country
2.Region
3.Happiness score
4.GDP per capita index
5.Family index
6.Life expectancy index
7.Freedom index
8.Trust in government index
9.Generosity index
The source is SUSTAINABLE DEVELOPMENT SOLUTIONS NETWORK from Kaggle
mydata <- mydata[, -c(3,5,6,13)]
head(mydata)
## Country Region Happiness.Score Economy..GDP.per.Capita. Family
## 1 Denmark Western Europe 7.526 1.44178 1.16374
## 2 Switzerland Western Europe 7.509 1.52733 1.14524
## 3 Iceland Western Europe 7.501 1.42666 1.18326
## 4 Norway Western Europe 7.498 1.57744 1.12690
## 5 Finland Western Europe 7.413 1.40598 1.13464
## 6 Canada North America 7.404 1.44015 1.09610
## Health..Life.Expectancy. Freedom Trust..Government.Corruption. Generosity
## 1 0.79504 0.57941 0.44453 0.36171
## 2 0.86303 0.58557 0.41203 0.28083
## 3 0.86733 0.56624 0.14975 0.47678
## 4 0.79579 0.59609 0.35776 0.37895
## 5 0.81091 0.57104 0.41004 0.25492
## 6 0.82760 0.57370 0.31329 0.44834
summary(mydata[, -c(1,2)])
## Happiness.Score Economy..GDP.per.Capita. Family
## Min. :2.905 Min. :0.0000 Min. :0.0000
## 1st Qu.:4.404 1st Qu.:0.6702 1st Qu.:0.6418
## Median :5.314 Median :1.0278 Median :0.8414
## Mean :5.382 Mean :0.9539 Mean :0.7936
## 3rd Qu.:6.269 3rd Qu.:1.2796 3rd Qu.:1.0215
## Max. :7.526 Max. :1.8243 Max. :1.1833
## Health..Life.Expectancy. Freedom Trust..Government.Corruption.
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.3829 1st Qu.:0.2575 1st Qu.:0.06126
## Median :0.5966 Median :0.3975 Median :0.10547
## Mean :0.5576 Mean :0.3710 Mean :0.13762
## 3rd Qu.:0.7299 3rd Qu.:0.4845 3rd Qu.:0.17554
## Max. :0.9528 Max. :0.6085 Max. :0.50521
## Generosity
## Min. :0.0000
## 1st Qu.:0.1546
## Median :0.2225
## Mean :0.2426
## 3rd Qu.:0.3119
## Max. :0.8197
Highest happiness score is 7,526. The average of family index is 0,7936. Half of the countries have generosity index up to 0,2225.
2.Research Question: Can we make groups of countries based on their common characteristics?
mydata_clu_std <- as.data.frame(scale(mydata[,-c(1,2,9)]))
I standardized the variables.
mydata$Dissimilarity <- sqrt(mydata_clu_std$Happiness.Score^2 + mydata_clu_std$Economy..GDP.per.Capita.^2 + mydata_clu_std$Family^2 +
mydata_clu_std$Health..Life.Expectancy.^2 + mydata_clu_std$Freedom^2+mydata_clu_std$Trust..Government.Corruption.^2)
head(mydata[order(-mydata$Dissimilarity), c("Country", "Dissimilarity")],15)
## Country Dissimilarity
## 157 Burundi 4.688593
## 152 Rwanda 4.343059
## 1 Denmark 4.196791
## 2 Switzerland 4.146734
## 36 Qatar 4.140066
## 154 Afghanistan 4.099026
## 22 Singapore 4.096498
## 155 Togo 4.088547
## 76 Somalia 4.038521
## 8 New Zealand 3.981442
## 5 Finland 3.889540
## 10 Sweden 3.867203
## 156 Syria 3.834757
## 4 Norway 3.831729
## 153 Benin 3.743631
mydata <- mydata[!(mydata$Country %in% c("Burundi", "Rwanda" )), ]
I removed Burundi and Rwanda because of potentially being outliers.
head(mydata[order(-mydata$Dissimilarity), c("Country", "Dissimilarity")])
## Country Dissimilarity
## 1 Denmark 4.196791
## 2 Switzerland 4.146734
## 36 Qatar 4.140066
## 154 Afghanistan 4.099026
## 22 Singapore 4.096498
## 155 Togo 4.088547
mydata_clu_std <- as.data.frame(scale(mydata[,-c(1,2,9)]))
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Distances <- get_dist(mydata_clu_std,
method = "euclidian")
fviz_dist(Distances,
gradient = list(low = "blue",
mid = "grey95",
high = "white"))
At first sight looking promising, 3 clusters possibly forming.
library(factoextra)
get_clust_tendency(mydata_clu_std,
n = nrow(mydata_clu_std) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.7166483
##
## $plot
## NULL
Hopkins over 0,5. Very good.
library(factoextra)
library(NbClust)
fviz_nbclust(mydata_clu_std, kmeans, method = "wss") +
labs(subtitle = "Elbow method")
fviz_nbclust(mydata_clu_std, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette analysis")
I will do 3 clusters.
library(NbClust)
NbClust(mydata_clu_std,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 1 proposed 2 as the best number of clusters
## * 14 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 2 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 1.1481 90.3061 80.8076 -1.3142 217.6133 2.512292e+13 8460.340 677.8869
## 3 5.4655 108.6895 24.6918 3.4124 468.0472 1.123471e+13 4518.234 443.5985
## 4 1.0910 91.8528 21.0962 3.8492 578.8829 9.769873e+12 3002.190 381.6079
## 5 1.0309 83.2333 19.4113 4.5615 700.4201 6.969053e+12 2682.260 334.8289
## 6 2.7640 78.5585 10.8786 5.4219 824.5589 4.505176e+12 2137.485 296.4639
## 7 6.4855 71.5746 6.3930 5.1814 892.3880 3.958723e+12 1877.039 276.2917
## 8 0.1601 64.4744 9.0721 4.4716 942.3991 3.744664e+12 1727.594 264.8512
## 9 2.6351 60.6156 5.9951 4.8175 979.0838 3.740512e+12 1511.785 249.4560
## 10 0.9367 56.3704 5.6416 4.4477 1026.8327 3.393583e+12 1402.957 239.6168
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale Ratkowsky
## 2 9.8948 1.5902 0.4243 1.2731 0.3261 0.6883 49.8238 2.0456 0.4002
## 3 15.0856 2.4301 0.4038 1.0547 0.3569 0.6104 37.6505 2.8773 0.4400
## 4 17.0455 2.8249 0.3738 1.2720 0.2833 1.8540 -28.5581 -2.0330 0.4014
## 5 19.8155 3.2196 0.3604 1.2328 0.2730 0.8227 9.0535 0.9596 0.3706
## 6 22.7456 3.6362 0.3569 1.4055 0.2573 1.2567 -8.7843 -0.9027 0.3471
## 7 24.5111 3.9017 0.3491 1.3922 0.2389 2.3037 -27.1636 -2.4948 0.3257
## 8 26.2633 4.0702 0.3108 1.3410 0.2216 0.9844 0.3968 0.0695 0.3068
## 9 26.6916 4.3214 0.3305 1.4137 0.2180 1.4056 -7.5022 -1.2351 0.2921
## 10 28.1269 4.4988 0.3257 1.3618 0.2196 2.1151 -16.8706 -2.2467 0.2788
## Ball Ptbiserial Frey McClain Dunn Hubert SDindex Dindex SDbw
## 2 338.9435 0.5002 -0.0028 0.6118 0.1385 0.0017 1.6620 1.9736 0.6941
## 3 147.8662 0.6536 1.8181 0.8537 0.1608 0.0024 1.3547 1.6120 0.4784
## 4 95.4020 0.5498 1.5439 1.5361 0.1041 0.0024 1.7416 1.4776 0.3962
## 5 66.9658 0.4956 0.2072 2.0766 0.1473 0.0026 1.7224 1.3753 0.3291
## 6 49.4107 0.4984 0.9916 2.4159 0.1618 0.0030 1.7659 1.3003 0.3222
## 7 39.4702 0.4752 1.1689 2.7504 0.0711 0.0031 1.7345 1.2476 0.2763
## 8 33.1064 0.4485 0.4113 3.1811 0.0647 0.0030 1.8136 1.2196 0.2693
## 9 27.7173 0.4346 0.7601 3.5264 0.0711 0.0032 1.8676 1.1875 0.2555
## 10 23.9617 0.4209 2.5856 3.8252 0.0876 0.0032 1.8785 1.1609 0.2392
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.7423 38.1937 0.0475
## 3 0.7303 21.7839 0.0059
## 4 0.6202 37.9712 1.0000
## 5 0.6579 21.8428 0.4611
## 6 0.6247 25.8300 1.0000
## 7 0.6154 29.9976 1.0000
## 8 0.5874 17.5622 0.9995
## 9 0.5190 24.1000 1.0000
## 10 0.5070 31.1219 1.0000
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 7.0000 3.0000 3.0000 6.0000 3.0000 3.000000e+00 3.000
## Value_Index 6.4855 108.6895 56.1158 5.4219 250.4339 1.242338e+13 3942.106
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 3.0000 3.0000 8.0000 3.0000 3.0000 4.000
## Value_Index 172.2978 5.1908 -0.4451 0.3108 1.0547 0.3569 1.854
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 4.0000 4.000 3.00 3.0000 3.0000 1 2.0000
## Value_Index -28.5581 -2.033 0.44 191.0773 0.6536 NA 0.6118
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 6.0000 0 3.0000 0 10.0000
## Value_Index 0.1618 0 1.3547 0 0.2392
##
## $Best.partition
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 3 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 3 3 1 1 1 1 3 3 3 1 3 1 3 1 3 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 3 1 1 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 2 2 1 1 2 1 1 1 2 2 1 1 2 2 1 1 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 2 1 2 2 2 2 2 1 2 2 2 2 1 2 2 1 2 2 2
## 141 142 143 144 145 146 147 148 149 150 151 153 154 155 156
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
Clustering <- kmeans(mydata_clu_std,
centers = 3,
nstart = 25)
Clustering
## K-means clustering with 3 clusters of sizes 31, 81, 43
##
## Cluster means:
## Happiness.Score Economy..GDP.per.Capita. Family Health..Life.Expectancy.
## 1 1.2828063 1.1614866 0.9389192 1.0073633
## 2 0.1091515 0.2177787 0.2177600 0.2795357
## 3 -1.1304248 -1.2475851 -1.0870942 -1.2528060
## Freedom Trust..Government.Corruption. Dissimilarity
## 1 1.1080176 1.4550095 0.9223491
## 2 -0.1562082 -0.4921252 -0.7279853
## 3 -0.5045507 -0.1219339 0.7063719
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 1 1 2 2 2 2 1 1 1 2 1 2 1 2 1 2 2 2 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 2 2 2 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 3 3 2 2 3 2 2 2 3 3 2 2 3 3 2 2 3 3 3
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 3 2 3 3 3 3 3 2 3 3 3 3 2 3 3 2 3 3 3
## 141 142 143 144 145 146 147 148 149 150 151 153 154 155 156
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 75.06724 208.68279 159.84846
## (between_SS / total_SS = 58.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_std)
mydata <- mydata[!(mydata$Country %in% c("Uzbekistan","Somalia","Turkmenistan")), ]
I removed 3 countries as they vary a lot comparing to their cluster.
mydata_clu_std <- as.data.frame(scale(mydata[,-c(1,2,9)]))
Clustering <- kmeans(mydata_clu_std,
centers = 3,
nstart = 25)
library(factoextra)
fviz_cluster(Clustering,
palette = "Set1",
repel = FALSE,
ggtheme = theme_bw(),
data = mydata_clu_std)
Looks just beautiful.
Clustering
## K-means clustering with 3 clusters of sizes 42, 80, 30
##
## Cluster means:
## Happiness.Score Economy..GDP.per.Capita. Family Health..Life.Expectancy.
## 1 -1.1428543 -1.2510085 -1.0778561 -1.2593229
## 2 0.1117783 0.2015555 0.2148377 0.2726807
## 3 1.3019206 1.2139304 0.9360980 1.0359034
## Freedom Trust..Government.Corruption. Dissimilarity
## 1 -0.5320536 -0.1365420 0.6912625
## 2 -0.1391721 -0.4925998 -0.7214054
## 3 1.1160005 1.5047584 0.9559802
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 2 3 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 3 3 2 2 2 2 3 3 3 2 3 2 3 2 3 2 2 2 2
## 41 42 43 44 45 46 47 48 50 51 52 53 54 55 56 57 58 59 60 61
## 3 3 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2
## 62 63 64 66 67 68 69 70 71 72 73 74 75 77 78 79 80 81 82 83
## 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2
## 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
## 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 1
## 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
## 2 2 1 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 1 2
## 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
## 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 1 1 1
## 144 145 146 147 148 149 150 151 153 154 155 156
## 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 149.65679 205.88172 69.99393
## (between_SS / total_SS = 59.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Averages <- Clustering$centers
Averages #Average values of cluster variables to describe groups
## Happiness.Score Economy..GDP.per.Capita. Family Health..Life.Expectancy.
## 1 -1.1428543 -1.2510085 -1.0778561 -1.2593229
## 2 0.1117783 0.2015555 0.2148377 0.2726807
## 3 1.3019206 1.2139304 0.9360980 1.0359034
## Freedom Trust..Government.Corruption. Dissimilarity
## 1 -0.5320536 -0.1365420 0.6912625
## 2 -0.1391721 -0.4925998 -0.7214054
## 3 1.1160005 1.5047584 0.9559802
Figure <- as.data.frame(Averages)
Figure$Country <- 1:nrow(Figure)
library(tidyr)
Figure <- pivot_longer(Figure, cols = c("Happiness.Score", "Economy..GDP.per.Capita.", "Family", "Health..Life.Expectancy.", "Freedom", "Trust..Government.Corruption."))
Figure$Group <- factor(Figure$Country,
levels = c(1, 2, 3),
labels = c("1", "2", "3"))
Figure$NameF <- factor(Figure$name,
levels = c("Happiness.Score", "Economy..GDP.per.Capita.", "Family", "Health..Life.Expectancy.", "Freedom", "Trust..Government.Corruption."),
labels = c("Happiness.Score", "Economy..GDP.per.Capita.", "Family", "Health..Life.Expectancy.", "Freedom", "Trust..Government.Corruption."))
library(ggplot2)
ggplot(Figure, aes(x = NameF, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Group, col = Group), size = 3) +
geom_line(aes(group = Country), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")+
ylim(-2.2, 2.2) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))
Group 3 on average scores the highest in all variables. Group 2 is on average better in everything compared to group 1 except for trust in the government.
mydata$Group <- Clustering$cluster
fit <- aov(cbind(Happiness.Score, Economy..GDP.per.Capita., Family, Health..Life.Expectancy., Freedom, Trust..Government.Corruption.) ~ as.factor(Group),
data = mydata)
summary(fit)
## Response Happiness.Score :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 136.520 68.26 179.48 < 2.2e-16 ***
## Residuals 149 56.669 0.38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Economy..GDP.per.Capita. :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 18.264 9.1320 223.02 < 2.2e-16 ***
## Residuals 149 6.101 0.0409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Family :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 5.3947 2.6973 81.257 < 2.2e-16 ***
## Residuals 149 4.9461 0.0332
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Health..Life.Expectancy. :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 5.3866 2.69329 168.73 < 2.2e-16 ***
## Residuals 149 2.3784 0.01596
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Freedom :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 1.0329 0.51646 37.773 5.371e-14 ***
## Residuals 149 2.0372 0.01367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Trust..Government.Corruption. :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Group) 2 0.99742 0.49871 104.42 < 2.2e-16 ***
## Residuals 149 0.71165 0.00478
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
All variables are significantly different among the clusters.
aggregate(mydata$Generosity,
by = list(mydata$Group),
FUN = mean)
## Group.1 x
## 1 1 0.2642038
## 2 2 0.1995018
## 3 3 0.3252460
library(car)
## Loading required package: carData
leveneTest(mydata$Generosity, as.factor(mydata$Group))
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.419 0.6585
## 149
I cannot reject null hypotheses, hence we have homogenity.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
mydata %>%
group_by(as.factor(mydata$Group)) %>%
shapiro_test(Generosity)
## # A tibble: 3 × 4
## `as.factor(mydata$Group)` variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 1 Generosity 0.831 0.0000217
## 2 2 Generosity 0.943 0.00144
## 3 3 Generosity 0.968 0.481
Not normal distributions for Generosity in all clusters.
kruskal.test(Generosity ~ as.factor(Group),
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Generosity by as.factor(Group)
## Kruskal-Wallis chi-squared = 22.568, df = 2, p-value = 1.257e-05
There is a significant difference in Generosity among the 3 clusters at p-value less than 0,001.