data21 <- read.table("./players_21.csv", fill=TRUE, header=TRUE, sep=",", dec=";")
mydata <- data21[,c(3,5,9,13,14,15,18,34,35,36,37,38,39)]
head(mydata)
## short_name age nationality overall potential value_eur preferred_foot
## 1 L. Messi 33 Argentina 93 93 67500000 Left
## 2 Cristiano Ronaldo 35 Portugal 92 92 46000000 Right
## 3 J. Oblak 27 Slovenia 91 93 75000000 Right
## 4 R. Lewandowski 31 Poland 91 91 80000000 Right
## 5 Neymar Jr 28 Brazil 91 91 90000000 Right
## 6 K. De Bruyne 29 Belgium 91 91 87000000 Right
## pace shooting passing dribbling defending physic
## 1 85 92 91 95 38 65
## 2 89 93 81 89 35 77
## 3
## 4 78 91 78 85 43 82
## 5 91 85 86 94 36 59
## 6 76 86 93 88 64 78
mydata[mydata == ''] <- NA
#Dropping rows containing missing values
mydata <- mydata %>% drop_na()
#Convert character objects to integer objects / factors
mydata$short_name <- factor(mydata$short_name)
mydata$age <- as.integer(mydata$age)
mydata$overall <- as.integer(mydata$overall)
mydata$nationality <- factor(mydata$nationality)
mydata$potential <- as.integer(mydata$potential)
mydata$value_eur <- as.integer(mydata$value_eur)
mydata$preferred_foot <- factor(mydata$preferred_foot)
mydata$pace <- as.integer(mydata$pace)
mydata$shooting <- as.integer(mydata$shooting)
mydata$passing <- as.integer(mydata$passing)
mydata$dribbling <- as.integer(mydata$dribbling)
mydata$defending <- as.integer(mydata$defending)
mydata$physic <- as.integer(mydata$physic)
colnames(mydata) <- c("Name", "Age", "Nationality", "Overall", "Potential", "Value", "PreferredFoot", "Pace", "Shooting", "Passing", "Dribbling", "Defending", "Physic")
mydata$Value <- mydata$Value/1000000
# Selecting only player from Portugal (smaller sample size)
mydata <- subset(mydata, Nationality == "Portugal")
head(mydata)
## Name Age Nationality Overall Potential Value PreferredFoot
## 2 Cristiano Ronaldo 35 Portugal 92 92 46.0 Right
## 32 André Almeida 29 Portugal 80 80 12.0 Right
## 75 André Silva 24 Portugal 79 84 17.0 Right
## 110 André Gomes 26 Portugal 79 80 14.5 Right
## 111 Ricardo Horta 25 Portugal 79 80 15.0 Right
## 112 Paulinho 27 Portugal 79 80 14.5 Left
## Pace Shooting Passing Dribbling Defending Physic
## 2 89 93 81 89 35 77
## 32 66 63 76 75 80 81
## 75 71 79 64 79 43 75
## 110 58 69 78 77 70 76
## 111 78 77 76 82 42 58
## 112 67 78 64 74 45 81
Name: Name of a player
Age: Age of a player in years
Nationality: Player’s nationality
Overall: Overall rating (1-100)
Potential: Potential rating (1-100)
Value: Value of a player in million EUR
PreferredFoot: The natural preference of player’s left or right foot
Pace: Pace rating (1-100)
Shooting: Shooting rating (1-100)
Passing: Passing rating (1-100)
Dribbling: Dribbling rating (1-100)
Defending: Defending rating (1-100)
Physic: Physic rating (1-100)
Research Hypothesis Players in all age groups (A, B, C) have the same mean value of overall rating.
H0: µA = µB = µC
H1: One of the arithmetic means is different
# Looking for potential outliers
head(select(mydata[order(-mydata$Age),], Name, Age))
## Name Age
## 1254 Ricardo Costa 39
## 174 José Fonte 36
## 1110 João Pereira 36
## 1682 Tarantini 36
## 2 Cristiano Ronaldo 35
## 1285 Varela 35
head(select(mydata[order(mydata$Age),], Name, Age))
## Name Age
## 2490 Fábio Silva 17
## 5764 Pedro Brazão 17
## 6375 Matchoi Djaló 17
## 4289 Tiago Tomás 18
## 5778 Famana Quizera 18
## 6282 Tiago Almeida 18
# Adding categorical variable to the data frame based on age
mydata$AgeGroup <- as.factor(ifelse(mydata$Age<25, 'A',
ifelse(mydata$Age<30, 'B',
ifelse(mydata$Age<=40, 'C', 'D'))))
describeBy(x=mydata$Overall, group=mydata$AgeGroup)
##
## Descriptive statistics by group
## group: A
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 59 67.83 5.43 67 67.76 7.41 55 79 24 0.08 -0.84 0.71
## ------------------------------------------------------------
## group: B
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 58 71.41 4.81 72 71.5 5.93 59 80 21 -0.19 -0.77 0.63
## ------------------------------------------------------------
## group: C
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 46 71.63 4.57 72 71.26 2.97 63 92 29 1.8 6.45 0.67
Age Groups
H0: The units of observation (Overall Rating) are normally distributed.
H1: The units of observations (Overall Rating) are not normally distributed.
ggplot(mydata, aes(x = Overall))+
geom_histogram(binwidth = 3, fill = "darkgreen", colour = "black")+
xlab("Overall Rating")+
ylab("Frequency") +
facet_wrap(~AgeGroup, ncol = 1)
mydata %>%
group_by(AgeGroup) %>%
shapiro_test(Overall)
## # A tibble: 3 × 4
## AgeGroup variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 A Overall 0.961 0.0552
## 2 B Overall 0.970 0.155
## 3 C Overall 0.858 0.0000485
We reject H0 (p < 5%). Which means the units of observations (Overall Rating) are not normally distributed, within one group. Thhe normality assumtion is violated.
leveneTest(mydata$Overall, group=mydata$AgeGroup)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.5698 0.07971 .
## 160
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The homogeneity assumption of the variance is met, there is no significant difference between the variances (p > 0.05).
We cannot use parametric test, so we use non-parametric Kruskal-Wallis Rank Sum test.
H0: All distribution locations of points are the same.
H1: At least one points distribution location is different.
kruskal.test(Overall ~ AgeGroup,
data = mydata)
##
## Kruskal-Wallis rank sum test
##
## data: Overall by AgeGroup
## Kruskal-Wallis chi-squared = 16.856, df = 2, p-value = 0.0002187
mydata %>%
kruskal_effsize(Overall ~ AgeGroup)
## # A tibble: 1 × 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Overall 163 0.0928 eta2[H] moderate
Kruskal effect size is 0.093 (moderate effect).
library(rstatix)
groups_nonpar <- wilcox_test(Overall ~ AgeGroup,
paired = FALSE,
p.adjust.method = "bonferroni",
data = mydata)
groups_nonpar
## # A tibble: 3 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Overall A B 59 58 1072 0.000483 0.001 **
## 2 Overall A C 59 46 810. 0.000402 0.001 **
## 3 Overall B C 58 46 1353 0.903 1 ns
CONCLUSION
We found that the distribution of overall ratings differs for at least one of the age groups (𝜒2 = 16.9, p < 0.001), the effect size was moderate (𝜂2 = 0.093). Post-Hoc tests revealed differences for two pairs of groups A-B and A-C (p < 0.01), differences in pair of groups B-C is not significant (p = 0.903).
Research Hypothesis Players with different preferred foot have the same mean value of shooting rating.
H0: µR = µL
H1: µR ≠ µL
describeBy(x=mydata$Value, group=mydata$PreferredFoot)
##
## Descriptive statistics by group
## group: Left
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 42 3.05 3.16 1.4 2.5 1.24 0.45 14.5 14.05 1.57 2.28 0.49
## ------------------------------------------------------------
## group: Right
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 121 3.8 5.5 1.5 2.8 1.48 0.16 46 45.84 4.15 26.51 0.5
H0: The units of observation (Shooting Rating) are normally distributed.
H1: The units of observations (Shooting Rating) are not normally distributed.
ggplot(mydata, aes(x = Shooting))+
geom_histogram(binwidth = 3, fill = "darkred", colour = "black")+
xlab("Shooting Rating")+
ylab("Frequency") +
facet_wrap(~PreferredFoot, ncol = 1)
mydata %>%
group_by(PreferredFoot) %>%
shapiro_test(Shooting)
## # A tibble: 2 × 4
## PreferredFoot variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Left Shooting 0.931 0.0136
## 2 Right Shooting 0.952 0.000267
We reject H0 (p < 5%). Which means the units of observations (Overall Rating) are not normally distributed within each group.
We cannot use parametric test, because assumption of normal distributed is not met, so we use non-parametric Wilcoxon Rank Sum test.
H0: Distribution locations are the same
H1: Distribution locations are NOT the same
library(stats)
wilcox.test(mydata$Shooting ~ mydata$PreferredFoot,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: mydata$Shooting by mydata$PreferredFoot
## W = 2392, p-value = 0.5716
## alternative hypothesis: true location shift is not equal to 0
We cannot reject H0 (p = 0.572).
effectsize(wilcox.test(mydata$Shooting ~ mydata$PreferredFoot,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## ---------------------------------
## -0.06 | [-0.26, 0.14]
interpret_rank_biserial(-4.18e-04)
## [1] "tiny"
## (Rules: funder2019)
Effect size is tiny (Funder, 2019).
CONCLUSION
Based on the sample data, we find that players who prefer different foot, differ in the shooting rating (p > 0.05), the difference in distribution is tiny.
data15 <- read.table("./players_15.csv", fill=TRUE, header=TRUE, sep=",", dec=";", quote = "")
data15$nationality <- factor(data15$nationality)
data15$potential <- as.integer(data15$potential)
data15 <- subset(data15, nationality == "Portugal")
mean15 <- mean(data15$potential)
median15 <- round(median(data15$potential),0)
mean21 <- mean(mydata$Potential)
median21 <- round(median(mydata$Potential),0)
H0: μ21 = μ15
H0: μ21 ≠ μ15
ggplot(mydata, aes(x = Potential))+
geom_histogram(binwidth = 3, fill = "darkblue", colour = "black")+
xlab("Potential")+
ylab("Frequency")
shapiro_test(mydata$Potential)
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 mydata$Potential 0.986 0.115
We cannot reject null hypothesis at p = 0.115. There is normal distribution.
identify_outliers(data = mydata, Potential)
## Name Age Nationality Overall Potential Value PreferredFoot Pace
## 1 Cristiano Ronaldo 35 Portugal 92 92 46.00 Right 89
## 2 Fábio Fortes 28 Portugal 59 59 0.16 Right 55
## Shooting Passing Dribbling Defending Physic AgeGroup is.outlier is.extreme
## 1 93 81 89 35 77 C TRUE FALSE
## 2 58 45 59 25 57 B TRUE FALSE
There are some outliers.
We cannot use parametric test, because assumption of no outliers is not met, so we use non-parametric Wilcoxon Signed Rank test.
H0: Me21 = Me15
H0: Me21 ≠ Me15
H0: True location is equal to median.
H1: True location is not equal to median.
wilcox.test(mydata$Potential,
mu=median15,
correct=FALSE)
##
## Wilcoxon signed rank test
##
## data: mydata$Potential
## V = 9791, p-value = 2.04e-11
## alternative hypothesis: true location is not equal to 71
We can reject null hypothesis (p < 0.001).
effectsize(wilcox.test(mydata$Potential,
mu=median15,
correct=FALSE))
## r (rank biserial) | 95% CI
## --------------------------------
## 0.62 | [0.50, 0.72]
##
## - Deviation from a difference of 71.
interpret_rank_biserial(0.62, rules = "funder2019")
## [1] "very large"
## (Rules: funder2019)
CONCLUSION
Based on the sample data, we found that the median potential in Fifa21 was 74 and has increased compared to Fifa15 (p < 0.001, r = 0.62 – very large effect).