library(readxl)
mydata <-read_excel("./Football.xlsx")
head(mydata)
## # A tibble: 6 × 10
## Rank Name Position Age Value Club Games_played Goals Assists Card_yellow
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Kylia… 4 22 144 Pari… 16 7 11 3
## 2 2 Erlin… 4 21 135 Boru… 10 13 4 1
## 3 3 Harry… 4 28 108 Tott… 16 7 2 2
## 4 4 Jack … 1 26 90 Manc… 15 2 3 1
## 5 5 Moham… 2 29 90 Live… 15 15 6 1
## 6 6 Romel… 4 28 90 Chel… 11 4 1 0
mydata <-as.data.frame(mydata)
head(mydata)
## Rank Name Position Age Value Club Games_played Goals
## 1 1 Kylian Mbappe 4 22 144 Paris Saint-Germain 16 7
## 2 2 Erling Haaland 4 21 135 Borussia Dortmund 10 13
## 3 3 Harry Kane 4 28 108 Tottenham Hotspur 16 7
## 4 4 Jack Grealish 1 26 90 Manchester City 15 2
## 5 5 Mohamed Salah 2 29 90 Liverpool FC 15 15
## 6 6 Romelu Lukaku 4 28 90 Chelsea FC 11 4
## Assists Card_yellow
## 1 11 3
## 2 4 1
## 3 2 2
## 4 3 1
## 5 6 1
## 6 1 0
##Display the frequency distributionn of the players position ## Convert it to FACTOR from position 1,2,3,4 to name position example (defender striker midfielder )
mydata$Position <-factor(mydata$Position,
levels = c(1,2,3,4,5),
labels = c("Midfielder", "Winger", "Defender", "Striker", "Goalkeeper"))
library(ggplot2)
ggplot(mydata, aes(x=Position)) +
geom_bar(fill = "lightgray") +
ylab("Frequency") +
theme_dark() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust =1.5, colour="black")
table(mydata$Position)
##
## Midfielder Winger Defender Striker Goalkeeper
## 48 9 23 14 6
###Draw a scaterplot between number of games played and number of goals scored using ggplot2
library(ggplot2)
ggplot(mydata, aes(x = Games_played, y = Goals)) +
geom_point(color = "blue", size = 2, alpha =0.8) + # Scatterplot points
geom_smooth(method = "lm", formula=y~x, color = "red", se = TRUE) + # Linear regression line
labs(
title = "Scatterplot with Regression Line",
x = "Number of Games Played",
y = "Number of Goals"
) +
theme_minimal()
##Estimate the avg number of yellow cards for defenders.
mean(mydata[mydata$Position == "Defender" , ]$Card_yellow)
## [1] 1.913043
mean(mydata[mydata$Position == "Striker", ]$Card_yellow)
## [1] 1.642857
###FROM CHATGPT
t.test(mydata$Card_yellow ~ mydata$Position, subset = mydata$Position %in% c("Defender", "Striker"))
##
## Welch Two Sample t-test
##
## data: mydata$Card_yellow by mydata$Position
## t = 0.63088, df = 26.949, p-value = 0.5334
## alternative hypothesis: true difference in means between group Defender and group Striker is not equal to 0
## 95 percent confidence interval:
## -0.6086206 1.1489933
## sample estimates:
## mean in group Defender mean in group Striker
## 1.913043 1.642857
###FROM PROFESSOR
mean(mydata[mydata$Position == "Defender" , ]$Card_yellow)
## [1] 1.913043
##Ttest
t.test(mydata[mydata$Position == "Striker", ]$Card_yellow,
mu = 1.913,
alternative = "less")
##
## One Sample t-test
##
## data: mydata[mydata$Position == "Striker", ]$Card_yellow
## t = -0.79125, df = 13, p-value = 0.2215
## alternative hypothesis: true mean is less than 1.913
## 95 percent confidence interval:
## -Inf 2.247475
## sample estimates:
## mean of x
## 1.642857
##H0: The aritmetic mean = 1.913 ##H1: The aritmetic mean < 1.913 ##We can NOT reject H0 Hypothesis ##P-value must be 5% or less to reject H0 hypothesis ##Not enough strikers we have only 14.