library(readxl)

mydata <-read_excel("./Football.xlsx")
head(mydata)
## # A tibble: 6 × 10
##    Rank Name   Position   Age Value Club  Games_played Goals Assists Card_yellow
##   <dbl> <chr>     <dbl> <dbl> <dbl> <chr>        <dbl> <dbl>   <dbl>       <dbl>
## 1     1 Kylia…        4    22   144 Pari…           16     7      11           3
## 2     2 Erlin…        4    21   135 Boru…           10    13       4           1
## 3     3 Harry…        4    28   108 Tott…           16     7       2           2
## 4     4 Jack …        1    26    90 Manc…           15     2       3           1
## 5     5 Moham…        2    29    90 Live…           15    15       6           1
## 6     6 Romel…        4    28    90 Chel…           11     4       1           0
mydata <-as.data.frame(mydata)

head(mydata)
##   Rank           Name Position Age Value                Club Games_played Goals
## 1    1  Kylian Mbappe        4  22   144 Paris Saint-Germain           16     7
## 2    2 Erling Haaland        4  21   135   Borussia Dortmund           10    13
## 3    3     Harry Kane        4  28   108   Tottenham Hotspur           16     7
## 4    4  Jack Grealish        1  26    90     Manchester City           15     2
## 5    5  Mohamed Salah        2  29    90        Liverpool FC           15    15
## 6    6  Romelu Lukaku        4  28    90          Chelsea FC           11     4
##   Assists Card_yellow
## 1      11           3
## 2       4           1
## 3       2           2
## 4       3           1
## 5       6           1
## 6       1           0

##Display the frequency distributionn of the players position ## Convert it to FACTOR from position 1,2,3,4 to name position example (defender striker midfielder )

mydata$Position <-factor(mydata$Position,
                         levels = c(1,2,3,4,5),
                         labels = c("Midfielder", "Winger", "Defender", "Striker", "Goalkeeper"))

library(ggplot2)
ggplot(mydata, aes(x=Position)) + 
  geom_bar(fill = "lightgray") +
  ylab("Frequency") +
theme_dark() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust =1.5, colour="black")

table(mydata$Position)
## 
## Midfielder     Winger   Defender    Striker Goalkeeper 
##         48          9         23         14          6

###Draw a scaterplot between number of games played and number of goals scored using ggplot2

library(ggplot2)

ggplot(mydata, aes(x = Games_played, y = Goals)) +
  geom_point(color = "blue", size = 2, alpha =0.8) +  # Scatterplot points
  geom_smooth(method = "lm", formula=y~x, color = "red", se = TRUE) +  # Linear regression line
  labs(
    title = "Scatterplot with Regression Line",
    x = "Number of Games Played",
    y = "Number of Goals"
  ) +
  theme_minimal()

##Estimate the avg number of yellow cards for defenders.

mean(mydata[mydata$Position == "Defender"  , ]$Card_yellow)
## [1] 1.913043

can you say that the avarage number of yellow cards for strikers is lower

mean(mydata[mydata$Position == "Striker", ]$Card_yellow)
## [1] 1.642857

###FROM CHATGPT

t.test(mydata$Card_yellow ~ mydata$Position, subset = mydata$Position %in% c("Defender", "Striker"))
## 
##  Welch Two Sample t-test
## 
## data:  mydata$Card_yellow by mydata$Position
## t = 0.63088, df = 26.949, p-value = 0.5334
## alternative hypothesis: true difference in means between group Defender and group Striker is not equal to 0
## 95 percent confidence interval:
##  -0.6086206  1.1489933
## sample estimates:
## mean in group Defender  mean in group Striker 
##               1.913043               1.642857

###FROM PROFESSOR

mean(mydata[mydata$Position == "Defender"  , ]$Card_yellow)
## [1] 1.913043

##Ttest

t.test(mydata[mydata$Position == "Striker", ]$Card_yellow,
       mu = 1.913,
       alternative = "less")
## 
##  One Sample t-test
## 
## data:  mydata[mydata$Position == "Striker", ]$Card_yellow
## t = -0.79125, df = 13, p-value = 0.2215
## alternative hypothesis: true mean is less than 1.913
## 95 percent confidence interval:
##      -Inf 2.247475
## sample estimates:
## mean of x 
##  1.642857

##H0: The aritmetic mean = 1.913 ##H1: The aritmetic mean < 1.913 ##We can NOT reject H0 Hypothesis ##P-value must be 5% or less to reject H0 hypothesis ##Not enough strikers we have only 14.