#importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
## [1] "X" "Date" "HomeTeam" "AwayTeam"
## [5] "FTHG" "FTAG" "FTR" "HTGS"
## [9] "ATGS" "HTGC" "ATGC" "HTP"
## [13] "ATP" "HM1" "HM2" "HM3"
## [17] "HM4" "HM5" "AM1" "AM2"
## [21] "AM3" "AM4" "AM5" "MW"
## [25] "HTFormPtsStr" "ATFormPtsStr" "HTFormPts" "ATFormPts"
## [29] "HTWinStreak3" "HTWinStreak5" "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3" "ATWinStreak5" "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD" "ATGD" "DiffPts" "DiffFormPts"
Alpha Level (α)=0.05 (common significance level).
Power Level (1 - β)=0.80 (common power level).
Minimum Effect Size: Let’s assume a minimum effect size of 0.5 goals per game.
data<- data %>%
select(FTAG,FTHG)
# Calculate mean
home_team_mean <- mean(data$FTHG)
away_team_mean <- mean(data$FTAG)
home_team_mean
## [1] 1.527485
away_team_mean
## [1] 1.130263
t_test_result1 <- t.test(data$FTHG, data$FTAG, alternative = "two.sided")
t_test_result1
##
## Welch Two Sample t-test
##
## data: data$FTHG and data$FTAG
## t = 19.13, df = 13406, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.3565204 0.4379240
## sample estimates:
## mean of x mean of y
## 1.527485 1.130263
The p-value (2.2e-16) is significantly smaller than the alpha level (0.05). This indicates an extremely undefined
Since the p-value is less than alpha, we reject the null hypothesis.
#Visualization
ggplot(data, aes(x = factor(1), y = FTHG)) +
geom_boxplot(fill = "blue", alpha = 0.5) +
geom_boxplot(aes(x = factor(2), y = FTAG), fill = "red", alpha = 0.5) +
labs(x = "", y = "Goals") +
theme_minimal() +
scale_x_discrete(labels = c("Home Team", "Away Team"))
Based on the analysis and the small p-value, we can conclude the following:
There is extremely strong statistical evidence to suggest that the Home Team scores more goals on average than the Away Team.
alpha level= 0.05(common significance level)
power level = 0.80(common power level)
effect_size = 1 Lets assume effective size as 1
data <-read.csv('C:/Downloads/final_dataset.csv')
data<- data %>%
select(HTWinStreak3,ATWinStreak3)
t_test_result2 <- t.test(data$HTWinStreak3, data$ATWinStreak3, alternative = "two.sided")
t_test_result2
##
## Welch Two Sample t-test
##
## data: data$HTWinStreak3 and data$ATWinStreak3
## t = -0.1412, df = 13678, p-value = 0.8877
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.008703139 0.007533549
## sample estimates:
## mean of x mean of y
## 0.0622807 0.0628655
Since the p-value (0.8877) is significantly greater than the alpha level we fail to reject the null hypothesis .
This means that there is not enough evidence to conclude that the Home Team’s win streak (HTWinStreak3) is significantly different from the Away Team’s win streak (ATWinStreak3).
#Visualization
ggplot(data, aes(x = HTWinStreak3)) +
geom_histogram(binwidth = 1, fill = "yellow", alpha = 0.5) +
geom_histogram(data = data, aes(x = ATWinStreak3), binwidth = 1, fill = "red", alpha = 0.5) +
labs(title = "Distribution of Home Team and Away Team Win Streaks (Last 3 Games)",
x = "Win Streak",
y = "Frequency") +
theme_minimal()
In summary, based on the results of the t-test there is no significant difference in win streaks between the Home Team and the Away Team.