Intro to R:Data Dive 6
Dataset:English Premier League
#importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)
Loading Dataset
data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
## [1] "X" "Date" "HomeTeam" "AwayTeam"
## [5] "FTHG" "FTAG" "FTR" "HTGS"
## [9] "ATGS" "HTGC" "ATGC" "HTP"
## [13] "ATP" "HM1" "HM2" "HM3"
## [17] "HM4" "HM5" "AM1" "AM2"
## [21] "AM3" "AM4" "AM5" "MW"
## [25] "HTFormPtsStr" "ATFormPtsStr" "HTFormPts" "ATFormPts"
## [29] "HTWinStreak3" "HTWinStreak5" "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3" "ATWinStreak5" "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD" "ATGD" "DiffPts" "DiffFormPts"
Creating new columns using mutate
#New column 1
data <- data %>%
mutate(TotalGoals = FTHG + FTAG)
#New column 2
data <- data %>%
mutate(GoalDifference = FTHG - FTAG)
#New column 3
data <- data %>%
mutate(AvgPointsHT = HTP / MW,
AvgPointsAT = ATP / MW)
head(data)
## X Date HomeTeam AwayTeam FTHG FTAG FTR HTGS ATGS HTGC ATGC HTP ATP
## 1 0 19/08/00 Charlton Man City 4 0 H 0 0 0 0 0 0
## 2 1 19/08/00 Chelsea West Ham 4 2 H 0 0 0 0 0 0
## 3 2 19/08/00 Coventry Middlesbrough 1 3 NH 0 0 0 0 0 0
## 4 3 19/08/00 Derby Southampton 2 2 NH 0 0 0 0 0 0
## 5 4 19/08/00 Leeds Everton 2 0 H 0 0 0 0 0 0
## 6 5 19/08/00 Leicester Aston Villa 0 0 NH 0 0 0 0 0 0
## HM1 HM2 HM3 HM4 HM5 AM1 AM2 AM3 AM4 AM5 MW HTFormPtsStr ATFormPtsStr
## 1 M M M M M M M M M M 1 MMMMM MMMMM
## 2 M M M M M M M M M M 1 MMMMM MMMMM
## 3 M M M M M M M M M M 1 MMMMM MMMMM
## 4 M M M M M M M M M M 1 MMMMM MMMMM
## 5 M M M M M M M M M M 1 MMMMM MMMMM
## 6 M M M M M M M M M M 1 MMMMM MMMMM
## HTFormPts ATFormPts HTWinStreak3 HTWinStreak5 HTLossStreak3 HTLossStreak5
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## ATWinStreak3 ATWinStreak5 ATLossStreak3 ATLossStreak5 HTGD ATGD DiffPts
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## DiffFormPts TotalGoals GoalDifference AvgPointsHT AvgPointsAT
## 1 0 4 4 0 0
## 2 0 6 2 0 0
## 3 0 4 -2 0 0
## 4 0 4 0 0 0
## 5 0 2 2 0 0
## 6 0 0 0 0 0
Finding outliers in the Total Goals Column
data <- data %>%
mutate(TotalGoals = FTHG + FTAG)
Q1 <- quantile(data$TotalGoals, 0.25)
Q3 <- quantile(data$TotalGoals, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Find outliers
outliers <- data[data$TotalGoals < lower_bound | data$TotalGoals > upper_bound, ]
# Print the outliers
print(outliers)
## X Date HomeTeam AwayTeam FTHG FTAG FTR HTGS ATGS HTGC
## 1647 1646 13/11/04 Tottenham Arsenal 4 5 NH 8 32 11
## 2732 2731 29/09/07 Portsmouth Reading 7 4 H 8 5 8
## 2855 2854 29/12/07 Tottenham Reading 6 4 H 35 25 32
## 3035 3034 11/05/08 Middlesbrough Man City 8 1 H 35 44 52
## 3547 3546 22/11/09 Tottenham Wigan 9 1 H 23 13 17
## 3621 3620 16/01/10 Chelsea Sunderland 7 2 H 45 28 16
## 4206 4205 28/08/11 Man United Arsenal 8 2 H 5 0 1
## 4749 4748 29/12/12 Arsenal Newcastle 7 3 H 32 20 18
## 4938 4937 19/05/13 West Brom Man United 5 5 NH 48 81 52
## 5095 5094 14/12/13 Man City Arsenal 6 3 H 41 30 15
## 5232 5231 22/03/14 Cardiff Liverpool 3 6 NH 22 76 50
## 5342 5341 30/08/14 Everton Chelsea 3 6 NH 4 5 4
## 5924 5923 23/01/16 Norwich Liverpool 4 5 NH 24 25 38
## 6206 6205 26/11/16 Swansea Crystal Palace 5 4 H 11 17 22
## 6313 6312 04/02/17 Everton Bournemouth 6 3 H 34 32 24
## 6534 6533 14/10/17 Man City Stoke 7 2 H 22 7 2
## 6839 6838 13/05/18 Tottenham Leicester 5 4 H 69 52 32
## ATGC HTP ATP HM1 HM2 HM3 HM4 HM5 AM1 AM2 AM3 AM4 AM5 MW
## 1647 13 1.0000000 2.0769231 L L L L W D D L W W 13
## 2732 11 1.1250000 0.8750000 W D L L W W L L L W 8
## 2855 36 1.0500000 1.1000000 W L W W L D W D W D 20
## 3035 45 1.0263158 1.4473684 W L L D D L L W W L 38
## 3547 22 1.6923077 1.0769231 W L L W D D L W D L 13
## 3621 31 2.1428571 1.0952381 W D D W D D D L L D 21
## 4206 2 2.0000000 0.3333333 W W M M M L D M M M 3
## 4749 26 1.5789474 1.0526316 W W W L D W L L W L 19
## 4938 38 1.2631579 2.3157895 L L L W D W L D W D 38
## 5095 11 1.8125000 2.1875000 D W W W L D W W W L 16
## 5232 35 0.8333333 2.0666667 W L L D L W W W W W 30
## 5342 1 0.6666667 2.0000000 D D M M M W W M M M 3
## 5924 28 1.0000000 1.3478261 L L W W L L D L W W 23
## 6206 21 0.4615385 0.8461538 D L L D L L L L L L 13
## 6313 41 1.5416667 1.0833333 D W W W D L D L D W 24
## 6534 11 2.3750000 1.0000000 W W W W W W L L D D 8
## 6839 55 1.9473684 1.2368421 W L W D L W L L D L 38
## HTFormPtsStr ATFormPtsStr HTFormPts ATFormPts HTWinStreak3 HTWinStreak5
## 1647 LLLLW DDLWW 3 8 0 0
## 2732 WDLLW WLLLW 7 6 0 0
## 2855 WLWWL DWDWD 9 9 0 0
## 3035 WLLDD LLWWL 5 6 0 0
## 3547 WLLWD DLWDL 7 5 0 0
## 3621 WDDWD DDLLD 9 3 0 0
## 4206 WWMMM LDMMM 6 1 0 0
## 4749 WWWLD WLLWL 10 6 0 0
## 4938 LLLWD WLDWD 4 8 0 0
## 5095 DWWWL DWWWL 10 10 0 0
## 5232 WLLDL WWWWW 4 15 0 0
## 5342 DDMMM WWMMM 2 6 0 0
## 5924 LLWWL LDLWW 6 7 0 0
## 6206 DLLDL LLLLL 2 0 0 0
## 6313 DWWWD LDLDW 11 5 0 0
## 6534 WWWWW WLLDD 15 5 1 1
## 6839 WLWDL WLLDL 7 4 0 0
## HTLossStreak3 HTLossStreak5 ATWinStreak3 ATWinStreak5 ATLossStreak3
## 1647 0 0 0 0 0
## 2732 0 0 0 0 0
## 2855 0 0 0 0 0
## 3035 0 0 0 0 0
## 3547 0 0 0 0 0
## 3621 0 0 0 0 0
## 4206 0 0 0 0 0
## 4749 0 0 0 0 0
## 4938 0 0 0 0 0
## 5095 0 0 0 0 0
## 5232 0 0 1 1 0
## 5342 0 0 0 0 0
## 5924 0 0 0 0 0
## 6206 0 0 0 0 1
## 6313 0 0 0 0 0
## 6534 0 0 0 0 0
## 6839 0 0 0 0 0
## ATLossStreak5 HTGD ATGD DiffPts DiffFormPts TotalGoals
## 1647 0 -0.2307692 1.46153846 -1.0769231 -0.38461538 9
## 2732 0 0.0000000 -0.75000000 0.2500000 0.12500000 11
## 2855 0 0.1500000 -0.55000000 -0.0500000 0.00000000 10
## 3035 0 -0.4473684 -0.02631579 -0.4210526 -0.02631579 9
## 3547 0 0.4615385 -0.69230769 0.6153846 0.15384615 10
## 3621 0 1.3809524 -0.14285714 1.0476190 0.28571429 9
## 4206 0 1.3333333 -0.66666667 1.6666667 1.66666667 10
## 4749 0 0.7368421 -0.31578947 0.5263158 0.21052632 10
## 4938 0 -0.1052632 1.13157895 -1.0526316 -0.10526316 10
## 5095 0 1.6250000 1.18750000 -0.3750000 0.00000000 9
## 5232 0 -0.9333333 1.36666667 -1.2333333 -0.36666667 9
## 5342 0 0.0000000 1.33333333 -1.3333333 -1.33333333 9
## 5924 0 -0.6086957 -0.13043478 -0.3478261 -0.04347826 9
## 6206 1 -0.8461538 -0.30769231 -0.3846154 0.15384615 9
## 6313 0 0.4166667 -0.37500000 0.4583333 0.25000000 9
## 6534 0 2.5000000 -0.50000000 1.3750000 1.25000000 9
## 6839 0 0.9736842 -0.07894737 0.7105263 0.07894737 9
## GoalDifference AvgPointsHT AvgPointsAT
## 1647 -1 0.07692308 0.15976331
## 2732 3 0.14062500 0.10937500
## 2855 2 0.05250000 0.05500000
## 3035 7 0.02700831 0.03808864
## 3547 8 0.13017751 0.08284024
## 3621 5 0.10204082 0.05215420
## 4206 6 0.66666667 0.11111111
## 4749 4 0.08310249 0.05540166
## 4938 0 0.03324100 0.06094183
## 5095 3 0.11328125 0.13671875
## 5232 -3 0.02777778 0.06888889
## 5342 -3 0.22222222 0.66666667
## 5924 -1 0.04347826 0.05860113
## 6206 1 0.03550296 0.06508876
## 6313 3 0.06423611 0.04513889
## 6534 5 0.29687500 0.12500000
## 6839 1 0.05124654 0.03254848
Plotting the outliers
library(ggplot2)
ggplot(data, aes(x = Date, y = TotalGoals)) +
geom_point(aes(color = ifelse(TotalGoals < lower_bound | TotalGoals > upper_bound, "Outlier", "Not Outlier")),
size = 3) +
labs(title = "Outliers observation",
x = "Date",
y = "TotalGoals") +
scale_color_manual(values = c("Outlier" = "green", "Not Outlier" = "yellow")) +
theme_minimal()

Finding the mean of the Goal Difference column created
data <- data %>%
mutate(GoalDifference = FTHG - FTAG)
# Calculate the mean of the GoalDifference column
mean_GoalDifference <- mean(data$GoalDifference)
# Print the mean
cat("Mean GoalDifference:", mean_GoalDifference, "\n")
## Mean GoalDifference: 0.3972222
ggplot(NULL, aes(x = "", y = mean_GoalDifference)) +
geom_bar(stat = "identity", fill = "red") +
labs(title = "Mean Goal Difference",
y = "Mean Value") +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())

#New column 3
data <- data %>%
mutate(AvgPointsHT = HTP / MW,
AvgPointsAT = ATP / MW)
#Visualization for AvgPointsHT vs. FTHG
ggplot(data, aes(x = FTHG, y = AvgPointsHT)) +
geom_point() +
labs(title = "AvgPointsHT vs. FTHG", x = "FTHG", y = "AvgPointsHT")

#Visualization for AvgPointsHT vs. FTAG
ggplot(data, aes(x = FTAG, y = AvgPointsHT)) +
geom_point() +
labs(title = "AvgPointsHT vs. FTAG", x = "FTAG", y = "AvgPointsHT")

Finding correlation between the columns
correlation_HT_FTHG <- cor(data$AvgPointsHT, data$FTHG)
correlation_HT_FTAG <- cor(data$AvgPointsHT, data$FTAG)
correlation_AT_FTHG <- cor(data$AvgPointsAT, data$FTHG)
correlation_AT_FTAG <- cor(data$AvgPointsAT, data$FTAG)
print(correlation_HT_FTHG)
## [1] 0.05915268
HT and FTHG shows positve corelation
print(correlation_HT_FTAG)
## [1] -0.05918528
HT and FTAG shows negative coorealtion
print(correlation_AT_FTHG)
## [1] -0.06614754
AT and FT shows negative correlation
print(correlation_AT_FTAG)
## [1] 0.07310344
AT and FTAG shows positive correaltion
Creating confidence interval for HT and AT
confidence_interval_HT <- t.test(data$AvgPointsHT)$conf.int
confidence_interval_AT <- t.test(data$AvgPointsAT)$conf.int
print(confidence_interval_HT)
## [1] 0.08932622 0.09423252
## attr(,"conf.level")
## [1] 0.95
print(confidence_interval_AT)
## [1] 0.09357701 0.09891186
## attr(,"conf.level")
## [1] 0.95