Intro to R:Data Dive 6

Dataset:English Premier League

#importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggrepel)

Loading Dataset

data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
##  [1] "X"             "Date"          "HomeTeam"      "AwayTeam"     
##  [5] "FTHG"          "FTAG"          "FTR"           "HTGS"         
##  [9] "ATGS"          "HTGC"          "ATGC"          "HTP"          
## [13] "ATP"           "HM1"           "HM2"           "HM3"          
## [17] "HM4"           "HM5"           "AM1"           "AM2"          
## [21] "AM3"           "AM4"           "AM5"           "MW"           
## [25] "HTFormPtsStr"  "ATFormPtsStr"  "HTFormPts"     "ATFormPts"    
## [29] "HTWinStreak3"  "HTWinStreak5"  "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3"  "ATWinStreak5"  "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD"          "ATGD"          "DiffPts"       "DiffFormPts"

Creating new columns using mutate

#New column 1
data <- data %>%
  mutate(TotalGoals = FTHG + FTAG)
#New column 2
data <- data %>%
  mutate(GoalDifference = FTHG - FTAG)
#New column 3
data <- data %>%
  mutate(AvgPointsHT = HTP / MW, 
         AvgPointsAT = ATP / MW)
head(data)
##   X     Date  HomeTeam      AwayTeam FTHG FTAG FTR HTGS ATGS HTGC ATGC HTP ATP
## 1 0 19/08/00  Charlton      Man City    4    0   H    0    0    0    0   0   0
## 2 1 19/08/00   Chelsea      West Ham    4    2   H    0    0    0    0   0   0
## 3 2 19/08/00  Coventry Middlesbrough    1    3  NH    0    0    0    0   0   0
## 4 3 19/08/00     Derby   Southampton    2    2  NH    0    0    0    0   0   0
## 5 4 19/08/00     Leeds       Everton    2    0   H    0    0    0    0   0   0
## 6 5 19/08/00 Leicester   Aston Villa    0    0  NH    0    0    0    0   0   0
##   HM1 HM2 HM3 HM4 HM5 AM1 AM2 AM3 AM4 AM5 MW HTFormPtsStr ATFormPtsStr
## 1   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 2   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 3   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 4   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 5   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
## 6   M   M   M   M   M   M   M   M   M   M  1        MMMMM        MMMMM
##   HTFormPts ATFormPts HTWinStreak3 HTWinStreak5 HTLossStreak3 HTLossStreak5
## 1         0         0            0            0             0             0
## 2         0         0            0            0             0             0
## 3         0         0            0            0             0             0
## 4         0         0            0            0             0             0
## 5         0         0            0            0             0             0
## 6         0         0            0            0             0             0
##   ATWinStreak3 ATWinStreak5 ATLossStreak3 ATLossStreak5 HTGD ATGD DiffPts
## 1            0            0             0             0    0    0       0
## 2            0            0             0             0    0    0       0
## 3            0            0             0             0    0    0       0
## 4            0            0             0             0    0    0       0
## 5            0            0             0             0    0    0       0
## 6            0            0             0             0    0    0       0
##   DiffFormPts TotalGoals GoalDifference AvgPointsHT AvgPointsAT
## 1           0          4              4           0           0
## 2           0          6              2           0           0
## 3           0          4             -2           0           0
## 4           0          4              0           0           0
## 5           0          2              2           0           0
## 6           0          0              0           0           0

Finding outliers in the Total Goals Column

data <- data %>%
  mutate(TotalGoals = FTHG + FTAG)
Q1 <- quantile(data$TotalGoals, 0.25)
Q3 <- quantile(data$TotalGoals, 0.75)
IQR_value <- Q3 - Q1

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Find outliers
outliers <- data[data$TotalGoals < lower_bound | data$TotalGoals > upper_bound, ]

# Print the outliers
print(outliers)
##         X     Date      HomeTeam       AwayTeam FTHG FTAG FTR HTGS ATGS HTGC
## 1647 1646 13/11/04     Tottenham        Arsenal    4    5  NH    8   32   11
## 2732 2731 29/09/07    Portsmouth        Reading    7    4   H    8    5    8
## 2855 2854 29/12/07     Tottenham        Reading    6    4   H   35   25   32
## 3035 3034 11/05/08 Middlesbrough       Man City    8    1   H   35   44   52
## 3547 3546 22/11/09     Tottenham          Wigan    9    1   H   23   13   17
## 3621 3620 16/01/10       Chelsea     Sunderland    7    2   H   45   28   16
## 4206 4205 28/08/11    Man United        Arsenal    8    2   H    5    0    1
## 4749 4748 29/12/12       Arsenal      Newcastle    7    3   H   32   20   18
## 4938 4937 19/05/13     West Brom     Man United    5    5  NH   48   81   52
## 5095 5094 14/12/13      Man City        Arsenal    6    3   H   41   30   15
## 5232 5231 22/03/14       Cardiff      Liverpool    3    6  NH   22   76   50
## 5342 5341 30/08/14       Everton        Chelsea    3    6  NH    4    5    4
## 5924 5923 23/01/16       Norwich      Liverpool    4    5  NH   24   25   38
## 6206 6205 26/11/16       Swansea Crystal Palace    5    4   H   11   17   22
## 6313 6312 04/02/17       Everton    Bournemouth    6    3   H   34   32   24
## 6534 6533 14/10/17      Man City          Stoke    7    2   H   22    7    2
## 6839 6838 13/05/18     Tottenham      Leicester    5    4   H   69   52   32
##      ATGC       HTP       ATP HM1 HM2 HM3 HM4 HM5 AM1 AM2 AM3 AM4 AM5 MW
## 1647   13 1.0000000 2.0769231   L   L   L   L   W   D   D   L   W   W 13
## 2732   11 1.1250000 0.8750000   W   D   L   L   W   W   L   L   L   W  8
## 2855   36 1.0500000 1.1000000   W   L   W   W   L   D   W   D   W   D 20
## 3035   45 1.0263158 1.4473684   W   L   L   D   D   L   L   W   W   L 38
## 3547   22 1.6923077 1.0769231   W   L   L   W   D   D   L   W   D   L 13
## 3621   31 2.1428571 1.0952381   W   D   D   W   D   D   D   L   L   D 21
## 4206    2 2.0000000 0.3333333   W   W   M   M   M   L   D   M   M   M  3
## 4749   26 1.5789474 1.0526316   W   W   W   L   D   W   L   L   W   L 19
## 4938   38 1.2631579 2.3157895   L   L   L   W   D   W   L   D   W   D 38
## 5095   11 1.8125000 2.1875000   D   W   W   W   L   D   W   W   W   L 16
## 5232   35 0.8333333 2.0666667   W   L   L   D   L   W   W   W   W   W 30
## 5342    1 0.6666667 2.0000000   D   D   M   M   M   W   W   M   M   M  3
## 5924   28 1.0000000 1.3478261   L   L   W   W   L   L   D   L   W   W 23
## 6206   21 0.4615385 0.8461538   D   L   L   D   L   L   L   L   L   L 13
## 6313   41 1.5416667 1.0833333   D   W   W   W   D   L   D   L   D   W 24
## 6534   11 2.3750000 1.0000000   W   W   W   W   W   W   L   L   D   D  8
## 6839   55 1.9473684 1.2368421   W   L   W   D   L   W   L   L   D   L 38
##      HTFormPtsStr ATFormPtsStr HTFormPts ATFormPts HTWinStreak3 HTWinStreak5
## 1647        LLLLW        DDLWW         3         8            0            0
## 2732        WDLLW        WLLLW         7         6            0            0
## 2855        WLWWL        DWDWD         9         9            0            0
## 3035        WLLDD        LLWWL         5         6            0            0
## 3547        WLLWD        DLWDL         7         5            0            0
## 3621        WDDWD        DDLLD         9         3            0            0
## 4206        WWMMM        LDMMM         6         1            0            0
## 4749        WWWLD        WLLWL        10         6            0            0
## 4938        LLLWD        WLDWD         4         8            0            0
## 5095        DWWWL        DWWWL        10        10            0            0
## 5232        WLLDL        WWWWW         4        15            0            0
## 5342        DDMMM        WWMMM         2         6            0            0
## 5924        LLWWL        LDLWW         6         7            0            0
## 6206        DLLDL        LLLLL         2         0            0            0
## 6313        DWWWD        LDLDW        11         5            0            0
## 6534        WWWWW        WLLDD        15         5            1            1
## 6839        WLWDL        WLLDL         7         4            0            0
##      HTLossStreak3 HTLossStreak5 ATWinStreak3 ATWinStreak5 ATLossStreak3
## 1647             0             0            0            0             0
## 2732             0             0            0            0             0
## 2855             0             0            0            0             0
## 3035             0             0            0            0             0
## 3547             0             0            0            0             0
## 3621             0             0            0            0             0
## 4206             0             0            0            0             0
## 4749             0             0            0            0             0
## 4938             0             0            0            0             0
## 5095             0             0            0            0             0
## 5232             0             0            1            1             0
## 5342             0             0            0            0             0
## 5924             0             0            0            0             0
## 6206             0             0            0            0             1
## 6313             0             0            0            0             0
## 6534             0             0            0            0             0
## 6839             0             0            0            0             0
##      ATLossStreak5       HTGD        ATGD    DiffPts DiffFormPts TotalGoals
## 1647             0 -0.2307692  1.46153846 -1.0769231 -0.38461538          9
## 2732             0  0.0000000 -0.75000000  0.2500000  0.12500000         11
## 2855             0  0.1500000 -0.55000000 -0.0500000  0.00000000         10
## 3035             0 -0.4473684 -0.02631579 -0.4210526 -0.02631579          9
## 3547             0  0.4615385 -0.69230769  0.6153846  0.15384615         10
## 3621             0  1.3809524 -0.14285714  1.0476190  0.28571429          9
## 4206             0  1.3333333 -0.66666667  1.6666667  1.66666667         10
## 4749             0  0.7368421 -0.31578947  0.5263158  0.21052632         10
## 4938             0 -0.1052632  1.13157895 -1.0526316 -0.10526316         10
## 5095             0  1.6250000  1.18750000 -0.3750000  0.00000000          9
## 5232             0 -0.9333333  1.36666667 -1.2333333 -0.36666667          9
## 5342             0  0.0000000  1.33333333 -1.3333333 -1.33333333          9
## 5924             0 -0.6086957 -0.13043478 -0.3478261 -0.04347826          9
## 6206             1 -0.8461538 -0.30769231 -0.3846154  0.15384615          9
## 6313             0  0.4166667 -0.37500000  0.4583333  0.25000000          9
## 6534             0  2.5000000 -0.50000000  1.3750000  1.25000000          9
## 6839             0  0.9736842 -0.07894737  0.7105263  0.07894737          9
##      GoalDifference AvgPointsHT AvgPointsAT
## 1647             -1  0.07692308  0.15976331
## 2732              3  0.14062500  0.10937500
## 2855              2  0.05250000  0.05500000
## 3035              7  0.02700831  0.03808864
## 3547              8  0.13017751  0.08284024
## 3621              5  0.10204082  0.05215420
## 4206              6  0.66666667  0.11111111
## 4749              4  0.08310249  0.05540166
## 4938              0  0.03324100  0.06094183
## 5095              3  0.11328125  0.13671875
## 5232             -3  0.02777778  0.06888889
## 5342             -3  0.22222222  0.66666667
## 5924             -1  0.04347826  0.05860113
## 6206              1  0.03550296  0.06508876
## 6313              3  0.06423611  0.04513889
## 6534              5  0.29687500  0.12500000
## 6839              1  0.05124654  0.03254848

Plotting the outliers

library(ggplot2)
ggplot(data, aes(x = Date, y = TotalGoals)) +
  geom_point(aes(color = ifelse(TotalGoals < lower_bound | TotalGoals > upper_bound, "Outlier", "Not Outlier")), 
             size = 3) +
  labs(title = "Outliers observation",
       x = "Date",
       y = "TotalGoals") +
  scale_color_manual(values = c("Outlier" = "green", "Not Outlier" = "yellow")) +
  theme_minimal()

Finding the mean of the Goal Difference column created

data <- data %>%
  mutate(GoalDifference = FTHG - FTAG)
# Calculate the mean of the GoalDifference column
mean_GoalDifference <- mean(data$GoalDifference)

# Print the mean
cat("Mean GoalDifference:", mean_GoalDifference, "\n")
## Mean GoalDifference: 0.3972222
ggplot(NULL, aes(x = "", y = mean_GoalDifference)) +
  geom_bar(stat = "identity", fill = "red") +
  labs(title = "Mean Goal Difference",
       y = "Mean Value") +
  theme_minimal() +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

#New column 3
data <- data %>%
  mutate(AvgPointsHT = HTP / MW, 
         AvgPointsAT = ATP / MW)
#Visualization for AvgPointsHT vs. FTHG
ggplot(data, aes(x = FTHG, y = AvgPointsHT)) +
  geom_point() +
  labs(title = "AvgPointsHT vs. FTHG", x = "FTHG", y = "AvgPointsHT")

#Visualization for AvgPointsHT vs. FTAG
ggplot(data, aes(x = FTAG, y = AvgPointsHT)) +
  geom_point() +
  labs(title = "AvgPointsHT vs. FTAG", x = "FTAG", y = "AvgPointsHT")

Finding correlation between the columns

correlation_HT_FTHG <- cor(data$AvgPointsHT, data$FTHG)
correlation_HT_FTAG <- cor(data$AvgPointsHT, data$FTAG)
correlation_AT_FTHG <- cor(data$AvgPointsAT, data$FTHG)
correlation_AT_FTAG <- cor(data$AvgPointsAT, data$FTAG)
print(correlation_HT_FTHG)
## [1] 0.05915268

HT and FTHG shows positve corelation

print(correlation_HT_FTAG)
## [1] -0.05918528

HT and FTAG shows negative coorealtion

print(correlation_AT_FTHG) 
## [1] -0.06614754

AT and FT shows negative correlation

print(correlation_AT_FTAG) 
## [1] 0.07310344

AT and FTAG shows positive correaltion

Creating confidence interval for HT and AT

confidence_interval_HT <- t.test(data$AvgPointsHT)$conf.int
confidence_interval_AT <- t.test(data$AvgPointsAT)$conf.int
print(confidence_interval_HT)
## [1] 0.08932622 0.09423252
## attr(,"conf.level")
## [1] 0.95
print(confidence_interval_AT)
## [1] 0.09357701 0.09891186
## attr(,"conf.level")
## [1] 0.95