Intro to R:Data Dive 1

Dataset-English Premier League

library(tidyverse) #importing the required libraries
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Loading the Dataset

data <-read.csv('C:/Downloads/final_dataset.csv ')

General Summary of data

summary(data)
##        X            Date             HomeTeam           AwayTeam        
##  Min.   :   0   Length:6840        Length:6840        Length:6840       
##  1st Qu.:1710   Class :character   Class :character   Class :character  
##  Median :3420   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3420                                                           
##  3rd Qu.:5129                                                           
##  Max.   :6839                                                           
##       FTHG            FTAG          FTR                 HTGS       
##  Min.   :0.000   Min.   :0.00   Length:6840        Min.   :  0.00  
##  1st Qu.:1.000   1st Qu.:0.00   Class :character   1st Qu.: 11.00  
##  Median :1.000   Median :1.00   Mode  :character   Median : 23.00  
##  Mean   :1.527   Mean   :1.13                      Mean   : 24.42  
##  3rd Qu.:2.000   3rd Qu.:2.00                      3rd Qu.: 35.00  
##  Max.   :9.000   Max.   :7.00                      Max.   :102.00  
##       ATGS             HTGC           ATGC            HTP        
##  Min.   :  0.00   Min.   : 0.0   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.: 11.00   1st Qu.:11.0   1st Qu.:11.00   1st Qu.:0.8889  
##  Median : 23.00   Median :23.0   Median :23.00   Median :1.1724  
##  Mean   : 24.51   Mean   :24.5   Mean   :24.35   Mean   :1.2090  
##  3rd Qu.: 35.00   3rd Qu.:36.0   3rd Qu.:36.00   3rd Qu.:1.5556  
##  Max.   :105.00   Max.   :85.0   Max.   :82.00   Max.   :2.7368  
##       ATP             HM1                HM2                HM3           
##  Min.   :0.0000   Length:6840        Length:6840        Length:6840       
##  1st Qu.:0.9062   Class :character   Class :character   Class :character  
##  Median :1.1923   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1.2268                                                           
##  3rd Qu.:1.5625                                                           
##  Max.   :2.7619                                                           
##      HM4                HM5                AM1                AM2           
##  Length:6840        Length:6840        Length:6840        Length:6840       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      AM3                AM4                AM5                  MW      
##  Length:6840        Length:6840        Length:6840        Min.   : 1.0  
##  Class :character   Class :character   Class :character   1st Qu.:10.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :19.5  
##                                                           Mean   :19.5  
##                                                           3rd Qu.:29.0  
##                                                           Max.   :38.0  
##  HTFormPtsStr       ATFormPtsStr         HTFormPts        ATFormPts     
##  Length:6840        Length:6840        Min.   : 0.000   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 4.000   1st Qu.: 4.000  
##  Mode  :character   Mode  :character   Median : 6.000   Median : 6.000  
##                                        Mean   : 6.243   Mean   : 6.414  
##                                        3rd Qu.: 9.000   3rd Qu.: 9.000  
##                                        Max.   :15.000   Max.   :15.000  
##   HTWinStreak3      HTWinStreak5     HTLossStreak3    HTLossStreak5    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.06228   Mean   :0.01798   Mean   :0.0576   Mean   :0.01433  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##   ATWinStreak3      ATWinStreak5     ATLossStreak3     ATLossStreak5    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.06287   Mean   :0.01652   Mean   :0.05102   Mean   :0.01023  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##       HTGD               ATGD             DiffPts          DiffFormPts      
##  Min.   :-3.00000   Min.   :-3.33333   Min.   :-2.36364   Min.   :-2.25000  
##  1st Qu.:-0.50000   1st Qu.:-0.46429   1st Qu.:-0.45161   1st Qu.:-0.17647  
##  Median :-0.07692   Median :-0.05000   Median : 0.00000   Median : 0.00000  
##  Mean   :-0.00969   Mean   : 0.01454   Mean   :-0.01775   Mean   :-0.01809  
##  3rd Qu.: 0.40000   3rd Qu.: 0.42105   3rd Qu.: 0.42857   3rd Qu.: 0.15385  
##  Max.   : 4.00000   Max.   : 3.50000   Max.   : 2.28571   Max.   : 2.25000

Column names in data

colnames(data)#coloumn name of the data
##  [1] "X"             "Date"          "HomeTeam"      "AwayTeam"     
##  [5] "FTHG"          "FTAG"          "FTR"           "HTGS"         
##  [9] "ATGS"          "HTGC"          "ATGC"          "HTP"          
## [13] "ATP"           "HM1"           "HM2"           "HM3"          
## [17] "HM4"           "HM5"           "AM1"           "AM2"          
## [21] "AM3"           "AM4"           "AM5"           "MW"           
## [25] "HTFormPtsStr"  "ATFormPtsStr"  "HTFormPts"     "ATFormPts"    
## [29] "HTWinStreak3"  "HTWinStreak5"  "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3"  "ATWinStreak5"  "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD"          "ATGD"          "DiffPts"       "DiffFormPts"

Dataset Information: English Premier league dataset

The data set involves around 35 columns

The main columns include the home team, away team,date,FTHG,FTAG,etc

Goal/Purpose:

The goal is to observe and analyze the English Premier league data set and predict winners in different category.

Basic Visualization

data<- c("Liverpool", "Manchester City", "Manchester United", "Chelsea", "Arsenal")
points <- c(99, 98, 85, 78, 68)


data <- data.frame(Team = data, Points = points)


library(ggplot2)

ggplot(data, aes(x = Team, y = Points, fill = Team)) +
  geom_bar(stat = "identity") +
  labs(title = "English Premier League Team Performance",
       x = "Team",
       y = "Total Points") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

data <- c("Liverpool", "Manchester City", "Manchester United", "Chelsea", "Arsenal")
goals_scored <- c(85, 95, 75, 70, 65)

data <- data.frame(Team = data, GoalsScored = goals_scored)


barplot(data$GoalsScored, 
        names.arg = data$Team,
        main = "Goals Scored by English Premier League Teams",
        xlab = "Team",
        ylab = "Goals Scored",
        col = "red", 
        ylim = c(0, max(data$GoalsScored) + 10) # Set y-axis limits
)