Data Dive week 11

#importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(dplyr)
library(ggplot2)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

Importing the dataset

data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
##  [1] "X"             "Date"          "HomeTeam"      "AwayTeam"     
##  [5] "FTHG"          "FTAG"          "FTR"           "HTGS"         
##  [9] "ATGS"          "HTGC"          "ATGC"          "HTP"          
## [13] "ATP"           "HM1"           "HM2"           "HM3"          
## [17] "HM4"           "HM5"           "AM1"           "AM2"          
## [21] "AM3"           "AM4"           "AM5"           "MW"           
## [25] "HTFormPtsStr"  "ATFormPtsStr"  "HTFormPts"     "ATFormPts"    
## [29] "HTWinStreak3"  "HTWinStreak5"  "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3"  "ATWinStreak5"  "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD"          "ATGD"          "DiffPts"       "DiffFormPts"

Response variable- FTHG

Explanatory variable-HTGS,ATGS,HTGC,ATGC

#linear model
lm_model <- lm(FTHG ~ HTGS + ATGS + HTGC + ATGC, data = data)
summary(lm_model)
## 
## Call:
## lm(formula = FTHG ~ HTGS + ATGS + HTGC + ATGC, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6912 -0.8723 -0.2631  0.6353  7.2087 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.465531   0.028950  50.623  < 2e-16 ***
## HTGS         0.023323   0.001588  14.688  < 2e-16 ***
## ATGS        -0.018265   0.001584 -11.529  < 2e-16 ***
## HTGC        -0.015730   0.001847  -8.519  < 2e-16 ***
## ATGC         0.013373   0.001861   7.185 7.44e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.24 on 6835 degrees of freedom
## Multiple R-squared:  0.0881, Adjusted R-squared:  0.08757 
## F-statistic: 165.1 on 4 and 6835 DF,  p-value: < 2.2e-16

Residual plots

par(mfrow=c(2,2))
plot(lm_model)

Checking non-linearity of the model

plot(lm_model, which = 1)  # Residuals vs. Fitted

We can see that residuals are scattered in a linear pattern.

Checking for heteroscedasticity

plot(lm_model, which = 1)

Checking if there are any influential outliers

influence_obj <- influence(lm_model)
cooks_distance <- influence_obj$cooks.distance
if (any(cooks_distance > 4 / length(cooks_distance))) {
  cat("Influential Outliers Detected (Cook's Distance)\n")
}
cooks_distance
## NULL

There are no influential outliers .

Coefficient summary

coeff_summary <- summary(lm_model)
print(coeff_summary$coefficients)
##                Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept)  1.46553099 0.028949659  50.623429 0.000000e+00
## HTGS         0.02332313 0.001587859  14.688418 4.116089e-48
## ATGS        -0.01826527 0.001584294 -11.528963 1.801378e-30
## HTGC        -0.01573029 0.001846583  -8.518593 1.965715e-17
## ATGC         0.01337301 0.001861277   7.184861 7.440387e-13

Interpretation of the coefficient -HTGS

coefficient_of_interest <- coeff_summary$coefficients["HTGS", ]
if (coefficient_of_interest["Pr(>|t|)"] < 0.05) {
  cat("The coefficient for HTGS is statistically significant.\n")
  if (coefficient_of_interest["Estimate"] > 0) {
    cat("A one-unit increase in HTGS is associated with an increase in FTHG.\n")
  } else {
    cat("A one-unit increase in HTGS is associated with a decrease in FTHG.\n")
  }
} else {
  cat("The coefficient for HTGS is not statistically significant.\n")
}
## The coefficient for HTGS is statistically significant.
## A one-unit increase in HTGS is associated with an increase in FTHG.

The coefficient for “HTGS” indicates that it is statistically significant and a one unit increase in “HTGS” is associated with an increase in the number of goals scored by the home team (FTHG)