Data Dive week 11
#importing libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(dplyr)
library(ggplot2)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
Importing the dataset
data <-read.csv('C:/Downloads/final_dataset.csv')
colnames(data)
## [1] "X" "Date" "HomeTeam" "AwayTeam"
## [5] "FTHG" "FTAG" "FTR" "HTGS"
## [9] "ATGS" "HTGC" "ATGC" "HTP"
## [13] "ATP" "HM1" "HM2" "HM3"
## [17] "HM4" "HM5" "AM1" "AM2"
## [21] "AM3" "AM4" "AM5" "MW"
## [25] "HTFormPtsStr" "ATFormPtsStr" "HTFormPts" "ATFormPts"
## [29] "HTWinStreak3" "HTWinStreak5" "HTLossStreak3" "HTLossStreak5"
## [33] "ATWinStreak3" "ATWinStreak5" "ATLossStreak3" "ATLossStreak5"
## [37] "HTGD" "ATGD" "DiffPts" "DiffFormPts"
Response variable- FTHG
Explanatory variable-HTGS,ATGS,HTGC,ATGC
#linear model
lm_model <- lm(FTHG ~ HTGS + ATGS + HTGC + ATGC, data = data)
summary(lm_model)
##
## Call:
## lm(formula = FTHG ~ HTGS + ATGS + HTGC + ATGC, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6912 -0.8723 -0.2631 0.6353 7.2087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.465531 0.028950 50.623 < 2e-16 ***
## HTGS 0.023323 0.001588 14.688 < 2e-16 ***
## ATGS -0.018265 0.001584 -11.529 < 2e-16 ***
## HTGC -0.015730 0.001847 -8.519 < 2e-16 ***
## ATGC 0.013373 0.001861 7.185 7.44e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.24 on 6835 degrees of freedom
## Multiple R-squared: 0.0881, Adjusted R-squared: 0.08757
## F-statistic: 165.1 on 4 and 6835 DF, p-value: < 2.2e-16
Residual plots
par(mfrow=c(2,2))
plot(lm_model)

Checking non-linearity of the model
plot(lm_model, which = 1) # Residuals vs. Fitted

We can see that residuals are scattered in a linear pattern.
Checking for heteroscedasticity
plot(lm_model, which = 1)

Checking if there are any influential outliers
influence_obj <- influence(lm_model)
cooks_distance <- influence_obj$cooks.distance
if (any(cooks_distance > 4 / length(cooks_distance))) {
cat("Influential Outliers Detected (Cook's Distance)\n")
}
cooks_distance
## NULL
There are no influential outliers .
Coefficient summary
coeff_summary <- summary(lm_model)
print(coeff_summary$coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.46553099 0.028949659 50.623429 0.000000e+00
## HTGS 0.02332313 0.001587859 14.688418 4.116089e-48
## ATGS -0.01826527 0.001584294 -11.528963 1.801378e-30
## HTGC -0.01573029 0.001846583 -8.518593 1.965715e-17
## ATGC 0.01337301 0.001861277 7.184861 7.440387e-13
Interpretation of the coefficient -HTGS
coefficient_of_interest <- coeff_summary$coefficients["HTGS", ]
if (coefficient_of_interest["Pr(>|t|)"] < 0.05) {
cat("The coefficient for HTGS is statistically significant.\n")
if (coefficient_of_interest["Estimate"] > 0) {
cat("A one-unit increase in HTGS is associated with an increase in FTHG.\n")
} else {
cat("A one-unit increase in HTGS is associated with a decrease in FTHG.\n")
}
} else {
cat("The coefficient for HTGS is not statistically significant.\n")
}
## The coefficient for HTGS is statistically significant.
## A one-unit increase in HTGS is associated with an increase in FTHG.
The coefficient for “HTGS” indicates that it is statistically
significant and a one unit increase in “HTGS” is associated with an
increase in the number of goals scored by the home team (FTHG)