my_data <- read.csv('C:/Users/dell/Downloads/Cleaned_Ball_By_Ball.csv')
# Convert MatchDateSK from an integer in the format YYYYMMDD to a Date type
my_data$MatchDateSK <- as.Date(as.character(my_data$MatchDateSK), format="%Y%m%d")
# Check the structure to confirm the change
str(my_data)
## 'data.frame': 150451 obs. of 46 variables:
## $ MatcH_id : int 598028 598028 598028 598028 598028 598028 598028 598028 598028 598028 ...
## $ Over_id : int 15 14 14 14 14 14 14 13 13 13 ...
## $ Ball_id : int 6 1 2 3 4 5 6 1 2 3 ...
## $ Innings_No : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Team_Batting : chr "5" "5" "5" "5" ...
## $ Team_Bowling : chr "2" "2" "2" "2" ...
## $ Striker_Batting_Position: num 6 5 3 5 3 3 3 5 3 3 ...
## $ Extra_Type : chr "No Extras" "No Extras" "No Extras" "No Extras" ...
## $ Runs_Scored : int 4 1 1 1 0 4 2 1 4 1 ...
## $ Extra_runs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Wides : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Legbyes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Byes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Noballs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Penalty : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bowler_Extras : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Out_type : chr "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
## $ Caught : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bowled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Run_out : int 0 0 0 0 0 0 0 0 0 0 ...
## $ LBW : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Retired_hurt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Stumped : int 0 0 0 0 0 0 0 0 0 0 ...
## $ caught_and_bowled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hit_wicket : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ObstructingFeild : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bowler_Wicket : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Match_Date : chr "4/20/2013" "4/20/2013" "4/20/2013" "4/20/2013" ...
## $ Season : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ Striker : int 277 104 6 104 6 6 6 104 6 6 ...
## $ Non_Striker : int 104 6 104 6 104 104 104 6 104 104 ...
## $ Bowler : int 83 346 346 346 346 346 346 83 83 83 ...
## $ Striker_match_SK : int 20336 20333 20328 20333 20328 20328 20328 20333 20328 20328 ...
## $ StrikerSK : int 276 103 5 103 5 5 5 103 5 5 ...
## $ NonStriker_match_SK : int 20333 20328 20333 20328 20333 20333 20333 20328 20333 20333 ...
## $ NONStriker_SK : int 103 5 103 5 103 103 103 5 103 103 ...
## $ Fielder_match_SK : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ Fielder_SK : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ Bowler_match_SK : int 20343 20348 20348 20348 20348 20348 20348 20343 20343 20343 ...
## $ BOWLER_SK : int 82 345 345 345 345 345 345 82 82 82 ...
## $ PlayerOut_match_SK : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ BattingTeam_SK : int 4 4 4 4 4 4 4 4 4 4 ...
## $ BowlingTeam_SK : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Keeper_Catch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Player_out_sk : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MatchDateSK : Date, format: "2013-04-20" "2013-04-20" ...
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(ggplot2)
my_data$date_col <- seq(as.Date("1989-01-01"), by = "days", length.out = nrow(my_data))
I choose response variable to be Extra_runs, as it directly reflects the performance in the game. Will use the date_col column for the time aspect.
Creating a Tsibble Object
Creating a tsibble object using date_col as the index and Extra_runs as the response variable.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
cricket_tsibble <- my_data %>%
select(date_col, Extra_runs) %>%
as_tsibble(index = date_col)
ggplot(cricket_tsibble, aes(x = date_col, y = Extra_runs)) +
geom_line() +
labs(title = "Runs Scored Over Time",
x = "Date",
y = "Extra_runs") +
theme_minimal()
cricket_monthly <- cricket_tsibble %>%
index_by(Month = ~ floor_date(.x, "month")) %>%
summarise(Total_Runs = sum(Extra_runs))
ggplot(cricket_monthly, aes(x = Month, y = Total_Runs)) +
geom_line() +
labs(title = "Monthly Total Runs",
x = "Month",
y = "Total Runs") +
theme_minimal()
## Insigts: This analysis helps to identify the significant trends or
patterns in the Extra Runs over different time periods.
Do you need to subset the data for multiple trends? How strong are these trends?
# Fit a linear model
#model <- lm(Extra_runs ~ date_col, data = cricket_tsibble)
# If the trend is significant, check the coefficient of date_col
# A positive coefficient indicates an upward trend, and a negative coefficient indicates a downward trend
# To subset the data.
# For example, I want to analyze trends for each year separately
# Suppose date_col is of Date type and you have a separate Year column
# Evaluate the strength of the trend by looking at the R-squared value
# A higher R-squared value indicates a stronger trend
# Assuming date_col is of Date type
# Create a Year column
cricket_tsibble$Year <- year(cricket_tsibble$date_col)
# Fit a linear model for the entire dataset
model <- lm(Extra_runs ~ date_col, data = cricket_tsibble)
print(summary(model))
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = cricket_tsibble)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0721 -0.0705 -0.0688 -0.0672 4.9341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.241e-02 1.927e-03 37.582 <2e-16 ***
## date_col -4.160e-08 2.073e-08 -2.007 0.0448 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3493 on 150449 degrees of freedom
## Multiple R-squared: 2.677e-05, Adjusted R-squared: 2.012e-05
## F-statistic: 4.027 on 1 and 150449 DF, p-value: 0.04477
# Analyze trends for each year separately
unique_years <- unique(cricket_tsibble$Year)
counter <- 0
for (year in unique_years) {
if (counter >= 10) {
break
}
subset_data <- filter(cricket_tsibble, Year == year)
subset_model <- lm(Extra_runs ~ date_col, data = subset_data)
print(paste("Year:", year))
print(summary(subset_model))
counter <- counter + 1
}
## [1] "Year: 1989"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11134 -0.08058 -0.04982 -0.02062 1.93185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.1579619 0.8411301 -2.566 0.01070 *
## date_col 0.0003107 0.0001181 2.631 0.00888 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2377 on 363 degrees of freedom
## Multiple R-squared: 0.01871, Adjusted R-squared: 0.01601
## F-statistic: 6.922 on 1 and 363 DF, p-value: 0.008877
##
## [1] "Year: 1990"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06822 -0.06390 -0.05966 -0.05564 2.94558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.668e-01 1.008e+00 -0.265 0.791
## date_col 4.368e-05 1.346e-04 0.324 0.746
##
## Residual standard error: 0.271 on 363 degrees of freedom
## Multiple R-squared: 0.0002898, Adjusted R-squared: -0.002464
## F-statistic: 0.1052 on 1 and 363 DF, p-value: 0.7458
##
## [1] "Year: 1991"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1042 -0.0969 -0.0893 -0.0820 4.9073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.064e-01 1.761e+00 -0.288 0.774
## date_col 7.601e-05 2.243e-04 0.339 0.735
##
## Residual standard error: 0.4515 on 363 degrees of freedom
## Multiple R-squared: 0.0003163, Adjusted R-squared: -0.002438
## F-statistic: 0.1148 on 1 and 363 DF, p-value: 0.7349
##
## [1] "Year: 1992"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04458 -0.04270 -0.04079 -0.03893 0.96257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.209e-01 8.082e-01 -0.15 0.881
## date_col 1.970e-05 9.835e-05 0.20 0.841
##
## Residual standard error: 0.1988 on 364 degrees of freedom
## Multiple R-squared: 0.0001103, Adjusted R-squared: -0.002637
## F-statistic: 0.04014 on 1 and 364 DF, p-value: 0.8413
##
## [1] "Year: 1993"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06706 -0.05923 -0.05123 -0.04315 1.93715
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.554e-01 1.000e+00 -0.655 0.513
## date_col 8.242e-05 1.166e-04 0.707 0.480
##
## Residual standard error: 0.2346 on 363 degrees of freedom
## Multiple R-squared: 0.001376, Adjusted R-squared: -0.001375
## F-statistic: 0.5001 on 1 and 363 DF, p-value: 0.4799
##
## [1] "Year: 1994"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0936 -0.0785 -0.0636 -0.0489 4.9332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4348169 1.5179755 0.945 0.345
## date_col -0.0001530 0.0001696 -0.902 0.368
##
## Residual standard error: 0.3415 on 363 degrees of freedom
## Multiple R-squared: 0.002236, Adjusted R-squared: -0.0005125
## F-statistic: 0.8135 on 1 and 363 DF, p-value: 0.3677
##
## [1] "Year: 1995"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.09472 -0.07598 -0.05743 -0.03926 0.97304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.8230246 1.1002688 1.657 0.0984 .
## date_col -0.0001893 0.0001181 -1.602 0.1100
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2378 on 363 degrees of freedom
## Multiple R-squared: 0.007022, Adjusted R-squared: 0.004287
## F-statistic: 2.567 on 1 and 363 DF, p-value: 0.11
##
## [1] "Year: 1996"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.06970 -0.06601 -0.06239 -0.05870 1.94214
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.008e-01 1.269e+00 -0.237 0.813
## date_col 3.757e-05 1.311e-04 0.287 0.775
##
## Residual standard error: 0.2649 on 364 degrees of freedom
## Multiple R-squared: 0.0002257, Adjusted R-squared: -0.002521
## F-statistic: 0.08217 on 1 and 364 DF, p-value: 0.7745
##
## [1] "Year: 1997"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0899 -0.0887 -0.0875 -0.0863 4.9121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.116e-01 1.981e+00 0.107 0.915
## date_col -1.234e-05 1.972e-04 -0.063 0.950
##
## Residual standard error: 0.3969 on 363 degrees of freedom
## Multiple R-squared: 1.079e-05, Adjusted R-squared: -0.002744
## F-statistic: 0.003916 on 1 and 363 DF, p-value: 0.9501
##
## [1] "Year: 1998"
##
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.05722 -0.05595 -0.05462 -0.05338 1.94663
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.935e-01 1.241e+00 0.156 0.876
## date_col -1.333e-05 1.192e-04 -0.112 0.911
##
## Residual standard error: 0.24 on 363 degrees of freedom
## Multiple R-squared: 3.442e-05, Adjusted R-squared: -0.00272
## F-statistic: 0.0125 on 1 and 363 DF, p-value: 0.9111
These results suggest that while there may be a slight upward trend in Extra_runs in 1989, such a trend is not consistent in following years. The low R-squared values in most cases indicate that other factors, not included in the model, might be influencing Extra_runs.
Can you illustrate the seasonality using ACF or PACF? This code will add a moving average to your cricket_tsibble and plot it alongside the original Extra_runs data. The moving average can help smooth out short-term fluctuations and reveal longer-term trends or patterns, including seasonality.
library(zoo)
## Warning: package 'zoo' was built under R version 4.3.2
##
## Attaching package: 'zoo'
## The following object is masked from 'package:tsibble':
##
## index
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Calculate a moving average, e.g., over a window of 30 days
cricket_tsibble$Moving_Avg <- rollmean(cricket_tsibble$Extra_runs, 30, fill = NA)
# Plot the original data with the moving average
ggplot(cricket_tsibble, aes(x = date_col)) +
geom_line(aes(y = Extra_runs), color = "blue") +
geom_line(aes(y = Moving_Avg), color = "red") +
labs(title = "Extra Runs with Moving Average",
x = "Date",
y = "Extra Runs") +
theme_minimal()
## Warning: Removed 29 rows containing missing values (`geom_line()`).
ACF and PACF plots can help identify the presence of seasonality by showing autocorrelations at different lags.
# ACF Plot
acf(cricket_tsibble$Extra_runs, main = "ACF for Extra Runs")
# PACF Plot
pacf(cricket_tsibble$Extra_runs, main = "PACF for Extra Runs")
## Insights
Smoothing: The moving average plot will show you the underlying trend in Extra_runs. If there is seasonality, you might notice a regular pattern in the moving average. No pattren observed. No seasonality
ACF Plot: Significant spikes in the ACF plot at specific lags can indicate seasonality. No spikes observed, hence no Seasonality PACF Plot: This plot helps differentiate between the direct effect of past values and the indirect effect mediated by intervening values. Significant spikes at specific lags can also indicate seasonality.
Could see some Spikes in between 0 and 20, showing a little or very low effect.