week12

my_data <- read.csv('C:/Users/dell/Downloads/Cleaned_Ball_By_Ball.csv')

# Convert MatchDateSK from an integer in the format YYYYMMDD to a Date type
my_data$MatchDateSK <- as.Date(as.character(my_data$MatchDateSK), format="%Y%m%d")

# Check the structure to confirm the change
str(my_data)

## 'data.frame':    150451 obs. of  46 variables:
##  $ MatcH_id                : int  598028 598028 598028 598028 598028 598028 598028 598028 598028 598028 ...
##  $ Over_id                 : int  15 14 14 14 14 14 14 13 13 13 ...
##  $ Ball_id                 : int  6 1 2 3 4 5 6 1 2 3 ...
##  $ Innings_No              : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Team_Batting            : chr  "5" "5" "5" "5" ...
##  $ Team_Bowling            : chr  "2" "2" "2" "2" ...
##  $ Striker_Batting_Position: num  6 5 3 5 3 3 3 5 3 3 ...
##  $ Extra_Type              : chr  "No Extras" "No Extras" "No Extras" "No Extras" ...
##  $ Runs_Scored             : int  4 1 1 1 0 4 2 1 4 1 ...
##  $ Extra_runs              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Wides                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Legbyes                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Byes                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Noballs                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Penalty                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Bowler_Extras           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Out_type                : chr  "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
##  $ Caught                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Bowled                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Run_out                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ LBW                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Retired_hurt            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Stumped                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ caught_and_bowled       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hit_wicket              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ObstructingFeild        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Bowler_Wicket           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Match_Date              : chr  "4/20/2013" "4/20/2013" "4/20/2013" "4/20/2013" ...
##  $ Season                  : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ Striker                 : int  277 104 6 104 6 6 6 104 6 6 ...
##  $ Non_Striker             : int  104 6 104 6 104 104 104 6 104 104 ...
##  $ Bowler                  : int  83 346 346 346 346 346 346 83 83 83 ...
##  $ Striker_match_SK        : int  20336 20333 20328 20333 20328 20328 20328 20333 20328 20328 ...
##  $ StrikerSK               : int  276 103 5 103 5 5 5 103 5 5 ...
##  $ NonStriker_match_SK     : int  20333 20328 20333 20328 20333 20333 20333 20328 20333 20333 ...
##  $ NONStriker_SK           : int  103 5 103 5 103 103 103 5 103 103 ...
##  $ Fielder_match_SK        : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ Fielder_SK              : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ Bowler_match_SK         : int  20343 20348 20348 20348 20348 20348 20348 20343 20343 20343 ...
##  $ BOWLER_SK               : int  82 345 345 345 345 345 345 82 82 82 ...
##  $ PlayerOut_match_SK      : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ BattingTeam_SK          : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ BowlingTeam_SK          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Keeper_Catch            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Player_out_sk           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MatchDateSK             : Date, format: "2013-04-20" "2013-04-20" ...

library(tsibble)

## Warning: package 'tsibble' was built under R version 4.3.2

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(ggplot2)

Choose a column of data to analyze over time. This should be a “response-like” variable that is of particular interest

my_data$date_col <- seq(as.Date("1989-01-01"), by = "days", length.out = nrow(my_data))

I choose response variable to be Extra_runs, as it directly reflects the performance in the game. Will use the date_col column for the time aspect.

Creating a Tsibble Object

Creating a tsibble object using date_col as the index and Extra_runs as the response variable.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()       masks stats::filter()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag()          masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

cricket_tsibble <- my_data %>%
  select(date_col, Extra_runs) %>%
  as_tsibble(index = date_col)

ggplot(cricket_tsibble, aes(x = date_col, y = Extra_runs)) +
  geom_line() +
  labs(title = "Runs Scored Over Time",
       x = "Date",
       y = "Extra_runs") +
  theme_minimal()

cricket_monthly <- cricket_tsibble %>%
  index_by(Month = ~ floor_date(.x, "month")) %>%
  summarise(Total_Runs = sum(Extra_runs))

ggplot(cricket_monthly, aes(x = Month, y = Total_Runs)) +
  geom_line() +
  labs(title = "Monthly Total Runs",
       x = "Month",
       y = "Total Runs") +
  theme_minimal()

## Insigts: This analysis helps to identify the significant trends or patterns in the Extra Runs over different time periods.

Use linear regression to detect any upwards or downwards trends.

Do you need to subset the data for multiple trends? How strong are these trends?

# Fit a linear model
#model <- lm(Extra_runs ~ date_col, data = cricket_tsibble)


# If the trend is significant, check the coefficient of date_col
# A positive coefficient indicates an upward trend, and a negative coefficient indicates a downward trend

# To subset the data.
# For example, I want to analyze trends for each year separately
# Suppose date_col is of Date type and you have a separate Year column


# Evaluate the strength of the trend by looking at the R-squared value
# A higher R-squared value indicates a stronger trend

# Assuming date_col is of Date type
# Create a Year column
cricket_tsibble$Year <- year(cricket_tsibble$date_col)

# Fit a linear model for the entire dataset
model <- lm(Extra_runs ~ date_col, data = cricket_tsibble)
print(summary(model))

## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = cricket_tsibble)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.0721 -0.0705 -0.0688 -0.0672  4.9341 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.241e-02  1.927e-03  37.582   <2e-16 ***
## date_col    -4.160e-08  2.073e-08  -2.007   0.0448 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3493 on 150449 degrees of freedom
## Multiple R-squared:  2.677e-05,  Adjusted R-squared:  2.012e-05 
## F-statistic: 4.027 on 1 and 150449 DF,  p-value: 0.04477

# Analyze trends for each year separately
unique_years <- unique(cricket_tsibble$Year)
counter <- 0

for (year in unique_years) {
  if (counter >= 10) {
    break
  }
  
  subset_data <- filter(cricket_tsibble, Year == year)
  subset_model <- lm(Extra_runs ~ date_col, data = subset_data)
  print(paste("Year:", year))
  print(summary(subset_model))
  
  counter <- counter + 1
}

## [1] "Year: 1989"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11134 -0.08058 -0.04982 -0.02062  1.93185 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -2.1579619  0.8411301  -2.566  0.01070 * 
## date_col     0.0003107  0.0001181   2.631  0.00888 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2377 on 363 degrees of freedom
## Multiple R-squared:  0.01871,    Adjusted R-squared:  0.01601 
## F-statistic: 6.922 on 1 and 363 DF,  p-value: 0.008877
## 
## [1] "Year: 1990"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06822 -0.06390 -0.05966 -0.05564  2.94558 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.668e-01  1.008e+00  -0.265    0.791
## date_col     4.368e-05  1.346e-04   0.324    0.746
## 
## Residual standard error: 0.271 on 363 degrees of freedom
## Multiple R-squared:  0.0002898,  Adjusted R-squared:  -0.002464 
## F-statistic: 0.1052 on 1 and 363 DF,  p-value: 0.7458
## 
## [1] "Year: 1991"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1042 -0.0969 -0.0893 -0.0820  4.9073 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.064e-01  1.761e+00  -0.288    0.774
## date_col     7.601e-05  2.243e-04   0.339    0.735
## 
## Residual standard error: 0.4515 on 363 degrees of freedom
## Multiple R-squared:  0.0003163,  Adjusted R-squared:  -0.002438 
## F-statistic: 0.1148 on 1 and 363 DF,  p-value: 0.7349
## 
## [1] "Year: 1992"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04458 -0.04270 -0.04079 -0.03893  0.96257 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.209e-01  8.082e-01   -0.15    0.881
## date_col     1.970e-05  9.835e-05    0.20    0.841
## 
## Residual standard error: 0.1988 on 364 degrees of freedom
## Multiple R-squared:  0.0001103,  Adjusted R-squared:  -0.002637 
## F-statistic: 0.04014 on 1 and 364 DF,  p-value: 0.8413
## 
## [1] "Year: 1993"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06706 -0.05923 -0.05123 -0.04315  1.93715 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.554e-01  1.000e+00  -0.655    0.513
## date_col     8.242e-05  1.166e-04   0.707    0.480
## 
## Residual standard error: 0.2346 on 363 degrees of freedom
## Multiple R-squared:  0.001376,   Adjusted R-squared:  -0.001375 
## F-statistic: 0.5001 on 1 and 363 DF,  p-value: 0.4799
## 
## [1] "Year: 1994"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.0936 -0.0785 -0.0636 -0.0489  4.9332 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.4348169  1.5179755   0.945    0.345
## date_col    -0.0001530  0.0001696  -0.902    0.368
## 
## Residual standard error: 0.3415 on 363 degrees of freedom
## Multiple R-squared:  0.002236,   Adjusted R-squared:  -0.0005125 
## F-statistic: 0.8135 on 1 and 363 DF,  p-value: 0.3677
## 
## [1] "Year: 1995"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.09472 -0.07598 -0.05743 -0.03926  0.97304 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  1.8230246  1.1002688   1.657   0.0984 .
## date_col    -0.0001893  0.0001181  -1.602   0.1100  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2378 on 363 degrees of freedom
## Multiple R-squared:  0.007022,   Adjusted R-squared:  0.004287 
## F-statistic: 2.567 on 1 and 363 DF,  p-value: 0.11
## 
## [1] "Year: 1996"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.06970 -0.06601 -0.06239 -0.05870  1.94214 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.008e-01  1.269e+00  -0.237    0.813
## date_col     3.757e-05  1.311e-04   0.287    0.775
## 
## Residual standard error: 0.2649 on 364 degrees of freedom
## Multiple R-squared:  0.0002257,  Adjusted R-squared:  -0.002521 
## F-statistic: 0.08217 on 1 and 364 DF,  p-value: 0.7745
## 
## [1] "Year: 1997"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.0899 -0.0887 -0.0875 -0.0863  4.9121 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  2.116e-01  1.981e+00   0.107    0.915
## date_col    -1.234e-05  1.972e-04  -0.063    0.950
## 
## Residual standard error: 0.3969 on 363 degrees of freedom
## Multiple R-squared:  1.079e-05,  Adjusted R-squared:  -0.002744 
## F-statistic: 0.003916 on 1 and 363 DF,  p-value: 0.9501
## 
## [1] "Year: 1998"
## 
## Call:
## lm(formula = Extra_runs ~ date_col, data = subset_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05722 -0.05595 -0.05462 -0.05338  1.94663 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.935e-01  1.241e+00   0.156    0.876
## date_col    -1.333e-05  1.192e-04  -0.112    0.911
## 
## Residual standard error: 0.24 on 363 degrees of freedom
## Multiple R-squared:  3.442e-05,  Adjusted R-squared:  -0.00272 
## F-statistic: 0.0125 on 1 and 363 DF,  p-value: 0.9111

Insights

These results suggest that while there may be a slight upward trend in Extra_runs in 1989, such a trend is not consistent in following years. The low R-squared values in most cases indicate that other factors, not included in the model, might be influencing Extra_runs.

Use smoothing to detect at least one season in your data, and interpret your results.

Can you illustrate the seasonality using ACF or PACF? This code will add a moving average to your cricket_tsibble and plot it alongside the original Extra_runs data. The moving average can help smooth out short-term fluctuations and reveal longer-term trends or patterns, including seasonality.

library(zoo)

## Warning: package 'zoo' was built under R version 4.3.2

## 
## Attaching package: 'zoo'

## The following object is masked from 'package:tsibble':
## 
##     index

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# Calculate a moving average, e.g., over a window of 30 days
cricket_tsibble$Moving_Avg <- rollmean(cricket_tsibble$Extra_runs, 30, fill = NA)

# Plot the original data with the moving average
ggplot(cricket_tsibble, aes(x = date_col)) +
  geom_line(aes(y = Extra_runs), color = "blue") +
  geom_line(aes(y = Moving_Avg), color = "red") +
  labs(title = "Extra Runs with Moving Average",
       x = "Date",
       y = "Extra Runs") +
  theme_minimal()

## Warning: Removed 29 rows containing missing values (`geom_line()`).

ACF and PACF

ACF and PACF plots can help identify the presence of seasonality by showing autocorrelations at different lags.

# ACF Plot
acf(cricket_tsibble$Extra_runs, main = "ACF for Extra Runs")

# PACF Plot
pacf(cricket_tsibble$Extra_runs, main = "PACF for Extra Runs")

## Insights

Smoothing: The moving average plot will show you the underlying trend in Extra_runs. If there is seasonality, you might notice a regular pattern in the moving average. No pattren observed. No seasonality

ACF Plot: Significant spikes in the ACF plot at specific lags can indicate seasonality. No spikes observed, hence no Seasonality PACF Plot: This plot helps differentiate between the direct effect of past values and the indirect effect mediated by intervening values. Significant spikes at specific lags can also indicate seasonality.

Could see some Spikes in between 0 and 20, showing a little or very low effect.