Check autocorrelation

Table check autocorrelation

Libraries

# Libraries to load 

pkgs <- c(
  "here",
  "tidyverse",
  "forecast",
  "lmtest",
  "urca",
  "glue",
  "ggpubr",
  "tseries",
  "ggrepel",
  "ggsci",
  "cowplot",
  "knitr"
)
## --------------------------------------------------
##  install any that are missing (thanks chatgpt!)
## --------------------------------------------------
to_install <- pkgs[!pkgs %in% rownames(installed.packages())]
if (length(to_install)) install.packages(to_install)

## --------------------------------------------------
##  attach them; TRUE/FALSE invisibly indicates success
## --------------------------------------------------
invisible(lapply(pkgs, library, character.only = TRUE))
here() starts at /Users/bkaplowitz/Developer/work/poker
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Loading required package: zoo


Attaching package: 'zoo'


The following objects are masked from 'package:base':

    as.Date, as.Date.numeric



Attaching package: 'ggpubr'


The following object is masked from 'package:forecast':

    gghistogram



Attaching package: 'cowplot'


The following object is masked from 'package:ggpubr':

    get_legend


The following object is masked from 'package:lubridate':

    stamp

Config

setwd(here::here())

Load data

#load data

winnings <-read_table("winnings.txt", col_names = FALSE, locale = locale())

── Column specification ────────────────────────────────────────────────────────
cols(
  X1 = col_double(),
  X2 = col_double()
)
# add column names
colnames(winnings)<-c("blinds", "winnings")

head(winnings)
# A tibble: 6 × 2
  blinds winnings
   <dbl>    <dbl>
1      0       50
2      0     -100
3      0     -100
4      0     -100
5      0     -100
6      0      200
x <- winnings %>% filter(blinds==1) %>% pull(winnings)
x_ts <- ts(x)
head(x_ts)
Time Series:
Start = 1 
End = 6 
Frequency = 1 
[1] -200  -50  100  100  100  100

A plain timeseries

ggplot(data.frame(idx = seq_along(x), x), aes(idx, x)) + geom_line(color="lightblue") 

Tests

Autocorrelation

First order differencing

x_diff<- diff(x_ts, differences=1L)
ggplot(data.frame(idx = seq_along(x_diff), x_diff), aes(idx, x_diff)) + geom_line(color="lightblue") 
Don't know how to automatically pick scale for object of type <ts>. Defaulting
to continuous.

Looks pretty unchanged.

ggAcf(x_ts, demean = TRUE, plot=TRUE, lag=50)

ggAcf(x_ts, demean =TRUE, plot=TRUE, lag=500)

There seems to be little auto-regression going on.

ggPacf(x_ts, demean = TRUE, lag=50)

ggPacf(x_ts, demean=TRUE, lag=500)

Box.test(x_ts, type="Ljung-Box")

    Box-Ljung test

data:  x_ts
X-squared = 0.49103, df = 1, p-value = 0.4835

Since the test statistic is ~0.48 well above 0.05, we fail to reject the null hypothesis that the data is uncorrelated.

Stationary tests

adf.test(x_ts, alternative="stationary")
Warning in adf.test(x_ts, alternative = "stationary"): p-value smaller than
printed p-value

    Augmented Dickey-Fuller Test

data:  x_ts
Dickey-Fuller = -66.549, Lag order = 66, p-value = 0.01
alternative hypothesis: stationary
pp.test(x_ts)
Warning in pp.test(x_ts): p-value smaller than printed p-value

    Phillips-Perron Unit Root Test

data:  x_ts
Dickey-Fuller Z(alpha) = -297180, Truncation lag parameter = 29,
p-value = 0.01
alternative hypothesis: stationary
# ADF will not go lower than 0.01 in adf.test so we use mckannon test as alternative test of cointegration/unit roots:

summary(ur.df(x_ts, type = "drift", lags = 0))

############################################### 
# Augmented Dickey-Fuller Test Unit Root Test # 
############################################### 

Test regression drift 


Call:
lm(formula = z.diff ~ z.lag.1 + 1)

Residuals:
     Min       1Q   Median       3Q      Max 
-20043.3   -317.1     82.1    235.2  20008.0 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept) 17.640634   3.476800    5.074  3.9e-07 ***
z.lag.1     -0.998718   0.001829 -546.086  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1901 on 298975 degrees of freedom
Multiple R-squared:  0.4994,    Adjusted R-squared:  0.4994 
F-statistic: 2.982e+05 on 1 and 298975 DF,  p-value: < 2.2e-16


Value of test-statistic is: -546.0858 149104.8 

Critical values for test statistics: 
      1pct  5pct 10pct
tau2 -3.43 -2.86 -2.57
phi1  6.43  4.59  3.78

All saying stationary

Normality

# cross-sectional distribution
# truncate outliers
keep         <- percent_rank(abs(x)) < 0.95
x_no_outliers <- x[keep]
ggplot(data.frame(x_no_outliers), aes(x_no_outliers)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40, fill = "skyblue", colour = "black") +
  stat_function(fun  = dnorm,
                args = list(mean = mean(x_no_outliers), sd = sd(x_no_outliers)),
                linewidth = 0.9) + xlab("winnings") + theme_minimal()

So even without outliers we can see we are pretty far from what we might expect

ggplot(data.frame(x), aes(sample = x))+
  stat_qq(color="blue") + stat_qq_line(color="black") +
  labs(title = "Normal Q–Q plot") +theme_minimal()

… Data is incredible non-normal and clear truncation. Even without this, extreme fat tails of data close to limit.

jarque.bera.test(x)

    Jarque Bera Test

data:  x
X-squared = 58317164, df = 2, p-value < 2.2e-16

p-value well below 1% level, rejects H0: normality of data.

Heteroskedasticity (identicalness over time)

t_idx <- seq_along(x)              
lm_mean  <- lm(x ~ 1+ t_idx)          

bptest(lm_mean)

    studentized Breusch-Pagan test

data:  lm_mean
BP = 0.61205, df = 1, p-value = 0.434

null is homoskedasticity. Hence we fail to reject null at 5% significance that it is homoskedastic, at least under simple trend model. What about just for first 100 samples?

x_first<- x[1:500]
t_idx_first <- seq_along(x_first)
lm_mean_first <-(x_first ~1 + t_idx_first )
bptest(lm_mean_first)

    studentized Breusch-Pagan test

data:  lm_mean_first
BP = 3.3544, df = 1, p-value = 0.06702

So pretty much unchanged. More heteroskedastic, but highly sample dependent. Other nearby values produce pretty wide ranging p-values, so not crazy.

tab <- sort(table(x), decreasing = TRUE)
head(data.frame(tab))
     x  Freq
1  100 62500
2  200 23685
3  -50 19510
4 -300 14716
5 -200 12963
6  300 12606

Note that by far most observations are +100 +200 -50 -300. Is this surprising? In that we cannot generate values off of 50 anyway, no?

x_small <- data.frame(x=x) |> filter(abs(x) <= 500)
ggplot(x_small, aes(x)) +
  geom_histogram(binwidth = 1,  
                 boundary = 0,
                 fill = "steelblue", colour = "black") +
  scale_x_continuous(breaks = seq(min(x), max(x), by = 100)) +
  labs(title = "Fine-grained histogram (bin = 1)",
       y = "Count",x = "winnings") + 
  theme_minimal()

This shows the dramatic over-concentration of values at 100

Other analysis

# fraction of values much larger than +/-5000
 large_values<-tibble(x) %>% filter(abs(x)>= 2000) 
 fraction_large<-count(large_values)/length(x) * 100
 str_glue('As big blind we get least +/-2000 winnings {fraction_large}% of the time')
As big blind we get least +/-2000 winnings 6.24828582705082% of the time
 x_hundred <- tibble(x) %>% filter(x==100)
 fraction_hundred <-count(x_hundred)/length(x) * 100
 str_glue('As big blind we make opponent fold: {fraction_hundred}% of the time.')
As big blind we make opponent fold: 20.9045481607342% of the time.