Check autocorrelation

Table check autocorrelation

#load data
library(readr)
library(ggplot2)
winnings <-read_table("./winnings.txt", col_names = FALSE, locale = locale())

── Column specification ────────────────────────────────────────────────────────
cols(
  X1 = col_double(),
  X2 = col_double()
)
# add column names
colnames(winnings)<-c("blinds", "winnings")

head(winnings)
# A tibble: 6 × 2
  blinds winnings
   <dbl>    <dbl>
1      0       50
2      0     -100
3      0     -100
4      0     -100
5      0     -100
6      0      200
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
x <- winnings %>% filter(blinds==1) %>% pull(winnings)
x_ts <- ts(x)
head(x_ts)
Time Series:
Start = 1 
End = 6 
Frequency = 1 
[1] -200  -50  100  100  100  100

A plain timeseries

ggplot(data.frame(idx = seq_along(x), x), aes(idx, x)) + geom_line(color="lightblue") 

Tests

Autocorrelation

First order differencing

x_diff<- diff(x_ts, differences=1L)
ggplot(data.frame(idx = seq_along(x_diff), x_diff), aes(idx, x_diff)) + geom_line(color="lightblue") 
Don't know how to automatically pick scale for object of type <ts>. Defaulting
to continuous.

Looks pretty unchanged.

library(forecast)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
ggAcf(x_ts, demean = TRUE, plot=TRUE, lag=50)

ggAcf(x_ts, demean =TRUE, plot=TRUE, lag=500)

There seems to be little autoregression going on.

ggPacf(x_ts, demean = TRUE, lag=50)

ggPacf(x_ts, demean=TRUE, lag=500)

Box.test(x_ts, type="Ljung-Box")

    Box-Ljung test

data:  x_ts
X-squared = 0.49103, df = 1, p-value = 0.4835

Since the test statistic is ~0.48 well above 0.05, we fail to reject the null hypothesis that the data is uncorrelated.

Stationary tests

library(tseries)
library(ggpubr)

Attaching package: 'ggpubr'
The following object is masked from 'package:forecast':

    gghistogram
adf.test(x_ts, alternative="stationary")
Warning in adf.test(x_ts, alternative = "stationary"): p-value smaller than
printed p-value

    Augmented Dickey-Fuller Test

data:  x_ts
Dickey-Fuller = -66.549, Lag order = 66, p-value = 0.01
alternative hypothesis: stationary
pp.test(x_ts)
Warning in pp.test(x_ts): p-value smaller than printed p-value

    Phillips-Perron Unit Root Test

data:  x_ts
Dickey-Fuller Z(alpha) = -297180, Truncation lag parameter = 29,
p-value = 0.01
alternative hypothesis: stationary
# ADF will not go lower than 0.01 in adf.test so we use mckannon test as alternative test of cointegration/unit roots:

library(urca)
summary(ur.df(x_ts, type = "drift", lags = 0))

############################################### 
# Augmented Dickey-Fuller Test Unit Root Test # 
############################################### 

Test regression drift 


Call:
lm(formula = z.diff ~ z.lag.1 + 1)

Residuals:
     Min       1Q   Median       3Q      Max 
-20043.3   -317.1     82.1    235.2  20008.0 

Coefficients:
             Estimate Std. Error  t value Pr(>|t|)    
(Intercept) 17.640634   3.476800    5.074  3.9e-07 ***
z.lag.1     -0.998718   0.001829 -546.086  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1901 on 298975 degrees of freedom
Multiple R-squared:  0.4994,    Adjusted R-squared:  0.4994 
F-statistic: 2.982e+05 on 1 and 298975 DF,  p-value: < 2.2e-16


Value of test-statistic is: -546.0858 149104.8 

Critical values for test statistics: 
      1pct  5pct 10pct
tau2 -3.43 -2.86 -2.57
phi1  6.43  4.59  3.78

All saying stationary

# cross-sectional distribution
# truncate outliers
keep         <- percent_rank(abs(x)) < 0.95
x_no_outliers <- x[keep]
ggplot(data.frame(x_no_outliers), aes(x_no_outliers)) +
  geom_histogram(aes(y = after_stat(density)),
                 bins = 40, fill = "skyblue", colour = "black") +
  stat_function(fun  = dnorm,
                args = list(mean = mean(x_no_outliers), sd = sd(x_no_outliers)),
                linewidth = 0.9) + xlab("winnings") + theme_minimal()

So even without outliers we can see we are pretty far from ### Normality

ggplot(data.frame(x), aes(sample = x))+
  stat_qq(color="blue") + stat_qq_line(color="black") +
  labs(title = "Normal Q–Q plot") +theme_minimal()

… Data is incredible non-normal and clear truncation. Even without this, extreme fat tails of data close to limit.

jarque.bera.test(x)

    Jarque Bera Test

data:  x
X-squared = 58317164, df = 2, p-value < 2.2e-16

p-value well below 1% level, rejects H0: normality of data. ### Heteroskedasticity (identicalness over time)

library(lmtest)         
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
t_idx <- seq_along(x)              
lm_mean  <- lm(x ~ 1+ t_idx)          

bptest(lm_mean)

    studentized Breusch-Pagan test

data:  lm_mean
BP = 0.61205, df = 1, p-value = 0.434

null is homoskedasticity. Hence we fail to reject null at 5% significance that it is homoskedastic, at least under simple trend model. What about just for first 100 samples?

x_first<- x[1:500]
t_idx_first <- seq_along(x_first)
lm_mean_first <-(x_first ~1 + t_idx_first )
bptest(lm_mean_first)

    studentized Breusch-Pagan test

data:  lm_mean_first
BP = 3.3544, df = 1, p-value = 0.06702

So pretty much unchanged. More heteroskedastic, but highly sample dependent. Other nearby values produce pretty wide ranging p-values, so not crazy.

Therefore, this does not for example check if we get higher variance than expected at very high winnings

library(lmtest)         

t_idx <- seq_along(x)              
lm_mean  <- lm(x ~ 1+ abs(x))
tab <- sort(table(x), decreasing = TRUE)
data.frame(tab)
         x  Freq
1      100 62500
2      200 23685
3      -50 19510
4     -300 14716
5     -200 12963
6      300 12606
7     -600 11717
8      250 11215
9      600 11157
10    -250  7316
11    -100  7154
12     400  6234
13    -400  5628
14    -900  5224
15    1200  4623
16     900  4528
17   -1200  4381
18     500  3661
19    1800  3638
20    -500  3610
21   -1800  3546
22    -750  3500
23     750  3026
24     350  2999
25       0  2792
26    -350  2668
27    -450  2615
28     450  2337
29    1500  2048
30    -375  1990
31     800  1839
32   -1500  1705
33     375  1679
34    -800  1621
35   -3600  1398
36    3600  1398
37   -2700  1303
38    1000  1191
39    2700  1155
40   -2400  1119
41    2400  1097
42   -1000  1042
43    2250   868
44  -20000   866
45   -2250   853
46   -1350   832
47    1350   772
48   20000   731
49    5400   725
50   -5400   689
51    -525   635
52   -1125   608
53    3000   601
54   -3000   568
55    1125   519
56    4500   516
57     525   499
58    -700   478
59   -4500   452
60   -1050   424
61     700   366
62    1600   299
63    1050   292
64    -675   276
65   -1600   272
66    -562   263
67   -7200   241
68    7200   235
69     675   216
70    2000   206
71     562   204
72    4800   192
73   -2000   180
74   -4050   178
75    8100   174
76  -10800   164
77    4050   161
78   -4800   160
79    6750   160
80   10800   159
81   -8100   147
82    9000   141
83   -6750   133
84   -3375   127
85    6000   125
86   -6000   108
87    3375   104
88   -1575    94
89   -9000    92
90   -2100    91
91    1575    85
92    2100    82
93    -787    81
94    1400    79
95   -1400    70
96     787    68
97   -3150    60
98   -2025    58
99    2025    52
100  13500    52
101 -13500    49
102   3200    49
103   2300    46
104   3150    39
105  -1687    33
106  16200    32
107   1686    31
108  -3200    29
109  -2300    29
110  -1686    29
111  -3450    26
112  -4000    25
113   1687    25
114   4000    23
115 -16200    22
116   4200    21
117  -1124    20
118   1124    17
119   3450    17
120   4600    16
121  12150    16
122  -4200    15
123  -2361    15
124 -12000    12
125   6075    11
126  -9600    10
127  -6900    10
128  -4600    10
129   2361    10
130 -10350     8
131    843     8
132   1574     8
133   5062     8
134   6900     8
135  10125     8
136  10200     8
137  12000     8
138  -6300     7
139   1012     7
140   2800     7
141   6300     7
142   6800     7
143   9600     7
144  14400     7
145 -18000     6
146  -6075     6
147  -4725     6
148  -2800     6
149  -1012     6
150 -14400     5
151 -12150     5
152 -10125     5
153  -6800     5
154  -5062     4
155  -3374     4
156  -1574     4
157   -843     4
158   5250     4
159  -9450     3
160  -2625     3
161  -1875     3
162   1180     3
163   2500     3
164   3700     3
165   4725     3
166   5061     3
167   7300     3
168  18000     3
169 -13800     2
170 -13600     2
171 -10200     2
172 -10124     2
173  -7500     2
174  -6400     2
175  -4900     2
176  -3700     2
177  -2530     2
178  -2500     2
179  -1180     2
180   3300     2
181   5000     2
182   6100     2
183   6150     2
184   6400     2
185   8000     2
186  10500     2
187 -16500     1
188 -16000     1
189 -15750     1
190 -10900     1
191 -10500     1
192  -9200     1
193  -8000     1
194  -7300     1
195  -5175     1
196  -5061     1
197  -4700     1
198  -3750     1
199  -2362     1
200   1875     1
201   2362     1
202   3500     1
203   3750     1
204   4900     1
205   9100     1
206   9450     1
207  10350     1
208  10900     1
209  13600     1
210  16300     1
211  19400     1

Note that by far most observations are +100 +200 -50 -300. Is this surprising? In that we cannot generate values off of 50 anyway, no?

x_small <- data.frame(x=x) |> filter(abs(x) <= 500)
ggplot(x_small, aes(x)) +
  geom_histogram(binwidth = 1,  
                 boundary = 0,
                 fill = "steelblue", colour = "black") +
  scale_x_continuous(breaks = seq(min(x), max(x), by = 100)) +
  labs(title = "Fine-grained histogram (bin = 1)",
       y = "Count") +
  theme_minimal()

This shows the dramatic over-concentration of values at 100