library(formatR)
Or
raises/ total hands played
## Warning: package 'XML' was built under R version 3.4.4
## Loading required package: RPostgreSQL
## Loading required package: DBI
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
tryCatch({
drv <- dbDriver("PostgreSQL")
print("Connecting to database")
conn <- con
print("Connected!")
}, error = function(cond)
{
print("Unable to connect to database.")
})
## [1] "Connecting to database"
## [1] "Connected!"
# query 6 has sn's
query_6 <- dbGetQuery(conn, "SELECT * FROM players")
# query 7 has stats
query_7 <- dbGetQuery(conn, "SELECT * FROM compiledplayerresults limit 1000000")
# Combine query 6 and 7 A
all_players <- query_6 %>%
select(., c(playername, player_id, cashhands))
player_ids <- all_players$player_id
all_player_stats <- query_7 %>%
filter(., player_id %in% player_ids)
all_players_db <- merge(all_player_stats, all_players, by = "player_id",
all = TRUE)
# Choosen columns
columns_for_analysis <- c("gametype_id", "player_id", "totalhands", "totalbbswon",
"totalrakeincents", "totalamountwonincents", "vpiphands", "pfrhands",
"couldcoldcall", "didcoldcall", "couldthreebet", "didthreebet", "couldsqueeze",
"didsqueeze", "facingtwopreflopraisers", "calledtwopreflopraisers",
"raisedtwopreflopraisers", "smallblindstealattempted", "smallblindstealdefended",
"smallblindstealreraised", "bigblindstealattempted", "bigblindstealdefended",
"bigblindstealreraised", "facedthreebetpreflop", "foldedtothreebetpreflop",
"calledthreebetpreflop", "raisedthreebetpreflop", "facedfourbetpreflop",
"foldedtofourbetpreflop", "calledfourbetpreflop", "raisedfourbetpreflop",
"sawflop", "wonhandwhensawflop", "sawshowdown", "wonshowdown", "flopcontinuationbetpossible",
"flopcontinuationbetmade", "turncontinuationbetpossible", "turncontinuationbetmade",
"rivercontinuationbetpossible", "rivercontinuationbetmade", "facingflopcontinuationbet",
"foldedtoflopcontinuationbet", "calledflopcontinuationbet", "raisedflopcontinuationbet",
"facingturncontinuationbet", "foldedtoturncontinuationbet", "calledturncontinuationbet",
"raisedturncontinuationbet", "facingrivercontinuationbet", "foldedtorivercontinuationbet",
"calledrivercontinuationbet", "raisedrivercontinuationbet", "playername",
"cashhands")
predictors <- c("vpip", "pfr", "threebet", "bb_per_100", "wwsf")
indexes <- c((1:31), 65)
# Create db with custom stats
all_players_finished_db <- all_players_db %>%
filter(., gametype_id %in% indexes) %>%
select(c(columns_for_analysis)) %>%
group_by(playername) %>%
summarize(vpip = round(sum(vpiphands)/sum(totalhands) * 100, 1),
pfr = round(sum(pfrhands)/sum(totalhands) * 100, 1),
total_hands = sum(totalhands),
money_won = sum(totalamountwonincents),
rake = sum(totalrakeincents),
threebet = round(sum(didthreebet)/sum(couldthreebet) * 100, 1),
bb_per_100 = round(sum(totalbbswon)/(sum(totalhands)), 2),
total_rake_100 = round((sum(totalrakeincents)/100)/sum(total_hands), 2),
money_won_100 = round((sum(totalamountwonincents)/100)/(sum(total_hands)/100)/100, 2),
wwsf = round(sum(wonhandwhensawflop)/sum(sawflop) * 100, 1))
# Print out of my statistics
all_players_finished_db %>% filter(playername == "RileyFreeman")
## # A tibble: 1 x 11
## playername vpip pfr total_hands money_won rake threebet bb_per_100
## <chr> <dbl> <dbl> <int> <int> <int> <dbl> <dbl>
## 1 RileyFree~ 22.4 17.3 565812 2166871 4.76e6 5.8 5.33
## # ... with 3 more variables: total_rake_100 <dbl>, money_won_100 <dbl>,
## # wwsf <dbl>
## vpip pfr threebet
## breaks Numeric,21 Numeric,21 Numeric,21
## counts Integer,20 Integer,20 Integer,20
## density Numeric,20 Numeric,20 Numeric,20
## mids Numeric,20 Numeric,20 Numeric,20
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## bb_per_100 wwsf
## breaks Numeric,11 Numeric,21
## counts Integer,10 Integer,20
## density Numeric,10 Numeric,20
## mids Numeric,10 Numeric,20
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]"
## equidist TRUE TRUE
## vpip pfr threebet
## breaks Numeric,21 Numeric,16 Numeric,13
## counts Integer,20 Integer,15 Integer,12
## density Numeric,20 Numeric,15 Numeric,12
## mids Numeric,20 Numeric,15 Numeric,12
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## bb_per_100 wwsf
## breaks Numeric,22 Numeric,21
## counts Integer,21 Integer,20
## density Numeric,21 Numeric,20
## mids Numeric,21 Numeric,20
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]"
## equidist TRUE TRUE
## vpip pfr threebet
## breaks Integer,20 Numeric,16 Numeric,12
## counts Integer,19 Integer,15 Integer,11
## density Numeric,19 Numeric,15 Numeric,11
## mids Numeric,19 Numeric,15 Numeric,11
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## bb_per_100 wwsf
## breaks Numeric,23 Numeric,21
## counts Integer,22 Integer,20
## density Numeric,22 Numeric,20
## mids Numeric,22 Numeric,20
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]"
## equidist TRUE TRUE
## vpip pfr threebet
## breaks Integer,18 Numeric,13 Numeric,9
## counts Integer,17 Integer,12 Integer,8
## density Numeric,17 Numeric,12 Numeric,8
## mids Numeric,17 Numeric,12 Numeric,8
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## bb_per_100 wwsf
## breaks Numeric,18 Integer,9
## counts Integer,17 Integer,8
## density Numeric,17 Numeric,8
## mids Numeric,17 Numeric,8
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]"
## equidist TRUE TRUE
## vpip pfr threebet
## breaks Integer,17 Numeric,10 Numeric,12
## counts Integer,16 Integer,9 Integer,11
## density Numeric,16 Numeric,9 Numeric,11
## mids Numeric,16 Numeric,9 Numeric,11
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## bb_per_100 wwsf
## breaks Numeric,13 Integer,15
## counts Integer,12 Integer,14
## density Numeric,12 Numeric,14
## mids Numeric,12 Numeric,14
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]"
## equidist TRUE TRUE
## vpip pfr threebet
## breaks Numeric,21 Numeric,21 Numeric,21
## counts Integer,20 Integer,20 Integer,20
## density Numeric,20 Numeric,20 Numeric,20
## mids Numeric,20 Numeric,20 Numeric,20
## xname "dots[[1L]][[1L]]" "dots[[1L]][[2L]]" "dots[[1L]][[3L]]"
## equidist TRUE TRUE TRUE
## bb_per_100 wwsf
## breaks Numeric,11 Numeric,21
## counts Integer,10 Integer,20
## density Numeric,10 Numeric,20
## mids Numeric,10 Numeric,20
## xname "dots[[1L]][[4L]]" "dots[[1L]][[5L]]"
## equidist TRUE TRUE
## [1] "wwsf of 0 = 4465 wwsf of 50 = 2622 wwsf of 100 = 1898 total occurences = 23183"
x |
---|
these 3 values(0,50,100) make up 0.39 % of total occurences |
## Warning: Setting row names on a tibble is deprecated.
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
vpip | 1 | 15363 | 39.68 | 14.44 | 38.20 | 38.80 | 14.68 | 1.5 | 100.0 | 98.5 | 0.57 | 0.22 | 0.12 |
pfr | 2 | 15363 | 13.43 | 7.75 | 12.70 | 12.92 | 7.71 | 0.0 | 73.4 | 73.4 | 0.89 | 2.04 | 0.06 |
threebet | 3 | 15362 | 4.84 | 4.74 | 3.90 | 4.15 | 4.00 | 0.0 | 60.0 | 60.0 | 2.09 | 8.70 | 0.04 |
bb_per_100 | 4 | 15363 | -29.87 | 94.78 | -22.21 | -26.55 | 61.56 | -1199.8 | 837.0 | 2036.8 | -0.62 | 6.95 | 0.76 |
wwsf | 5 | 15335 | 40.49 | 8.86 | 40.30 | 40.39 | 7.41 | 0.0 | 100.0 | 100.0 | 0.23 | 2.30 | 0.07 |
## Warning: Setting row names on a tibble is deprecated.
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
vpip | 1 | 10635 | 39.93 | 13.66 | 38.50 | 39.06 | 14.08 | 7.40 | 100.00 | 92.60 | 0.59 | 0.17 | 0.13 |
pfr | 2 | 10635 | 13.59 | 7.42 | 13.00 | 13.16 | 7.26 | 0.00 | 73.40 | 73.40 | 0.81 | 1.90 | 0.07 |
threebet | 3 | 10634 | 4.85 | 4.11 | 4.00 | 4.35 | 3.26 | 0.00 | 52.20 | 52.20 | 2.03 | 9.07 | 0.04 |
bb_per_100 | 4 | 10635 | -27.89 | 71.41 | -20.77 | -24.72 | 50.01 | -611.37 | 431.61 | 1042.98 | -0.70 | 4.55 | 0.69 |
wwsf | 5 | 10631 | 40.67 | 6.95 | 40.50 | 40.57 | 6.08 | 0.00 | 100.00 | 100.00 | 0.25 | 1.84 | 0.07 |
## Warning: Setting row names on a tibble is deprecated.
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
vpip | 1 | 4007 | 37.98 | 12.84 | 36.40 | 37.03 | 13.49 | 8.20 | 89.20 | 81.00 | 0.67 | 0.23 | 0.20 |
pfr | 2 | 4007 | 13.89 | 6.62 | 13.70 | 13.64 | 6.52 | 0.10 | 58.30 | 58.20 | 0.53 | 1.04 | 0.10 |
threebet | 3 | 4007 | 4.79 | 3.18 | 4.30 | 4.47 | 2.97 | 0.00 | 39.50 | 39.50 | 1.59 | 6.79 | 0.05 |
bb_per_100 | 4 | 4007 | -19.07 | 40.07 | -13.56 | -16.51 | 29.74 | -611.37 | 164.67 | 776.04 | -1.59 | 14.37 | 0.63 |
wwsf | 5 | 4007 | 40.74 | 4.85 | 40.60 | 40.66 | 4.60 | 25.40 | 63.00 | 37.60 | 0.18 | 0.29 | 0.08 |
## Warning: Setting row names on a tibble is deprecated.
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
vpip | 1 | 2348 | 36.96 | 12.43 | 35.30 | 36.05 | 13.49 | 8.20 | 85.10 | 76.90 | 0.66 | 0.14 | 0.26 |
pfr | 2 | 2348 | 14.21 | 6.44 | 14.20 | 14.02 | 6.38 | 0.10 | 43.30 | 43.20 | 0.35 | 0.35 | 0.13 |
threebet | 3 | 2348 | 4.85 | 2.93 | 4.40 | 4.57 | 2.82 | 0.00 | 21.10 | 21.10 | 1.10 | 2.01 | 0.06 |
bb_per_100 | 4 | 2348 | -16.57 | 31.50 | -11.35 | -13.75 | 24.33 | -254.63 | 96.95 | 351.58 | -1.46 | 5.52 | 0.65 |
wwsf | 5 | 2348 | 40.81 | 4.43 | 40.80 | 40.76 | 4.30 | 25.50 | 58.10 | 32.60 | 0.12 | 0.22 | 0.09 |
## Warning: Setting row names on a tibble is deprecated.
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
vpip | 1 | 1080 | 35.07 | 11.67 | 32.90 | 34.11 | 11.71 | 8.20 | 81.60 | 73.40 | 0.73 | 0.21 | 0.36 |
pfr | 2 | 1080 | 14.63 | 6.20 | 14.80 | 14.60 | 5.93 | 0.10 | 42.60 | 42.50 | 0.20 | 0.49 | 0.19 |
threebet | 3 | 1080 | 4.96 | 2.75 | 4.60 | 4.75 | 2.82 | 0.10 | 21.10 | 21.00 | 1.00 | 2.27 | 0.08 |
bb_per_100 | 4 | 1080 | -12.79 | 23.27 | -8.34 | -10.23 | 18.21 | -178.16 | 44.62 | 222.78 | -1.58 | 5.25 | 0.71 |
wwsf | 5 | 1080 | 40.81 | 4.06 | 40.90 | 40.76 | 4.00 | 26.90 | 52.40 | 25.50 | 0.07 | 0.11 | 0.12 |
## vpip pfr threebet bb_per_100 wwsf
## x Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080
## y Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080
## vpip pfr threebet bb_per_100 wwsf
## x Numeric,10635 Numeric,10635 Numeric,10635 Numeric,10635 Numeric,10635
## y Numeric,10635 Numeric,10635 Numeric,10635 Numeric,10635 Numeric,10635
## vpip pfr threebet bb_per_100 wwsf
## x Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007
## y Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007
## vpip pfr threebet bb_per_100 wwsf
## x Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007
## y Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007 Numeric,4007
## vpip pfr threebet bb_per_100 wwsf
## x Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080
## y Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080 Numeric,1080
over_2500_hands <- all_players_finished_db %>%
filter(., total_hands > 2500) %>%
select(., c(predictors))
## Create numeric column
over_2500_hands <- over_2500_hands %>% mutate(., vpip_pfr = vpip - pfr)
# Save this vector for use later
numerical_vpip_pfr <- over_2500_hands$vpip_pfr
# Create categorical factor column
over_2500_hands$vpip_pfr[over_2500_hands$vpip_pfr < 15.001] <- 1
over_2500_hands$vpip_pfr[over_2500_hands$vpip_pfr > 15.001] <- 0
my_vector <- str_replace(as.character(over_2500_hands$vpip_pfr), "0", "wide_gap")
my_vector <- str_replace(my_vector, "1", "narrow_gap")
over_2500_hands$vpip_pfr <- as_factor(my_vector)
# Display new df summary stats
kable(summary(over_2500_hands))
vpip | pfr | threebet | bb_per_100 | wwsf | vpip_pfr | |
---|---|---|---|---|---|---|
Min. : 8.20 | Min. : 0.10 | Min. : 0.100 | Min. :-178.160 | Min. :26.90 | wide_gap :645 | |
1st Qu.:25.90 | 1st Qu.:10.60 | 1st Qu.: 2.800 | 1st Qu.: -23.305 | 1st Qu.:38.00 | narrow_gap:435 | |
Median :32.90 | Median :14.80 | Median : 4.600 | Median : -8.335 | Median :40.90 | NA | |
Mean :35.07 | Mean :14.63 | Mean : 4.956 | Mean : -12.787 | Mean :40.81 | NA | |
3rd Qu.:42.50 | 3rd Qu.:18.52 | 3rd Qu.: 6.600 | 3rd Qu.: 2.335 | 3rd Qu.:43.40 | NA | |
Max. :81.60 | Max. :42.60 | Max. :21.100 | Max. : 44.620 | Max. :52.40 | NA |
y <- over_2500_hands$bb_per_100
vpip_pfr <- over_2500_hands$vpip_pfr
WWSF <- over_2500_hands$wwsf
VPIP <- over_2500_hands$vpip
PFR <- over_2500_hands$pfr
THREE_BET <- over_2500_hands$threebet
fit_1 <- lm(y ~ vpip_pfr + VPIP + THREE_BET + WWSF + PFR)
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(fit_1)
summary(fit_1)
##
## Call:
## lm(formula = y ~ vpip_pfr + VPIP + THREE_BET + WWSF + PFR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -119.843 -9.064 1.013 10.145 63.616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.54655 6.24926 2.328 0.0201 *
## vpip_pfrnarrow_gap 2.79640 1.87353 1.493 0.1358
## VPIP -1.12553 0.07442 -15.125 < 2e-16 ***
## THREE_BET -2.04711 0.30375 -6.739 2.59e-11 ***
## WWSF 0.18462 0.15109 1.222 0.2220
## PFR 0.93165 0.13942 6.682 3.77e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.25 on 1074 degrees of freedom
## Multiple R-squared: 0.3881, Adjusted R-squared: 0.3853
## F-statistic: 136.3 on 5 and 1074 DF, p-value: < 2.2e-16
anova(fit_1)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## vpip_pfr 1 118613 118613 356.2982 < 2.2e-16 ***
## VPIP 1 88708 88708 266.4668 < 2.2e-16 ***
## THREE_BET 1 2811 2811 8.4429 0.00374 **
## WWSF 1 1802 1802 5.4127 0.02018 *
## PFR 1 14865 14865 44.6515 3.773e-11 ***
## Residuals 1074 357539 333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_2 <- lm(y ~ vpip_pfr + VPIP + THREE_BET + PFR)
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(fit_2)
summary(fit_2)
##
## Call:
## lm(formula = y ~ vpip_pfr + VPIP + THREE_BET + PFR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -119.113 -9.040 1.161 10.255 63.411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.36649 2.81137 7.600 6.43e-14 ***
## vpip_pfrnarrow_gap 2.71664 1.87283 1.451 0.147
## VPIP -1.12511 0.07443 -15.116 < 2e-16 ***
## THREE_BET -1.98040 0.29887 -6.626 5.43e-11 ***
## PFR 0.95904 0.13764 6.968 5.61e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.25 on 1075 degrees of freedom
## Multiple R-squared: 0.3873, Adjusted R-squared: 0.385
## F-statistic: 169.9 on 4 and 1075 DF, p-value: < 2.2e-16
anova(fit_2)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## vpip_pfr 1 118613 118613 356.135 < 2.2e-16 ***
## VPIP 1 88708 88708 266.345 < 2.2e-16 ***
## THREE_BET 1 2811 2811 8.439 0.003748 **
## PFR 1 16170 16170 48.549 5.606e-12 ***
## Residuals 1075 358036 333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.4
## corrplot 0.84 loaded
# Add vpip-pfr numerical vector
corr_plot_db <- as_data_frame(cbind(over_2500_hands, numerical_vpip_pfr))
# Plot correlations
corrplot(cor(corr_plot_db[, -6]))
fit_3 <- lm(y ~ vpip_pfr + THREE_BET + PFR)
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(fit_3)
my_fit <- summary(fit_3)
anova(fit_3)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## vpip_pfr 1 118613 118613 293.9808 <2e-16 ***
## THREE_BET 1 29057 29057 72.0166 <2e-16 ***
## PFR 1 2531 2531 6.2735 0.0124 *
## Residuals 1076 434137 403
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning: Setting row names on a tibble is deprecated.
vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
vpip | 1 | 1080 | 35.07 | 11.67 | 32.90 | 34.11 | 11.71 | 8.20 | 81.60 | 73.40 | 0.73 | 0.21 | 0.36 |
pfr | 2 | 1080 | 14.63 | 6.20 | 14.80 | 14.60 | 5.93 | 0.10 | 42.60 | 42.50 | 0.20 | 0.49 | 0.19 |
threebet | 3 | 1080 | 4.96 | 2.75 | 4.60 | 4.75 | 2.82 | 0.10 | 21.10 | 21.00 | 1.00 | 2.27 | 0.08 |
bb_per_100 | 4 | 1080 | -12.79 | 23.27 | -8.34 | -10.23 | 18.21 | -178.16 | 44.62 | 222.78 | -1.58 | 5.25 | 0.71 |
wwsf | 5 | 1080 | 40.81 | 4.06 | 40.90 | 40.76 | 4.00 | 26.90 | 52.40 | 25.50 | 0.07 | 0.11 | 0.12 |
my_stats <- all_players_finished_db %>% filter(playername == "RileyFreeman") %>%
select(., predictors)
for_comparison <- fit_3$coefficients[1] + fit_3$coefficients[2] + my_stats$pfr *
(fit_3$coefficients[4]) + my_stats$threebet * (fit_3$coefficients[3])
# Print prediction versus actual
paste("my actual win rate is", my_stats$bb_per_100, "model predicts ",
for_comparison)
## [1] "my actual win rate is 5.52 model predicts 0.119438972916067"
##
## Call:
## lm(formula = y ~ vpip_pfr + THREE_BET_2 + PFR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143.295 -9.489 2.207 11.539 69.712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.57131 1.61279 -12.755 < 2e-16 ***
## vpip_pfrnarrow_gap 23.06659 1.29429 17.822 < 2e-16 ***
## THREE_BET_2 -0.20567 0.02058 -9.994 < 2e-16 ***
## PFR 0.34878 0.12859 2.712 0.00679 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.75 on 1076 degrees of freedom
## Multiple R-squared: 0.2814, Adjusted R-squared: 0.2794
## F-statistic: 140.4 on 3 and 1076 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## vpip_pfr 1 118613 118613 303.942 < 2.2e-16 ***
## THREE_BET_2 1 42945 42945 110.045 < 2.2e-16 ***
## PFR 1 2871 2871 7.357 0.006787 **
## Residuals 1076 419908 390
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "my actual win rate is 5.52 model predicts 7.33004460602566"
##
## Call:
## lm(formula = y ~ vpip_pfr + THREE_BET_2 + PFR_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142.786 -9.631 2.215 11.392 70.938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.460608 1.028844 -16.971 <2e-16 ***
## vpip_pfrnarrow_gap 23.787548 1.266364 18.784 <2e-16 ***
## THREE_BET_2 -0.185957 0.021193 -8.774 <2e-16 ***
## PFR_2 0.004231 0.004029 1.050 0.294
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.81 on 1076 degrees of freedom
## Multiple R-squared: 0.2772, Adjusted R-squared: 0.2752
## F-statistic: 137.6 on 3 and 1076 DF, p-value: < 2.2e-16
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## vpip_pfr 1 118613 118613 302.1874 <2e-16 ***
## THREE_BET_2 1 42945 42945 109.4099 <2e-16 ***
## PFR_2 1 433 433 1.1026 0.2939
## Residuals 1076 422347 393
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] "my actual win rate is 5.52 model predicts 5.28481828673124"
# Create test/train
set.seed(10)
train.idx <- sample(nrow(over_2500_hands), 0.7 * nrow(over_2500_hands),
replace = FALSE)
test.idx <- (1:nrow(over_2500_hands))[-train.idx]
# Extract target vector and rest of DF for LM
lm_target <- over_2500_hands[, "bb_per_100"]
lm_inputs <- over_2500_hands[, c("pfr", "threebet", "vpip_pfr")]
train_df <- over_2500_hands[train.idx, c("pfr", "threebet", "vpip_pfr",
"bb_per_100")]
test_df <- over_2500_hands[test.idx, c("pfr", "threebet", "vpip_pfr", "bb_per_100")]
THREE_BET_2 <- THREE_BET^2
PFR_2 <- PFR^2
fit_6 <- lm(bb_per_100 ~ vpip_pfr + (threebet^2) + (pfr^2), data = train_df)
fit_6 <- predict(fit_6, test_df)
actual <- over_2500_hands[test.idx, c("bb_per_100")]
error <- actual - fit_6
paste("my RMSE is", sqrt(mean(error^2)))
## [1] "my RMSE is 18.6350384277145"
## Create db
db_for_ci <- all_players_finished_db %>% filter(., total_hands > 2500)
## CUT vpip column and create sd column
db_for_ci$estimated_sd <- as.numeric(as.character(cut(db_for_ci$vpip, breaks = c(0,
10, 20, 30, 40, Inf), labels = c(60, 70, 80, 90, 110))))
## Create vectors to s-store sd(sample), n-samples, a-sample means point
## Estimate
sample_parameters <- describe(db_for_ci$bb_per_100)
s <- db_for_ci$estimated_sd
n <- db_for_ci$total_hands/100
a <- db_for_ci$bb_per_100
## Make confidence intervals
names <- as.character(db_for_ci$playername)
left <- round(a - qnorm(0.975) * s/sqrt(n), 2)
right <- round(a + qnorm(0.975) * s/sqrt(n), 2)
## Build df of overall intervals
df_with_intervals <- as_data_frame(cbind(names, left, right))
kable(head(df_with_intervals))
names | left | right |
---|---|---|
$HIPITLOSER | 5.89 | 83.35 |
10jqka155 | -45.92 | -6.68 |
1800G@mbIer | -25.8 | 18.84 |
1CEMAN1CEMAN | -49.63 | 7.11 |
41lax | -36.09 | 15.51 |
a._.a | -20.94 | 27.82 |
df_with_intervals[, 2:3] <- lapply(df_with_intervals[, 2:3], function(x)
{
as.numeric(x)
})
## Run the hpyothesis test and create a vector of names
confidence_intervals <- df_with_intervals %>% filter(., (right < 0 & left <
0) | (right > 0 & left > 0))
colnames(confidence_intervals) <- c("playername", "left", "right")
confidence_intervals <- plyr::join(confidence_intervals, db_for_ci)
## Joining by: playername
my_names_2 <- confidence_intervals$playername
## Cut a winner loser category into df
confidence_intervals$winner_loser <- cut(confidence_intervals$left, breaks = c(-Inf,
0, Inf), labels = c(0, 1))
confidence_intervals$winner_loser
## [1] 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0
## [71] 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0
## [106] 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
## [141] 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0
## [176] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0
## [211] 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0
## [246] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0
## Levels: 0 1
# Create numeric column
confidence_intervals <- confidence_intervals %>% mutate(., vpip_pfr = vpip -
pfr)
# Create categorical factor column
confidence_intervals$vpip_pfr[confidence_intervals$vpip_pfr < 15.001] <- 1
confidence_intervals$vpip_pfr[confidence_intervals$vpip_pfr > 15.001] <- 0
my_vector <- str_replace(as.character(confidence_intervals$vpip_pfr), "0",
"wide_gap")
my_vector <- str_replace(my_vector, "1", "narrow_gap")
confidence_intervals$vpip_pfr <- as_factor(my_vector)
## Run LM model fit_7
confidence_intervals <- confidence_intervals %>% filter(., playername %in%
my_names_2) %>% select(c(vpip_pfr, wwsf, vpip, pfr, threebet, vpip_pfr,
bb_per_100, winner_loser))
y <- confidence_intervals$bb_per_100
vpip_pfr <- confidence_intervals$vpip_pfr
WWSF <- confidence_intervals$wwsf
VPIP <- confidence_intervals$vpip
PFR <- confidence_intervals$pfr
THREE_BET <- confidence_intervals$threebet
WIN_LOSE <- confidence_intervals$winner_loser
fit_7 <- lm(y ~ vpip_pfr + VPIP + THREE_BET + PFR + WIN_LOSE)
layout(matrix(c(1, 2, 3, 4), 2, 2))
plot(fit_7)
summary(fit_7)
##
## Call:
## lm(formula = y ~ vpip_pfr + VPIP + THREE_BET + PFR + WIN_LOSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.401 -8.253 1.023 11.145 45.621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5155 4.9277 0.713 0.476
## vpip_pfrnarrow_gap -0.6176 4.3977 -0.140 0.888
## VPIP -0.9560 0.1154 -8.288 5.68e-15 ***
## THREE_BET -1.9526 0.4861 -4.017 7.68e-05 ***
## PFR 0.2825 0.2328 1.213 0.226
## WIN_LOSE1 46.2839 4.2843 10.803 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.05 on 267 degrees of freedom
## Multiple R-squared: 0.6937, Adjusted R-squared: 0.6879
## F-statistic: 120.9 on 5 and 267 DF, p-value: < 2.2e-16
anova(fit_7)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## vpip_pfr 1 104485 104485 320.6613 < 2.2e-16 ***
## VPIP 1 47470 47470 145.6837 < 2.2e-16 ***
## THREE_BET 1 5877 5877 18.0374 2.994e-05 ***
## PFR 1 1133 1133 3.4766 0.06334 .
## WIN_LOSE 1 38027 38027 116.7055 < 2.2e-16 ***
## Residuals 267 87000 326
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
set.seed(20)
train.idx <- sample(nrow(confidence_intervals), 0.7 * nrow(confidence_intervals),
replace = FALSE)
test.idx <- (1:nrow(confidence_intervals))[-train.idx]
## Extract target vector and rest of DF for LM
lm_target <- confidence_intervals[, "bb_per_100"]
lm_inputs <- confidence_intervals[, c("pfr", "threebet", "vpip_pfr")]
train_df <- confidence_intervals[train.idx, c("pfr", "winner_loser", "threebet",
"vpip_pfr", "bb_per_100", "wwsf", "vpip")]
test_df <- confidence_intervals[test.idx, c("pfr", "winner_loser", "threebet",
"vpip_pfr", "bb_per_100", "wwsf", "vpip")]
fit_7 <- lm(bb_per_100 ~ I(threebet^2) + I(pfr^2) + I(vpip - pfr) + vpip +
wwsf + winner_loser, data = train_df)
summary(fit_7)
##
## Call:
## lm(formula = bb_per_100 ~ I(threebet^2) + I(pfr^2) + I(vpip -
## pfr) + vpip + wwsf + winner_loser, data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -97.891 -9.750 -1.372 11.476 54.001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.25810 13.79787 1.758 0.0804 .
## I(threebet^2) -0.14969 0.02783 -5.379 2.25e-07 ***
## I(pfr^2) -0.02441 0.01842 -1.325 0.1867
## I(vpip - pfr) -1.35193 0.62488 -2.163 0.0318 *
## vpip 0.53103 0.65053 0.816 0.4154
## wwsf -0.97248 0.32560 -2.987 0.0032 **
## winner_loser1 44.74008 4.16868 10.732 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.43 on 184 degrees of freedom
## Multiple R-squared: 0.7274, Adjusted R-squared: 0.7185
## F-statistic: 81.84 on 6 and 184 DF, p-value: < 2.2e-16
fit_7 <- predict(fit_7, test_df)
actual <- confidence_intervals[test.idx, c("bb_per_100")]
error <- actual - fit_7
paste("my RMSE is", sqrt(mean(error^2)))
## [1] "my RMSE is 17.496161990825"
train_df <- confidence_intervals[train.idx, c("pfr", "winner_loser", "threebet",
"vpip_pfr", "wwsf", "vpip")]
test_df <- confidence_intervals[test.idx, c("pfr", "winner_loser", "threebet",
"vpip_pfr", "wwsf", "vpip")]
model <- glm(winner_loser ~ pfr + threebet + vpip_pfr, family = binomial(link = "logit"),
data = train_df)
summary(model)
##
## Call:
## glm(formula = winner_loser ~ pfr + threebet + vpip_pfr, family = binomial(link = "logit"),
## data = train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7059 -0.2227 -0.1775 -0.1065 2.8329
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.02944 0.98091 -4.108 3.99e-05 ***
## pfr 0.10199 0.06346 1.607 0.108
## threebet -0.27854 0.17054 -1.633 0.102
## vpip_pfrnarrow_gap 4.52577 0.74744 6.055 1.40e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 175.819 on 190 degrees of freedom
## Residual deviance: 89.655 on 187 degrees of freedom
## AIC: 97.655
##
## Number of Fisher Scoring iterations: 7
# Use model to make predictions
pred <- predict(model, test_df)
# Convert predictions to probabilities
probs <- exp(pred)/(1 + exp(pred))
probs
## 6 15 17 22 24 26
## 0.681644833 0.016099549 0.013031634 0.023282492 0.687293522 0.015737068
## 30 31 35 36 38 42
## 0.015157248 0.018886371 0.014453572 0.022091286 0.024755561 0.021047899
## 43 48 49 52 56 65
## 0.676678397 0.663588132 0.014225886 0.012199448 0.667329436 0.025709344
## 68 70 71 73 75 78
## 0.679427535 0.008083215 0.016650166 0.713165610 0.567514714 0.010484213
## 79 85 95 96 101 104
## 0.757724672 0.064039651 0.657611686 0.005997898 0.589823683 0.021161231
## 105 107 110 118 119 120
## 0.032976935 0.030436530 0.600768617 0.018172651 0.023968642 0.016281069
## 126 128 131 133 137 138
## 0.022528597 0.014459621 0.028509950 0.039079634 0.017923431 0.018246588
## 140 141 142 149 156 157
## 0.020378749 0.018812087 0.009376665 0.007627213 0.018417594 0.012215729
## 159 161 163 167 169 171
## 0.015435959 0.013153429 0.014985183 0.022344763 0.021962320 0.357945753
## 177 185 186 195 196 198
## 0.022553295 0.017892217 0.462352122 0.645588709 0.031046570 0.580070050
## 204 206 208 211 212 213
## 0.010540481 0.015602419 0.010928110 0.020286392 0.035302153 0.060512123
## 214 217 224 231 239 241
## 0.041867617 0.020223533 0.015271248 0.007756062 0.564819842 0.031451197
## 244 249 255 257 262 267
## 0.016966194 0.024413654 0.036626628 0.012786978 0.598589337 0.014009710
## 268 270 271 273
## 0.030774641 0.811349890 0.008849686 0.024620606
## Cut so that anything above .5= winner(1) below .5= loser(0)
convert_logistic_for_matrix <- cut(probs, breaks = c(0, 0.5, 1), labels = c(0,
1))
accuracy <- table(convert_logistic_for_matrix, test_df[, "winner_loser"])
accuracy
##
## convert_logistic_for_matrix 0 1
## 0 64 1
## 1 4 13
## accuracy rate
sum(diag(accuracy))/sum(accuracy)
## [1] 0.9390244
It appears that using a linear model to predict winrate may be much more difficult than I assumed. None of the models performed well enough. However, the logistic regression classification of winners and losers, performed extremely well. I believe it worked well for the same reason that the linear models may have worked so poorly. The categorical data I used of wide_gap narrow_gap(vpip_pfr) does a great job at identifying winning versus losing players. For simple classification, this works great. However when it comes to predicting, through a Linear Model, the coefficient of 1 *(narrow_gap) is to reductive to have value. Perhaps if I could have tuned the other variables better or created higher levels of factors for my vpip_pfr input, the linear models could have performed better. Overall it was helpful to dive into regression with a dataset I am familiar with. I hope to do some more tuning and to implement some of the ideas I mentioned above in the near future.