Chess Tournament

If the package aod is not already installed, you must install it:

install.packages(“aod”)

Load in the stringr library and the dataset TournamentInfo

library(stringr)
library(knitr)
TournamentInfo <- read.csv(url("https://raw.githubusercontent.com/AsherMeyers/DATA-607/master/tournamentinfo.txt"),
                           sep = "|", stringsAsFactors=FALSE,
                           col.names = c("Pair", "Name", "Total", "R1", "R2", "R3", "R4", "R5", "R6", "R7", ""), 
                           skip = 1)

TournamentInfo <- TournamentInfo[seq(-2,-194,-3),]
kable(head(TournamentInfo))

	Pair	Name	Total	R1	R2	R3	R4	R5	R6	R7	X
1	Num	USCF ID / Rtg (Pre->Post)	Pts	1	2	3	4	5	6	7	NA
3	1	GARY HUA	6.0	W 39	W 21	W 18	W 14	W 7	D 12	D 4	NA
4	ON	15445895 / R: 1794 ->1817	N:2	W	B	W	B	W	B	W	NA
6	2	DAKSHESH DARURI	6.0	W 63	W 58	L 4	W 17	W 16	W 20	W 7	NA
7	MI	14598900 / R: 1553 ->1663	N:2	B	W	B	W	B	W	B	NA
9	3	ADITYA BAJAJ	6.0	L 8	W 61	W 25	W 21	W 11	W 13	W 12	NA

Reorganize the TournamentInfo table so that each player’s info falls only on a single line.

JoinedRows <- data.frame(TournamentInfo[seq(2,2*194/3,2),],TournamentInfo[seq(3,2*194/3,2),])
kable(head(JoinedRows))

	Pair	Name	Total	R1	R2	R3	R4	R5	R6	R7	X	Pair.1	Name.1	Total.1	R1.1	R2.1	R3.1	R4.1	R5.1	R6.1	R7.1	X.1
3	1	GARY HUA	6.0	W 39	W 21	W 18	W 14	W 7	D 12	D 4	NA	ON	15445895 / R: 1794 ->1817	N:2	W	B	W	B	W	B	W	NA
6	2	DAKSHESH DARURI	6.0	W 63	W 58	L 4	W 17	W 16	W 20	W 7	NA	MI	14598900 / R: 1553 ->1663	N:2	B	W	B	W	B	W	B	NA
9	3	ADITYA BAJAJ	6.0	L 8	W 61	W 25	W 21	W 11	W 13	W 12	NA	MI	14959604 / R: 1384 ->1640	N:2	W	B	W	B	W	B	W	NA
12	4	PATRICK H SCHILLING	5.5	W 23	D 28	W 2	W 26	D 5	W 19	D 1	NA	MI	12616049 / R: 1716 ->1744	N:2	W	B	W	B	W	B	B	NA
15	5	HANSHI ZUO	5.5	W 45	W 37	D 12	D 13	D 4	W 14	W 17	NA	MI	14601533 / R: 1655 ->1690	N:2	B	W	B	W	B	W	B	NA
18	6	HANSEN SONG	5.0	W 34	D 29	L 11	W 35	D 10	W 27	W 21	NA	OH	15055204 / R: 1686 ->1687	N:3	W	B	W	B	B	W	B	NA

Pre-Rating Location: Identify the start and end positions of the player’s rating in a string, and then identify the content of that substring and save it as the player’s pre-tournament rating.

PreRatingLoc <- c(str_locate(JoinedRows[,13],"R: ")[1,2]+1, str_locate(JoinedRows[,13], "R: ")[1,2]+4)
PreRatings <- as.integer(str_sub(JoinedRows[,13],PreRatingLoc[1],PreRatingLoc[2]))

kable(head(PreRatings))

1794

1553

1384

1716

1655

1686

Listing of each player’s opponents

Opponents <- data.frame(1, as.list(as.numeric(str_extract(JoinedRows[1,4:10],"[0-9]{1,2}"))))
colnames(Opponents) <- c("Number", "Opp1","Opp2","Opp3","Opp4","Opp5","Opp6","Opp7")
for (i in 2:64) {
  Opponents <- rbind(Opponents, c(i, as.numeric(str_extract(JoinedRows[i,4:10],"[0-9]{1,2}"))))
}
kable(head(Opponents))

Number	Opp1	Opp2	Opp3	Opp4	Opp5	Opp6	Opp7
1	39	21	18	14	7	12	4
2	63	58	4	17	16	20	7
3	8	61	25	21	11	13	12
4	23	28	2	26	5	19	1
5	45	37	12	13	4	14	17
6	34	29	11	35	10	27	21

Create a listing of each player’s opponent’s ratings, along with a column for the the number of rounds not played (i.e. column NullCount), and an average rating; for the average, the denominator is the number of games played, which ranges from 2 to 7.

OppRatings <- data.frame(c(1:64), PreRatings, PreRatings[Opponents$Opp1[1:64]],
                         PreRatings[Opponents$Opp2[1:64]],
                         PreRatings[Opponents$Opp3[1:64]],
                         PreRatings[Opponents$Opp4[1:64]],
                         PreRatings[Opponents$Opp5[1:64]],
                         PreRatings[Opponents$Opp6[1:64]],
                         PreRatings[Opponents$Opp7[1:64]])
colnames(OppRatings) <- c("Player","PlayerRating", "Opp1R","Opp2R","Opp3R","Opp4R","Opp5R","Opp6R","Opp7R")
OppRatings$Average <- round(rowMeans(OppRatings[3:9],na.rm=TRUE), digits=0)
kable(head(OppRatings))

Player	PlayerRating	Opp1R	Opp2R	Opp3R	Opp4R	Opp5R	Opp6R	Opp7R	Average
1	1794	1436	1563	1600	1610	1649	1663	1716	1605
2	1553	1175	917	1716	1629	1604	1595	1649	1469
3	1384	1641	955	1745	1563	1712	1666	1663	1564
4	1716	1363	1507	1553	1579	1655	1564	1794	1574
5	1655	1242	980	1663	1666	1716	1610	1629	1501
6	1686	1399	1602	1712	1438	1365	1552	1563	1519

Now we have all the desired components of our table. Write a table with Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre-Chess Rating of Opponent

ChessTable <- data.frame(c(1:64), sapply(JoinedRows[2],str_trim),JoinedRows[12],JoinedRows[3], PreRatings, OppRatings$Average)
colnames(ChessTable) <- c("Number", "Name", "State", "Points", "PreRating","OpponentAverage")
kable(head(ChessTable))

	Number	Name	State	Points	PreRating	OpponentAverage
3	1	GARY HUA	ON	6.0	1794	1605
6	2	DAKSHESH DARURI	MI	6.0	1553	1469
9	3	ADITYA BAJAJ	MI	6.0	1384	1564
12	4	PATRICK H SCHILLING	MI	5.5	1716	1574
15	5	HANSHI ZUO	MI	5.5	1655	1501
18	6	HANSEN SONG	OH	5.0	1686	1519

First, we’ll compile the difference in ratings for each game, along with the outcome, in an easy to read table.

DiffRatings <- data.frame(c(1:64), OppRatings$PlayerRating-OppRatings$Opp1R,
                                   OppRatings$PlayerRating-OppRatings$Opp2R, 
                                   OppRatings$PlayerRating-OppRatings$Opp3R, 
                                   OppRatings$PlayerRating-OppRatings$Opp4R, 
                                   OppRatings$PlayerRating-OppRatings$Opp5R, 
                                   OppRatings$PlayerRating-OppRatings$Opp6R, 
                                   OppRatings$PlayerRating-OppRatings$Opp7R,
                                   str_sub(JoinedRows$R1, 1, 1),
                                   str_sub(JoinedRows$R2, 1, 1),
                                   str_sub(JoinedRows$R3, 1, 1),
                                   str_sub(JoinedRows$R4, 1, 1),
                                   str_sub(JoinedRows$R5, 1, 1),
                                   str_sub(JoinedRows$R6, 1, 1),
                                   str_sub(JoinedRows$R7, 1, 1), stringsAsFactors=FALSE)
colnames(DiffRatings) <- c("Player", "Opp1", "Opp2", "Opp3", "Opp4", "Opp5", "Opp6", "Opp7", "Score1", "Score2", "Score3", "Score4", "Score5","Score6", "Score7")

DiffRatings[DiffRatings == "W"] <- 1
DiffRatings[DiffRatings == "D"] <- NA_integer_
DiffRatings[DiffRatings == "L"] <- -1
DiffRatings[DiffRatings == "H"] <- NA_integer_
DiffRatings[DiffRatings == "U"] <- NA_integer_
DiffRatings[DiffRatings == "X"] <- NA_integer_
DiffRatings[DiffRatings == "B"] <- NA_integer_
kable(head(DiffRatings))

Player	Opp1	Opp2	Opp3	Opp4	Opp5	Opp6	Opp7	Score1	Score2	Score3	Score4	Score5	Score6	Score7
1	358	231	194	184	145	131	78	1	1	1	1	1	NA	NA
2	378	636	-163	-76	-51	-42	-96	1	1	-1	1	1	1	1
3	-257	429	-361	-179	-328	-282	-279	-1	1	1	1	1	1	1
4	353	209	163	137	61	152	-78	1	NA	1	1	NA	1	NA
5	413	675	-8	-11	-61	45	26	1	1	NA	NA	NA	1	1
6	287	84	-26	248	321	134	123	1	NA	-1	1	NA	1	1

Now, let’s reformat the table into a format that’s easier to analyze.

DiffRatingsData <- data.frame(rep.int(1:64,7), 
                              c(DiffRatings$Opp1, DiffRatings$Opp2, DiffRatings$Opp3, DiffRatings$Opp4,
                                DiffRatings$Opp5, DiffRatings$Opp6, DiffRatings$Opp7),
                              c(DiffRatings$Score1, DiffRatings$Score2, DiffRatings$Score3, DiffRatings$Score4,                                         DiffRatings$Score5, DiffRatings$Score6, DiffRatings$Score7))
DiffRatingsData <- na.omit(DiffRatingsData)
colnames(DiffRatingsData) <- c("Player", "ScoreDifference", "Outcome")
kable(head(DiffRatingsData))

Player	ScoreDifference	Outcome
1	358	1
2	378	1
3	-257	-1
4	353	1
5	413	1
6	287	1

Now we’ll do our logistic regression

library(ggplot2)
library(Rcpp)

DiffRatingsData$Outcome <- factor(DiffRatingsData$Outcome)
LogitRatings <- glm(Outcome ~ ScoreDifference, data = DiffRatingsData, family = "binomial")
summary(LogitRatings)

## 
## Call:
## glm(formula = Outcome ~ ScoreDifference, family = "binomial", 
##     data = DiffRatingsData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6202  -0.9197   0.0000   0.9197   2.6202  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.037e-16  1.201e-01   0.000        1    
## ScoreDifference 3.204e-03  4.203e-04   7.624 2.46e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 485.20  on 349  degrees of freedom
## Residual deviance: 405.86  on 348  degrees of freedom
## AIC: 409.86
## 
## Number of Fisher Scoring iterations: 4

LogitRatings

## 
## Call:  glm(formula = Outcome ~ ScoreDifference, family = "binomial", 
##     data = DiffRatingsData)
## 
## Coefficients:
##     (Intercept)  ScoreDifference  
##       2.037e-16        3.204e-03  
## 
## Degrees of Freedom: 349 Total (i.e. Null);  348 Residual
## Null Deviance:       485.2 
## Residual Deviance: 405.9     AIC: 409.9

We get a value of 3.204e-03 for the intercept, and zero (more or less) for the intercept. To calculate the odds ratios and probabilities of winning, given a range of score differences, from 300 less than one’s opponent to 300 more:

Diffs <- seq(-300, 300, 100)
Odds <- exp(Diffs*3.204e-03)
WinProb <- data.frame(Diffs, Odds, (Odds/ (Odds+1)))
colnames(WinProb) <- c("Score Difference","Odds Ratio","Probability of Victory")
kable(head(WinProb))

Score Difference	Odds Ratio	Probability of Victory
-300	0.3824337	0.2766380
-200	0.5268708	0.3450657
-100	0.7258586	0.4205783
0	1.0000000	0.5000000
100	1.3776787	0.5794217
200	1.8979987	0.6549343

Now let’s plot out a graph of the probabilities:

Diffs <- seq(min(DiffRatingsData$ScoreDifference), max(DiffRatingsData$ScoreDifference), 1)
Odds <- exp(Diffs*3.204e-03)
WinProb <- Odds/ (Odds + 1)
plot(Diffs, WinProb, xlab = "Player Rating Difference", ylab = "Probability of Victory", main = "The Probability of Winning as a Function of a Player's Rating Premium", col = "brown", cex = 0.5, axes = FALSE)
axis(1, at = seq(-1250, 1250, 250), pos = 0, las = 0)
axis(2, at = seq(0, 1, 0.25))

Of course, in the interest of full disclosure, there are two flaws with my procedure: I’ve excluded the games that ended in a draw, and I’ve double-counted each game that didn’t end in a draw. The latter does not affect the probability calculations, but it could have been avoided perhaps by randomly choosing from which perspective to score a game (ie each game is a 1 from the winning player’s perspective and a 0 from the loser’s perspective.)

As we’d expect, when players are evenly matched, the probability of winning is 50%, aka fifty-fifty.

Chess Tournament

Asher Meyers and Chris Martin

February 16, 2016

Number	Opp1	Opp2	Opp3	Opp4	Opp5	Opp6	Opp7
1	39	21	18	14	7	12	4
2	63	58	4	17	16	20	7
3	8	61	25	21	11	13	12
4	23	28	2	26	5	19	1
5	45	37	12	13	4	14	17
6	34	29	11	35	10	27	21

Number	Opp1	Opp2	Opp3	Opp4	Opp5	Opp6	Opp7
1	39	21	18	14	7	12	4
2	63	58	4	17	16	20	7
3	8	61	25	21	11	13	12
4	23	28	2	26	5	19	1
5	45	37	12	13	4	14	17
6	34	29	11	35	10	27	21

Number	Opp1	Opp2	Opp3	Opp4	Opp5	Opp6	Opp7
1	39	21	18	14	7	12	4
2	63	58	4	17	16	20	7
3	8	61	25	21	11	13	12
4	23	28	2	26	5	19	1
5	45	37	12	13	4	14	17
6	34	29	11	35	10	27	21