If the package aod is not already installed, you must install it:
install.packages(“aod”)
Load in the stringr library and the dataset TournamentInfo
library(stringr)
library(knitr)
TournamentInfo <- read.csv(url("https://raw.githubusercontent.com/AsherMeyers/DATA-607/master/tournamentinfo.txt"),
sep = "|", stringsAsFactors=FALSE,
col.names = c("Pair", "Name", "Total", "R1", "R2", "R3", "R4", "R5", "R6", "R7", ""),
skip = 1)
TournamentInfo <- TournamentInfo[seq(-2,-194,-3),]
kable(head(TournamentInfo))
| Pair | Name | Total | R1 | R2 | R3 | R4 | R5 | R6 | R7 | X | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | NA |
| 3 | 1 | GARY HUA | 6.0 | W 39 | W 21 | W 18 | W 14 | W 7 | D 12 | D 4 | NA |
| 4 | ON | 15445895 / R: 1794 ->1817 | N:2 | W | B | W | B | W | B | W | NA |
| 6 | 2 | DAKSHESH DARURI | 6.0 | W 63 | W 58 | L 4 | W 17 | W 16 | W 20 | W 7 | NA |
| 7 | MI | 14598900 / R: 1553 ->1663 | N:2 | B | W | B | W | B | W | B | NA |
| 9 | 3 | ADITYA BAJAJ | 6.0 | L 8 | W 61 | W 25 | W 21 | W 11 | W 13 | W 12 | NA |
Reorganize the TournamentInfo table so that each player’s info falls only on a single line.
JoinedRows <- data.frame(TournamentInfo[seq(2,2*194/3,2),],TournamentInfo[seq(3,2*194/3,2),])
kable(head(JoinedRows))
| Pair | Name | Total | R1 | R2 | R3 | R4 | R5 | R6 | R7 | X | Pair.1 | Name.1 | Total.1 | R1.1 | R2.1 | R3.1 | R4.1 | R5.1 | R6.1 | R7.1 | X.1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3 | 1 | GARY HUA | 6.0 | W 39 | W 21 | W 18 | W 14 | W 7 | D 12 | D 4 | NA | ON | 15445895 / R: 1794 ->1817 | N:2 | W | B | W | B | W | B | W | NA |
| 6 | 2 | DAKSHESH DARURI | 6.0 | W 63 | W 58 | L 4 | W 17 | W 16 | W 20 | W 7 | NA | MI | 14598900 / R: 1553 ->1663 | N:2 | B | W | B | W | B | W | B | NA |
| 9 | 3 | ADITYA BAJAJ | 6.0 | L 8 | W 61 | W 25 | W 21 | W 11 | W 13 | W 12 | NA | MI | 14959604 / R: 1384 ->1640 | N:2 | W | B | W | B | W | B | W | NA |
| 12 | 4 | PATRICK H SCHILLING | 5.5 | W 23 | D 28 | W 2 | W 26 | D 5 | W 19 | D 1 | NA | MI | 12616049 / R: 1716 ->1744 | N:2 | W | B | W | B | W | B | B | NA |
| 15 | 5 | HANSHI ZUO | 5.5 | W 45 | W 37 | D 12 | D 13 | D 4 | W 14 | W 17 | NA | MI | 14601533 / R: 1655 ->1690 | N:2 | B | W | B | W | B | W | B | NA |
| 18 | 6 | HANSEN SONG | 5.0 | W 34 | D 29 | L 11 | W 35 | D 10 | W 27 | W 21 | NA | OH | 15055204 / R: 1686 ->1687 | N:3 | W | B | W | B | B | W | B | NA |
Pre-Rating Location: Identify the start and end positions of the player’s rating in a string, and then identify the content of that substring and save it as the player’s pre-tournament rating.
PreRatingLoc <- c(str_locate(JoinedRows[,13],"R: ")[1,2]+1, str_locate(JoinedRows[,13], "R: ")[1,2]+4)
PreRatings <- as.integer(str_sub(JoinedRows[,13],PreRatingLoc[1],PreRatingLoc[2]))
kable(head(PreRatings))
| 1794 |
| 1553 |
| 1384 |
| 1716 |
| 1655 |
| 1686 |
Listing of each player’s opponents
Opponents <- data.frame(1, as.list(as.numeric(str_extract(JoinedRows[1,4:10],"[0-9]{1,2}"))))
colnames(Opponents) <- c("Number", "Opp1","Opp2","Opp3","Opp4","Opp5","Opp6","Opp7")
for (i in 2:64) {
Opponents <- rbind(Opponents, c(i, as.numeric(str_extract(JoinedRows[i,4:10],"[0-9]{1,2}"))))
}
kable(head(Opponents))
| Number | Opp1 | Opp2 | Opp3 | Opp4 | Opp5 | Opp6 | Opp7 |
|---|---|---|---|---|---|---|---|
| 1 | 39 | 21 | 18 | 14 | 7 | 12 | 4 |
| 2 | 63 | 58 | 4 | 17 | 16 | 20 | 7 |
| 3 | 8 | 61 | 25 | 21 | 11 | 13 | 12 |
| 4 | 23 | 28 | 2 | 26 | 5 | 19 | 1 |
| 5 | 45 | 37 | 12 | 13 | 4 | 14 | 17 |
| 6 | 34 | 29 | 11 | 35 | 10 | 27 | 21 |
Create a listing of each player’s opponent’s ratings, along with a column for the the number of rounds not played (i.e. column NullCount), and an average rating; for the average, the denominator is the number of games played, which ranges from 2 to 7.
OppRatings <- data.frame(c(1:64), PreRatings, PreRatings[Opponents$Opp1[1:64]],
PreRatings[Opponents$Opp2[1:64]],
PreRatings[Opponents$Opp3[1:64]],
PreRatings[Opponents$Opp4[1:64]],
PreRatings[Opponents$Opp5[1:64]],
PreRatings[Opponents$Opp6[1:64]],
PreRatings[Opponents$Opp7[1:64]])
colnames(OppRatings) <- c("Player","PlayerRating", "Opp1R","Opp2R","Opp3R","Opp4R","Opp5R","Opp6R","Opp7R")
OppRatings$Average <- round(rowMeans(OppRatings[3:9],na.rm=TRUE), digits=0)
kable(head(OppRatings))
| Player | PlayerRating | Opp1R | Opp2R | Opp3R | Opp4R | Opp5R | Opp6R | Opp7R | Average |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 1794 | 1436 | 1563 | 1600 | 1610 | 1649 | 1663 | 1716 | 1605 |
| 2 | 1553 | 1175 | 917 | 1716 | 1629 | 1604 | 1595 | 1649 | 1469 |
| 3 | 1384 | 1641 | 955 | 1745 | 1563 | 1712 | 1666 | 1663 | 1564 |
| 4 | 1716 | 1363 | 1507 | 1553 | 1579 | 1655 | 1564 | 1794 | 1574 |
| 5 | 1655 | 1242 | 980 | 1663 | 1666 | 1716 | 1610 | 1629 | 1501 |
| 6 | 1686 | 1399 | 1602 | 1712 | 1438 | 1365 | 1552 | 1563 | 1519 |
Now we have all the desired components of our table. Write a table with Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre-Chess Rating of Opponent
ChessTable <- data.frame(c(1:64), sapply(JoinedRows[2],str_trim),JoinedRows[12],JoinedRows[3], PreRatings, OppRatings$Average)
colnames(ChessTable) <- c("Number", "Name", "State", "Points", "PreRating","OpponentAverage")
kable(head(ChessTable))
| Number | Name | State | Points | PreRating | OpponentAverage | |
|---|---|---|---|---|---|---|
| 3 | 1 | GARY HUA | ON | 6.0 | 1794 | 1605 |
| 6 | 2 | DAKSHESH DARURI | MI | 6.0 | 1553 | 1469 |
| 9 | 3 | ADITYA BAJAJ | MI | 6.0 | 1384 | 1564 |
| 12 | 4 | PATRICK H SCHILLING | MI | 5.5 | 1716 | 1574 |
| 15 | 5 | HANSHI ZUO | MI | 5.5 | 1655 | 1501 |
| 18 | 6 | HANSEN SONG | OH | 5.0 | 1686 | 1519 |
First, we’ll compile the difference in ratings for each game, along with the outcome, in an easy to read table.
DiffRatings <- data.frame(c(1:64), OppRatings$PlayerRating-OppRatings$Opp1R,
OppRatings$PlayerRating-OppRatings$Opp2R,
OppRatings$PlayerRating-OppRatings$Opp3R,
OppRatings$PlayerRating-OppRatings$Opp4R,
OppRatings$PlayerRating-OppRatings$Opp5R,
OppRatings$PlayerRating-OppRatings$Opp6R,
OppRatings$PlayerRating-OppRatings$Opp7R,
str_sub(JoinedRows$R1, 1, 1),
str_sub(JoinedRows$R2, 1, 1),
str_sub(JoinedRows$R3, 1, 1),
str_sub(JoinedRows$R4, 1, 1),
str_sub(JoinedRows$R5, 1, 1),
str_sub(JoinedRows$R6, 1, 1),
str_sub(JoinedRows$R7, 1, 1), stringsAsFactors=FALSE)
colnames(DiffRatings) <- c("Player", "Opp1", "Opp2", "Opp3", "Opp4", "Opp5", "Opp6", "Opp7", "Score1", "Score2", "Score3", "Score4", "Score5","Score6", "Score7")
DiffRatings[DiffRatings == "W"] <- 1
DiffRatings[DiffRatings == "D"] <- NA_integer_
DiffRatings[DiffRatings == "L"] <- -1
DiffRatings[DiffRatings == "H"] <- NA_integer_
DiffRatings[DiffRatings == "U"] <- NA_integer_
DiffRatings[DiffRatings == "X"] <- NA_integer_
DiffRatings[DiffRatings == "B"] <- NA_integer_
kable(head(DiffRatings))
| Player | Opp1 | Opp2 | Opp3 | Opp4 | Opp5 | Opp6 | Opp7 | Score1 | Score2 | Score3 | Score4 | Score5 | Score6 | Score7 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 358 | 231 | 194 | 184 | 145 | 131 | 78 | 1 | 1 | 1 | 1 | 1 | NA | NA |
| 2 | 378 | 636 | -163 | -76 | -51 | -42 | -96 | 1 | 1 | -1 | 1 | 1 | 1 | 1 |
| 3 | -257 | 429 | -361 | -179 | -328 | -282 | -279 | -1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 4 | 353 | 209 | 163 | 137 | 61 | 152 | -78 | 1 | NA | 1 | 1 | NA | 1 | NA |
| 5 | 413 | 675 | -8 | -11 | -61 | 45 | 26 | 1 | 1 | NA | NA | NA | 1 | 1 |
| 6 | 287 | 84 | -26 | 248 | 321 | 134 | 123 | 1 | NA | -1 | 1 | NA | 1 | 1 |
Now, let’s reformat the table into a format that’s easier to analyze.
DiffRatingsData <- data.frame(rep.int(1:64,7),
c(DiffRatings$Opp1, DiffRatings$Opp2, DiffRatings$Opp3, DiffRatings$Opp4,
DiffRatings$Opp5, DiffRatings$Opp6, DiffRatings$Opp7),
c(DiffRatings$Score1, DiffRatings$Score2, DiffRatings$Score3, DiffRatings$Score4, DiffRatings$Score5, DiffRatings$Score6, DiffRatings$Score7))
DiffRatingsData <- na.omit(DiffRatingsData)
colnames(DiffRatingsData) <- c("Player", "ScoreDifference", "Outcome")
kable(head(DiffRatingsData))
| Player | ScoreDifference | Outcome |
|---|---|---|
| 1 | 358 | 1 |
| 2 | 378 | 1 |
| 3 | -257 | -1 |
| 4 | 353 | 1 |
| 5 | 413 | 1 |
| 6 | 287 | 1 |
Now we’ll do our logistic regression
library(ggplot2)
library(Rcpp)
DiffRatingsData$Outcome <- factor(DiffRatingsData$Outcome)
LogitRatings <- glm(Outcome ~ ScoreDifference, data = DiffRatingsData, family = "binomial")
summary(LogitRatings)
##
## Call:
## glm(formula = Outcome ~ ScoreDifference, family = "binomial",
## data = DiffRatingsData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6202 -0.9197 0.0000 0.9197 2.6202
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.037e-16 1.201e-01 0.000 1
## ScoreDifference 3.204e-03 4.203e-04 7.624 2.46e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 485.20 on 349 degrees of freedom
## Residual deviance: 405.86 on 348 degrees of freedom
## AIC: 409.86
##
## Number of Fisher Scoring iterations: 4
LogitRatings
##
## Call: glm(formula = Outcome ~ ScoreDifference, family = "binomial",
## data = DiffRatingsData)
##
## Coefficients:
## (Intercept) ScoreDifference
## 2.037e-16 3.204e-03
##
## Degrees of Freedom: 349 Total (i.e. Null); 348 Residual
## Null Deviance: 485.2
## Residual Deviance: 405.9 AIC: 409.9
We get a value of 3.204e-03 for the intercept, and zero (more or less) for the intercept. To calculate the odds ratios and probabilities of winning, given a range of score differences, from 300 less than one’s opponent to 300 more:
Diffs <- seq(-300, 300, 100)
Odds <- exp(Diffs*3.204e-03)
WinProb <- data.frame(Diffs, Odds, (Odds/ (Odds+1)))
colnames(WinProb) <- c("Score Difference","Odds Ratio","Probability of Victory")
kable(head(WinProb))
| Score Difference | Odds Ratio | Probability of Victory |
|---|---|---|
| -300 | 0.3824337 | 0.2766380 |
| -200 | 0.5268708 | 0.3450657 |
| -100 | 0.7258586 | 0.4205783 |
| 0 | 1.0000000 | 0.5000000 |
| 100 | 1.3776787 | 0.5794217 |
| 200 | 1.8979987 | 0.6549343 |
Now let’s plot out a graph of the probabilities:
Diffs <- seq(min(DiffRatingsData$ScoreDifference), max(DiffRatingsData$ScoreDifference), 1)
Odds <- exp(Diffs*3.204e-03)
WinProb <- Odds/ (Odds + 1)
plot(Diffs, WinProb, xlab = "Player Rating Difference", ylab = "Probability of Victory", main = "The Probability of Winning as a Function of a Player's Rating Premium", col = "brown", cex = 0.5, axes = FALSE)
axis(1, at = seq(-1250, 1250, 250), pos = 0, las = 0)
axis(2, at = seq(0, 1, 0.25))
Of course, in the interest of full disclosure, there are two flaws with my procedure: I’ve excluded the games that ended in a draw, and I’ve double-counted each game that didn’t end in a draw. The latter does not affect the probability calculations, but it could have been avoided perhaps by randomly choosing from which perspective to score a game (ie each game is a 1 from the winning player’s perspective and a 0 from the loser’s perspective.)
As we’d expect, when players are evenly matched, the probability of winning is 50%, aka fifty-fifty.