If the package aod is not already installed, you must install it:

install.packages(“aod”)

Load in the stringr library and the dataset TournamentInfo

library(stringr)
library(knitr)
TournamentInfo <- read.csv(url("https://raw.githubusercontent.com/AsherMeyers/DATA-607/master/tournamentinfo.txt"),
                           sep = "|", stringsAsFactors=FALSE,
                           col.names = c("Pair", "Name", "Total", "R1", "R2", "R3", "R4", "R5", "R6", "R7", ""), 
                           skip = 1)

TournamentInfo <- TournamentInfo[seq(-2,-194,-3),]
kable(head(TournamentInfo))
Pair Name Total R1 R2 R3 R4 R5 R6 R7 X
1 Num USCF ID / Rtg (Pre->Post) Pts 1 2 3 4 5 6 7 NA
3 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12 D 4 NA
4 ON 15445895 / R: 1794 ->1817 N:2 W B W B W B W NA
6 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20 W 7 NA
7 MI 14598900 / R: 1553 ->1663 N:2 B W B W B W B NA
9 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11 W 13 W 12 NA

Reorganize the TournamentInfo table so that each player’s info falls only on a single line.

JoinedRows <- data.frame(TournamentInfo[seq(2,2*194/3,2),],TournamentInfo[seq(3,2*194/3,2),])
kable(head(JoinedRows))
Pair Name Total R1 R2 R3 R4 R5 R6 R7 X Pair.1 Name.1 Total.1 R1.1 R2.1 R3.1 R4.1 R5.1 R6.1 R7.1 X.1
3 1 GARY HUA 6.0 W 39 W 21 W 18 W 14 W 7 D 12 D 4 NA ON 15445895 / R: 1794 ->1817 N:2 W B W B W B W NA
6 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17 W 16 W 20 W 7 NA MI 14598900 / R: 1553 ->1663 N:2 B W B W B W B NA
9 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21 W 11 W 13 W 12 NA MI 14959604 / R: 1384 ->1640 N:2 W B W B W B W NA
12 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26 D 5 W 19 D 1 NA MI 12616049 / R: 1716 ->1744 N:2 W B W B W B B NA
15 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13 D 4 W 14 W 17 NA MI 14601533 / R: 1655 ->1690 N:2 B W B W B W B NA
18 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35 D 10 W 27 W 21 NA OH 15055204 / R: 1686 ->1687 N:3 W B W B B W B NA

Pre-Rating Location: Identify the start and end positions of the player’s rating in a string, and then identify the content of that substring and save it as the player’s pre-tournament rating.

PreRatingLoc <- c(str_locate(JoinedRows[,13],"R: ")[1,2]+1, str_locate(JoinedRows[,13], "R: ")[1,2]+4)
PreRatings <- as.integer(str_sub(JoinedRows[,13],PreRatingLoc[1],PreRatingLoc[2]))

kable(head(PreRatings))
1794
1553
1384
1716
1655
1686

Listing of each player’s opponents

Opponents <- data.frame(1, as.list(as.numeric(str_extract(JoinedRows[1,4:10],"[0-9]{1,2}"))))
colnames(Opponents) <- c("Number", "Opp1","Opp2","Opp3","Opp4","Opp5","Opp6","Opp7")
for (i in 2:64) {
  Opponents <- rbind(Opponents, c(i, as.numeric(str_extract(JoinedRows[i,4:10],"[0-9]{1,2}"))))
}
kable(head(Opponents))
Number Opp1 Opp2 Opp3 Opp4 Opp5 Opp6 Opp7
1 39 21 18 14 7 12 4
2 63 58 4 17 16 20 7
3 8 61 25 21 11 13 12
4 23 28 2 26 5 19 1
5 45 37 12 13 4 14 17
6 34 29 11 35 10 27 21

Create a listing of each player’s opponent’s ratings, along with a column for the the number of rounds not played (i.e. column NullCount), and an average rating; for the average, the denominator is the number of games played, which ranges from 2 to 7.

OppRatings <- data.frame(c(1:64), PreRatings, PreRatings[Opponents$Opp1[1:64]],
                         PreRatings[Opponents$Opp2[1:64]],
                         PreRatings[Opponents$Opp3[1:64]],
                         PreRatings[Opponents$Opp4[1:64]],
                         PreRatings[Opponents$Opp5[1:64]],
                         PreRatings[Opponents$Opp6[1:64]],
                         PreRatings[Opponents$Opp7[1:64]])
colnames(OppRatings) <- c("Player","PlayerRating", "Opp1R","Opp2R","Opp3R","Opp4R","Opp5R","Opp6R","Opp7R")
OppRatings$Average <- round(rowMeans(OppRatings[3:9],na.rm=TRUE), digits=0)
kable(head(OppRatings))
Player PlayerRating Opp1R Opp2R Opp3R Opp4R Opp5R Opp6R Opp7R Average
1 1794 1436 1563 1600 1610 1649 1663 1716 1605
2 1553 1175 917 1716 1629 1604 1595 1649 1469
3 1384 1641 955 1745 1563 1712 1666 1663 1564
4 1716 1363 1507 1553 1579 1655 1564 1794 1574
5 1655 1242 980 1663 1666 1716 1610 1629 1501
6 1686 1399 1602 1712 1438 1365 1552 1563 1519

Now we have all the desired components of our table. Write a table with Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre-Chess Rating of Opponent

ChessTable <- data.frame(c(1:64), sapply(JoinedRows[2],str_trim),JoinedRows[12],JoinedRows[3], PreRatings, OppRatings$Average)
colnames(ChessTable) <- c("Number", "Name", "State", "Points", "PreRating","OpponentAverage")
kable(head(ChessTable))
Number Name State Points PreRating OpponentAverage
3 1 GARY HUA ON 6.0 1794 1605
6 2 DAKSHESH DARURI MI 6.0 1553 1469
9 3 ADITYA BAJAJ MI 6.0 1384 1564
12 4 PATRICK H SCHILLING MI 5.5 1716 1574
15 5 HANSHI ZUO MI 5.5 1655 1501
18 6 HANSEN SONG OH 5.0 1686 1519

First, we’ll compile the difference in ratings for each game, along with the outcome, in an easy to read table.

DiffRatings <- data.frame(c(1:64), OppRatings$PlayerRating-OppRatings$Opp1R,
                                   OppRatings$PlayerRating-OppRatings$Opp2R, 
                                   OppRatings$PlayerRating-OppRatings$Opp3R, 
                                   OppRatings$PlayerRating-OppRatings$Opp4R, 
                                   OppRatings$PlayerRating-OppRatings$Opp5R, 
                                   OppRatings$PlayerRating-OppRatings$Opp6R, 
                                   OppRatings$PlayerRating-OppRatings$Opp7R,
                                   str_sub(JoinedRows$R1, 1, 1),
                                   str_sub(JoinedRows$R2, 1, 1),
                                   str_sub(JoinedRows$R3, 1, 1),
                                   str_sub(JoinedRows$R4, 1, 1),
                                   str_sub(JoinedRows$R5, 1, 1),
                                   str_sub(JoinedRows$R6, 1, 1),
                                   str_sub(JoinedRows$R7, 1, 1), stringsAsFactors=FALSE)
colnames(DiffRatings) <- c("Player", "Opp1", "Opp2", "Opp3", "Opp4", "Opp5", "Opp6", "Opp7", "Score1", "Score2", "Score3", "Score4", "Score5","Score6", "Score7")

DiffRatings[DiffRatings == "W"] <- 1
DiffRatings[DiffRatings == "D"] <- NA_integer_
DiffRatings[DiffRatings == "L"] <- -1
DiffRatings[DiffRatings == "H"] <- NA_integer_
DiffRatings[DiffRatings == "U"] <- NA_integer_
DiffRatings[DiffRatings == "X"] <- NA_integer_
DiffRatings[DiffRatings == "B"] <- NA_integer_
kable(head(DiffRatings))
Player Opp1 Opp2 Opp3 Opp4 Opp5 Opp6 Opp7 Score1 Score2 Score3 Score4 Score5 Score6 Score7
1 358 231 194 184 145 131 78 1 1 1 1 1 NA NA
2 378 636 -163 -76 -51 -42 -96 1 1 -1 1 1 1 1
3 -257 429 -361 -179 -328 -282 -279 -1 1 1 1 1 1 1
4 353 209 163 137 61 152 -78 1 NA 1 1 NA 1 NA
5 413 675 -8 -11 -61 45 26 1 1 NA NA NA 1 1
6 287 84 -26 248 321 134 123 1 NA -1 1 NA 1 1

Now, let’s reformat the table into a format that’s easier to analyze.

DiffRatingsData <- data.frame(rep.int(1:64,7), 
                              c(DiffRatings$Opp1, DiffRatings$Opp2, DiffRatings$Opp3, DiffRatings$Opp4,
                                DiffRatings$Opp5, DiffRatings$Opp6, DiffRatings$Opp7),
                              c(DiffRatings$Score1, DiffRatings$Score2, DiffRatings$Score3, DiffRatings$Score4,                                         DiffRatings$Score5, DiffRatings$Score6, DiffRatings$Score7))
DiffRatingsData <- na.omit(DiffRatingsData)
colnames(DiffRatingsData) <- c("Player", "ScoreDifference", "Outcome")
kable(head(DiffRatingsData))
Player ScoreDifference Outcome
1 358 1
2 378 1
3 -257 -1
4 353 1
5 413 1
6 287 1

Now we’ll do our logistic regression

library(ggplot2)
library(Rcpp)

DiffRatingsData$Outcome <- factor(DiffRatingsData$Outcome)
LogitRatings <- glm(Outcome ~ ScoreDifference, data = DiffRatingsData, family = "binomial")
summary(LogitRatings)
## 
## Call:
## glm(formula = Outcome ~ ScoreDifference, family = "binomial", 
##     data = DiffRatingsData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6202  -0.9197   0.0000   0.9197   2.6202  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.037e-16  1.201e-01   0.000        1    
## ScoreDifference 3.204e-03  4.203e-04   7.624 2.46e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 485.20  on 349  degrees of freedom
## Residual deviance: 405.86  on 348  degrees of freedom
## AIC: 409.86
## 
## Number of Fisher Scoring iterations: 4
LogitRatings
## 
## Call:  glm(formula = Outcome ~ ScoreDifference, family = "binomial", 
##     data = DiffRatingsData)
## 
## Coefficients:
##     (Intercept)  ScoreDifference  
##       2.037e-16        3.204e-03  
## 
## Degrees of Freedom: 349 Total (i.e. Null);  348 Residual
## Null Deviance:       485.2 
## Residual Deviance: 405.9     AIC: 409.9

We get a value of 3.204e-03 for the intercept, and zero (more or less) for the intercept. To calculate the odds ratios and probabilities of winning, given a range of score differences, from 300 less than one’s opponent to 300 more:

Diffs <- seq(-300, 300, 100)
Odds <- exp(Diffs*3.204e-03)
WinProb <- data.frame(Diffs, Odds, (Odds/ (Odds+1)))
colnames(WinProb) <- c("Score Difference","Odds Ratio","Probability of Victory")
kable(head(WinProb))
Score Difference Odds Ratio Probability of Victory
-300 0.3824337 0.2766380
-200 0.5268708 0.3450657
-100 0.7258586 0.4205783
0 1.0000000 0.5000000
100 1.3776787 0.5794217
200 1.8979987 0.6549343

Now let’s plot out a graph of the probabilities:

Diffs <- seq(min(DiffRatingsData$ScoreDifference), max(DiffRatingsData$ScoreDifference), 1)
Odds <- exp(Diffs*3.204e-03)
WinProb <- Odds/ (Odds + 1)
plot(Diffs, WinProb, xlab = "Player Rating Difference", ylab = "Probability of Victory", main = "The Probability of Winning as a Function of a Player's Rating Premium", col = "brown", cex = 0.5, axes = FALSE)
axis(1, at = seq(-1250, 1250, 250), pos = 0, las = 0)
axis(2, at = seq(0, 1, 0.25))

Of course, in the interest of full disclosure, there are two flaws with my procedure: I’ve excluded the games that ended in a draw, and I’ve double-counted each game that didn’t end in a draw. The latter does not affect the probability calculations, but it could have been avoided perhaps by randomly choosing from which perspective to score a game (ie each game is a 1 from the winning player’s perspective and a 0 from the loser’s perspective.)

As we’d expect, when players are evenly matched, the probability of winning is 50%, aka fifty-fifty.