library(stringr)
library(DT)
tournamentraw <- readLines("~/Desktop/tournamentinfo.txt")
## Warning in readLines("~/Desktop/tournamentinfo.txt"): incomplete final line
## found on '~/Desktop/tournamentinfo.txt'
head(tournamentraw)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
tail(tournamentraw)
## [1] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [2] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
## [5] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
## [6] "-----------------------------------------------------------------------------------------"
NROW(tournamentraw)
## [1] 196
NCOL(tournamentraw)
## [1] 1
Primary objective, bring all of the data for each player into one row and extract the requested attributes.
tournamentrows <- length(tournamentraw)
tournamentrows
## [1] 196
Start with the fifth row to account for the header and hyphens
PlayerNameRows <- tournamentraw[seq(5, tournamentrows, 3)]
head(PlayerNameRows)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
Start with the sixth row
PlayerStateRows <- tournamentraw[seq(6, tournamentrows, 3)]
head(PlayerStateRows)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
From PlayerNameRows, extract the Player’s Name
PlayerName <- str_trim(str_extract(PlayerNameRows, "(\\w+\\s){2,3}"))
head(PlayerName)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
From PlayerStateRows, extract the Player State taking the first two letters in the string.
PlayerState <- str_extract(PlayerStateRows, "\\w+")
head(PlayerState)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
From the PlayerNameRows, extract the Total Points.
TotalPoints <- as.numeric(str_extract(PlayerNameRows, "\\d+\\.\\d+"))
head(TotalPoints)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
From the PlayerStateRows, extract the Player’s Pre-Rating by first taking the string and then pulling out the number to remove the spaces.
PlayerPreRating <- str_extract(PlayerStateRows, "[^\\d]\\d{3,4}[^\\d]")
PlayerPreRating <- as.integer(str_extract(PlayerPreRating, "\\d+"))
head(PlayerPreRating)
## [1] 1794 1553 1384 1716 1655 1686
From the PlayerStateRows, determine who the opponents of each player were.
FindOpponents <- str_extract_all(PlayerNameRows, "\\d+\\|")
FindOpponents <- str_extract_all(FindOpponents, "\\d+")
head(FindOpponents)
## [[1]]
## [1] "39" "21" "18" "14" "7" "12" "4"
##
## [[2]]
## [1] "63" "58" "4" "17" "16" "20" "7"
##
## [[3]]
## [1] "8" "61" "25" "21" "11" "13" "12"
##
## [[4]]
## [1] "23" "28" "2" "26" "5" "19" "1"
##
## [[5]]
## [1] "45" "37" "12" "13" "4" "14" "17"
##
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"
From the PlayerNameRows, extract the pair numbers for use in the opponent pre-rating calculation.
Pair <- as.integer(str_extract(PlayerNameRows, "\\d+"))
head(Pair)
## [1] 1 2 3 4 5 6
Run a loop to calculate the mean rating of the opponents of each player using the Pair numbers for all rows.
AveOpponentRating <- Pair
for (i in 1:NROW(Pair)) {
AveOpponentRating[i] <- mean(PlayerPreRating[as.numeric(unlist(FindOpponents[Pair[i]]))])
}
head(AveOpponentRating)
## [1] 1605.286 1469.286 1563.571 1573.571 1500.857 1518.714
AveOpponentRating <- round(AveOpponentRating)
head(AveOpponentRating)
## [1] 1605 1469 1564 1574 1501 1519
Put together all of the extracted data into a final set.
FinalData <- data.frame(PlayerName, PlayerState, TotalPoints, PlayerPreRating, AveOpponentRating)
head(FinalData)
## PlayerName PlayerState TotalPoints PlayerPreRating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## AveOpponentRating
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
colnames(FinalData) <- c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre-Rating", "Average Rating of Opponents")
head(FinalData)
## Player's Name Player's State Total Number of Points
## 1 GARY HUA ON 6.0
## 2 DAKSHESH DARURI MI 6.0
## 3 ADITYA BAJAJ MI 6.0
## 4 PATRICK H SCHILLING MI 5.5
## 5 HANSHI ZUO MI 5.5
## 6 HANSEN SONG OH 5.0
## Player's Pre-Rating Average Rating of Opponents
## 1 1794 1605
## 2 1553 1469
## 3 1384 1564
## 4 1716 1574
## 5 1655 1501
## 6 1686 1519
Export final data to a new CSV file.
write.csv(FinalData,file="Chess Data Extracted Jill Anderson.csv")
Improve the data table for display.
datatable(FinalData)
Comparing the individual Players’ Pre-Ratings to the Average Opponents’ Pre-Ratings, one would expect the means to be identical and the medians to be similar. The min and max from the individual Players’ Pre-Ratings should be lower and higher respectively compared to the Average Opponents’ Pre-Ratings to demonstrate an average distribution of opponent pre-ratings across all players.
summary(FinalData$`Player's Pre-Rating`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 377 1227 1407 1378 1583 1794
summary(FinalData$`Average Rating of Opponents`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1107 1310 1382 1379 1481 1605
hist(FinalData$`Player's Pre-Rating`, breaks = 30, main = "Distribution of Player Ratings Pre-Tournament", xlab = "Player's Pre-Rating", ylab = "Count")
hist(FinalData$`Total Number of Points`, breaks = 10, main = "Distribution of Total Points", xlab = "Total Points per Player", ylab = "Count")
FinalDataMI = subset(FinalData, PlayerState == "MI")
FinalDataMI_Num = round(NROW(FinalDataMI))
FinalDataMI_MeanPoints = round(mean(FinalDataMI$`Total Number of Points`))
FinalDataMI_PlayerPreRating = round(mean(FinalDataMI$`Player's Pre-Rating`))
FinalDataMI_AveOpponent = round(mean(FinalDataMI$`Average Rating of Opponents`))
FinalDataMISum <- data.frame("MI", FinalDataMI_Num, FinalDataMI_MeanPoints, FinalDataMI_PlayerPreRating, FinalDataMI_AveOpponent)
colnames(FinalDataMISum) <- c("State", "Number of Players", "Average Total Number of Points", "Average Player Pre-Rating", "Average Rating of Opponents")
FinalDataMISum
## State Number of Players Average Total Number of Points
## 1 MI 55 3
## Average Player Pre-Rating Average Rating of Opponents
## 1 1362 1374
FinalDataON = subset(FinalData, PlayerState == "ON")
FinalDataON_Num = round(NROW(FinalDataON))
FinalDataON_MeanPoints = round(mean(FinalDataON$`Total Number of Points`))
FinalDataON_PlayerPreRating = round(mean(FinalDataON$`Player's Pre-Rating`))
FinalDataON_AveOpponent = round(mean(FinalDataON$`Average Rating of Opponents`))
FinalDataONSum <- data.frame("ON", FinalDataON_Num, FinalDataON_MeanPoints, FinalDataON_PlayerPreRating, FinalDataON_AveOpponent)
colnames(FinalDataONSum) <- c("State", "Number of Players", "Average Total Number of Points", "Average Player Pre-Rating", "Average Rating of Opponents")
FinalDataONSum
## State Number of Players Average Total Number of Points
## 1 ON 8 4
## Average Player Pre-Rating Average Rating of Opponents
## 1 1454 1396
FinalDataOH = subset(FinalData, PlayerState == "OH")
FinalDataOH_Num = round(NROW(FinalDataOH))
FinalDataOH_MeanPoints = round(mean(FinalDataOH$`Total Number of Points`))
FinalDataOH_PlayerPreRating = round(mean(FinalDataOH$`Player's Pre-Rating`))
FinalDataOH_AveOpponent = round(mean(FinalDataOH$`Average Rating of Opponents`))
FinalDataOHSum <- data.frame("OH", FinalDataOH_Num, FinalDataOH_MeanPoints, FinalDataOH_PlayerPreRating, FinalDataOH_AveOpponent)
colnames(FinalDataOHSum) <- c("State", "Number of Players", "Average Total Number of Points", "Average Player Pre-Rating", "Average Rating of Opponents")
FinalDataOHSum
## State Number of Players Average Total Number of Points
## 1 OH 1 5
## Average Player Pre-Rating Average Rating of Opponents
## 1 1686 1519
This comparison shows MI had the most representation with 55 players and OH had the least with 1. The one player from OH had a higher rating than the average ratings of the players from MI and ON and also played opponents with a higher average rating than the averages played in the MI and ON, though looking at a distribution of one player is not enough to draw conclusions. Instead comparing MI and ON, the table shows ON has a higher average player pre-rating.
FinalDataSum <- rbind(FinalDataMISum, FinalDataONSum, FinalDataOHSum)
datatable(FinalDataSum)