In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.
Excerpt from text file:
knitr::include_graphics('/Users/katieevers/Desktop/txtFileExcerpt.png')
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
tournamentInfo <- read.delim("tournamentinfo.txt", header = FALSE, stringsAsFactors = FALSE, sep = "|")
DT::datatable(tournamentInfo, editable = TRUE)
#Remove headers (first 4 rows):
tournamentInfo2 <- tournamentInfo[-c(1,2,3,4),]
#Create separate data frames by extracting every 3rd row to separate name and ranking rows
row1 <- tournamentInfo2[seq(1, nrow(tournamentInfo2), 3), ]
head(row1)
## V1 V2 V3 V4 V5 V6 V7
## 5 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 8 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 11 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 14 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 17 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 20 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## V8 V9 V10 V11
## 5 W 7 D 12 D 4 NA
## 8 W 16 W 20 W 7 NA
## 11 W 11 W 13 W 12 NA
## 14 D 5 W 19 D 1 NA
## 17 D 4 W 14 W 17 NA
## 20 D 10 W 27 W 21 NA
row2 <- tournamentInfo2[seq(2, nrow(tournamentInfo2), 3), ]
head(row2)
## V1 V2 V3 V4 V5 V6 V7
## 6 ON 15445895 / R: 1794 ->1817 N:2 W B W B
## 9 MI 14598900 / R: 1553 ->1663 N:2 B W B W
## 12 MI 14959604 / R: 1384 ->1640 N:2 W B W B
## 15 MI 12616049 / R: 1716 ->1744 N:2 W B W B
## 18 MI 14601533 / R: 1655 ->1690 N:2 B W B W
## 21 OH 15055204 / R: 1686 ->1687 N:3 W B W B
## V8 V9 V10 V11
## 6 W B W NA
## 9 B W B NA
## 12 W B W NA
## 15 W B B NA
## 18 B W B NA
## 21 B W B NA
#Combine the two dataframes so information for each player is on a single row
tournamentInfo3 <- cbind(row1, row2)
DT::datatable(tournamentInfo3, editable = TRUE)
#Remove columns with NA
tournamentInfo4 <- tournamentInfo3[ , colSums(is.na(tournamentInfo3)) == 0]
#Rename columns
names <- c("PairNum", "PlayerName", "TotalPts", "Round1", "Round2", "Round3", "Round4","Round5","Round6","Round7","State","Rating")
names(tournamentInfo4) <- names
tournamentInfo5 <- tournamentInfo4[names]
#Isolate player's pre rating scores:
preRating1 <- gsub("-.*$", "",tournamentInfo4$Rating)
preRating2 <- sub("[^:]*", "",preRating1)
preRating3 <- gsub("P.*$", "",preRating2)
preRating <- str_replace_all(preRating3, "[[:punct:]]", "")
preRating
## [1] " 1794 " " 1553 " " 1384 " " 1716 " " 1655 " " 1686 "
## [7] " 1649 " " 1641" " 1411 " " 1365 " " 1712 " " 1663 "
## [13] " 1666 " " 1610 " " 1220" " 1604 " " 1629 " " 1600 "
## [19] " 1564 " " 1595 " " 1563" " 1555 " " 1363 " " 1229 "
## [25] " 1745 " " 1579 " " 1552 " " 1507 " " 1602" " 1522 "
## [31] " 1494 " " 1441 " " 1449 " " 1399 " " 1438 " " 1355 "
## [37] " 980" " 1423 " " 1436" " 1348 " " 1403" " 1332 "
## [43] " 1283 " " 1199 " " 1242 " " 377" " 1362 " " 1382 "
## [49] " 1291" " 1056 " " 1011 " " 935 " " 1393 " " 1270 "
## [55] " 1186 " " 1153 " " 1092 " " 917 " " 853 " " 967 "
## [61] " 955" " 1530 " " 1175 " " 1163 "
#Add preRating to the dataframe
tournamentInfo6 <- cbind(tournamentInfo5, preRating)
# Format the columns with each round result so only player id numbers remain
tournamentInfo4$Round1 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round1)
tournamentInfo4$Round1 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round1)
tournamentInfo4$Round2 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round2)
tournamentInfo4$Round2 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round2)
tournamentInfo4$Round3 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round3)
tournamentInfo4$Round3 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round3)
tournamentInfo4$Round4 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round4)
tournamentInfo4$Round4 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round4)
tournamentInfo4$Round5 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round5)
tournamentInfo4$Round5 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round5)
tournamentInfo4$Round6 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round6)
tournamentInfo4$Round6 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round6)
tournamentInfo4$Round7 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round7)
tournamentInfo4$Round7 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round7)
DT::datatable(tournamentInfo4, editable = TRUE)
# Create a data frame that only contains the rounds columns
opponentId <- tournamentInfo4[,c("Round1", "Round2", "Round3", "Round4","Round5","Round6","Round7")]
#convert chr to num
opponentId[] <- lapply(opponentId, function(x) as.numeric(as.character(x)))
# Create a lookup table with player id and corresponding pre-rating
ratingLookup <- tournamentInfo6[,c("PairNum","preRating")]
ratingLookup[] <- lapply(ratingLookup, function(x) as.numeric(as.character(x)))
#Use a for loop to replace player id with rating
for (i in 1:46) {
for (j in 1:7) {
opponentId[i,j] <- ratingLookup[ratingLookup$PairNum == opponentId[i,j], 2][1]
}
}
#Replace NA with 0
opponentId[is.na(opponentId)] <- 0
DT::datatable(opponentId, editable = TRUE)
#calculate average opponents' ratings by taking mean for each row
averageOpponentsRating <- round(rowMeans(opponentId, na.rm=TRUE))
averageOpponentsRating<-averageOpponentsRating[1:46]
#Redo replacement for rows 47-64
replace <- opponentId[47:64,]
for (i in 1:18) {
for (j in 1:7) {
replace[i,j] <- ratingLookup[ratingLookup$PairNum == replace[i,j], 2][1]
}
}
#Replace NA with 0
replace[is.na(replace)] <- 0
#Calculate mean for each row
averageOpponentsRating1 <- round(rowMeans(replace, na.rm=TRUE))
averageOpponentsRating <- c(averageOpponentsRating, averageOpponentsRating1)
averageOpponentsRating
## 5 8 11 14 17 20 23 26 29 32 35 38 41 44 47
## 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1291 1498 1515 1484
## 50 53 56 59 62 65 68 71 74 77 80 83 86 89 92
## 990 1499 1480 1426 1411 1470 1115 1214 1357 1363 1507 1047 1522 1126 1144
## 95 98 101 104 107 110 113 116 119 122 125 128 131 134 137
## 1260 1379 1277 1375 1150 1190 989 1319 1430 1391 713 1150 1107 1137 1152
## 140 143 146 149 152 155 158 161 164 167 170 173 176 179 182
## 1358 1392 968 918 1111 1356 1495 577 1034 1205 1010 1168 1192 1131 950
## 185 188 191 194
## 1327 169 964 1263
tournamentInfo7 <- tournamentInfo6[,c("PlayerName", "State", "TotalPts", "preRating")]
finalTable <- cbind(tournamentInfo7, averageOpponentsRating)
DT::datatable(finalTable, editable = TRUE)
#Generate .csv file of final dataframe
write.csv(finalTable, file = "607Project1result.csv")