Contents

  1. Project overview
  2. Import data and load Library
  3. Clean and format data
  4. Calculate opponents’ average ratings
  5. Final dataframe

Project overview

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605

The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments.

Excerpt from text file:

knitr::include_graphics('/Users/katieevers/Desktop/txtFileExcerpt.png')

Import data and load library:

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  2.0.1     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
tournamentInfo <- read.delim("tournamentinfo.txt", header = FALSE, stringsAsFactors = FALSE, sep = "|")
DT::datatable(tournamentInfo, editable = TRUE)

Clean and format data:

#Remove headers (first 4 rows):
tournamentInfo2 <- tournamentInfo[-c(1,2,3,4),]

#Create separate data frames by extracting every 3rd row to separate name and ranking rows
row1 <- tournamentInfo2[seq(1, nrow(tournamentInfo2), 3), ]
head(row1)
##        V1                                V2    V3    V4    V5    V6    V7
## 5      1   GARY HUA                         6.0   W  39 W  21 W  18 W  14
## 8      2   DAKSHESH DARURI                  6.0   W  63 W  58 L   4 W  17
## 11     3   ADITYA BAJAJ                     6.0   L   8 W  61 W  25 W  21
## 14     4   PATRICK H SCHILLING              5.5   W  23 D  28 W   2 W  26
## 17     5   HANSHI ZUO                       5.5   W  45 W  37 D  12 D  13
## 20     6   HANSEN SONG                      5.0   W  34 D  29 L  11 W  35
##       V8    V9   V10 V11
## 5  W   7 D  12 D   4  NA
## 8  W  16 W  20 W   7  NA
## 11 W  11 W  13 W  12  NA
## 14 D   5 W  19 D   1  NA
## 17 D   4 W  14 W  17  NA
## 20 D  10 W  27 W  21  NA
row2 <- tournamentInfo2[seq(2, nrow(tournamentInfo2), 3), ]
head(row2)
##        V1                                V2    V3    V4    V5    V6    V7
## 6     ON   15445895 / R: 1794   ->1817      N:2   W     B     W     B    
## 9     MI   14598900 / R: 1553   ->1663      N:2   B     W     B     W    
## 12    MI   14959604 / R: 1384   ->1640      N:2   W     B     W     B    
## 15    MI   12616049 / R: 1716   ->1744      N:2   W     B     W     B    
## 18    MI   14601533 / R: 1655   ->1690      N:2   B     W     B     W    
## 21    OH   15055204 / R: 1686   ->1687      N:3   W     B     W     B    
##       V8    V9   V10 V11
## 6  W     B     W      NA
## 9  B     W     B      NA
## 12 W     B     W      NA
## 15 W     B     B      NA
## 18 B     W     B      NA
## 21 B     W     B      NA
#Combine the two dataframes so information for each player is on a single row
tournamentInfo3 <- cbind(row1, row2)
DT::datatable(tournamentInfo3, editable = TRUE)
#Remove columns with NA
tournamentInfo4 <- tournamentInfo3[ , colSums(is.na(tournamentInfo3)) == 0]

#Rename columns
names <- c("PairNum", "PlayerName", "TotalPts", "Round1", "Round2", "Round3", "Round4","Round5","Round6","Round7","State","Rating")

names(tournamentInfo4) <- names
tournamentInfo5 <- tournamentInfo4[names]

#Isolate player's pre rating scores:
preRating1 <- gsub("-.*$", "",tournamentInfo4$Rating)
preRating2 <- sub("[^:]*", "",preRating1)
preRating3 <- gsub("P.*$", "",preRating2)
preRating <- str_replace_all(preRating3, "[[:punct:]]", "")
preRating
##  [1] " 1794   " " 1553   " " 1384   " " 1716   " " 1655   " " 1686   "
##  [7] " 1649   " " 1641"    " 1411   " " 1365   " " 1712   " " 1663   "
## [13] " 1666   " " 1610   " " 1220"    " 1604   " " 1629   " " 1600   "
## [19] " 1564   " " 1595   " " 1563"    " 1555   " " 1363   " " 1229   "
## [25] " 1745   " " 1579   " " 1552   " " 1507   " " 1602"    " 1522   "
## [31] " 1494   " " 1441   " " 1449   " " 1399   " " 1438   " " 1355   "
## [37] "  980"    " 1423   " " 1436"    " 1348   " " 1403"    " 1332   "
## [43] " 1283   " " 1199   " " 1242   " "  377"    " 1362   " " 1382   "
## [49] " 1291"    " 1056   " " 1011   " "  935   " " 1393   " " 1270   "
## [55] " 1186   " " 1153   " " 1092   " "  917   " "  853   " "  967   "
## [61] "  955"    " 1530   " " 1175   " " 1163   "
#Add preRating to the dataframe
tournamentInfo6 <- cbind(tournamentInfo5, preRating)

# Format the columns with each round result so only player id numbers remain
tournamentInfo4$Round1 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round1)
tournamentInfo4$Round1 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round1)

tournamentInfo4$Round2 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round2)
tournamentInfo4$Round2 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round2)

tournamentInfo4$Round3 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round3)
tournamentInfo4$Round3 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round3)

tournamentInfo4$Round4 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round4)
tournamentInfo4$Round4 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round4)

tournamentInfo4$Round5 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round5)
tournamentInfo4$Round5 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round5)

tournamentInfo4$Round6 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round6)
tournamentInfo4$Round6 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round6)

tournamentInfo4$Round7 <- sub("[[:alnum:]][[:space:]]+([[:digit:]]+)", "\\1", tournamentInfo4$Round7)
tournamentInfo4$Round7 <- sub("[[:upper:]]+", "0", tournamentInfo4$Round7)

DT::datatable(tournamentInfo4, editable = TRUE)

Calculate opponents’ average ratings:

# Create a data frame that only contains the rounds columns
opponentId <- tournamentInfo4[,c("Round1", "Round2", "Round3", "Round4","Round5","Round6","Round7")]

#convert chr to num
opponentId[] <- lapply(opponentId, function(x) as.numeric(as.character(x)))

# Create a lookup table with player id and corresponding pre-rating
ratingLookup <- tournamentInfo6[,c("PairNum","preRating")]

ratingLookup[] <- lapply(ratingLookup, function(x) as.numeric(as.character(x)))

#Use a for loop to replace player id with rating
for (i in 1:46) {
  for (j in 1:7) {
    opponentId[i,j] <- ratingLookup[ratingLookup$PairNum == opponentId[i,j], 2][1]
  }
}

#Replace NA with 0
opponentId[is.na(opponentId)] <- 0

DT::datatable(opponentId, editable = TRUE)
#calculate average opponents' ratings by taking mean for each row
averageOpponentsRating <- round(rowMeans(opponentId, na.rm=TRUE))
averageOpponentsRating<-averageOpponentsRating[1:46]

#Redo replacement for rows 47-64
replace <- opponentId[47:64,]

for (i in 1:18) {
  for (j in 1:7) {
    replace[i,j] <- ratingLookup[ratingLookup$PairNum == replace[i,j], 2][1]
  }
}

#Replace NA with 0
replace[is.na(replace)] <- 0

#Calculate mean for each row
averageOpponentsRating1 <- round(rowMeans(replace, na.rm=TRUE))

averageOpponentsRating <- c(averageOpponentsRating, averageOpponentsRating1)
averageOpponentsRating
##    5    8   11   14   17   20   23   26   29   32   35   38   41   44   47 
## 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1291 1498 1515 1484 
##   50   53   56   59   62   65   68   71   74   77   80   83   86   89   92 
##  990 1499 1480 1426 1411 1470 1115 1214 1357 1363 1507 1047 1522 1126 1144 
##   95   98  101  104  107  110  113  116  119  122  125  128  131  134  137 
## 1260 1379 1277 1375 1150 1190  989 1319 1430 1391  713 1150 1107 1137 1152 
##  140  143  146  149  152  155  158  161  164  167  170  173  176  179  182 
## 1358 1392  968  918 1111 1356 1495  577 1034 1205 1010 1168 1192 1131  950 
##  185  188  191  194 
## 1327  169  964 1263

Create final dataframe with desired information

tournamentInfo7 <- tournamentInfo6[,c("PlayerName", "State", "TotalPts", "preRating")]

finalTable <- cbind(tournamentInfo7, averageOpponentsRating)
DT::datatable(finalTable, editable = TRUE)
#Generate .csv file of final dataframe
write.csv(finalTable, file = "607Project1result.csv")