In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
The original datafile will be hosted in Github for consumption.
url <- "https://raw.githubusercontent.com/sortega7878/DATA607PROJ1/master/tournamentinfo.txt"
download_file <- "tournamentinfo.txt" # name of the file on the local machine after download
# the file will be downloaded to the working directory
downloader::download(url, download_file)
# read the file into a vector
conn <- file(download_file, open="r")
tournament.info.raw <- readLines(conn, warn = FALSE)
close(conn)
tournament.info <- str_replace_all(str_replace_all(tournament.info.raw, "->", ">>"), "-{3,}", "")
tournament.info <- tournament.info[tournament.info != ""]
tournament.info <- tournament.info[-(1:2)]
# determine the format by looking at the contents of the first field in the record
tournament.format1 <- tournament.info[str_detect(str_sub(tournament.info, 1, 6), "[0-9]")]
tournament.format2 <- tournament.info[str_detect(str_sub(tournament.info, 1, 6), "[A-Z]{2,2}")]
Displaying the two different lines of records separated in two different vectors.
head(tournament.format1, 5)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
head(tournament.format2, 5)
## [1] " ON | 15445895 / R: 1794 >>1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 >>1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 >>1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 >>1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 >>1690 |N:2 |B |W |B |W |B |W |B |"
Now that we separate the two lines of the records we need to parse the contents of each with regular expressions.
First line:
format1.df <- data.frame(player.num = as.numeric(substr(tournament.format1, 1, 6)),
player.name = str_trim(substr(tournament.format1, 8, 40), side="both"),
totalpts = as.numeric(substr(tournament.format1, 42, 46)),
round1 = substr(tournament.format1, 48, 52),
round2 = substr(tournament.format1, 54, 58),
round3 = substr(tournament.format1, 60, 64),
round4 = substr(tournament.format1, 66, 70),
round5 = substr(tournament.format1, 72, 76),
round6 = substr(tournament.format1, 78, 82),
round7 = substr(tournament.format1, 84, 88), stringsAsFactors=FALSE)
Second line:
format2.df <- data.frame(player_state = str_trim(substr(tournament.format2, 1, 6), side="both"),
uscf.id = str_extract(substr(tournament.format2, 8, 40), "\\d+"),
prerating = as.numeric(str_extract(substr(tournament.format2, 8, 40), "(?<=R: ).\\d+(?=)")),
postrating = as.numeric(str_extract(substr(tournament.format2, 8, 40), "(?<=>>).\\d+(?=)")),
stringsAsFactors=FALSE)
Combining the results into a single Data Frame
results.df <- cbind(format1.df, format2.df)
head(results.df, 5)
## player.num player.name totalpts round1 round2 round3 round4
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 2 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 3 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 4 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 5 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## round5 round6 round7 player_state uscf.id prerating postrating
## 1 W 7 D 12 D 4 ON 15445895 1794 1817
## 2 W 16 W 20 W 7 MI 14598900 1553 1663
## 3 W 11 W 13 W 12 MI 14959604 1384 1640
## 4 D 5 W 19 D 1 MI 12616049 1716 1744
## 5 D 4 W 14 W 17 MI 14601533 1655 1690
Continuing parsing process
df.player <- select(results.df, player.num:totalpts, player_state:postrating)
head(df.player, 5)
## player.num player.name totalpts player_state uscf.id prerating
## 1 1 GARY HUA 6.0 ON 15445895 1794
## 2 2 DAKSHESH DARURI 6.0 MI 14598900 1553
## 3 3 ADITYA BAJAJ 6.0 MI 14959604 1384
## 4 4 PATRICK H SCHILLING 5.5 MI 12616049 1716
## 5 5 HANSHI ZUO 5.5 MI 14601533 1655
## postrating
## 1 1817
## 2 1663
## 3 1640
## 4 1744
## 5 1690
Chess Player Matches Normalized Dataset
The logic below creates a normalized dataset of the player’s matches, The data captured in the round 1 - 7 fields will be split to indicate the outcome of the round and the opponent’s player number.
player.matches <-
results.df %>% select(player.num, round1:round7) %>%
melt(id.var=c("player.num"), value.name="outopp") %>%
mutate( round = as.numeric(str_replace(variable, "round", "")),
outcome = str_extract(outopp, "^\\w+"),
opponent_num = as.numeric(str_extract(outopp, "\\d+$"))) %>%
select(-c(variable, outopp)) %>%
inner_join(select(df.player, player.num, prerating), c("opponent_num" = "player.num")) %>%
select(player.num, round, outcome, opponent_num, prerating) %>%
arrange(player.num, round)
# rename prerating to opponent_prerating
names(player.matches)[names(player.matches) == "prerating"] <- "opponent_prerating"
Transformed dataset
head(player.matches, 5)
## player.num round outcome opponent_num opponent_prerating
## 1 1 1 W 39 1436
## 2 1 2 W 21 1563
## 3 1 3 W 18 1600
## 4 1 4 W 14 1610
## 5 1 5 W 7 1649
Aggregate the normalized match dataset to create the value for opponents_avg_prerating. Note: In this calculation, only matches played with a Win, Loss, or Draw status are considered
df.player <-
player.matches %>%
group_by(player.num) %>% summarise(opponents_avg_prerating = round(mean(opponent_prerating))) %>%
inner_join(df.player, by="player.num") %>%
select(player.name, player_state, prerating, opponents_avg_prerating, totalpts)
head(df.player, 5)
## # A tibble: 5 × 5
## player.name player_state prerating opponents_avg_prerating
## <chr> <chr> <dbl> <dbl>
## 1 GARY HUA ON 1794 1605
## 2 DAKSHESH DARURI MI 1553 1469
## 3 ADITYA BAJAJ MI 1384 1564
## 4 PATRICK H SCHILLING MI 1716 1574
## 5 HANSHI ZUO MI 1655 1501
## # ... with 1 more variables: totalpts <dbl>
The final dataframe will be sent to a CSV file for further consumption.
write.csv(df.player, "Chess_Results.csv", row.names=FALSE)