The project task is to generate a .CSV file from a text file containing chess tournament results. The text file and resulting .CSV file are available here.
After loading the stringr package, I read in the file directly from my GitHub repository and as a character vector.
library(stringr)
chess <- readLines('https://raw.githubusercontent.com/chrosemo/data607_fall19_project1/master/tournamentinfo.txt')
head(chess, 10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
I then remove the label rows as well as all lines consisting solely of “-” characters.
chess <- chess[-c(1:3, seq(1,length(chess), 3))]
head(chess)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [4] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [5] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [6] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
Noting how the data are structured, I create separate subset vectors for the first rows (number_line) and second rows (state_line), respectively.
number_line <- chess[c(seq(1,length(chess), 2))]
state_line <- chess[c(seq(2,length(chess), 2))]
head(number_line)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
head(state_line)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
I use the two subsets to extract values for each player’s number, name, state, tournament point total, and pre-tournament rating as well as the numbers of each of that player’s opponents.
number <- as.numeric(str_extract(number_line, "[0-9]{1,2}"))
name <- str_trim(str_replace_all(str_extract(number_line, "\\|.[[A-z].,' ']{2,}\\|"), "[|]", ""), side="both")
state <- str_extract(state_line, "[A-Z]{2,2}")
point_total <- as.numeric(str_extract(number_line, "[0-9]\\.[0-9]{1,1}"))
pre_rating <- as.numeric(str_replace(str_extract(state_line, "R:.{5}"), "R: ", ""))
opps <- lapply(str_extract_all(str_sub(number_line, start=51), "[0-9]{1,2}"), as.numeric)
head(number)
## [1] 1 2 3 4 5 6
tail(name)
## [1] "SEAN M MC CORMICK" "JULIA SHEN" "JEZZEL FARKAS"
## [4] "ASHWIN BALAJI" "THOMAS JOSEPH HOSMER" "BEN LI"
head(state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
tail(point_total)
## [1] 2.0 1.5 1.5 1.0 1.0 1.0
head(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686
tail(opps)
## [[1]]
## [1] 41 9 40 43 54 44
##
## [[2]]
## [1] 33 34 45 42 24
##
## [[3]]
## [1] 32 3 54 47 42 30 37
##
## [[4]]
## [1] 55
##
## [[5]]
## [1] 2 48 49 43 45
##
## [[6]]
## [1] 22 30 31 49 46 42 54
To calculate the average pre-tournament rating of a player’s opponents, I first use nested apply functions to match each player’s opponent numbers with the respective opponent’s pre-tournament rating. I then calculate the average pre-rating using another apply function and convert the results to integers, which rounds the values to whole numbers.
opp_ratings <- lapply(opps, function(i) {
sapply(i, function(j) {
j <- pre_rating[number == j]
})
})
avg_opp_rating <- as.integer(sapply(opp_ratings, mean))
head(opp_ratings)
## [[1]]
## [1] 1436 1563 1600 1610 1649 1663 1716
##
## [[2]]
## [1] 1175 917 1716 1629 1604 1595 1649
##
## [[3]]
## [1] 1641 955 1745 1563 1712 1666 1663
##
## [[4]]
## [1] 1363 1507 1553 1579 1655 1564 1794
##
## [[5]]
## [1] 1242 980 1663 1666 1716 1610 1629
##
## [[6]]
## [1] 1399 1602 1712 1438 1365 1552 1563
head(avg_opp_rating)
## [1] 1605 1469 1563 1573 1500 1518
I create a data frame called tournament_df using the name (labeled “player_name”), state (“player_state”), point_total (“player_point_total), pre_rating (”player_pre_rating“), and avg_opp_rating (”avg_opp_pre_rating“) vectors.
tournament_df <- data.frame("player_name" = name,
"player_state" = state,
"player_point_total" = point_total,
"player_pre_rating" = pre_rating,
"avg_opp_pre_rating" = avg_opp_rating)
head(tournament_df)
## player_name player_state player_point_total player_pre_rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## avg_opp_pre_rating
## 1 1605
## 2 1469
## 3 1563
## 4 1573
## 5 1500
## 6 1518
Finally, I export the data frame to .CSV format.
write.csv(tournament_df, file = "tournament_df.csv", row.names=FALSE)
“Basic Regular Expressions in R, Cheat Sheet”. RStudio. Accessed 091919 from https://www.rstudio.com/wp-content/uploads/2016/09/RegExCheatsheet.pdf;
“Chapter 8”. Automated Data Collection with R. Accessed 091919 from http://kek.ksu.ru/eos/WM/AutDataCollectR.pdf