Initially I attempted to use read.table and extract the necessary data from each column, but the data was far too messy to continue on this way.
Chess_Tournament <- read.table("/Users/Michele/Desktop/Chess_Tournament.txt", header=FALSE, fill=TRUE)
columns <-c(1,3:13)
myvars <- paste("V", columns, sep="")
less_Chess_Tournament <- Chess_Tournament[myvars]
head(less_Chess_Tournament)
## V1
## 1 -----------------------------------------------------------------------------------------
## 2 Pair
## 3 Num
## 4 -----------------------------------------------------------------------------------------
## 5 1
## 6 ON
## V3 V4 V5 V6
## 1
## 2 Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## 3 USCF ID / Rtg
## 4
## 5 GARY HUA |6.0 |W
## 6 15445895 / R: 1794
## V7 V8 V9 V10 V11 V12 V13
## 1
## 2
## 3 (Pre->Post) | Pts | 1 | 2
## 4
## 5 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 6 ->1817 |N:2 |W |B |W |B |W
Decided to look into alternative methods for imputting data and found readLines, and this was much cleaner.
Chess_Tournament_readlines <- readLines("/Users/Michele/Desktop/Chess_Tournament.txt")
## Warning in readLines("/Users/Michele/Desktop/Chess_Tournament.txt"):
## incomplete final line found on '/Users/Michele/Desktop/
## Chess_Tournament.txt'
head(Chess_Tournament_readlines, 10)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [7] "-----------------------------------------------------------------------------------------"
## [8] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [9] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [10] "-----------------------------------------------------------------------------------------"
From this, I noticed that there is a pattern to the data, with every third row starting with row 5 having names and competitors, while every third rown starting on row 6 has states and ratings.
rows_with_names_rounds <- seq(5, 196, 3)
names_rounds <- Chess_Tournament_readlines[rows_with_names_rounds]
head(names_rounds)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
rows_with_state_ratings <- seq(6, 196, 3)
state_rating <- Chess_Tournament_readlines[rows_with_state_ratings]
head(state_rating)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
Used the stringr library to extract names, states, pre-rating, post-rating, and competitors. All of this data was placed into a dataframe called Chess_Data.
library(stringr)
names <- unlist(str_extract_all(names_rounds, "\\w+\\s\\w+\\s\\w+\\s\\w+\\s|\\w+\\s\\w+\\s\\w+|\\w+\\s\\w+"))
head(names)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
states <- unlist(str_extract_all(state_rating, "\\s\\w{2}\\s"))
head(states)
## [1] " ON " " MI " " MI " " MI " " MI " " OH "
pre_rating <- unlist(str_extract_all(state_rating, "[:]\\s+[[:digit:]]+"))
pre_rating <- unlist(str_extract_all(pre_rating, "[[:digit:]]+"))
head(pre_rating)
## [1] "1794" "1553" "1384" "1716" "1655" "1686"
post_rating <- unlist(str_extract_all(state_rating, "[>]\\s*[[:digit:]]+"))
post_rating <- unlist(str_extract_all(post_rating, "[[:digit:]]+"))
head(post_rating)
## [1] "1817" "1663" "1640" "1744" "1690" "1687"
rounds <- unlist(str_extract_all(names_rounds, "\\s?[[:print:]]{1,2}[|][[:alpha:]]?"))
round_1 <- rounds[seq(4, length(rounds), 10)]
round_2 <- rounds[seq(5, length(rounds), 10)]
round_3 <- rounds[seq(6, length(rounds), 10)]
round_4 <- rounds[seq(7, length(rounds), 10)]
round_5 <- rounds[seq(8, length(rounds), 10)]
round_6 <- rounds[seq(9, length(rounds), 10)]
round_7 <- rounds[seq(10, length(rounds), 10)]
round_1 <- unlist(str_extract_all(round_1, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_1 <- unlist(str_extract_all(round_1, "[[:print:]]+[^[|]]"))
round_2 <- unlist(str_extract_all(round_2, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_2 <- unlist(str_extract_all(round_2, "[[:print:]]+[^[|]]"))
round_3 <- unlist(str_extract_all(round_3, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_3 <- unlist(str_extract_all(round_3, "[[:print:]]+[^[|]]"))
round_4 <- unlist(str_extract_all(round_4, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_4 <- unlist(str_extract_all(round_4, "[[:print:]]+[^[|]]"))
round_5 <- unlist(str_extract_all(round_5, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_5 <- unlist(str_extract_all(round_5, "[[:print:]]+[^[|]]"))
round_6 <- unlist(str_extract_all(round_6, "[[:print:]]+[^[|]][^[:alpha:]_]"))
round_6 <- unlist(str_extract_all(round_6, "[[:print:]]+[^[|]]"))
round_7 <- unlist(str_extract_all(round_7, "[[:print:]]+[^[|]]"))
head(round_1)
## [1] " 39" " 63" " 8" " 23" " 45" " 34"
id <- seq(1, 64, 1)
Chess_Data <- data.frame(id, names, states, "Pre Rating" = as.numeric(pre_rating), "Post Rating" = as.numeric(post_rating), "Round 1" = as.numeric(round_1), "Round 2" = as.numeric(round_2), "Round 3" = as.numeric(round_3), "Round 4" = as.numeric(round_4), "Round 5" = as.numeric(round_5), "Round 6" = as.numeric(round_6), "Round 7" = as.numeric(round_7))
head(Chess_Data)
## id names states Pre.Rating Post.Rating Round.1 Round.2
## 1 1 GARY HUA ON 1794 1817 39 21
## 2 2 DAKSHESH DARURI MI 1553 1663 63 58
## 3 3 ADITYA BAJAJ MI 1384 1640 8 61
## 4 4 PATRICK H SCHILLING MI 1716 1744 23 28
## 5 5 HANSHI ZUO MI 1655 1690 45 37
## 6 6 HANSEN SONG OH 1686 1687 34 29
## Round.3 Round.4 Round.5 Round.6 Round.7
## 1 18 14 7 12 4
## 2 4 17 16 20 7
## 3 25 21 11 13 12
## 4 2 26 5 19 1
## 5 12 13 4 14 17
## 6 11 35 10 27 21
Finally, I used a for loop to grab the Pre.Rating for each person’s competitor and compute their average. This was then added to the datafram Chess_Data_Final.
Average_Opponent_Pre_Rating <- c()
for (i in Chess_Data$id){
a <- Chess_Data[Chess_Data$'Round.1'==i, "Pre.Rating"]
a <- a[!is.na(a)]
b <- Chess_Data[Chess_Data$'Round.2'==i, "Pre.Rating"]
b <- b[!is.na(b)]
c <- Chess_Data[Chess_Data$'Round.3'==i, "Pre.Rating"]
c <- c[!is.na(c)]
d <- Chess_Data[Chess_Data$'Round.4'==i, "Pre.Rating"]
d <- d[!is.na(d)]
e <- Chess_Data[Chess_Data$'Round.5'==i, "Pre.Rating"]
e <- e[!is.na(e)]
f <- Chess_Data[Chess_Data$'Round.6'==i, "Pre.Rating"]
f <- f[!is.na(f)]
g <- Chess_Data[Chess_Data$'Round.7'==i, "Pre.Rating"]
g <- g[!is.na(g)]
x <- mean(c(a, b, c, d, e, f, g))
Average_Opponent_Pre_Rating <- c(Average_Opponent_Pre_Rating, x)
}
Chess_Data["Average Opponent Pre Rating"] <- Average_Opponent_Pre_Rating
Chess_Data_Final <- subset(Chess_Data, select = c("id", "names", "states", "Post.Rating", "Average Opponent Pre Rating"))
head(Chess_Data_Final)
## id names states Post.Rating Average Opponent Pre Rating
## 1 1 GARY HUA ON 1817 1605.286
## 2 2 DAKSHESH DARURI MI 1663 1469.286
## 3 3 ADITYA BAJAJ MI 1640 1563.571
## 4 4 PATRICK H SCHILLING MI 1744 1573.571
## 5 5 HANSHI ZUO MI 1690 1500.857
## 6 6 HANSEN SONG OH 1687 1518.714
Exporting dataframe to a csv.
write.csv(Chess_Data_Final, "Chess Tournament.csv")