In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, Average Pre Chess Rating of Opponents
library(stringr)
library(readr)
library(dplyr)
theUrl <- 'https://raw.githubusercontent.com/tonyCUNY/tonyCUNY/main/tournamentinfo.txt'
# Reading the data again but skip the first 4 lines
df2 <- read.table(file=theUrl, skip = 4, sep=",")
# Inspect the data structure.
head(df2)
## V1
## 1 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## 2 ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 3 -----------------------------------------------------------------------------------------
## 4 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## 5 MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## 6 -----------------------------------------------------------------------------------------
Here are the steps I use to extract the information for just 1 player- GARY HUA:
The first 4 lines are not needed. They are removed.
Use strsplit to split elements according to matches “|” and space (\s*) around it.
Results are stored in sub_n1, sub_n2.
Transform sub_n1, sub_n2 as data.frame so each elements are stored in different rows.
Id, Name, State, Number of Points, Pre-Rating can be accessed individually from specific row
Use str_extract and regex to extract the information we need.
# split elements according to matches "|" and space (\\s*) around it
sub_n1 <- strsplit(df2[1, ], "\\s*\\|\\s*")
sub_n2 <- strsplit(df2[2, ], "\\s*\\|\\s*")
sub_n1
## [[1]]
## [1] " 1" "GARY HUA" "6.0" "W 39" "W 21" "W 18"
## [7] "W 14" "W 7" "D 12" "D 4"
sub_n2
## [[1]]
## [1] " ON" "15445895 / R: 1794 ->1817"
## [3] "N:2" "W"
## [5] "B" "W"
## [7] "B" "W"
## [9] "B" "W"
# Transform sub_n1, sub_n2 as data.frame
sub_n1_df <- as.data.frame(sub_n1)
sub_n2_df <- as.data.frame(sub_n2)
# Access Id, Name, State, Number of Points, Pre-Rating
# Player's ID - It will be used when calcuating the Average Pre Chess Rating
str_extract(sub_n1_df[1,], "\\d+")
## [1] "1"
# Player’s Name
sub_n1_df[2,]
## [1] "GARY HUA"
# Player’s State
str_extract(sub_n2_df[1, ], "\\b[A-Z]+\\b")
## [1] "ON"
# Total Number of Points
sub_n1_df[3,]
## [1] "6.0"
# Player’s Pre-Rating
str_extract(sub_n2_df[2,], "(?<=R: )\\d+")
## [1] "1794"
# Opponents ID 1 - 7
str_extract(sub_n1_df[4,], "\\d+")
## [1] "39"
str_extract(sub_n1_df[5,], "\\d+")
## [1] "21"
str_extract(sub_n1_df[6,], "\\d+")
## [1] "18"
str_extract(sub_n1_df[7,], "\\d+")
## [1] "14"
str_extract(sub_n1_df[8,], "\\d+")
## [1] "7"
str_extract(sub_n1_df[9,], "\\d+")
## [1] "12"
str_extract(sub_n1_df[10,], "\\d+")
## [1] "4"
Now we need to use a loop to repeat part 1 to extract information from all players.
Create a simple for loop to extract elements in a list and combine them into a new vector
test <- c()
for (i in 1:length(test_list)) {
test <- c(test, test_list[[i]])
}
Expand this loop to include all the vectors we need.
Function Seq() will tell the for loop which row to access.
item 1. Row contains ID, Name, Number of Points, Opponents ID repeat every 3 lines
item 2. Row contains State, Pre-Rating repeat every 3 lines
# Use loop to repeat part 1 and complete item 1
id = c()
name = c()
total_num_points = c()
o1 = c()
o2 = c()
o3 = c()
o4 = c()
o5 = c()
o6 = c()
o7 = c()
for (i in seq(1, nrow(df2), by = 3)) {
sub = as.data.frame(strsplit(df2[i, ], "\\s*\\|\\s*"))
id = c(id, as.numeric(str_extract(sub[1,], "\\d+")))
name = c(name, sub[2,])
total_num_points = c(total_num_points, sub[3,])
o1 = c(o1, as.numeric(str_extract(sub[4,], "\\d+")))
o2 = c(o2, as.numeric(str_extract(sub[5,], "\\d+")))
o3 = c(o3, as.numeric(str_extract(sub[6,], "\\d+")))
o4 = c(o4, as.numeric(str_extract(sub[7,], "\\d+")))
o5 = c(o5, as.numeric(str_extract(sub[8,], "\\d+")))
o6 = c(o6, as.numeric(str_extract(sub[9,], "\\d+")))
o7 = c(o7, as.numeric(str_extract(sub[10,], "\\d+")))
}
# Use loop to repeat part 1 and complete item 2
state = c()
pre_rating = c()
for (i in seq(2, nrow(df2), by = 3)) {
sub = as.data.frame(strsplit(df2[i, ], "\\s*\\|\\s*"))
state = c(state, str_extract(sub[1, ], "\\b[A-Z]+\\b"))
pre_rating = c(pre_rating, as.numeric(str_extract(sub[2,], "(?<=R: )\\d+")))
}
table = data.frame(id, state, name, total_num_points, pre_rating, o1, o2, o3, o4, o5, o6, o7 )
head(table)
## id state name total_num_points pre_rating o1 o2 o3 o4 o5 o6 o7
## 1 1 ON GARY HUA 6.0 1794 39 21 18 14 7 12 4
## 2 2 MI DAKSHESH DARURI 6.0 1553 63 58 4 17 16 20 7
## 3 3 MI ADITYA BAJAJ 6.0 1384 8 61 25 21 11 13 12
## 4 4 MI PATRICK H SCHILLING 5.5 1716 23 28 2 26 5 19 1
## 5 5 MI HANSHI ZUO 5.5 1655 45 37 12 13 4 14 17
## 6 6 OH HANSEN SONG 5.0 1686 34 29 11 35 10 27 21
Now we need to calculate Average Pre Chess Rating of Opponents and create a new column Avg_Chess_Rate
The values of o1, 2, 3, 4, 5, 6, 7 are referring to values of ID
They are used to call out the pre_rating value corresponding to the matched ID:
For example, table$pre_rating[table$id[table[1, 6]] = 39
!is.na is used to filter out NA value during the loop.
The count2 column under total4 is to get us correct denominator value.Because some players play less then 7
games.
col_index <- c(6, 7, 8, 9, 10, 11, 12)
total4 <- data.frame(total1 = rep(0, 64),
count2 = rep(0, 64))
for (i in 1:64) {
for (j in col_index) {
if (!is.na(table$pre_rating[table$id[table[i, j]]])) {
total4[i, 1] <- total4[i, 1] + table$pre_rating[table$id[table[i, j]]]
}
if (is.na(table[i, j])) {
total4[i, 2] <- total4[i, 2] + 1
}
}
}
Avg_Chess_Rate <- c()
for (i in 1:64) {
Avg_Chess_Rate[i] <- total4[i, 1] / (7 - total4[i, 2])
}
result_avg = as.data.frame(Avg_Chess_Rate)
head(result_avg)
## Avg_Chess_Rate
## 1 1605.286
## 2 1338.286
## 3 1427.143
## 4 1573.571
## 5 1360.857
## 6 1518.714
final_table <- cbind(table, result_avg)
head(final_table)
## id state name total_num_points pre_rating o1 o2 o3 o4 o5 o6 o7
## 1 1 ON GARY HUA 6.0 1794 39 21 18 14 7 12 4
## 2 2 MI DAKSHESH DARURI 6.0 1553 63 58 4 17 16 20 7
## 3 3 MI ADITYA BAJAJ 6.0 1384 8 61 25 21 11 13 12
## 4 4 MI PATRICK H SCHILLING 5.5 1716 23 28 2 26 5 19 1
## 5 5 MI HANSHI ZUO 5.5 1655 45 37 12 13 4 14 17
## 6 6 OH HANSEN SONG 5.0 1686 34 29 11 35 10 27 21
## Avg_Chess_Rate
## 1 1605.286
## 2 1338.286
## 3 1427.143
## 4 1573.571
## 5 1360.857
## 6 1518.714
final_table2 <- data.frame(final_table$name, final_table$state, final_table$pre_rating, final_table$Avg_Chess_Rate)
head(final_table2)
## final_table.name final_table.state final_table.pre_rating
## 1 GARY HUA ON 1794
## 2 DAKSHESH DARURI MI 1553
## 3 ADITYA BAJAJ MI 1384
## 4 PATRICK H SCHILLING MI 1716
## 5 HANSHI ZUO MI 1655
## 6 HANSEN SONG OH 1686
## final_table.Avg_Chess_Rate
## 1 1605.286
## 2 1338.286
## 3 1427.143
## 4 1573.571
## 5 1360.857
## 6 1518.714
write.csv(final_table, "C:\\Users\\tonyl\\Desktop\\CUNY\\final_table.csv", row.names = FALSE)
write.csv(final_table2, "C:\\Users\\tonyl\\Desktop\\CUNY\\final_table2.csv", row.names = FALSE)