library(stringr)
Getting the Chess data:
vars <- c("pair_num", "player_name", "total_points", "Round1", "Round2", "Round3", "Round4", "Round5", "Round6", "Round7", "LR")
data1 <- read.table("https://raw.githubusercontent.com/Riteshlohiya/Data607_Project1/master/tournamentinfo.txt", header = FALSE, skip = 4, sep = "|", fill = TRUE, stringsAsFactors = FALSE, col.names = vars)
head(data1)
## pair_num
## 1 1
## 2 ON
## 3 -----------------------------------------------------------------------------------------
## 4 2
## 5 MI
## 6 -----------------------------------------------------------------------------------------
## player_name total_points Round1 Round2 Round3
## 1 GARY HUA 6.0 W 39 W 21 W 18
## 2 15445895 / R: 1794 ->1817 N:2 W B W
## 3
## 4 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 5 14598900 / R: 1553 ->1663 N:2 B W B
## 6
## Round4 Round5 Round6 Round7 LR
## 1 W 14 W 7 D 12 D 4 NA
## 2 B W B W NA
## 3 NA
## 4 W 17 W 16 W 20 W 7 NA
## 5 W B W B NA
## 6 NA
Removing the hyphens and all the blank strings and also the Line return column(LR):
data2 <- subset(data1, !player_name == "", select = c(pair_num:Round7))
head(data2)
## pair_num player_name total_points Round1 Round2
## 1 1 GARY HUA 6.0 W 39 W 21
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58
## 5 MI 14598900 / R: 1553 ->1663 N:2 B W
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61
## 8 MI 14959604 / R: 1384 ->1640 N:2 W B
## Round3 Round4 Round5 Round6 Round7
## 1 W 18 W 14 W 7 D 12 D 4
## 2 W B W B W
## 4 L 4 W 17 W 16 W 20 W 7
## 5 B W B W B
## 7 W 25 W 21 W 11 W 13 W 12
## 8 W B W B W
Each player information is spread across 2 lines. Frirst we need to trim and remove the spaces:
for (i in 1:length(data2)) {
data2[, i] <- str_trim(data2[, i])
}
head(data2)
## pair_num player_name total_points Round1 Round2 Round3
## 1 1 GARY HUA 6.0 W 39 W 21 W 18
## 2 ON 15445895 / R: 1794 ->1817 N:2 W B W
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4
## 5 MI 14598900 / R: 1553 ->1663 N:2 B W B
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25
## 8 MI 14959604 / R: 1384 ->1640 N:2 W B W
## Round4 Round5 Round6 Round7
## 1 W 14 W 7 D 12 D 4
## 2 B W B W
## 4 W 17 W 16 W 20 W 7
## 5 W B W B
## 7 W 21 W 11 W 13 W 12
## 8 B W B W
Now instead of 2 rows, arrange it in one row:
for (i in 1:nrow(data2)) {
data2$player_state[i] <- data2$pair_num[i + 1]
data2$player_rating[i] <- str_trim(str_extract(data2$player_name[i + 1], "[[:blank:]]{1}[[:digit:]]{3,4}"))
}
data2 <- subset(data2, !is.na(player_rating))
head(data2)
## pair_num player_name total_points Round1 Round2 Round3 Round4
## 1 1 GARY HUA 6.0 W 39 W 21 W 18 W 14
## 4 2 DAKSHESH DARURI 6.0 W 63 W 58 L 4 W 17
## 7 3 ADITYA BAJAJ 6.0 L 8 W 61 W 25 W 21
## 10 4 PATRICK H SCHILLING 5.5 W 23 D 28 W 2 W 26
## 13 5 HANSHI ZUO 5.5 W 45 W 37 D 12 D 13
## 16 6 HANSEN SONG 5.0 W 34 D 29 L 11 W 35
## Round5 Round6 Round7 player_state player_rating
## 1 W 7 D 12 D 4 ON 1794
## 4 W 16 W 20 W 7 MI 1553
## 7 W 11 W 13 W 12 MI 1384
## 10 D 5 W 19 D 1 MI 1716
## 13 D 4 W 14 W 17 MI 1655
## 16 D 10 W 27 W 21 OH 1686
The details for players playing against opponents:
for (i in 4:10) {
data2[, i] <- str_trim(str_extract(data2[, i], "[[:space:]]+[[:digit:]]{1,2}"))
}
head(data2)
## pair_num player_name total_points Round1 Round2 Round3 Round4
## 1 1 GARY HUA 6.0 39 21 18 14
## 4 2 DAKSHESH DARURI 6.0 63 58 4 17
## 7 3 ADITYA BAJAJ 6.0 8 61 25 21
## 10 4 PATRICK H SCHILLING 5.5 23 28 2 26
## 13 5 HANSHI ZUO 5.5 45 37 12 13
## 16 6 HANSEN SONG 5.0 34 29 11 35
## Round5 Round6 Round7 player_state player_rating
## 1 7 12 4 ON 1794
## 4 16 20 7 MI 1553
## 7 11 13 12 MI 1384
## 10 5 19 1 MI 1716
## 13 4 14 17 MI 1655
## 16 10 27 21 OH 1686
Finding the oppenent ratings:
for (i in 1:nrow(data2)) {
for(j in 4:10) {
data2[i,j] <- data2[data2$pair_num == data2[i,j],12][1]
}
}
head(data2)
## pair_num player_name total_points Round1 Round2 Round3 Round4
## 1 1 GARY HUA 6.0 1436 1563 1600 1610
## 4 2 DAKSHESH DARURI 6.0 1175 917 1716 1629
## 7 3 ADITYA BAJAJ 6.0 1641 955 1745 1563
## 10 4 PATRICK H SCHILLING 5.5 1363 1507 1553 1579
## 13 5 HANSHI ZUO 5.5 1242 980 1663 1666
## 16 6 HANSEN SONG 5.0 1399 1602 1712 1438
## Round5 Round6 Round7 player_state player_rating
## 1 1649 1663 1716 ON 1794
## 4 1604 1595 1649 MI 1553
## 7 1712 1666 1663 MI 1384
## 10 1655 1564 1794 MI 1716
## 13 1716 1610 1629 MI 1655
## 16 1365 1552 1563 OH 1686
Finding average rating of opponents:
for (i in 4:10) {
data2[, i] <- as.numeric(data2[, i])
}
data2$player_rating <- as.numeric(data2$player_rating)
data2$total_points <- as.numeric(data2$total_points)
data2$avg_opponent_rating <- round(round(rowMeans(data2[, c(4:10)], na.rm = TRUE), 1))
head(data2)
## pair_num player_name total_points Round1 Round2 Round3 Round4
## 1 1 GARY HUA 6.0 1436 1563 1600 1610
## 4 2 DAKSHESH DARURI 6.0 1175 917 1716 1629
## 7 3 ADITYA BAJAJ 6.0 1641 955 1745 1563
## 10 4 PATRICK H SCHILLING 5.5 1363 1507 1553 1579
## 13 5 HANSHI ZUO 5.5 1242 980 1663 1666
## 16 6 HANSEN SONG 5.0 1399 1602 1712 1438
## Round5 Round6 Round7 player_state player_rating avg_opponent_rating
## 1 1649 1663 1716 ON 1794 1605
## 4 1604 1595 1649 MI 1553 1469
## 7 1712 1666 1663 MI 1384 1564
## 10 1655 1564 1794 MI 1716 1574
## 13 1716 1610 1629 MI 1655 1501
## 16 1365 1552 1563 OH 1686 1519
Now writing only the required columns to the .csv file:
final_data <- subset(data2, TRUE, c(player_name, player_state, total_points, player_rating, avg_opponent_rating))
row.names(final_data) <- 1:64
write.table(final_data, file = "C:/Users/Arnav/Documents/GitHub/Data607_Project1/tournament_data.csv", sep = ",", row.names = FALSE)