Load required libraries
library(stringr)
Load the data file and have a look at the first few lines
tournamentinfo <- read.delim("https://raw.githubusercontent.com/ezaccountz/-ezaccountz-Data_607_Project_1/master/tournamentinfo.txt", header = FALSE)
tournamentinfo <- unlist(tournamentinfo)
head(tournamentinfo)
## V11
## -----------------------------------------------------------------------------------------
## V12
## Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round|
## V13
## Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
## V14
## -----------------------------------------------------------------------------------------
## V15
## 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## V16
## ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Remove the header
tournamentinfo <- tournamentinfo[c(5:length(tournamentinfo))]
head(tournamentinfo)
## V15
## 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## V16
## ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## V17
## -----------------------------------------------------------------------------------------
## V18
## 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## V19
## MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## V110
## -----------------------------------------------------------------------------------------
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Extract the lines with information about a player’s pair number, name, total points and opponents and game results
name_score <- tournamentinfo[str_detect(tournamentinfo, "\\d+\\s\\|\\s[:alpha:]+")]
head(name_score)
## V15
## 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|
## V18
## 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|
## V111
## 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|
## V114
## 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|
## V117
## 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|
## V120
## 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Extract the lines with information about a player’s USCF ID, state, pre-rating and post rating
state_rating <- tournamentinfo[str_detect(tournamentinfo, "[:alpha:]+\\s\\|\\s\\d+")]
head(state_rating)
## V16
## ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |
## V19
## MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |
## V112
## MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |
## V115
## MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |
## V118
## MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |
## V121
## OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |
## 131 Levels: ----------------------------------------------------------------------------------------- ...
Extract the pair number
pair_num <- str_extract(str_extract(name_score, "\\d+\\s\\|\\s[:alpha:]+"),"\\d+")
head(pair_num)
## [1] "1" "2" "3" "4" "5" "6"
To obtain the players’ names, extract the string from the first “| |” area, then remove the “|”s and trim the string
player_name <- str_extract(name_score,"(\\|).+?\\|")
player_name <- str_remove_all(player_name, "[\\|]")
player_name <- str_trim(player_name)
head(player_name)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
Extract the total points from the second “| |”
name_score <- str_remove(name_score, ".*?\\|.*?\\|")
total_points <- str_extract(name_score, ".*?\\|")
total_points <- as.numeric(str_remove_all(total_points, "[\\s\\|]"))
head(total_points)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
Extract the opponents’ pair numbers and the game result recurring from the 3rd to 9th “| |”
opponents <- data.frame(matrix(ncol = 7, nrow = length(name_score)))
colnames(opponents) <- str_c("opponent",1:7)
game_result <- data.frame(matrix(ncol = 7, nrow = length(name_score)))
colnames(game_result) <- str_c("opponent",1:7)
for(i in c(1:7)) {
name_score <- str_remove(name_score, ".*?\\|")
opponents[,i] <- str_extract(name_score, ".*?\\|")
game_result[,i] <- str_extract(opponents[,i], "[:alpha:]")
opponents[,i] <- as.numeric(str_remove_all(opponents[,i], "[[:alpha:]\\s\\|]"))
}
head(opponents)
## opponent1 opponent2 opponent3 opponent4 opponent5 opponent6 opponent7
## 1 39 21 18 14 7 12 4
## 2 63 58 4 17 16 20 7
## 3 8 61 25 21 11 13 12
## 4 23 28 2 26 5 19 1
## 5 45 37 12 13 4 14 17
## 6 34 29 11 35 10 27 21
head(game_result)
## opponent1 opponent2 opponent3 opponent4 opponent5 opponent6 opponent7
## 1 W W W W W D D
## 2 W W L W W W W
## 3 L W W W W W W
## 4 W D W W D W D
## 5 W W D D D W W
## 6 W D L W D W W
Now we go to our other set of extracted lines. Extract the state information
state <- str_extract(state_rating, ".*?\\|")
state <- str_remove_all(state, "[\\s\\|]")
head(state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
Extract the USCF ID, which is within the first “|” and the first “/”
state_rating <- str_remove(state_rating, ".*?\\|")
uscf_id <- str_extract(state_rating, ".*?\\/")
uscf_id <- str_remove_all(uscf_id, "[\\s\\/]")
head(uscf_id)
## [1] "15445895" "14598900" "14959604" "12616049" "14601533" "15055204"
Extract the pre rating, which is within the string after the first “/” and endings with the rating (number) followed by a space or character
state_rating <- str_remove(state_rating, ".*?\\/")
pre_rating <- str_extract(state_rating, ".*?\\d+[[:alpha:]\\s]")
pre_rating <- as.numeric(str_extract(pre_rating,"\\d+"))
head(pre_rating)
## [1] 1794 1553 1384 1716 1655 1686
Extract the post rating, which is within the string after “the first”->" and endings with the rating (number) followed by a space
state_rating <- str_remove(state_rating, ".*?\\-\\>")
post_rating <- str_extract(state_rating,".*?\\d+[[:alpha:]\\s]")
post_rating <- as.numeric(str_extract(post_rating,"\\d+"))
head(post_rating)
## [1] 1817 1663 1640 1744 1690 1687
According to the pair number of the opponents, we find the pre ratings of the opponents and store them in a table
opponents_rating <- data.frame(matrix(ncol = 7, nrow = length(name_score)))
colnames(opponents_rating) <- str_c("opponent",1:7)
for (i in (1:7)) {
for (j in (1: length(pair_num)))
opponents_rating[j,i] <- ifelse(is.na(opponents[j,i]),NA, as.numeric(pre_rating[pair_num == opponents[j,i]]))
}
head(opponents_rating)
## opponent1 opponent2 opponent3 opponent4 opponent5 opponent6 opponent7
## 1 1436 1563 1600 1610 1649 1663 1716
## 2 1175 917 1716 1629 1604 1595 1649
## 3 1641 955 1745 1563 1712 1666 1663
## 4 1363 1507 1553 1579 1655 1564 1794
## 5 1242 980 1663 1666 1716 1610 1629
## 6 1399 1602 1712 1438 1365 1552 1563
Finally we calculate the average pre rating of the opponents, rounded to the nearest integer
average_rating <- round(apply(opponents_rating, 1, mean,na.rm=TRUE),0)
head(average_rating)
## [1] 1605 1469 1564 1574 1501 1519
Combine all rate into a detailed table
detailed_data <- data.frame(pair_num,player_name,uscf_id,state,total_points,pre_rating, post_rating,
opponents$opponent1,opponents_rating$opponent1,game_result$opponent1,
opponents$opponent2,opponents_rating$opponent2,game_result$opponent2,
opponents$opponent3,opponents_rating$opponent3,game_result$opponent3,
opponents$opponent4,opponents_rating$opponent4,game_result$opponent4,
opponents$opponent5,opponents_rating$opponent5,game_result$opponent5,
opponents$opponent6,opponents_rating$opponent6,game_result$opponent6,
opponents$opponent7,opponents_rating$opponent7,game_result$opponent7,
average_rating)
colnames(detailed_data) <- c("Pair Number", "Name", "USCF ID", "State", "Total Points", "Pre Rating", "Post Rating",
"Opponent 1 Pair Number", "Opponent 1 Pre Rating", "Game Result vs.Opponent 1",
"Opponent 2 Pair Number", "Opponent 2 Pre Rating", "Game Result vs.Opponent 2",
"Opponent 3 Pair Number", "Opponent 3 Pre Rating", "Game Result vs.Opponent 3",
"Opponent 4 Pair Number", "Opponent 4 Pre Rating", "Game Result vs.Opponent 4",
"Opponent 5 Pair Number", "Opponent 5 Pre Rating", "Game Result vs.Opponent 5",
"Opponent 6 Pair Number", "Opponent 6 Pre Rating", "Game Result vs.Opponent 6",
"Opponent 7 Pair Number", "Opponent 7 Pre Rating", "Game Result vs.Opponent 7",
"Average Pre Rating of Opponents"
)
head(detailed_data)
## Pair Number Name USCF ID State Total Points Pre Rating
## 1 1 GARY HUA 15445895 ON 6.0 1794
## 2 2 DAKSHESH DARURI 14598900 MI 6.0 1553
## 3 3 ADITYA BAJAJ 14959604 MI 6.0 1384
## 4 4 PATRICK H SCHILLING 12616049 MI 5.5 1716
## 5 5 HANSHI ZUO 14601533 MI 5.5 1655
## 6 6 HANSEN SONG 15055204 OH 5.0 1686
## Post Rating Opponent 1 Pair Number Opponent 1 Pre Rating
## 1 1817 39 1436
## 2 1663 63 1175
## 3 1640 8 1641
## 4 1744 23 1363
## 5 1690 45 1242
## 6 1687 34 1399
## Game Result vs.Opponent 1 Opponent 2 Pair Number Opponent 2 Pre Rating
## 1 W 21 1563
## 2 W 58 917
## 3 L 61 955
## 4 W 28 1507
## 5 W 37 980
## 6 W 29 1602
## Game Result vs.Opponent 2 Opponent 3 Pair Number Opponent 3 Pre Rating
## 1 W 18 1600
## 2 W 4 1716
## 3 W 25 1745
## 4 D 2 1553
## 5 W 12 1663
## 6 D 11 1712
## Game Result vs.Opponent 3 Opponent 4 Pair Number Opponent 4 Pre Rating
## 1 W 14 1610
## 2 L 17 1629
## 3 W 21 1563
## 4 W 26 1579
## 5 D 13 1666
## 6 L 35 1438
## Game Result vs.Opponent 4 Opponent 5 Pair Number Opponent 5 Pre Rating
## 1 W 7 1649
## 2 W 16 1604
## 3 W 11 1712
## 4 W 5 1655
## 5 D 4 1716
## 6 W 10 1365
## Game Result vs.Opponent 5 Opponent 6 Pair Number Opponent 6 Pre Rating
## 1 W 12 1663
## 2 W 20 1595
## 3 W 13 1666
## 4 D 19 1564
## 5 D 14 1610
## 6 D 27 1552
## Game Result vs.Opponent 6 Opponent 7 Pair Number Opponent 7 Pre Rating
## 1 D 4 1716
## 2 W 7 1649
## 3 W 12 1663
## 4 W 1 1794
## 5 W 17 1629
## 6 W 21 1563
## Game Result vs.Opponent 7 Average Pre Rating of Opponents
## 1 D 1605
## 2 W 1469
## 3 W 1564
## 4 D 1574
## 5 W 1501
## 6 W 1519
Also, create a summaried data table as requested to be the answer to this project
summarized_data <- data.frame(player_name,state,total_points,pre_rating,average_rating)
colnames(summarized_data) <- c("Name", "State", "Total Points", "Pre Rating","Average Pre Rating of Opponents")
head(summarized_data)
## Name State Total Points Pre Rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## Average Pre Rating of Opponents
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
Export the data tables into .csv files
write.csv(summarized_data, file = "summarized_data.csv")
write.csv(detailed_data, file = "detailed_data.csv",row.names=FALSE)