library(readr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
chess_tournament <- readLines('https://raw.githubusercontent.com/enidroman/data_607_data_aquisition_and_management_project/main/tournamentinfo.txt')
head(chess_tournament)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
####I did a summary of the dataset.
summary(chess_tournament)
## Length Class Mode
## 196 character character
deconstruct_chess_tournament <- matrix(unlist(chess_tournament), byrow=TRUE)
d1_chess_tournament <- deconstruct_chess_tournament[seq(5,length(deconstruct_chess_tournament),3)]
head(d1_chess_tournament)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
d2_chess_tournament <- deconstruct_chess_tournament[seq(6,length(deconstruct_chess_tournament),3)]
head(d2_chess_tournament)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
# From matrix d1 I matched and extracted the ID column.
ID <- as.numeric(str_extract(d1_chess_tournament, '\\d+'))
# From matrix d1 I matched and extracted the Name from Name column.
Name <- str_extract(d1_chess_tournament, '[A-z].{1,32}')
Player_Name <- str_trim(str_extract(Name, '.+\\s{2,}'))
# From matrix d2 I matched and extracted the States from the State column.
Player_State <- str_extract(d2_chess_tournament, '[A-Z]{2}')
# From matrix d1 I matched and extract the Total Pts column.
Total_Number_of_Points <- as.numeric(str_extract(d1_chess_tournament, '\\d+\\.\\d'))
# From matrix d2 I matched and extract the Prerating column.
PreRating <- str_extract(d2_chess_tournament, 'R:.{8,}-')
PreRating <- as.numeric(str_extract(PreRating, '\\d{1,4}'))
# From matrix d2 I matched and extracted Rounds.(Needed this for the average pre chess rating calculation)
Rounds <- str_extract_all(d1_chess_tournament, '[A-Z]\\s{2,}\\d+')
Rounds <- str_extract_all(Rounds, '\\d+')
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing
Avg_Pre_Chess_Rating_Opp <- c()
for(i in c(1:length(Rounds))){
Avg_Pre_Chess_Rating_Opp[i] <- round(mean(PreRating[as.numeric(Rounds[[i]])]),0)
}
Avg_Pre_Chess_Rating_Opp
## [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515 1484
## [16] 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522 1314 1144
## [31] 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150 1107 1327 1152
## [46] 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414 1363 1391 1319 1330
## [61] 1327 1186 1350 1263
new_chess_tournament <- data.frame(Player_Name, Player_State, Total_Number_of_Points, PreRating, Avg_Pre_Chess_Rating_Opp)
head(new_chess_tournament)
## Player_Name Player_State Total_Number_of_Points PreRating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## Avg_Pre_Chess_Rating_Opp
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
write.csv(new_chess_tournament, file = "new_chess_tournament.csv")