url <- "https://raw.githubusercontent.com/emrahakin1985/DATA607/master/datasets/tournamentinfo.txt"
chess_x <- read.csv(url, stringsAsFactors = F, skip = 3)
#how regex sees it:
head(unlist(str_extract_all(chess_x, ".+")), 2)
## [1] "c(\" 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|\", \" ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |\", \"-----------------------------------------------------------------------------------------\", \" 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|\", \" MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |\", \"-----------------------------------------------------------------------------------------\", "
## [2] "\" 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|\", \" MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |\", \"-----------------------------------------------------------------------------------------\", \" 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|\", \" MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |\", \"-----------------------------------------------------------------------------------------\", "
chess <- unlist(chess_x)
chess_reg <- unlist(str_extract_all(chess, ".+"))
# how regex sees it:
head(chess_reg, 11)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
## [7] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [8] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [9] "-----------------------------------------------------------------------------------------"
## [10] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [11] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
First step is to narrow down, second step is to get the vector with full names only:
names_uncln <- unlist(str_extract_all(chess, "\\| +[A-Z]{2,}.+[A-Z]{2,} +\\|?"))
head(names_uncln, 10)
## [1] "| GARY HUA |"
## [2] "| DAKSHESH DARURI |"
## [3] "| ADITYA BAJAJ |"
## [4] "| PATRICK H SCHILLING |"
## [5] "| HANSHI ZUO |"
## [6] "| HANSEN SONG |"
## [7] "| GARY DEE SWATHELL |"
## [8] "| EZEKIEL HOUGHTON |"
## [9] "| STEFANO LEE |"
## [10] "| ANVIT RAO |"
Second step:
names <- unlist(str_extract_all(unlist(names_uncln), "[A-Z].+[A-Z]"))
head(names, 10)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON" "STEFANO LEE"
## [10] "ANVIT RAO"
States:
state <- unlist(str_extract_all(chess, "^ +[A-Z]{2}"))
state <- str_trim(state)
#
state
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
#
Points:
points <- unlist(str_extract_all(chess, "\\d\\.\\d"))
#
points
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"
#
Pre_rates:
pre_rate_uncln <- unlist(str_extract_all(chess, "R\\: +\\d{3,4}P?(\\d{2})?"))
pre_rate_almost_done <- unlist(str_extract_all(unlist(pre_rate_uncln), "\\d.+\\d"))
pre_rate <- str_replace(unlist(pre_rate_almost_done), "P", ".")
#
pre_rate
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649"
## [8] "1641.17" "1411" "1365" "1712" "1663" "1666" "1610"
## [15] "1220.13" "1604" "1629" "1600" "1564" "1595" "1563.22"
## [22] "1555" "1363" "1229" "1745" "1579" "1552" "1507"
## [29] "1602" "1522" "1494" "1441" "1449" "1399" "1438"
## [36] "1355" "980.12" "1423" "1436.23" "1348" "1403" "1332"
## [43] "1283" "1199" "1242" "377" "1362" "1382" "1291.12"
## [50] "1056" "1011" "935" "1393" "1270" "1186" "1153"
## [57] "1092" "917" "853" "967" "955.11" "1530" "1175"
## [64] "1163"
#
2 step regex code to extract opponent id’s:
step1 <- "\\|[A-Z] {2,}\\d{1,2}"
step2 <- "\\d{1,2}"
finding the rows where we will extract the data for opponent id’s:
vector_test <- ""
for(i in 1:length(chess)){
if (str_detect(chess[i], step1) == T){
vector_test[i] <- i
}
}
vector_test <- vector_test[!is.na(vector_test)]
vector_test
## [1] "1" "4" "7" "10" "13" "16" "19" "22" "25" "28" "31"
## [12] "34" "37" "40" "43" "46" "49" "52" "55" "58" "61" "64"
## [23] "67" "70" "73" "76" "79" "82" "85" "88" "91" "94" "97"
## [34] "100" "103" "106" "109" "112" "115" "118" "121" "124" "127" "130"
## [45] "133" "136" "139" "142" "145" "148" "151" "154" "157" "160" "163"
## [56] "166" "169" "172" "175" "178" "181" "184" "187" "190"
# creating data frame with established columns
final_df <- data.frame(names, state, points, pre_rate)
final_df[1:10,]
## names state points pre_rate
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641.17
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
Extracting and placing opponent id’s for each player in a new column:
k <- 1
for(i in as.numeric(vector_test)){
#print(i)
final_df$Opponent_ID[k] <- str_c(unlist(str_extract_all(unlist(unlist(str_extract_all(chess[i], step1))), step2)), collapse = " ")
k <- k + 1
}
final_df[c(1:5,55:59),]
## names state points pre_rate Opponent_ID
## 1 GARY HUA ON 6.0 1794 39 21 18 14 7 12 4
## 2 DAKSHESH DARURI MI 6.0 1553 63 58 4 17 16 20 7
## 3 ADITYA BAJAJ MI 6.0 1384 8 61 25 21 11 13 12
## 4 PATRICK H SCHILLING MI 5.5 1716 23 28 2 26 5 19 1
## 5 HANSHI ZUO MI 5.5 1655 45 37 12 13 4 14 17
## 55 ALEX KONG MI 2.0 1186 62 31 10 30 45 43
## 56 MARISA RICCI MI 2.0 1153 11 35 45 40 42
## 57 MICHAEL LU MI 2.0 1092 7 36 42 51 35 53
## 58 VIRAJ MOHILE MI 2.0 917 31 2 41 23 49 45
## 59 SEAN M MC CORMICK MI 2.0 853 41 9 40 43 54 44
First, we need to change the type of pre_rate values (from factors to characters first, then numerical values) to make the calculations.
final_df$pre_rate <- as.numeric(as.character(final_df$pre_rate))
Matching opponent id and their rate and calculating their average for each row (player):
x <- 0
for( i in 1:nrow(final_df) ) {
for ( u in 1:length(unlist(str_split(final_df$Opponent_ID[i], " ") ) ) ){
x <- x + final_df$pre_rate[as.numeric( unlist(str_split(final_df$Opponent_ID[i], " "))[u] )]
#print(x)
}
final_df$Average_Opponent_Rate[i] <- x / length(unlist(str_split(final_df$Opponent_ID[i], " ") ) )
#print(x)
x<-0
}
final_df$Average_Opponent_Rate <- round(final_df$Average_Opponent_Rate,0)
final_df <- subset(final_df,,-5)
knitr::kable(final_df)
names | state | points | pre_rate | Average_Opponent_Rate |
---|---|---|---|---|
GARY HUA | ON | 6.0 | 1794.00 | 1605 |
DAKSHESH DARURI | MI | 6.0 | 1553.00 | 1469 |
ADITYA BAJAJ | MI | 6.0 | 1384.00 | 1564 |
PATRICK H SCHILLING | MI | 5.5 | 1716.00 | 1574 |
HANSHI ZUO | MI | 5.5 | 1655.00 | 1501 |
HANSEN SONG | OH | 5.0 | 1686.00 | 1519 |
GARY DEE SWATHELL | MI | 5.0 | 1649.00 | 1372 |
EZEKIEL HOUGHTON | MI | 5.0 | 1641.17 | 1468 |
STEFANO LEE | ON | 5.0 | 1411.00 | 1523 |
ANVIT RAO | MI | 5.0 | 1365.00 | 1554 |
CAMERON WILLIAM MC LEMAN | MI | 4.5 | 1712.00 | 1468 |
KENNETH J TACK | MI | 4.5 | 1663.00 | 1506 |
TORRANCE HENRY JR | MI | 4.5 | 1666.00 | 1498 |
BRADLEY SHAW | MI | 4.5 | 1610.00 | 1515 |
ZACHARY JAMES HOUGHTON | MI | 4.5 | 1220.13 | 1484 |
MIKE NIKITIN | MI | 4.0 | 1604.00 | 1386 |
RONALD GRZEGORCZYK | MI | 4.0 | 1629.00 | 1499 |
DAVID SUNDEEN | MI | 4.0 | 1600.00 | 1480 |
DIPANKAR ROY | MI | 4.0 | 1564.00 | 1426 |
JASON ZHENG | MI | 4.0 | 1595.00 | 1411 |
DINH DANG BUI | ON | 4.0 | 1563.22 | 1470 |
EUGENE L MCCLURE | MI | 4.0 | 1555.00 | 1300 |
ALAN BUI | ON | 4.0 | 1363.00 | 1214 |
MICHAEL R ALDRICH | MI | 4.0 | 1229.00 | 1357 |
LOREN SCHWIEBERT | MI | 3.5 | 1745.00 | 1363 |
MAX ZHU | ON | 3.5 | 1579.00 | 1507 |
GAURAV GIDWANI | MI | 3.5 | 1552.00 | 1222 |
SOFIA ADINA STANESCU-BELLU | MI | 3.5 | 1507.00 | 1522 |
CHIEDOZIE OKORIE | MI | 3.5 | 1602.00 | 1314 |
GEORGE AVERY JONES | ON | 3.5 | 1522.00 | 1144 |
RISHI SHETTY | MI | 3.5 | 1494.00 | 1260 |
JOSHUA PHILIP MATHEWS | ON | 3.5 | 1441.00 | 1379 |
JADE GE | MI | 3.5 | 1449.00 | 1277 |
MICHAEL JEFFERY THOMAS | MI | 3.5 | 1399.00 | 1375 |
JOSHUA DAVID LEE | MI | 3.5 | 1438.00 | 1150 |
SIDDHARTH JHA | MI | 3.5 | 1355.00 | 1388 |
AMIYATOSH PWNANANDAM | MI | 3.5 | 980.12 | 1385 |
BRIAN LIU | MI | 3.0 | 1423.00 | 1539 |
JOEL R HENDON | MI | 3.0 | 1436.23 | 1430 |
FOREST ZHANG | MI | 3.0 | 1348.00 | 1391 |
KYLE WILLIAM MURPHY | MI | 3.0 | 1403.00 | 1248 |
JARED GE | MI | 3.0 | 1332.00 | 1150 |
ROBERT GLEN VASEY | MI | 3.0 | 1283.00 | 1107 |
JUSTIN D SCHILLING | MI | 3.0 | 1199.00 | 1327 |
DEREK YAN | MI | 3.0 | 1242.00 | 1152 |
JACOB ALEXANDER LAVALLEY | MI | 3.0 | 377.00 | 1358 |
ERIC WRIGHT | MI | 2.5 | 1362.00 | 1392 |
DANIEL KHAIN | MI | 2.5 | 1382.00 | 1356 |
MICHAEL J MARTIN | MI | 2.5 | 1291.12 | 1286 |
SHIVAM JHA | MI | 2.5 | 1056.00 | 1296 |
TEJAS AYYAGARI | MI | 2.5 | 1011.00 | 1356 |
ETHAN GUO | MI | 2.5 | 935.00 | 1495 |
JOSE C YBARRA | MI | 2.0 | 1393.00 | 1345 |
LARRY HODGE | MI | 2.0 | 1270.00 | 1206 |
ALEX KONG | MI | 2.0 | 1186.00 | 1406 |
MARISA RICCI | MI | 2.0 | 1153.00 | 1414 |
MICHAEL LU | MI | 2.0 | 1092.00 | 1363 |
VIRAJ MOHILE | MI | 2.0 | 917.00 | 1391 |
SEAN M MC CORMICK | MI | 2.0 | 853.00 | 1319 |
JULIA SHEN | MI | 1.5 | 967.00 | 1330 |
JEZZEL FARKAS | ON | 1.5 | 955.11 | 1327 |
ASHWIN BALAJI | MI | 1.0 | 1530.00 | 1186 |
THOMAS JOSEPH HOSMER | MI | 1.0 | 1175.00 | 1350 |
BEN LI | MI | 1.0 | 1163.00 | 1263 |
write.csv(final_df, "chess.csv")