In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents


url <- "https://raw.githubusercontent.com/emrahakin1985/DATA607/master/datasets/tournamentinfo.txt"
chess_x <- read.csv(url, stringsAsFactors = F, skip = 3)

#how regex sees it:
head(unlist(str_extract_all(chess_x, ".+")), 2)
## [1] "c(\"    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|\", \"   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |\", \"-----------------------------------------------------------------------------------------\", \"    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|\", \"   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |\", \"-----------------------------------------------------------------------------------------\", "
## [2] "\"    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|\", \"   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |\", \"-----------------------------------------------------------------------------------------\", \"    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|\", \"   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |\", \"-----------------------------------------------------------------------------------------\", "
chess <- unlist(chess_x)
chess_reg <- unlist(str_extract_all(chess, ".+"))
# how regex sees it:
head(chess_reg, 11)
##  [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
##  [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [3] "-----------------------------------------------------------------------------------------"
##  [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
##  [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
##  [6] "-----------------------------------------------------------------------------------------"
##  [7] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
##  [8] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
##  [9] "-----------------------------------------------------------------------------------------"
## [10] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [11] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"


First step is to narrow down, second step is to get the vector with full names only:

names_uncln <- unlist(str_extract_all(chess, "\\| +[A-Z]{2,}.+[A-Z]{2,} +\\|?"))

head(names_uncln, 10)
##  [1] "| GARY HUA                        |"
##  [2] "| DAKSHESH DARURI                 |"
##  [3] "| ADITYA BAJAJ                    |"
##  [4] "| PATRICK H SCHILLING             |"
##  [5] "| HANSHI ZUO                      |"
##  [6] "| HANSEN SONG                     |"
##  [7] "| GARY DEE SWATHELL               |"
##  [8] "| EZEKIEL HOUGHTON                |"
##  [9] "| STEFANO LEE                     |"
## [10] "| ANVIT RAO                       |"


Second step:

names <- unlist(str_extract_all(unlist(names_uncln), "[A-Z].+[A-Z]"))
head(names, 10)
##  [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
##  [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"        
##  [7] "GARY DEE SWATHELL"   "EZEKIEL HOUGHTON"    "STEFANO LEE"        
## [10] "ANVIT RAO"


States:

state <- unlist(str_extract_all(chess, "^ +[A-Z]{2}"))
state <- str_trim(state)
#
state
##  [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
#


Points:

points <- unlist(str_extract_all(chess, "\\d\\.\\d"))
#
points
##  [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"
#


Pre_rates:

pre_rate_uncln <- unlist(str_extract_all(chess, "R\\: +\\d{3,4}P?(\\d{2})?"))
pre_rate_almost_done <- unlist(str_extract_all(unlist(pre_rate_uncln), "\\d.+\\d"))
pre_rate <- str_replace(unlist(pre_rate_almost_done), "P", ".")
#
pre_rate
##  [1] "1794"    "1553"    "1384"    "1716"    "1655"    "1686"    "1649"   
##  [8] "1641.17" "1411"    "1365"    "1712"    "1663"    "1666"    "1610"   
## [15] "1220.13" "1604"    "1629"    "1600"    "1564"    "1595"    "1563.22"
## [22] "1555"    "1363"    "1229"    "1745"    "1579"    "1552"    "1507"   
## [29] "1602"    "1522"    "1494"    "1441"    "1449"    "1399"    "1438"   
## [36] "1355"    "980.12"  "1423"    "1436.23" "1348"    "1403"    "1332"   
## [43] "1283"    "1199"    "1242"    "377"     "1362"    "1382"    "1291.12"
## [50] "1056"    "1011"    "935"     "1393"    "1270"    "1186"    "1153"   
## [57] "1092"    "917"     "853"     "967"     "955.11"  "1530"    "1175"   
## [64] "1163"
#


2 step regex code to extract opponent id’s:

step1 <- "\\|[A-Z] {2,}\\d{1,2}"
step2 <- "\\d{1,2}"


finding the rows where we will extract the data for opponent id’s:

vector_test <- ""
for(i in 1:length(chess)){
  if (str_detect(chess[i], step1) == T){
    vector_test[i] <- i
  } 
}
vector_test <- vector_test[!is.na(vector_test)]
vector_test
##  [1] "1"   "4"   "7"   "10"  "13"  "16"  "19"  "22"  "25"  "28"  "31" 
## [12] "34"  "37"  "40"  "43"  "46"  "49"  "52"  "55"  "58"  "61"  "64" 
## [23] "67"  "70"  "73"  "76"  "79"  "82"  "85"  "88"  "91"  "94"  "97" 
## [34] "100" "103" "106" "109" "112" "115" "118" "121" "124" "127" "130"
## [45] "133" "136" "139" "142" "145" "148" "151" "154" "157" "160" "163"
## [56] "166" "169" "172" "175" "178" "181" "184" "187" "190"
# creating data frame with established columns
final_df <- data.frame(names, state, points, pre_rate)
final_df[1:10,]
##                  names state points pre_rate
## 1             GARY HUA    ON    6.0     1794
## 2      DAKSHESH DARURI    MI    6.0     1553
## 3         ADITYA BAJAJ    MI    6.0     1384
## 4  PATRICK H SCHILLING    MI    5.5     1716
## 5           HANSHI ZUO    MI    5.5     1655
## 6          HANSEN SONG    OH    5.0     1686
## 7    GARY DEE SWATHELL    MI    5.0     1649
## 8     EZEKIEL HOUGHTON    MI    5.0  1641.17
## 9          STEFANO LEE    ON    5.0     1411
## 10           ANVIT RAO    MI    5.0     1365


Extracting and placing opponent id’s for each player in a new column:

k <- 1
for(i in as.numeric(vector_test)){
  #print(i)
  final_df$Opponent_ID[k] <- str_c(unlist(str_extract_all(unlist(unlist(str_extract_all(chess[i], step1))), step2)), collapse = " ")
  k <- k + 1
}

final_df[c(1:5,55:59),]
##                  names state points pre_rate         Opponent_ID
## 1             GARY HUA    ON    6.0     1794  39 21 18 14 7 12 4
## 2      DAKSHESH DARURI    MI    6.0     1553  63 58 4 17 16 20 7
## 3         ADITYA BAJAJ    MI    6.0     1384 8 61 25 21 11 13 12
## 4  PATRICK H SCHILLING    MI    5.5     1716   23 28 2 26 5 19 1
## 5           HANSHI ZUO    MI    5.5     1655 45 37 12 13 4 14 17
## 55           ALEX KONG    MI    2.0     1186   62 31 10 30 45 43
## 56        MARISA RICCI    MI    2.0     1153      11 35 45 40 42
## 57          MICHAEL LU    MI    2.0     1092    7 36 42 51 35 53
## 58        VIRAJ MOHILE    MI    2.0      917    31 2 41 23 49 45
## 59   SEAN M MC CORMICK    MI    2.0      853    41 9 40 43 54 44


First, we need to change the type of pre_rate values (from factors to characters first, then numerical values) to make the calculations.

final_df$pre_rate <- as.numeric(as.character(final_df$pre_rate))


Matching opponent id and their rate and calculating their average for each row (player):

x <- 0

for( i in 1:nrow(final_df) ) {
  
  for ( u in 1:length(unlist(str_split(final_df$Opponent_ID[i], " ")  )  )   ){
    
    x <- x + final_df$pre_rate[as.numeric(   unlist(str_split(final_df$Opponent_ID[i], " "))[u]   )]
    #print(x)
    
  }
  final_df$Average_Opponent_Rate[i] <- x / length(unlist(str_split(final_df$Opponent_ID[i], " ")  )  )
  #print(x)
  x<-0
}
final_df$Average_Opponent_Rate <- round(final_df$Average_Opponent_Rate,0)
final_df <- subset(final_df,,-5)
knitr::kable(final_df)
names state points pre_rate Average_Opponent_Rate
GARY HUA ON 6.0 1794.00 1605
DAKSHESH DARURI MI 6.0 1553.00 1469
ADITYA BAJAJ MI 6.0 1384.00 1564
PATRICK H SCHILLING MI 5.5 1716.00 1574
HANSHI ZUO MI 5.5 1655.00 1501
HANSEN SONG OH 5.0 1686.00 1519
GARY DEE SWATHELL MI 5.0 1649.00 1372
EZEKIEL HOUGHTON MI 5.0 1641.17 1468
STEFANO LEE ON 5.0 1411.00 1523
ANVIT RAO MI 5.0 1365.00 1554
CAMERON WILLIAM MC LEMAN MI 4.5 1712.00 1468
KENNETH J TACK MI 4.5 1663.00 1506
TORRANCE HENRY JR MI 4.5 1666.00 1498
BRADLEY SHAW MI 4.5 1610.00 1515
ZACHARY JAMES HOUGHTON MI 4.5 1220.13 1484
MIKE NIKITIN MI 4.0 1604.00 1386
RONALD GRZEGORCZYK MI 4.0 1629.00 1499
DAVID SUNDEEN MI 4.0 1600.00 1480
DIPANKAR ROY MI 4.0 1564.00 1426
JASON ZHENG MI 4.0 1595.00 1411
DINH DANG BUI ON 4.0 1563.22 1470
EUGENE L MCCLURE MI 4.0 1555.00 1300
ALAN BUI ON 4.0 1363.00 1214
MICHAEL R ALDRICH MI 4.0 1229.00 1357
LOREN SCHWIEBERT MI 3.5 1745.00 1363
MAX ZHU ON 3.5 1579.00 1507
GAURAV GIDWANI MI 3.5 1552.00 1222
SOFIA ADINA STANESCU-BELLU MI 3.5 1507.00 1522
CHIEDOZIE OKORIE MI 3.5 1602.00 1314
GEORGE AVERY JONES ON 3.5 1522.00 1144
RISHI SHETTY MI 3.5 1494.00 1260
JOSHUA PHILIP MATHEWS ON 3.5 1441.00 1379
JADE GE MI 3.5 1449.00 1277
MICHAEL JEFFERY THOMAS MI 3.5 1399.00 1375
JOSHUA DAVID LEE MI 3.5 1438.00 1150
SIDDHARTH JHA MI 3.5 1355.00 1388
AMIYATOSH PWNANANDAM MI 3.5 980.12 1385
BRIAN LIU MI 3.0 1423.00 1539
JOEL R HENDON MI 3.0 1436.23 1430
FOREST ZHANG MI 3.0 1348.00 1391
KYLE WILLIAM MURPHY MI 3.0 1403.00 1248
JARED GE MI 3.0 1332.00 1150
ROBERT GLEN VASEY MI 3.0 1283.00 1107
JUSTIN D SCHILLING MI 3.0 1199.00 1327
DEREK YAN MI 3.0 1242.00 1152
JACOB ALEXANDER LAVALLEY MI 3.0 377.00 1358
ERIC WRIGHT MI 2.5 1362.00 1392
DANIEL KHAIN MI 2.5 1382.00 1356
MICHAEL J MARTIN MI 2.5 1291.12 1286
SHIVAM JHA MI 2.5 1056.00 1296
TEJAS AYYAGARI MI 2.5 1011.00 1356
ETHAN GUO MI 2.5 935.00 1495
JOSE C YBARRA MI 2.0 1393.00 1345
LARRY HODGE MI 2.0 1270.00 1206
ALEX KONG MI 2.0 1186.00 1406
MARISA RICCI MI 2.0 1153.00 1414
MICHAEL LU MI 2.0 1092.00 1363
VIRAJ MOHILE MI 2.0 917.00 1391
SEAN M MC CORMICK MI 2.0 853.00 1319
JULIA SHEN MI 1.5 967.00 1330
JEZZEL FARKAS ON 1.5 955.11 1327
ASHWIN BALAJI MI 1.0 1530.00 1186
THOMAS JOSEPH HOSMER MI 1.0 1175.00 1350
BEN LI MI 1.0 1163.00 1263
write.csv(final_df, "chess.csv")