The following code transforms an ascii formatted chess tournament results table in a .csv.
library("stringr")
## Warning: package 'stringr' was built under R version 3.1.3
setwd("C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Project 1")
chess_grid <- readLines("tournamentinfo.txt")
## Warning in readLines("tournamentinfo.txt"): incomplete final line found on
## 'tournamentinfo.txt'
str(chess_grid)
## chr [1:196] "-----------------------------------------------------------------------------------------" ...
#Extracts vector of player names
player_name <- (str_trim(unlist(str_extract_all(chess_grid, "([[:alpha:] ]-?){15,31}"))))[2:65]
str(player_name)
## chr [1:64] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ" ...
#Extracts vector of states
state <- str_trim(unlist(str_extract_all(chess_grid, " MI | ON | OH ")))
str(state)
## chr [1:64] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" ...
#Extracts vector of player scores
results <- unlist(str_extract_all(chess_grid, "[:digit:][//.][:digit:]"))
str(results)
## chr [1:64] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" ...
#Extracts vector of player preratings
prerating <- as.numeric(sub(pattern = 'R: ', replacement = '', x = unlist(str_extract_all(chess_grid, "R: [[:digit:] ]{4}"))))
#add index
names(prerating) <- (1:64)
str(prerating)
## Named num [1:64] 1794 1553 1384 1716 1655 ...
## - attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
#Collects rest of strings following top row total points, with opponent data
opp_dat <- gsub("\\|", " ", str_sub(unlist(str_extract_all(chess_grid, "[:digit:][//.][:digit:][|DLWUXBH[:digit:] ]{44}")), start = 10, end = 47))
str(opp_dat)
## chr [1:64] "39 W 21 W 18 W 14 W 7 D 12 D 4" ...
#Extracts vector of opponent numbers
opp_num <- as.numeric(unlist(strsplit(opp_dat, " B | L | W | H | U | X | D ")))
str(opp_num)
## num [1:448] 39 21 18 14 7 12 4 63 58 4 ...
#Creates vector of averaged opponent preratings
opp_prerating_vec <- (prerating[opp_num])
ind <- ceiling(seq_along(opp_prerating_vec)/7) #this took some internet research
opp_avg_prerating <- as.numeric(format(tapply(opp_prerating_vec, ind, mean, na.rm = TRUE), digits = 4))
str(opp_avg_prerating)
## num [1:64] 1605 1469 1564 1574 1501 ...
#Combines vectors into a data frame
tourn_results <- data.frame(player_name, state, results, prerating, opp_avg_prerating)
str(tourn_results)
## 'data.frame': 64 obs. of 5 variables:
## $ player_name : Factor w/ 64 levels "ADITYA BAJAJ",..: 24 12 1 51 28 27 23 21 59 5 ...
## $ state : Factor w/ 3 levels "MI","OH","ON": 3 1 1 1 1 2 1 1 3 1 ...
## $ results : Factor w/ 11 levels "1.0","1.5","2.0",..: 11 11 11 10 10 9 9 9 9 9 ...
## $ prerating : num 1794 1553 1384 1716 1655 ...
## $ opp_avg_prerating: num 1605 1469 1564 1574 1501 ...
tourn_results
## player_name state results prerating opp_avg_prerating
## 1 GARY HUA ON 6.0 1794 1605
## 2 DAKSHESH DARURI MI 6.0 1553 1469
## 3 ADITYA BAJAJ MI 6.0 1384 1564
## 4 PATRICK H SCHILLING MI 5.5 1716 1574
## 5 HANSHI ZUO MI 5.5 1655 1501
## 6 HANSEN SONG OH 5.0 1686 1519
## 7 GARY DEE SWATHELL MI 5.0 1649 1372
## 8 EZEKIEL HOUGHTON MI 5.0 1641 1468
## 9 STEFANO LEE ON 5.0 1411 1523
## 10 ANVIT RAO MI 5.0 1365 1554
## 11 CAMERON WILLIAM MC LEMAN MI 4.5 1712 1468
## 12 KENNETH J TACK MI 4.5 1663 1506
## 13 TORRANCE HENRY JR MI 4.5 1666 1498
## 14 BRADLEY SHAW MI 4.5 1610 1515
## 15 ZACHARY JAMES HOUGHTON MI 4.5 1220 1484
## 16 MIKE NIKITIN MI 4.0 1604 1386
## 17 RONALD GRZEGORCZYK MI 4.0 1629 1499
## 18 DAVID SUNDEEN MI 4.0 1600 1480
## 19 DIPANKAR ROY MI 4.0 1564 1426
## 20 JASON ZHENG MI 4.0 1595 1411
## 21 DINH DANG BUI ON 4.0 1563 1470
## 22 EUGENE L MCCLURE MI 4.0 1555 1300
## 23 ALAN BUI ON 4.0 1363 1214
## 24 MICHAEL R ALDRICH MI 4.0 1229 1357
## 25 LOREN SCHWIEBERT MI 3.5 1745 1363
## 26 MAX ZHU ON 3.5 1579 1507
## 27 GAURAV GIDWANI MI 3.5 1552 1222
## 28 SOFIA ADINA STANESCU-BELLU MI 3.5 1507 1522
## 29 CHIEDOZIE OKORIE MI 3.5 1602 1314
## 30 GEORGE AVERY JONES ON 3.5 1522 1144
## 31 RISHI SHETTY MI 3.5 1494 1260
## 32 JOSHUA PHILIP MATHEWS ON 3.5 1441 1379
## 33 JADE GE MI 3.5 1449 1277
## 34 MICHAEL JEFFERY THOMAS MI 3.5 1399 1375
## 35 JOSHUA DAVID LEE MI 3.5 1438 1150
## 36 SIDDHARTH JHA MI 3.5 1355 1388
## 37 AMIYATOSH PWNANANDAM MI 3.5 980 1385
## 38 BRIAN LIU MI 3.0 1423 1539
## 39 JOEL R HENDON MI 3.0 1436 1430
## 40 FOREST ZHANG MI 3.0 1348 1391
## 41 KYLE WILLIAM MURPHY MI 3.0 1403 1248
## 42 JARED GE MI 3.0 1332 1150
## 43 ROBERT GLEN VASEY MI 3.0 1283 1107
## 44 JUSTIN D SCHILLING MI 3.0 1199 1327
## 45 DEREK YAN MI 3.0 1242 1152
## 46 JACOB ALEXANDER LAVALLEY MI 3.0 377 1358
## 47 ERIC WRIGHT MI 2.5 1362 1392
## 48 DANIEL KHAIN MI 2.5 1382 1356
## 49 MICHAEL J MARTIN MI 2.5 1291 1286
## 50 SHIVAM JHA MI 2.5 1056 1296
## 51 TEJAS AYYAGARI MI 2.5 1011 1356
## 52 ETHAN GUO MI 2.5 935 1495
## 53 JOSE C YBARRA MI 2.0 1393 1345
## 54 LARRY HODGE MI 2.0 1270 1206
## 55 ALEX KONG MI 2.0 1186 1406
## 56 MARISA RICCI MI 2.0 1153 1414
## 57 MICHAEL LU MI 2.0 1092 1363
## 58 VIRAJ MOHILE MI 2.0 917 1391
## 59 SEAN M MC CORMICK MI 2.0 853 1319
## 60 JULIA SHEN MI 1.5 967 1330
## 61 JEZZEL FARKAS ON 1.5 955 1327
## 62 ASHWIN BALAJI MI 1.0 1530 1186
## 63 THOMAS JOSEPH HOSMER MI 1.0 1175 1350
## 64 BEN LI MI 1.0 1163 1263
#Exports .csv
write.csv(tourn_results, file = "chess_results.csv")