To start, we read in the data in a way that each line is treated as one string.
Then, we use as.vector to make the result a vector instead of a data frame.
#Set working directory to the directory you are currently in, which presumably will be where you downloaded this script and the text file.
setwd(getwd())
#Read in text file.
info <- as.vector(read.table("tournamentinfo.txt",quote="",comment.char="",sep="\n")$V1)
length(info)
## [1] 196
head(info)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
tail(info)
## [1] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [2] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
## [5] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
## [6] "-----------------------------------------------------------------------------------------"
Now, remove the lines that are all dashes.
For this we use the grep function, returning content of lines NOT matching a regex for a line of all dashes.
library(stringr)
info <- grep('^-+$',info,invert=TRUE,perl=TRUE,value=TRUE)
length(info)
## [1] 130
Set aside the first two items in info, as these were the headers.
info_headers <- info[1:2]
info <- info[3:length(info)]
Really every two lines were a record. Let’s separate the first line in each record from the second line.
first_lines_of_records <- info[seq(from=1,to=(length(info) - 1),by=2)]
second_lines_of_records <- info[seq(from=2,to=length(info),by=2)]
length(first_lines_of_records);head(first_lines_of_records);tail(first_lines_of_records)
## [1] 64
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
## [1] " 59 | SEAN M MC CORMICK |2.0 |L 41|B |L 9|L 40|L 43|W 54|L 44|"
## [2] " 60 | JULIA SHEN |1.5 |L 33|L 34|D 45|D 42|L 24|H |U |"
## [3] " 61 | JEZZEL FARKAS |1.5 |L 32|L 3|W 54|L 47|D 42|L 30|L 37|"
## [4] " 62 | ASHWIN BALAJI |1.0 |W 55|U |U |U |U |U |U |"
## [5] " 63 | THOMAS JOSEPH HOSMER |1.0 |L 2|L 48|D 49|L 43|L 45|H |U |"
## [6] " 64 | BEN LI |1.0 |L 22|D 30|L 31|D 49|L 46|L 42|L 54|"
length(second_lines_of_records);head(second_lines_of_records);tail(second_lines_of_records)
## [1] 64
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
## [1] " MI | 12841036 / R: 853 -> 878 | |W | |B |B |W |W |B |"
## [2] " MI | 14579262 / R: 967 -> 984 | |W |B |B |W |B | | |"
## [3] " ON | 15771592 / R: 955P11-> 979P18 | |B |W |B |W |B |W |B |"
## [4] " MI | 15219542 / R: 1530 ->1535 | |B | | | | | | |"
## [5] " MI | 15057092 / R: 1175 ->1125 | |W |B |W |B |B | | |"
## [6] " MI | 15006561 / R: 1163 ->1112 | |B |W |W |B |W |B |B |"
Now, we want to get each field from first_lines_of_records and second_lines_of_records.
Then, the information we want will be contained within different fields in either first line or second line.
Fields are separated by dashes.
first_lines_of_records_separate_fields <- str_split(first_lines_of_records,"\\|")
second_lines_of_records_separate_fields <- str_split(second_lines_of_records,"\\|")
#The first field from first lines of records gives the player number.
#Remove leading and trailing whitespace using base R trimws function to get the number by itself.
#In this case player_ids are in numerical order, but this might not always be the case so good to do.
player_ids <- unlist(lapply(first_lines_of_records_separate_fields,"[[",1))
player_ids <- trimws(player_ids)
#Repeat similar procedure for other fields.
#For some others like name, we'll need to do some additional processing after.
#But at least good to extract each field into a vector for now.
player_names <- unlist(lapply(first_lines_of_records_separate_fields,"[[",2))
player_names <- trimws(player_names)
player_states <- trimws(unlist(lapply(second_lines_of_records_separate_fields,"[[",1)))
total_points <- trimws(unlist(lapply(first_lines_of_records_separate_fields,"[[",3)))
ratings <- trimws(unlist(lapply(second_lines_of_records_separate_fields,"[[",2)))
#Show all these vectors.
player_ids
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28"
## [29] "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42"
## [43] "43" "44" "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56"
## [57] "57" "58" "59" "60" "61" "62" "63" "64"
player_names
## [1] "GARY HUA" "DAKSHESH DARURI"
## [3] "ADITYA BAJAJ" "PATRICK H SCHILLING"
## [5] "HANSHI ZUO" "HANSEN SONG"
## [7] "GARY DEE SWATHELL" "EZEKIEL HOUGHTON"
## [9] "STEFANO LEE" "ANVIT RAO"
## [11] "CAMERON WILLIAM MC LEMAN" "KENNETH J TACK"
## [13] "TORRANCE HENRY JR" "BRADLEY SHAW"
## [15] "ZACHARY JAMES HOUGHTON" "MIKE NIKITIN"
## [17] "RONALD GRZEGORCZYK" "DAVID SUNDEEN"
## [19] "DIPANKAR ROY" "JASON ZHENG"
## [21] "DINH DANG BUI" "EUGENE L MCCLURE"
## [23] "ALAN BUI" "MICHAEL R ALDRICH"
## [25] "LOREN SCHWIEBERT" "MAX ZHU"
## [27] "GAURAV GIDWANI" "SOFIA ADINA STANESCU-BELLU"
## [29] "CHIEDOZIE OKORIE" "GEORGE AVERY JONES"
## [31] "RISHI SHETTY" "JOSHUA PHILIP MATHEWS"
## [33] "JADE GE" "MICHAEL JEFFERY THOMAS"
## [35] "JOSHUA DAVID LEE" "SIDDHARTH JHA"
## [37] "AMIYATOSH PWNANANDAM" "BRIAN LIU"
## [39] "JOEL R HENDON" "FOREST ZHANG"
## [41] "KYLE WILLIAM MURPHY" "JARED GE"
## [43] "ROBERT GLEN VASEY" "JUSTIN D SCHILLING"
## [45] "DEREK YAN" "JACOB ALEXANDER LAVALLEY"
## [47] "ERIC WRIGHT" "DANIEL KHAIN"
## [49] "MICHAEL J MARTIN" "SHIVAM JHA"
## [51] "TEJAS AYYAGARI" "ETHAN GUO"
## [53] "JOSE C YBARRA" "LARRY HODGE"
## [55] "ALEX KONG" "MARISA RICCI"
## [57] "MICHAEL LU" "VIRAJ MOHILE"
## [59] "SEAN M MC CORMICK" "JULIA SHEN"
## [61] "JEZZEL FARKAS" "ASHWIN BALAJI"
## [63] "THOMAS JOSEPH HOSMER" "BEN LI"
player_states
## [1] "ON" "MI" "MI" "MI" "MI" "OH" "MI" "MI" "ON" "MI" "MI" "MI" "MI" "MI"
## [15] "MI" "MI" "MI" "MI" "MI" "MI" "ON" "MI" "ON" "MI" "MI" "ON" "MI" "MI"
## [29] "MI" "ON" "MI" "ON" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [43] "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI" "MI"
## [57] "MI" "MI" "MI" "MI" "ON" "MI" "MI" "MI"
total_points
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0" "5.0" "5.0" "5.0" "5.0" "4.5"
## [12] "4.5" "4.5" "4.5" "4.5" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0" "4.0"
## [23] "4.0" "4.0" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5" "3.5"
## [34] "3.5" "3.5" "3.5" "3.5" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0" "3.0"
## [45] "3.0" "3.0" "2.5" "2.5" "2.5" "2.5" "2.5" "2.5" "2.0" "2.0" "2.0"
## [56] "2.0" "2.0" "2.0" "2.0" "1.5" "1.5" "1.0" "1.0" "1.0"
ratings
## [1] "15445895 / R: 1794 ->1817" "14598900 / R: 1553 ->1663"
## [3] "14959604 / R: 1384 ->1640" "12616049 / R: 1716 ->1744"
## [5] "14601533 / R: 1655 ->1690" "15055204 / R: 1686 ->1687"
## [7] "11146376 / R: 1649 ->1673" "15142253 / R: 1641P17->1657P24"
## [9] "14954524 / R: 1411 ->1564" "14150362 / R: 1365 ->1544"
## [11] "12581589 / R: 1712 ->1696" "12681257 / R: 1663 ->1670"
## [13] "15082995 / R: 1666 ->1662" "10131499 / R: 1610 ->1618"
## [15] "15619130 / R: 1220P13->1416P20" "10295068 / R: 1604 ->1613"
## [17] "10297702 / R: 1629 ->1610" "11342094 / R: 1600 ->1600"
## [19] "14862333 / R: 1564 ->1570" "14529060 / R: 1595 ->1569"
## [21] "15495066 / R: 1563P22->1562" "12405534 / R: 1555 ->1529"
## [23] "15030142 / R: 1363 ->1371" "13469010 / R: 1229 ->1300"
## [25] "12486656 / R: 1745 ->1681" "15131520 / R: 1579 ->1564"
## [27] "14476567 / R: 1552 ->1539" "14882954 / R: 1507 ->1513"
## [29] "15323285 / R: 1602P6 ->1508P12" "12577178 / R: 1522 ->1444"
## [31] "15131618 / R: 1494 ->1444" "14073750 / R: 1441 ->1433"
## [33] "14691842 / R: 1449 ->1421" "15051807 / R: 1399 ->1400"
## [35] "14601397 / R: 1438 ->1392" "14773163 / R: 1355 ->1367"
## [37] "15489571 / R: 980P12->1077P17" "15108523 / R: 1423 ->1439"
## [39] "12923035 / R: 1436P23->1413" "14892710 / R: 1348 ->1346"
## [41] "15761443 / R: 1403P5 ->1341P9" "14462326 / R: 1332 ->1256"
## [43] "14101068 / R: 1283 ->1244" "15323504 / R: 1199 ->1199"
## [45] "15372807 / R: 1242 ->1191" "15490981 / R: 377P3 ->1076P10"
## [47] "12533115 / R: 1362 ->1341" "14369165 / R: 1382 ->1335"
## [49] "12531685 / R: 1291P12->1259P17" "14773178 / R: 1056 ->1111"
## [51] "15205474 / R: 1011 ->1097" "14918803 / R: 935 ->1092"
## [53] "12578849 / R: 1393 ->1359" "12836773 / R: 1270 ->1200"
## [55] "15412571 / R: 1186 ->1163" "14679887 / R: 1153 ->1140"
## [57] "15113330 / R: 1092 ->1079" "14700365 / R: 917 -> 941"
## [59] "12841036 / R: 853 -> 878" "14579262 / R: 967 -> 984"
## [61] "15771592 / R: 955P11-> 979P18" "15219542 / R: 1530 ->1535"
## [63] "15057092 / R: 1175 ->1125" "15006561 / R: 1163 ->1112"
The name and ratings fields require additional processing.
Let’s focus on the name field first.
We’ll want to have only the first letter of names be capitalized instead of the whole name.
It might be nice to add a period after initials and suffixes like Jr.
Finally, it seems there is a space in between for some last names that have two capitalized letters (like MC CORMICK for McCormick).
There also seems to be an error for one name. Assuming MC CORMICK is actually for McCormick, MCCLURE should be MC CLURE.
Let’s start with last names.
#First step - fix the McClure error.
player_names <- str_replace(player_names,pattern='MCCLURE',replace='MC CLURE')
#Now, create a version of player_names split by whitespace.
player_names_split <- str_split(player_names,"[[:blank:]]+")
#Now, we are going to create a few logical vectors looking for various things that will affect how we paste together names.
#Start by checking the number of "words" in each name. A lot of the tests we need to run only apply if there are more than two.
number_name_fields <- unlist(lapply(player_names_split,function(x)length(x)))
#If there are 3+ names, check the next-to-last for "MC".
#Extract next-to-last name using the "word" function.
two_capitals_last_name <- rep(FALSE,times=length(player_names))
two_capitals_last_name[which(number_name_fields >= 3 & word(player_names,-2) == "MC")] <- TRUE
#Next, check for Jr. suffix, as well as hypenated last name.
junior_suffix <- rep(FALSE,times=length(player_names))
junior_suffix[which(number_name_fields >= 3 & word(player_names,-1) == "JR")] <- TRUE
hyphenated_last_name <- str_detect(word(player_names,-1),'\\-')
#Now, let's get each last name with only the appropriate letters capitalized.
#For Jr. suffix, paste this to the second-to-last "word" in the name.
#Use the capitalize function from the Hmisc package to capitalize the first letter of each "word".
#Capitalize function used as suggested here: https://stackoverflow.com/questions/6364783/capitalize-the-first-letter-of-both-words-in-a-two-word-string.
#Combining this with the "tolower" function so only the first letter capitalized instead of the whole thing.
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
last_names <- capitalize(tolower(word(player_names,-1)))
last_names[which(two_capitals_last_name == TRUE)] <- paste0("Mc",last_names[which(two_capitals_last_name == TRUE)])
last_names[which(junior_suffix == TRUE)] <- paste0(capitalize(tolower(word(player_names[which(junior_suffix == TRUE)],-2)))," Jr.")
for(i in which(hyphenated_last_name == TRUE))
{
this_last_name_split_by_hyphen <- unlist(str_split(last_names[i],"-"))
last_names[i] <- paste0(this_last_name_split_by_hyphen[1],"-",capitalize(this_last_name_split_by_hyphen[2]))
}
#Let's check our work.
data.frame(Full.name = player_names,Last.name = last_names)
## Full.name Last.name
## 1 GARY HUA Hua
## 2 DAKSHESH DARURI Daruri
## 3 ADITYA BAJAJ Bajaj
## 4 PATRICK H SCHILLING Schilling
## 5 HANSHI ZUO Zuo
## 6 HANSEN SONG Song
## 7 GARY DEE SWATHELL Swathell
## 8 EZEKIEL HOUGHTON Houghton
## 9 STEFANO LEE Lee
## 10 ANVIT RAO Rao
## 11 CAMERON WILLIAM MC LEMAN McLeman
## 12 KENNETH J TACK Tack
## 13 TORRANCE HENRY JR Henry Jr.
## 14 BRADLEY SHAW Shaw
## 15 ZACHARY JAMES HOUGHTON Houghton
## 16 MIKE NIKITIN Nikitin
## 17 RONALD GRZEGORCZYK Grzegorczyk
## 18 DAVID SUNDEEN Sundeen
## 19 DIPANKAR ROY Roy
## 20 JASON ZHENG Zheng
## 21 DINH DANG BUI Bui
## 22 EUGENE L MC CLURE McClure
## 23 ALAN BUI Bui
## 24 MICHAEL R ALDRICH Aldrich
## 25 LOREN SCHWIEBERT Schwiebert
## 26 MAX ZHU Zhu
## 27 GAURAV GIDWANI Gidwani
## 28 SOFIA ADINA STANESCU-BELLU Stanescu-Bellu
## 29 CHIEDOZIE OKORIE Okorie
## 30 GEORGE AVERY JONES Jones
## 31 RISHI SHETTY Shetty
## 32 JOSHUA PHILIP MATHEWS Mathews
## 33 JADE GE Ge
## 34 MICHAEL JEFFERY THOMAS Thomas
## 35 JOSHUA DAVID LEE Lee
## 36 SIDDHARTH JHA Jha
## 37 AMIYATOSH PWNANANDAM Pwnanandam
## 38 BRIAN LIU Liu
## 39 JOEL R HENDON Hendon
## 40 FOREST ZHANG Zhang
## 41 KYLE WILLIAM MURPHY Murphy
## 42 JARED GE Ge
## 43 ROBERT GLEN VASEY Vasey
## 44 JUSTIN D SCHILLING Schilling
## 45 DEREK YAN Yan
## 46 JACOB ALEXANDER LAVALLEY Lavalley
## 47 ERIC WRIGHT Wright
## 48 DANIEL KHAIN Khain
## 49 MICHAEL J MARTIN Martin
## 50 SHIVAM JHA Jha
## 51 TEJAS AYYAGARI Ayyagari
## 52 ETHAN GUO Guo
## 53 JOSE C YBARRA Ybarra
## 54 LARRY HODGE Hodge
## 55 ALEX KONG Kong
## 56 MARISA RICCI Ricci
## 57 MICHAEL LU Lu
## 58 VIRAJ MOHILE Mohile
## 59 SEAN M MC CORMICK McCormick
## 60 JULIA SHEN Shen
## 61 JEZZEL FARKAS Farkas
## 62 ASHWIN BALAJI Balaji
## 63 THOMAS JOSEPH HOSMER Hosmer
## 64 BEN LI Li
Looks good! Now we need to get just the first and middle names/initials.
first_names <- capitalize(tolower(word(player_names,1)))
#Adjust number_name_fields to subtract one for two_capitals_last_name or junior_suffix.
number_name_fields_adjusted <- number_name_fields
number_name_fields_adjusted[which(two_capitals_last_name == TRUE | junior_suffix == TRUE)] <- number_name_fields_adjusted[which(two_capitals_last_name == TRUE | junior_suffix == TRUE)] - 1
#After adjustment, how many name fields total are there?
table(number_name_fields_adjusted)
## number_name_fields_adjusted
## 2 3
## 42 22
#Looks like no one has two middle names.
#So we can either say that there is no middle name or initial if number_name_fields_adjusted = 2, or there is a middle name or initial if number_name_fields_adjusted = 3.
#If number_name_fields_adjusted = 3, the middle name or initial will be the second name.
middle_names_or_initials <- rep("",times=length(player_names))
middle_names_or_initials[which(number_name_fields_adjusted == 3)] <- word(player_names[which(number_name_fields_adjusted == 3)],2)
#If middle is just a single letter, assume it is an initial and add a period.
#Otherwise, capitalize only the first letter.
middle_names_or_initials[which(nchar(middle_names_or_initials) == 1)] <- paste0(middle_names_or_initials[which(nchar(middle_names_or_initials) == 1)],".")
middle_names_or_initials[which(nchar(middle_names_or_initials) > 1)] <- capitalize(tolower(middle_names_or_initials[which(nchar(middle_names_or_initials) > 1)]))
middle_names_or_initials
## [1] "" "" "" "H." ""
## [6] "" "Dee" "" "" ""
## [11] "William" "J." "" "" "James"
## [16] "" "" "" "" ""
## [21] "Dang" "L." "" "R." ""
## [26] "" "" "Adina" "" "Avery"
## [31] "" "Philip" "" "Jeffery" "David"
## [36] "" "" "" "R." ""
## [41] "William" "" "Glen" "D." ""
## [46] "Alexander" "" "" "J." ""
## [51] "" "" "C." "" ""
## [56] "" "" "" "M." ""
## [61] "" "" "Joseph" ""
Think we are good to go! Combine the first/middle/last information.
Then check everything at once versus the original names to see if it all looks right.
player_names_reformatted <- paste0(first_names," ",middle_names_or_initials," ",last_names)
player_names_reformatted
## [1] "Gary Hua" "Dakshesh Daruri"
## [3] "Aditya Bajaj" "Patrick H. Schilling"
## [5] "Hanshi Zuo" "Hansen Song"
## [7] "Gary Dee Swathell" "Ezekiel Houghton"
## [9] "Stefano Lee" "Anvit Rao"
## [11] "Cameron William McLeman" "Kenneth J. Tack"
## [13] "Torrance Henry Jr." "Bradley Shaw"
## [15] "Zachary James Houghton" "Mike Nikitin"
## [17] "Ronald Grzegorczyk" "David Sundeen"
## [19] "Dipankar Roy" "Jason Zheng"
## [21] "Dinh Dang Bui" "Eugene L. McClure"
## [23] "Alan Bui" "Michael R. Aldrich"
## [25] "Loren Schwiebert" "Max Zhu"
## [27] "Gaurav Gidwani" "Sofia Adina Stanescu-Bellu"
## [29] "Chiedozie Okorie" "George Avery Jones"
## [31] "Rishi Shetty" "Joshua Philip Mathews"
## [33] "Jade Ge" "Michael Jeffery Thomas"
## [35] "Joshua David Lee" "Siddharth Jha"
## [37] "Amiyatosh Pwnanandam" "Brian Liu"
## [39] "Joel R. Hendon" "Forest Zhang"
## [41] "Kyle William Murphy" "Jared Ge"
## [43] "Robert Glen Vasey" "Justin D. Schilling"
## [45] "Derek Yan" "Jacob Alexander Lavalley"
## [47] "Eric Wright" "Daniel Khain"
## [49] "Michael J. Martin" "Shivam Jha"
## [51] "Tejas Ayyagari" "Ethan Guo"
## [53] "Jose C. Ybarra" "Larry Hodge"
## [55] "Alex Kong" "Marisa Ricci"
## [57] "Michael Lu" "Viraj Mohile"
## [59] "Sean M. McCormick" "Julia Shen"
## [61] "Jezzel Farkas" "Ashwin Balaji"
## [63] "Thomas Joseph Hosmer" "Ben Li"
#Looks like an extra space gets added if there is no middle name or initial. Let's fix this.
player_names_reformatted[which(nchar(middle_names_or_initials) == 0)] <- paste0(first_names[which(nchar(middle_names_or_initials) == 0)]," ",last_names[which(nchar(middle_names_or_initials) == 0)])
data.frame(Original = player_names,Reformat = player_names_reformatted)
## Original Reformat
## 1 GARY HUA Gary Hua
## 2 DAKSHESH DARURI Dakshesh Daruri
## 3 ADITYA BAJAJ Aditya Bajaj
## 4 PATRICK H SCHILLING Patrick H. Schilling
## 5 HANSHI ZUO Hanshi Zuo
## 6 HANSEN SONG Hansen Song
## 7 GARY DEE SWATHELL Gary Dee Swathell
## 8 EZEKIEL HOUGHTON Ezekiel Houghton
## 9 STEFANO LEE Stefano Lee
## 10 ANVIT RAO Anvit Rao
## 11 CAMERON WILLIAM MC LEMAN Cameron William McLeman
## 12 KENNETH J TACK Kenneth J. Tack
## 13 TORRANCE HENRY JR Torrance Henry Jr.
## 14 BRADLEY SHAW Bradley Shaw
## 15 ZACHARY JAMES HOUGHTON Zachary James Houghton
## 16 MIKE NIKITIN Mike Nikitin
## 17 RONALD GRZEGORCZYK Ronald Grzegorczyk
## 18 DAVID SUNDEEN David Sundeen
## 19 DIPANKAR ROY Dipankar Roy
## 20 JASON ZHENG Jason Zheng
## 21 DINH DANG BUI Dinh Dang Bui
## 22 EUGENE L MC CLURE Eugene L. McClure
## 23 ALAN BUI Alan Bui
## 24 MICHAEL R ALDRICH Michael R. Aldrich
## 25 LOREN SCHWIEBERT Loren Schwiebert
## 26 MAX ZHU Max Zhu
## 27 GAURAV GIDWANI Gaurav Gidwani
## 28 SOFIA ADINA STANESCU-BELLU Sofia Adina Stanescu-Bellu
## 29 CHIEDOZIE OKORIE Chiedozie Okorie
## 30 GEORGE AVERY JONES George Avery Jones
## 31 RISHI SHETTY Rishi Shetty
## 32 JOSHUA PHILIP MATHEWS Joshua Philip Mathews
## 33 JADE GE Jade Ge
## 34 MICHAEL JEFFERY THOMAS Michael Jeffery Thomas
## 35 JOSHUA DAVID LEE Joshua David Lee
## 36 SIDDHARTH JHA Siddharth Jha
## 37 AMIYATOSH PWNANANDAM Amiyatosh Pwnanandam
## 38 BRIAN LIU Brian Liu
## 39 JOEL R HENDON Joel R. Hendon
## 40 FOREST ZHANG Forest Zhang
## 41 KYLE WILLIAM MURPHY Kyle William Murphy
## 42 JARED GE Jared Ge
## 43 ROBERT GLEN VASEY Robert Glen Vasey
## 44 JUSTIN D SCHILLING Justin D. Schilling
## 45 DEREK YAN Derek Yan
## 46 JACOB ALEXANDER LAVALLEY Jacob Alexander Lavalley
## 47 ERIC WRIGHT Eric Wright
## 48 DANIEL KHAIN Daniel Khain
## 49 MICHAEL J MARTIN Michael J. Martin
## 50 SHIVAM JHA Shivam Jha
## 51 TEJAS AYYAGARI Tejas Ayyagari
## 52 ETHAN GUO Ethan Guo
## 53 JOSE C YBARRA Jose C. Ybarra
## 54 LARRY HODGE Larry Hodge
## 55 ALEX KONG Alex Kong
## 56 MARISA RICCI Marisa Ricci
## 57 MICHAEL LU Michael Lu
## 58 VIRAJ MOHILE Viraj Mohile
## 59 SEAN M MC CORMICK Sean M. McCormick
## 60 JULIA SHEN Julia Shen
## 61 JEZZEL FARKAS Jezzel Farkas
## 62 ASHWIN BALAJI Ashwin Balaji
## 63 THOMAS JOSEPH HOSMER Thomas Joseph Hosmer
## 64 BEN LI Ben Li
For the ratings field, we can simplify a lot by taking everything before the arrow “->”.
This is because we are interested in pre-ratings only.
Once we get everything before the “->”, take the last part of THAT, which will always be after a colon.
ratings <- str_split(ratings,'->')
ratings <- unlist(lapply(ratings,"[[",1))
ratings <- trimws(ratings,which="right")
ratings
## [1] "15445895 / R: 1794" "14598900 / R: 1553"
## [3] "14959604 / R: 1384" "12616049 / R: 1716"
## [5] "14601533 / R: 1655" "15055204 / R: 1686"
## [7] "11146376 / R: 1649" "15142253 / R: 1641P17"
## [9] "14954524 / R: 1411" "14150362 / R: 1365"
## [11] "12581589 / R: 1712" "12681257 / R: 1663"
## [13] "15082995 / R: 1666" "10131499 / R: 1610"
## [15] "15619130 / R: 1220P13" "10295068 / R: 1604"
## [17] "10297702 / R: 1629" "11342094 / R: 1600"
## [19] "14862333 / R: 1564" "14529060 / R: 1595"
## [21] "15495066 / R: 1563P22" "12405534 / R: 1555"
## [23] "15030142 / R: 1363" "13469010 / R: 1229"
## [25] "12486656 / R: 1745" "15131520 / R: 1579"
## [27] "14476567 / R: 1552" "14882954 / R: 1507"
## [29] "15323285 / R: 1602P6" "12577178 / R: 1522"
## [31] "15131618 / R: 1494" "14073750 / R: 1441"
## [33] "14691842 / R: 1449" "15051807 / R: 1399"
## [35] "14601397 / R: 1438" "14773163 / R: 1355"
## [37] "15489571 / R: 980P12" "15108523 / R: 1423"
## [39] "12923035 / R: 1436P23" "14892710 / R: 1348"
## [41] "15761443 / R: 1403P5" "14462326 / R: 1332"
## [43] "14101068 / R: 1283" "15323504 / R: 1199"
## [45] "15372807 / R: 1242" "15490981 / R: 377P3"
## [47] "12533115 / R: 1362" "14369165 / R: 1382"
## [49] "12531685 / R: 1291P12" "14773178 / R: 1056"
## [51] "15205474 / R: 1011" "14918803 / R: 935"
## [53] "12578849 / R: 1393" "12836773 / R: 1270"
## [55] "15412571 / R: 1186" "14679887 / R: 1153"
## [57] "15113330 / R: 1092" "14700365 / R: 917"
## [59] "12841036 / R: 853" "14579262 / R: 967"
## [61] "15771592 / R: 955P11" "15219542 / R: 1530"
## [63] "15057092 / R: 1175" "15006561 / R: 1163"
ratings <- str_split(ratings,":")
ratings <- trimws(unlist(lapply(ratings,"[[",2)))
ratings
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649"
## [8] "1641P17" "1411" "1365" "1712" "1663" "1666" "1610"
## [15] "1220P13" "1604" "1629" "1600" "1564" "1595" "1563P22"
## [22] "1555" "1363" "1229" "1745" "1579" "1552" "1507"
## [29] "1602P6" "1522" "1494" "1441" "1449" "1399" "1438"
## [36] "1355" "980P12" "1423" "1436P23" "1348" "1403P5" "1332"
## [43] "1283" "1199" "1242" "377P3" "1362" "1382" "1291P12"
## [50] "1056" "1011" "935" "1393" "1270" "1186" "1153"
## [57] "1092" "917" "853" "967" "955P11" "1530" "1175"
## [64] "1163"
Finally, if there is any provisional notation, remove this by taking only the part before the “P” where appropriate.
ratings <- str_split(ratings,"P")
ratings <- unlist(lapply(ratings,"[[",1))
ratings
## [1] "1794" "1553" "1384" "1716" "1655" "1686" "1649" "1641" "1411" "1365"
## [11] "1712" "1663" "1666" "1610" "1220" "1604" "1629" "1600" "1564" "1595"
## [21] "1563" "1555" "1363" "1229" "1745" "1579" "1552" "1507" "1602" "1522"
## [31] "1494" "1441" "1449" "1399" "1438" "1355" "980" "1423" "1436" "1348"
## [41] "1403" "1332" "1283" "1199" "1242" "377" "1362" "1382" "1291" "1056"
## [51] "1011" "935" "1393" "1270" "1186" "1153" "1092" "917" "853" "967"
## [61] "955" "1530" "1175" "1163"
Looks good! We’ll need to convert to numeric, but we can do that later when we are assembling the data frame.
We’ll need to process the remaining fields in first_lines_of_records_separate_fields (4th and onward) to get the player numbers of each player’s opponents.
First, get just these values from first_lines_of_records_separate_fields. These will be fields 4-10.
players_per_round <- lapply(first_lines_of_records_separate_fields,"[",4:10)
Now, use the word function from stringr to get the last “word” of each value in player_per_round.
We combine this with the trimws function so we will always extract either a number or a letter, never an empty string.
players_per_round <- lapply(players_per_round,function(x)word(trimws(x),-1))
Finally, we extract only numbers, not letters.
Use the “+” to extract two-digit numbers as one string.
players_per_round <- lapply(players_per_round,function(x)unlist(str_extract_all(x,'\\d+')))
Let’s paste this together with the original strings to check our work.
original_strings_for_players_per_round <- lapply(first_lines_of_records_separate_fields,"[",4:10)
original_strings_for_players_per_round <- unlist(lapply(original_strings_for_players_per_round,function(x)paste0(x,collapse=",")))
players_per_round_as_vector <- unlist(lapply(players_per_round,function(x)paste0(x,collapse=",")))
data.frame(Original.string = original_strings_for_players_per_round,New.string = players_per_round_as_vector)
## Original.string New.string
## 1 W 39,W 21,W 18,W 14,W 7,D 12,D 4 39,21,18,14,7,12,4
## 2 W 63,W 58,L 4,W 17,W 16,W 20,W 7 63,58,4,17,16,20,7
## 3 L 8,W 61,W 25,W 21,W 11,W 13,W 12 8,61,25,21,11,13,12
## 4 W 23,D 28,W 2,W 26,D 5,W 19,D 1 23,28,2,26,5,19,1
## 5 W 45,W 37,D 12,D 13,D 4,W 14,W 17 45,37,12,13,4,14,17
## 6 W 34,D 29,L 11,W 35,D 10,W 27,W 21 34,29,11,35,10,27,21
## 7 W 57,W 46,W 13,W 11,L 1,W 9,L 2 57,46,13,11,1,9,2
## 8 W 3,W 32,L 14,L 9,W 47,W 28,W 19 3,32,14,9,47,28,19
## 9 W 25,L 18,W 59,W 8,W 26,L 7,W 20 25,18,59,8,26,7,20
## 10 D 16,L 19,W 55,W 31,D 6,W 25,W 18 16,19,55,31,6,25,18
## 11 D 38,W 56,W 6,L 7,L 3,W 34,W 26 38,56,6,7,3,34,26
## 12 W 42,W 33,D 5,W 38,H ,D 1,L 3 42,33,5,38,1,3
## 13 W 36,W 27,L 7,D 5,W 33,L 3,W 32 36,27,7,5,33,3,32
## 14 W 54,W 44,W 8,L 1,D 27,L 5,W 31 54,44,8,1,27,5,31
## 15 D 19,L 16,W 30,L 22,W 54,W 33,W 38 19,16,30,22,54,33,38
## 16 D 10,W 15,H ,W 39,L 2,W 36,U 10,15,39,2,36
## 17 W 48,W 41,L 26,L 2,W 23,W 22,L 5 48,41,26,2,23,22,5
## 18 W 47,W 9,L 1,W 32,L 19,W 38,L 10 47,9,1,32,19,38,10
## 19 D 15,W 10,W 52,D 28,W 18,L 4,L 8 15,10,52,28,18,4,8
## 20 L 40,W 49,W 23,W 41,W 28,L 2,L 9 40,49,23,41,28,2,9
## 21 W 43,L 1,W 47,L 3,W 40,W 39,L 6 43,1,47,3,40,39,6
## 22 W 64,D 52,L 28,W 15,H ,L 17,W 40 64,52,28,15,17,40
## 23 L 4,W 43,L 20,W 58,L 17,W 37,W 46 4,43,20,58,17,37,46
## 24 L 28,L 47,W 43,L 25,W 60,W 44,W 39 28,47,43,25,60,44,39
## 25 L 9,W 53,L 3,W 24,D 34,L 10,W 47 9,53,3,24,34,10,47
## 26 W 49,W 40,W 17,L 4,L 9,D 32,L 11 49,40,17,4,9,32,11
## 27 W 51,L 13,W 46,W 37,D 14,L 6,U 51,13,46,37,14,6
## 28 W 24,D 4,W 22,D 19,L 20,L 8,D 36 24,4,22,19,20,8,36
## 29 W 50,D 6,L 38,L 34,W 52,W 48,U 50,6,38,34,52,48
## 30 L 52,D 64,L 15,W 55,L 31,W 61,W 50 52,64,15,55,31,61,50
## 31 L 58,D 55,W 64,L 10,W 30,W 50,L 14 58,55,64,10,30,50,14
## 32 W 61,L 8,W 44,L 18,W 51,D 26,L 13 61,8,44,18,51,26,13
## 33 W 60,L 12,W 50,D 36,L 13,L 15,W 51 60,12,50,36,13,15,51
## 34 L 6,W 60,L 37,W 29,D 25,L 11,W 52 6,60,37,29,25,11,52
## 35 L 46,L 38,W 56,L 6,W 57,D 52,W 48 46,38,56,6,57,52,48
## 36 L 13,W 57,W 51,D 33,H ,L 16,D 28 13,57,51,33,16,28
## 37 B ,L 5,W 34,L 27,H ,L 23,W 61 5,34,27,23,61
## 38 D 11,W 35,W 29,L 12,H ,L 18,L 15 11,35,29,12,18,15
## 39 L 1,W 54,W 40,L 16,W 44,L 21,L 24 1,54,40,16,44,21,24
## 40 W 20,L 26,L 39,W 59,L 21,W 56,L 22 20,26,39,59,21,56,22
## 41 W 59,L 17,W 58,L 20,X ,U ,U 59,17,58,20
## 42 L 12,L 50,L 57,D 60,D 61,W 64,W 56 12,50,57,60,61,64,56
## 43 L 21,L 23,L 24,W 63,W 59,L 46,W 55 21,23,24,63,59,46,55
## 44 B ,L 14,L 32,W 53,L 39,L 24,W 59 14,32,53,39,24,59
## 45 L 5,L 51,D 60,L 56,W 63,D 55,W 58 5,51,60,56,63,55,58
## 46 W 35,L 7,L 27,L 50,W 64,W 43,L 23 35,7,27,50,64,43,23
## 47 L 18,W 24,L 21,W 61,L 8,D 51,L 25 18,24,21,61,8,51,25
## 48 L 17,W 63,H ,D 52,H ,L 29,L 35 17,63,52,29,35
## 49 L 26,L 20,D 63,D 64,W 58,H ,U 26,20,63,64,58
## 50 L 29,W 42,L 33,W 46,H ,L 31,L 30 29,42,33,46,31,30
## 51 L 27,W 45,L 36,W 57,L 32,D 47,L 33 27,45,36,57,32,47,33
## 52 W 30,D 22,L 19,D 48,L 29,D 35,L 34 30,22,19,48,29,35,34
## 53 H ,L 25,H ,L 44,U ,W 57,U 25,44,57
## 54 L 14,L 39,L 61,B ,L 15,L 59,W 64 14,39,61,15,59,64
## 55 L 62,D 31,L 10,L 30,B ,D 45,L 43 62,31,10,30,45,43
## 56 H ,L 11,L 35,W 45,H ,L 40,L 42 11,35,45,40,42
## 57 L 7,L 36,W 42,L 51,L 35,L 53,B 7,36,42,51,35,53
## 58 W 31,L 2,L 41,L 23,L 49,B ,L 45 31,2,41,23,49,45
## 59 L 41,B ,L 9,L 40,L 43,W 54,L 44 41,9,40,43,54,44
## 60 L 33,L 34,D 45,D 42,L 24,H ,U 33,34,45,42,24
## 61 L 32,L 3,W 54,L 47,D 42,L 30,L 37 32,3,54,47,42,30,37
## 62 W 55,U ,U ,U ,U ,U ,U 55
## 63 L 2,L 48,D 49,L 43,L 45,H ,U 2,48,49,43,45
## 64 L 22,D 30,L 31,D 49,L 46,L 42,L 54 22,30,31,49,46,42,54
Looks good!
Now, we need to use players_per_round to get the average pre-rating for the other players.
Then, combine this information with all the other information we need.
ratings <- as.numeric(ratings)
mean_rating_other_players <- c()
for(i in 1:length(players_per_round))
{
players_per_round[[i]] <- as.numeric(players_per_round[[i]])
mean_rating_other_players <- c(mean_rating_other_players,mean(ratings[players_per_round[[i]]]))
}
mean_rating_other_players <- round(mean_rating_other_players,digits=0)
data_frame_for_csv <- data.frame(Player.name = player_names_reformatted,
State = player_states,
Total.points = as.numeric(total_points),
Pre.rating = ratings,
Average.pre.rating.opponents = mean_rating_other_players,
stringsAsFactors=FALSE)
#Increase display width a bit so we can see everything side-by-side.
options(width=100)
data_frame_for_csv
## Player.name State Total.points Pre.rating Average.pre.rating.opponents
## 1 Gary Hua ON 6.0 1794 1605
## 2 Dakshesh Daruri MI 6.0 1553 1469
## 3 Aditya Bajaj MI 6.0 1384 1564
## 4 Patrick H. Schilling MI 5.5 1716 1574
## 5 Hanshi Zuo MI 5.5 1655 1501
## 6 Hansen Song OH 5.0 1686 1519
## 7 Gary Dee Swathell MI 5.0 1649 1372
## 8 Ezekiel Houghton MI 5.0 1641 1468
## 9 Stefano Lee ON 5.0 1411 1523
## 10 Anvit Rao MI 5.0 1365 1554
## 11 Cameron William McLeman MI 4.5 1712 1468
## 12 Kenneth J. Tack MI 4.5 1663 1506
## 13 Torrance Henry Jr. MI 4.5 1666 1498
## 14 Bradley Shaw MI 4.5 1610 1515
## 15 Zachary James Houghton MI 4.5 1220 1484
## 16 Mike Nikitin MI 4.0 1604 1386
## 17 Ronald Grzegorczyk MI 4.0 1629 1499
## 18 David Sundeen MI 4.0 1600 1480
## 19 Dipankar Roy MI 4.0 1564 1426
## 20 Jason Zheng MI 4.0 1595 1411
## 21 Dinh Dang Bui ON 4.0 1563 1470
## 22 Eugene L. McClure MI 4.0 1555 1300
## 23 Alan Bui ON 4.0 1363 1214
## 24 Michael R. Aldrich MI 4.0 1229 1357
## 25 Loren Schwiebert MI 3.5 1745 1363
## 26 Max Zhu ON 3.5 1579 1507
## 27 Gaurav Gidwani MI 3.5 1552 1222
## 28 Sofia Adina Stanescu-Bellu MI 3.5 1507 1522
## 29 Chiedozie Okorie MI 3.5 1602 1314
## 30 George Avery Jones ON 3.5 1522 1144
## 31 Rishi Shetty MI 3.5 1494 1260
## 32 Joshua Philip Mathews ON 3.5 1441 1379
## 33 Jade Ge MI 3.5 1449 1277
## 34 Michael Jeffery Thomas MI 3.5 1399 1375
## 35 Joshua David Lee MI 3.5 1438 1150
## 36 Siddharth Jha MI 3.5 1355 1388
## 37 Amiyatosh Pwnanandam MI 3.5 980 1385
## 38 Brian Liu MI 3.0 1423 1539
## 39 Joel R. Hendon MI 3.0 1436 1430
## 40 Forest Zhang MI 3.0 1348 1391
## 41 Kyle William Murphy MI 3.0 1403 1248
## 42 Jared Ge MI 3.0 1332 1150
## 43 Robert Glen Vasey MI 3.0 1283 1107
## 44 Justin D. Schilling MI 3.0 1199 1327
## 45 Derek Yan MI 3.0 1242 1152
## 46 Jacob Alexander Lavalley MI 3.0 377 1358
## 47 Eric Wright MI 2.5 1362 1392
## 48 Daniel Khain MI 2.5 1382 1356
## 49 Michael J. Martin MI 2.5 1291 1286
## 50 Shivam Jha MI 2.5 1056 1296
## 51 Tejas Ayyagari MI 2.5 1011 1356
## 52 Ethan Guo MI 2.5 935 1495
## 53 Jose C. Ybarra MI 2.0 1393 1345
## 54 Larry Hodge MI 2.0 1270 1206
## 55 Alex Kong MI 2.0 1186 1406
## 56 Marisa Ricci MI 2.0 1153 1414
## 57 Michael Lu MI 2.0 1092 1363
## 58 Viraj Mohile MI 2.0 917 1391
## 59 Sean M. McCormick MI 2.0 853 1319
## 60 Julia Shen MI 1.5 967 1330
## 61 Jezzel Farkas ON 1.5 955 1327
## 62 Ashwin Balaji MI 1.0 1530 1186
## 63 Thomas Joseph Hosmer MI 1.0 1175 1350
## 64 Ben Li MI 1.0 1163 1263
Finally, let’s change column names, then print to a CSV file.
colnames(data_frame_for_csv) <- c("Player’s Name","Player’s State","Total Number of Points","Player’s Pre-Rating","Average Pre-Chess Rating of Opponents")
write.table(data_frame_for_csv,
file="tournamentinfo_processed.csv",
row.names=FALSE,col.names=TRUE,quote=TRUE,sep=",")