Load Package
- Read all lines
- Remove lines arent required, like — lines
- Split the lines into two lists. One for the rounds info and another with pregame and state info
- Using regular expression parse all required items and store them in indiviudal lists. All individual lists must have 64 items, one per player. I am showing length of the lists to make sure all lists are of the same size.
- Use the iniviudal lists to create a data frame and write that to the file.
require(stringr)
## Loading required package: stringr
Load data and clean up
#Load data from the file using readlines.
raw_file_lines <- readLines('tournamentinfo.txt', warn = F)
#Remove anything that are not letters and numbers, like "----"
file_lines <- subset(raw_file_lines, str_detect(raw_file_lines,'[[:alnum:]]'))
Data Summary
#Summary of the original file. Orginal file contains 196 lines including "---"lines.
summary(raw_file_lines)
## Length Class Mode
## 196 character character
#Summary of the lines after removing "----" lines
summary(file_lines)
## Length Class Mode
## 130 character character
Parse lines
#Split and store the lines into two different lists statrting at line 4 and 3.
#Player row contains rounds information
player_row <-str_split(file_lines[seq(3, length(file_lines), by=2)], '[|]')
head(player_row, n=1)
## [[1]]
## [1] " 1 "
## [2] " GARY HUA "
## [3] "6.0 "
## [4] "W 39"
## [5] "W 21"
## [6] "W 18"
## [7] "W 14"
## [8] "W 7"
## [9] "D 12"
## [10] "D 4"
## [11] ""
#Ratings row contains all pre game player ratings and state
ratings_row <-str_split(file_lines[seq(4, length(file_lines), by=2)], '[|]')
head(ratings_row, n = 1)
## [[1]]
## [1] " ON "
## [2] " 15445895 / R: 1794 ->1817 "
## [3] "N:2 "
## [4] "W "
## [5] "B "
## [6] "W "
## [7] "B "
## [8] "W "
## [9] "B "
## [10] "W "
## [11] ""
#Both lists are of the same size.
length(ratings_row)
## [1] 64
length(player_row)
## [1] 64
Player names
#Get the player names from the second column.
names <-unlist(lapply(player_row, function(x)str_trim(x[2])))
head(names)
## [1] "GARY HUA" "DAKSHESH DARURI" "ADITYA BAJAJ"
## [4] "PATRICK H SCHILLING" "HANSHI ZUO" "HANSEN SONG"
# Number of players
length(names)
## [1] 64
Player total points
# Get the total points for the player.
player_points <- as.numeric(unlist(lapply(player_row, function(x)x[3])))
head(player_points)
## [1] 6.0 6.0 6.0 5.5 5.5 5.0
length(player_points)
## [1] 64
Player states
#Get player state.
state <- unlist(lapply(ratings_row, function(x)str_trim(x[1])))
head(state)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"
length(state)
## [1] 64
Player pre ratings
# Second colums contains the pre ratings
ratings_col <- unlist(lapply(ratings_row, function(x)x[2]))
head(ratings_col)
## [1] " 15445895 / R: 1794 ->1817 " " 14598900 / R: 1553 ->1663 "
## [3] " 14959604 / R: 1384 ->1640 " " 12616049 / R: 1716 ->1744 "
## [5] " 14601533 / R: 1655 ->1690 " " 15055204 / R: 1686 ->1687 "
length(ratings_col)
## [1] 64
#Get the pre-rating using regular expression.
player_ratings<-unlist(lapply(ratings_col, function(x) as.numeric(str_extract( str_extract(x,":\\s*\\d{1,}"),"\\d{1,}"))))
head(player_ratings)
## [1] 1794 1553 1384 1716 1655 1686
length(player_ratings)
## [1] 64
Opponents rating
# Get the rounds player played from column 4 to 10.
player_rounds <- lapply(player_row, function(x)x[c(4:10)])
head(player_rounds, n = 1)
## [[1]]
## [1] "W 39" "W 21" "W 18" "W 14" "W 7" "D 12" "D 4"
length(player_rounds)
## [1] 64
#Get the oppoents Id using regular experession
opponents<-lapply(player_rounds, function(x)unlist(str_extract_all(x,'\\d{1,}')))
head(opponents , n =1)
## [[1]]
## [1] "39" "21" "18" "14" "7" "12" "4"
length(opponents)
## [1] 64
#Calculate opponents pre rating.
opponents_ratings <- vapply(opponents,
function(x) round(sum(player_ratings[ as.numeric( unlist(x))])/length(x)),FUN.VALUE = c(0))
head(opponents_ratings, n =1)
## [1] 1605
length(opponents_ratings)
## [1] 64
Create a data frame and view summary
df <-data.frame(names, state,player_points,player_ratings, opponents_ratings)
names(df)<-c('Name','State','Total Points','Pre-Rating','Average Pre-Rating of Opponents')
nrow(df)
## [1] 64
head(df)
## Name State Total Points Pre-Rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## Average Pre-Rating of Opponents
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
Write to file
write.table(df,'output.txt', quote = FALSE, sep = ',', row.names = FALSE)
Read the file
data <-read.csv('output.txt')
head(data, n =10)
## Name State Total.Points Pre.Rating
## 1 GARY HUA ON 6.0 1794
## 2 DAKSHESH DARURI MI 6.0 1553
## 3 ADITYA BAJAJ MI 6.0 1384
## 4 PATRICK H SCHILLING MI 5.5 1716
## 5 HANSHI ZUO MI 5.5 1655
## 6 HANSEN SONG OH 5.0 1686
## 7 GARY DEE SWATHELL MI 5.0 1649
## 8 EZEKIEL HOUGHTON MI 5.0 1641
## 9 STEFANO LEE ON 5.0 1411
## 10 ANVIT RAO MI 5.0 1365
## Average.Pre.Rating.of.Opponents
## 1 1605
## 2 1469
## 3 1564
## 4 1574
## 5 1501
## 6 1519
## 7 1372
## 8 1468
## 9 1523
## 10 1554
nrow(data)
## [1] 64