Project 1

Load Package

Read all lines
Remove lines arent required, like — lines
Split the lines into two lists. One for the rounds info and another with pregame and state info
Using regular expression parse all required items and store them in indiviudal lists. All individual lists must have 64 items, one per player. I am showing length of the lists to make sure all lists are of the same size.
Use the iniviudal lists to create a data frame and write that to the file.

require(stringr)

## Loading required package: stringr

Load data and clean up

#Load data from the file using readlines. 
raw_file_lines <- readLines('tournamentinfo.txt', warn = F)

#Remove anything that are not letters and numbers,  like "----"
file_lines <- subset(raw_file_lines, str_detect(raw_file_lines,'[[:alnum:]]'))

Data Summary

#Summary of the original file. Orginal file contains 196 lines including "---"lines.
summary(raw_file_lines)

##    Length     Class      Mode 
##       196 character character

#Summary of the lines after removing "----" lines
summary(file_lines)

##    Length     Class      Mode 
##       130 character character

Parse lines

#Split and store the lines into two different lists statrting at line 4 and 3. 
#Player row contains rounds information
player_row <-str_split(file_lines[seq(3, length(file_lines), by=2)], '[|]')
head(player_row, n=1)

## [[1]]
##  [1] "    1 "                           
##  [2] " GARY HUA                        "
##  [3] "6.0  "                            
##  [4] "W  39"                            
##  [5] "W  21"                            
##  [6] "W  18"                            
##  [7] "W  14"                            
##  [8] "W   7"                            
##  [9] "D  12"                            
## [10] "D   4"                            
## [11] ""

#Ratings row contains all pre game player ratings and state
ratings_row <-str_split(file_lines[seq(4, length(file_lines), by=2)], '[|]')
head(ratings_row, n = 1)

## [[1]]
##  [1] "   ON "                           
##  [2] " 15445895 / R: 1794   ->1817     "
##  [3] "N:2  "                            
##  [4] "W    "                            
##  [5] "B    "                            
##  [6] "W    "                            
##  [7] "B    "                            
##  [8] "W    "                            
##  [9] "B    "                            
## [10] "W    "                            
## [11] ""

#Both lists are of the same size.
length(ratings_row)

## [1] 64

length(player_row)

## [1] 64

Player names

#Get the player names from the second column.
names <-unlist(lapply(player_row, function(x)str_trim(x[2])))
head(names)

## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

# Number of players
length(names)

## [1] 64

Player total points

# Get the total points for the player. 
player_points <- as.numeric(unlist(lapply(player_row, function(x)x[3])))

head(player_points)

## [1] 6.0 6.0 6.0 5.5 5.5 5.0

length(player_points)

## [1] 64

Player states

#Get player state.
state <- unlist(lapply(ratings_row, function(x)str_trim(x[1])))

head(state)

## [1] "ON" "MI" "MI" "MI" "MI" "OH"

length(state)

## [1] 64

Player pre ratings

# Second colums contains the pre ratings
ratings_col <-  unlist(lapply(ratings_row, function(x)x[2]))

head(ratings_col)

## [1] " 15445895 / R: 1794   ->1817     " " 14598900 / R: 1553   ->1663     "
## [3] " 14959604 / R: 1384   ->1640     " " 12616049 / R: 1716   ->1744     "
## [5] " 14601533 / R: 1655   ->1690     " " 15055204 / R: 1686   ->1687     "

length(ratings_col)

## [1] 64

#Get the pre-rating using regular expression.
player_ratings<-unlist(lapply(ratings_col, function(x) as.numeric(str_extract( str_extract(x,":\\s*\\d{1,}"),"\\d{1,}"))))

head(player_ratings)

## [1] 1794 1553 1384 1716 1655 1686

length(player_ratings)

## [1] 64

Opponents rating

# Get the rounds player played from column 4 to 10.
player_rounds <- lapply(player_row, function(x)x[c(4:10)])
head(player_rounds, n = 1)

## [[1]]
## [1] "W  39" "W  21" "W  18" "W  14" "W   7" "D  12" "D   4"

length(player_rounds)

## [1] 64

#Get the oppoents Id using regular experession
opponents<-lapply(player_rounds, function(x)unlist(str_extract_all(x,'\\d{1,}')))
head(opponents , n =1)

## [[1]]
## [1] "39" "21" "18" "14" "7"  "12" "4"

length(opponents)

## [1] 64

#Calculate opponents pre rating.
opponents_ratings <- vapply(opponents, 
                            function(x) round(sum(player_ratings[ as.numeric( unlist(x))])/length(x)),FUN.VALUE = c(0)) 

head(opponents_ratings, n =1)

## [1] 1605

length(opponents_ratings)

## [1] 64

Create a data frame and view summary

df <-data.frame(names, state,player_points,player_ratings, opponents_ratings)
names(df)<-c('Name','State','Total Points','Pre-Rating','Average Pre-Rating of Opponents')
nrow(df)

## [1] 64

head(df)

##                  Name State Total Points Pre-Rating
## 1            GARY HUA    ON          6.0       1794
## 2     DAKSHESH DARURI    MI          6.0       1553
## 3        ADITYA BAJAJ    MI          6.0       1384
## 4 PATRICK H SCHILLING    MI          5.5       1716
## 5          HANSHI ZUO    MI          5.5       1655
## 6         HANSEN SONG    OH          5.0       1686
##   Average Pre-Rating of Opponents
## 1                            1605
## 2                            1469
## 3                            1564
## 4                            1574
## 5                            1501
## 6                            1519

Write to file

write.table(df,'output.txt',   quote = FALSE, sep = ',', row.names = FALSE)

Read the file

data <-read.csv('output.txt')

head(data, n =10)

##                   Name State Total.Points Pre.Rating
## 1             GARY HUA    ON          6.0       1794
## 2      DAKSHESH DARURI    MI          6.0       1553
## 3         ADITYA BAJAJ    MI          6.0       1384
## 4  PATRICK H SCHILLING    MI          5.5       1716
## 5           HANSHI ZUO    MI          5.5       1655
## 6          HANSEN SONG    OH          5.0       1686
## 7    GARY DEE SWATHELL    MI          5.0       1649
## 8     EZEKIEL HOUGHTON    MI          5.0       1641
## 9          STEFANO LEE    ON          5.0       1411
## 10           ANVIT RAO    MI          5.0       1365
##    Average.Pre.Rating.of.Opponents
## 1                             1605
## 2                             1469
## 3                             1564
## 4                             1574
## 5                             1501
## 6                             1519
## 7                             1372
## 8                             1468
## 9                             1523
## 10                            1554

nrow(data)

## [1] 64

Project 1

J John

Sep 20, 2018

Load Package

Load data and clean up

Data Summary

Parse lines

Player names

Player total points

Player states

Player pre ratings

Opponents rating

Create a data frame and view summary

Write to file

Read the file