Read in Text file

Being very sick this week, I took the most simplest approach.

First read in the txt file using | as delimiter.
In addition, align the data so that all the information needed is in a single row instead of two. Also drop all the columns that’s not needed.

tournament_raw <- read.delim("C:\\Users\\soh1\\Box Sync\\CUNY\\Fall 2017\\Data 607\\Week3\\Project\\tournamentinfo.txt", sep="|", header = FALSE, stringsAsFactors = FALSE)

tournament_raw <- tournament_raw[5:nrow(tournament_raw),]

name_row <- c()
for (i in seq(1,nrow(tournament_raw), by=3)){
  name_row <- rbind(name_row,tournament_raw[i,]) 
}

state_row <- c()
for (i in seq(2,nrow(tournament_raw), by=3)){
  state_row <- rbind(state_row,tournament_raw[i,]) 
}

name_state <- cbind(name_row, state_row)

##clean up
name_state <- name_state[,1:13]
names(name_state) <- c("playerid","name", "Total Pts", "Round1", "Round2", "Round3", "Round4","Round5","Round6","Round7","drop","state","rank")
name_state <- name_state[,!(names(name_state) %in% "drop")]

head(name_state)
##    playerid                              name Total Pts Round1 Round2
## 5        1   GARY HUA                             6.0    W  39  W  21
## 8        2   DAKSHESH DARURI                      6.0    W  63  W  58
## 11       3   ADITYA BAJAJ                         6.0    L   8  W  61
## 14       4   PATRICK H SCHILLING                  5.5    W  23  D  28
## 17       5   HANSHI ZUO                           5.5    W  45  W  37
## 20       6   HANSEN SONG                          5.0    W  34  D  29
##    Round3 Round4 Round5 Round6 Round7  state
## 5   W  18  W  14  W   7  D  12  D   4    ON 
## 8   L   4  W  17  W  16  W  20  W   7    MI 
## 11  W  25  W  21  W  11  W  13  W  12    MI 
## 14  W   2  W  26  D   5  W  19  D   1    MI 
## 17  D  12  D  13  D   4  W  14  W  17    MI 
## 20  L  11  W  35  D  10  W  27  W  21    OH 
##                                 rank
## 5   15445895 / R: 1794   ->1817     
## 8   14598900 / R: 1553   ->1663     
## 11  14959604 / R: 1384   ->1640     
## 14  12616049 / R: 1716   ->1744     
## 17  14601533 / R: 1655   ->1690     
## 20  15055204 / R: 1686   ->1687

Extract Rating and Opponent Players

I called Rating, ranking for now. Using str_split_fixed to separate out Rating string and keep only the Pre-Rate. Using str_extract to get the opponent playerid (keep numbers only)

##extract rank
library(stringr)
library(data.table)
rank <- data.table((str_split_fixed(name_state$rank," ",5)))
rank_final <- data.table(str_extract(rank$V5,"[:digit:]+"))
name_state$rank <- rank_final$V1

##extract players played against
playerlist <- c()
for (i in 1:7){
  Roundname <- name_state[,names(name_state) %in% paste0("Round",i)]
  Roundname <- str_extract(Roundname,"[:digit:]+")
  name_state[,names(name_state) %in% paste0("Round",i)] <- Roundname
}

head(name_state)
##    playerid                              name Total Pts Round1 Round2
## 5        1   GARY HUA                             6.0       39     21
## 8        2   DAKSHESH DARURI                      6.0       63     58
## 11       3   ADITYA BAJAJ                         6.0        8     61
## 14       4   PATRICK H SCHILLING                  5.5       23     28
## 17       5   HANSHI ZUO                           5.5       45     37
## 20       6   HANSEN SONG                          5.0       34     29
##    Round3 Round4 Round5 Round6 Round7  state rank
## 5      18     14      7     12      4    ON  1794
## 8       4     17     16     20      7    MI  1553
## 11     25     21     11     13     12    MI  1384
## 14      2     26      5     19      1    MI  1716
## 17     12     13      4     14     17    MI  1655
## 20     11     35     10     27     21    OH  1686

Merge Rating of the Opponent Players

First, format the db so that the variables can be used for calculation and lookup.

Create another table with just playerid and rating. Then just merge on the rating by each Round’s opponent players.

##Format 
name_state$playerid <- as.integer(name_state$playerid)
name_state[,4:10] <- lapply(name_state[,4:10], as.integer)
name_state$rank <- as.integer(name_state$rank)

##Another table of playerid and rank and merge
ranking_table <- name_state[,c("playerid","rank")]

##merge
for (i in 1:7){
  names(ranking_table) <- c(paste0("Round",i), paste0("rank",i))
  name_state <- merge(name_state,ranking_table, paste0("Round",i), all.x = TRUE)
}

head(name_state)
##   Round7 Round6 Round5 Round4 Round3 Round2 Round1 playerid
## 1      1     19      5     26      2     28     23        4
## 2      2      9      1     11     13     46     57        7
## 3      3      1     NA     38      5     33     42       12
## 4      4     12      7     14     18     21     39        1
## 5      5     22     23      2     26     41     48       17
## 6      6     39     40      3     47      1     43       21
##                                name Total Pts  state rank rank1 rank2
## 1  PATRICK H SCHILLING                  5.5      MI  1716  1363  1507
## 2  GARY DEE SWATHELL                    5.0      MI  1649  1092   377
## 3  KENNETH J TACK                       4.5      MI  1663  1332  1449
## 4  GARY HUA                             6.0      ON  1794  1436  1563
## 5  RONALD GRZEGORCZYK                   4.0      MI  1629  1382  1403
## 6  DINH DANG BUI                        4.0      ON  1563  1283  1794
##   rank3 rank4 rank5 rank6 rank7
## 1  1553  1579  1655  1564  1794
## 2  1666  1712  1794  1411  1553
## 3  1655  1423    NA  1794  1384
## 4  1600  1610  1649  1663  1716
## 5  1579  1553  1363  1555  1655
## 6  1362  1384  1348  1436  1686

Calculate the Mean and create Final DB

Here I renamed some variables to be consistent with definition of the text file.

##calculate the mean
name_state$avgOppo <- rowMeans(name_state[,13:19], na.rm=TRUE)

##Final condense DB
FinalDB <- name_state[,c("name","state","Total Pts", "rank","avgOppo")]
names(FinalDB) <- c("name","state","Total Pts", "Pre_Rating","Avg Pre Opponent Rating")

head(FinalDB)
##                                name  state Total Pts Pre_Rating
## 1  PATRICK H SCHILLING                 MI      5.5         1716
## 2  GARY DEE SWATHELL                   MI      5.0         1649
## 3  KENNETH J TACK                      MI      4.5         1663
## 4  GARY HUA                            ON      6.0         1794
## 5  RONALD GRZEGORCZYK                  MI      4.0         1629
## 6  DINH DANG BUI                       ON      4.0         1563
##   Avg Pre Opponent Rating
## 1                1573.571
## 2                1372.143
## 3                1506.167
## 4                1605.286
## 5                1498.571
## 6                1470.429