Being very sick this week, I took the most simplest approach.
First read in the txt file using | as delimiter.
In addition, align the data so that all the information needed is in a single row instead of two. Also drop all the columns that’s not needed.
tournament_raw <- read.delim("C:\\Users\\soh1\\Box Sync\\CUNY\\Fall 2017\\Data 607\\Week3\\Project\\tournamentinfo.txt", sep="|", header = FALSE, stringsAsFactors = FALSE)
tournament_raw <- tournament_raw[5:nrow(tournament_raw),]
name_row <- c()
for (i in seq(1,nrow(tournament_raw), by=3)){
name_row <- rbind(name_row,tournament_raw[i,])
}
state_row <- c()
for (i in seq(2,nrow(tournament_raw), by=3)){
state_row <- rbind(state_row,tournament_raw[i,])
}
name_state <- cbind(name_row, state_row)
##clean up
name_state <- name_state[,1:13]
names(name_state) <- c("playerid","name", "Total Pts", "Round1", "Round2", "Round3", "Round4","Round5","Round6","Round7","drop","state","rank")
name_state <- name_state[,!(names(name_state) %in% "drop")]
head(name_state)
## playerid name Total Pts Round1 Round2
## 5 1 GARY HUA 6.0 W 39 W 21
## 8 2 DAKSHESH DARURI 6.0 W 63 W 58
## 11 3 ADITYA BAJAJ 6.0 L 8 W 61
## 14 4 PATRICK H SCHILLING 5.5 W 23 D 28
## 17 5 HANSHI ZUO 5.5 W 45 W 37
## 20 6 HANSEN SONG 5.0 W 34 D 29
## Round3 Round4 Round5 Round6 Round7 state
## 5 W 18 W 14 W 7 D 12 D 4 ON
## 8 L 4 W 17 W 16 W 20 W 7 MI
## 11 W 25 W 21 W 11 W 13 W 12 MI
## 14 W 2 W 26 D 5 W 19 D 1 MI
## 17 D 12 D 13 D 4 W 14 W 17 MI
## 20 L 11 W 35 D 10 W 27 W 21 OH
## rank
## 5 15445895 / R: 1794 ->1817
## 8 14598900 / R: 1553 ->1663
## 11 14959604 / R: 1384 ->1640
## 14 12616049 / R: 1716 ->1744
## 17 14601533 / R: 1655 ->1690
## 20 15055204 / R: 1686 ->1687
I called Rating, ranking for now. Using str_split_fixed to separate out Rating string and keep only the Pre-Rate. Using str_extract to get the opponent playerid (keep numbers only)
##extract rank
library(stringr)
library(data.table)
rank <- data.table((str_split_fixed(name_state$rank," ",5)))
rank_final <- data.table(str_extract(rank$V5,"[:digit:]+"))
name_state$rank <- rank_final$V1
##extract players played against
playerlist <- c()
for (i in 1:7){
Roundname <- name_state[,names(name_state) %in% paste0("Round",i)]
Roundname <- str_extract(Roundname,"[:digit:]+")
name_state[,names(name_state) %in% paste0("Round",i)] <- Roundname
}
head(name_state)
## playerid name Total Pts Round1 Round2
## 5 1 GARY HUA 6.0 39 21
## 8 2 DAKSHESH DARURI 6.0 63 58
## 11 3 ADITYA BAJAJ 6.0 8 61
## 14 4 PATRICK H SCHILLING 5.5 23 28
## 17 5 HANSHI ZUO 5.5 45 37
## 20 6 HANSEN SONG 5.0 34 29
## Round3 Round4 Round5 Round6 Round7 state rank
## 5 18 14 7 12 4 ON 1794
## 8 4 17 16 20 7 MI 1553
## 11 25 21 11 13 12 MI 1384
## 14 2 26 5 19 1 MI 1716
## 17 12 13 4 14 17 MI 1655
## 20 11 35 10 27 21 OH 1686
First, format the db so that the variables can be used for calculation and lookup.
Create another table with just playerid and rating. Then just merge on the rating by each Round’s opponent players.
##Format
name_state$playerid <- as.integer(name_state$playerid)
name_state[,4:10] <- lapply(name_state[,4:10], as.integer)
name_state$rank <- as.integer(name_state$rank)
##Another table of playerid and rank and merge
ranking_table <- name_state[,c("playerid","rank")]
##merge
for (i in 1:7){
names(ranking_table) <- c(paste0("Round",i), paste0("rank",i))
name_state <- merge(name_state,ranking_table, paste0("Round",i), all.x = TRUE)
}
head(name_state)
## Round7 Round6 Round5 Round4 Round3 Round2 Round1 playerid
## 1 1 19 5 26 2 28 23 4
## 2 2 9 1 11 13 46 57 7
## 3 3 1 NA 38 5 33 42 12
## 4 4 12 7 14 18 21 39 1
## 5 5 22 23 2 26 41 48 17
## 6 6 39 40 3 47 1 43 21
## name Total Pts state rank rank1 rank2
## 1 PATRICK H SCHILLING 5.5 MI 1716 1363 1507
## 2 GARY DEE SWATHELL 5.0 MI 1649 1092 377
## 3 KENNETH J TACK 4.5 MI 1663 1332 1449
## 4 GARY HUA 6.0 ON 1794 1436 1563
## 5 RONALD GRZEGORCZYK 4.0 MI 1629 1382 1403
## 6 DINH DANG BUI 4.0 ON 1563 1283 1794
## rank3 rank4 rank5 rank6 rank7
## 1 1553 1579 1655 1564 1794
## 2 1666 1712 1794 1411 1553
## 3 1655 1423 NA 1794 1384
## 4 1600 1610 1649 1663 1716
## 5 1579 1553 1363 1555 1655
## 6 1362 1384 1348 1436 1686
Here I renamed some variables to be consistent with definition of the text file.
##calculate the mean
name_state$avgOppo <- rowMeans(name_state[,13:19], na.rm=TRUE)
##Final condense DB
FinalDB <- name_state[,c("name","state","Total Pts", "rank","avgOppo")]
names(FinalDB) <- c("name","state","Total Pts", "Pre_Rating","Avg Pre Opponent Rating")
head(FinalDB)
## name state Total Pts Pre_Rating
## 1 PATRICK H SCHILLING MI 5.5 1716
## 2 GARY DEE SWATHELL MI 5.0 1649
## 3 KENNETH J TACK MI 4.5 1663
## 4 GARY HUA ON 6.0 1794
## 5 RONALD GRZEGORCZYK MI 4.0 1629
## 6 DINH DANG BUI ON 4.0 1563
## Avg Pre Opponent Rating
## 1 1573.571
## 2 1372.143
## 3 1506.167
## 4 1605.286
## 5 1498.571
## 6 1470.429