OBJECTIVE

To convert text file into a dataframe with the following columns: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

IMPORT PACKAGES

library(readr)
library(stringr)

READ IN FILE

file = "/Users/euniceok/PycharmProjects/cuny/spring2019/Week4/data/tournamentinfo.txt"
df <- read.table(file, fill=TRUE, header = FALSE, sep='|',stringsAsFactors = FALSE)

# dim(df) [1] 196  11
# str(df,vec.len=1)
#'data.frame':  196 obs. of  11 variables:
# $ V1 : chr  "-----------------------------------------------------------------------------------------" ...
# $ V2 : chr  "" ...
# $ V3 : chr  "" ...
# $ V4 : chr  "" ...
# $ V5 : chr  "" ...
# $ V6 : chr  "" ...
# $ V7 : chr  "" ...
# $ V8 : chr  "" ...
# $ V9 : chr  "" ...
# $ V10: chr  "" ...
# $ V11: logi  NA ...

INITIAL CLEANING

# remove weird dashes
df$V1 <- str_replace_all(df$V1,pattern ="[-]","")
class(df[1,c("V1")]) # confirm the first column is character
## [1] "character"
# reset header 
colnames(df) = df[3,] # make the third row the header
df<-df[-1:-4,] # remove first 3 rows

# rename columns 
colnames(df)[1] <-"num" # rename first column with player # 
colnames(df)[2] <-"name" # rename second column with player name
colnames(df)[3] <- "totpts" # rename rest of columns
colnames(df)[4] <- "round1"
colnames(df)[5] <- "round2"
colnames(df)[6] <- "round3"
colnames(df)[7] <- "round4"
colnames(df)[8] <- "round5"
colnames(df)[9] <- "round6"
colnames(df)[10] <- "round7"
df[11] <- NULL # delete column
rownames(df) <- 1:nrow(df) # reset index

Nth.delete<-function(dataframe, n)dataframe[-(seq(n,to=nrow(dataframe),by=n)),]  # create a function that deletes every n row
df <- Nth.delete(df,3) # delete every 3rd row 
rownames(df) <- 1:nrow(df) # reset index

df.players = df[seq(1, nrow(df), 2), ] # create a subset dataframe of just player name rows
rownames(df.players) <- 1:nrow(df.players)  # reset index 
df.stats = df[seq(2, nrow(df), 2), ] # create a subset dataframe of the 2nd row of stats for ea player 
rownames(df.stats) <- 1:nrow(df.stats)  # reset index 

# create new columns
colnames(df.stats)[1] <-"state"
colnames(df.stats)[2] <- "id"
df.stats <- df.stats[,c("state","id")]
newdf <- cbind(df.players, df.stats) # join df.stats back wtih df.players on index
newdf$player_id <- str_trim(lapply(str_split(newdf$id, "/"), '[[',1),side="both")
newdf$scores <- str_trim(lapply(str_split(newdf$id,":"), '[[',2), side="both")
newdf$prescore <- str_trim(lapply(str_split(newdf$scores,"->"), '[[',1), side="both")
newdf$postscore <- str_trim(lapply(str_split(newdf$scores,"->"), '[[',2), side="both")
newdf$id <- NULL
newdf$scores <- NULL
newdf$prescore <- str_extract(newdf$prescore,"^[^P]+") # extract only the digits from the prescore (not provisional indicator)
newdf$postscore <- str_extract(newdf$postscore,"^[^P]+") # extract only the digits from the postscore (not provisional indicator)
newdf$prescore <- as.numeric(as.character(newdf$prescore)) # convert both scores to numeric
newdf$postscore <- as.numeric(as.character(newdf$postscore))
newdf$round1 <- str_replace_all(str_replace_all(newdf$round1, pattern = "[[:alpha:]]", ""), pattern="\\s+","")

# clean up "round"" columns
rounds <- c('round1','round2','round3','round4','round5','round6','round7')
therounds = newdf[rounds]
therounds <- lapply(therounds, function(x) {
  str_replace_all(str_replace_all(x, pattern = "[[:alpha:]]", ""),pattern="\\s+","")
  })

# remove old round columns and join with clean round columns
newdf <- subset(newdf, select = -c(round1,round2,round3,round4,round5,round6,round7))
newdf <- cbind(newdf, therounds)

head(newdf,n=10)
##       num                              name totpts  state player_id
## 1      1   GARY HUA                          6.0      ON   15445895
## 2      2   DAKSHESH DARURI                   6.0      MI   14598900
## 3      3   ADITYA BAJAJ                      6.0      MI   14959604
## 4      4   PATRICK H SCHILLING               5.5      MI   12616049
## 5      5   HANSHI ZUO                        5.5      MI   14601533
## 6      6   HANSEN SONG                       5.0      OH   15055204
## 7      7   GARY DEE SWATHELL                 5.0      MI   11146376
## 8      8   EZEKIEL HOUGHTON                  5.0      MI   15142253
## 9      9   STEFANO LEE                       5.0      ON   14954524
## 10    10   ANVIT RAO                         5.0      MI   14150362
##    prescore postscore round1 round2 round3 round4 round5 round6 round7
## 1      1794      1817     39     21     18     14      7     12      4
## 2      1553      1663     63     58      4     17     16     20      7
## 3      1384      1640      8     61     25     21     11     13     12
## 4      1716      1744     23     28      2     26      5     19      1
## 5      1655      1690     45     37     12     13      4     14     17
## 6      1686      1687     34     29     11     35     10     27     21
## 7      1649      1673     57     46     13     11      1      9      2
## 8      1641      1657      3     32     14      9     47     28     19
## 9      1411      1564     25     18     59      8     26      7     20
## 10     1365      1544     16     19     55     31      6     25     18

OUTPUT TO CSV

# not sure how to average the opponents scores in R. write temporary csv to complete rest of assignment in python
write.csv(newdf, file = "/Users/euniceok/PycharmProjects/cuny/spring2019/Week4/output/tournamenttmpdf.csv")