To convert text file into a dataframe with the following columns: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
library(readr)
library(stringr)
file = "/Users/euniceok/PycharmProjects/cuny/spring2019/Week4/data/tournamentinfo.txt"
df <- read.table(file, fill=TRUE, header = FALSE, sep='|',stringsAsFactors = FALSE)
# dim(df) [1] 196 11
# str(df,vec.len=1)
#'data.frame': 196 obs. of 11 variables:
# $ V1 : chr "-----------------------------------------------------------------------------------------" ...
# $ V2 : chr "" ...
# $ V3 : chr "" ...
# $ V4 : chr "" ...
# $ V5 : chr "" ...
# $ V6 : chr "" ...
# $ V7 : chr "" ...
# $ V8 : chr "" ...
# $ V9 : chr "" ...
# $ V10: chr "" ...
# $ V11: logi NA ...
# remove weird dashes
df$V1 <- str_replace_all(df$V1,pattern ="[-]","")
class(df[1,c("V1")]) # confirm the first column is character
## [1] "character"
# reset header
colnames(df) = df[3,] # make the third row the header
df<-df[-1:-4,] # remove first 3 rows
# rename columns
colnames(df)[1] <-"num" # rename first column with player #
colnames(df)[2] <-"name" # rename second column with player name
colnames(df)[3] <- "totpts" # rename rest of columns
colnames(df)[4] <- "round1"
colnames(df)[5] <- "round2"
colnames(df)[6] <- "round3"
colnames(df)[7] <- "round4"
colnames(df)[8] <- "round5"
colnames(df)[9] <- "round6"
colnames(df)[10] <- "round7"
df[11] <- NULL # delete column
rownames(df) <- 1:nrow(df) # reset index
Nth.delete<-function(dataframe, n)dataframe[-(seq(n,to=nrow(dataframe),by=n)),] # create a function that deletes every n row
df <- Nth.delete(df,3) # delete every 3rd row
rownames(df) <- 1:nrow(df) # reset index
df.players = df[seq(1, nrow(df), 2), ] # create a subset dataframe of just player name rows
rownames(df.players) <- 1:nrow(df.players) # reset index
df.stats = df[seq(2, nrow(df), 2), ] # create a subset dataframe of the 2nd row of stats for ea player
rownames(df.stats) <- 1:nrow(df.stats) # reset index
# create new columns
colnames(df.stats)[1] <-"state"
colnames(df.stats)[2] <- "id"
df.stats <- df.stats[,c("state","id")]
newdf <- cbind(df.players, df.stats) # join df.stats back wtih df.players on index
newdf$player_id <- str_trim(lapply(str_split(newdf$id, "/"), '[[',1),side="both")
newdf$scores <- str_trim(lapply(str_split(newdf$id,":"), '[[',2), side="both")
newdf$prescore <- str_trim(lapply(str_split(newdf$scores,"->"), '[[',1), side="both")
newdf$postscore <- str_trim(lapply(str_split(newdf$scores,"->"), '[[',2), side="both")
newdf$id <- NULL
newdf$scores <- NULL
newdf$prescore <- str_extract(newdf$prescore,"^[^P]+") # extract only the digits from the prescore (not provisional indicator)
newdf$postscore <- str_extract(newdf$postscore,"^[^P]+") # extract only the digits from the postscore (not provisional indicator)
newdf$prescore <- as.numeric(as.character(newdf$prescore)) # convert both scores to numeric
newdf$postscore <- as.numeric(as.character(newdf$postscore))
newdf$round1 <- str_replace_all(str_replace_all(newdf$round1, pattern = "[[:alpha:]]", ""), pattern="\\s+","")
# clean up "round"" columns
rounds <- c('round1','round2','round3','round4','round5','round6','round7')
therounds = newdf[rounds]
therounds <- lapply(therounds, function(x) {
str_replace_all(str_replace_all(x, pattern = "[[:alpha:]]", ""),pattern="\\s+","")
})
# remove old round columns and join with clean round columns
newdf <- subset(newdf, select = -c(round1,round2,round3,round4,round5,round6,round7))
newdf <- cbind(newdf, therounds)
head(newdf,n=10)
## num name totpts state player_id
## 1 1 GARY HUA 6.0 ON 15445895
## 2 2 DAKSHESH DARURI 6.0 MI 14598900
## 3 3 ADITYA BAJAJ 6.0 MI 14959604
## 4 4 PATRICK H SCHILLING 5.5 MI 12616049
## 5 5 HANSHI ZUO 5.5 MI 14601533
## 6 6 HANSEN SONG 5.0 OH 15055204
## 7 7 GARY DEE SWATHELL 5.0 MI 11146376
## 8 8 EZEKIEL HOUGHTON 5.0 MI 15142253
## 9 9 STEFANO LEE 5.0 ON 14954524
## 10 10 ANVIT RAO 5.0 MI 14150362
## prescore postscore round1 round2 round3 round4 round5 round6 round7
## 1 1794 1817 39 21 18 14 7 12 4
## 2 1553 1663 63 58 4 17 16 20 7
## 3 1384 1640 8 61 25 21 11 13 12
## 4 1716 1744 23 28 2 26 5 19 1
## 5 1655 1690 45 37 12 13 4 14 17
## 6 1686 1687 34 29 11 35 10 27 21
## 7 1649 1673 57 46 13 11 1 9 2
## 8 1641 1657 3 32 14 9 47 28 19
## 9 1411 1564 25 18 59 8 26 7 20
## 10 1365 1544 16 19 55 31 6 25 18
# not sure how to average the opponents scores in R. write temporary csv to complete rest of assignment in python
write.csv(newdf, file = "/Users/euniceok/PycharmProjects/cuny/spring2019/Week4/output/tournamenttmpdf.csv")