raw_data <- scan("https://raw.githubusercontent.com/NNedd/CUNYMSDA/master/Data607/Project1/tournamentinfo.txt", what = "character", skip = 4,sep = "|")
head(raw_data, 23)
## [1] " 1 "
## [2] " GARY HUA "
## [3] "6.0 "
## [4] "W 39"
## [5] "W 21"
## [6] "W 18"
## [7] "W 14"
## [8] "W 7"
## [9] "D 12"
## [10] "D 4"
## [11] ""
## [12] " ON "
## [13] " 15445895 / R: 1794 ->1817 "
## [14] "N:2 "
## [15] "W "
## [16] "B "
## [17] "W "
## [18] "B "
## [19] "W "
## [20] "B "
## [21] "W "
## [22] ""
## [23] "-----------------------------------------------------------------------------------------"
tourn_all <- "https://raw.githubusercontent.com/NNedd/CUNYMSDA/master/Data607/Project1/tournamentinfo.txt"
tlines <- readLines(tourn_all)
For examination of the data the first record starts in row 1 and ends in row 23 (the 23rd row being ——) Therefore the records will start in the following rows: 1, 24, 47, 70, etc.
format_data <- NULL #setup data frame
i <- 1
while( i < length(raw_data)) {
Number <- as.numeric(raw_data[i])
Name <- raw_data[i+1]
Total<- as.numeric(raw_data[i+2])
Round1 <- as.numeric(str_extract(raw_data[i+3], "[[:digit:]]+"))
Round2 <- as.numeric(str_extract(raw_data[i+4], "[[:digit:]]+"))
Round3 <- as.numeric(str_extract(raw_data[i+5], "[[:digit:]]+"))
Round4 <- as.numeric(str_extract(raw_data[i+6], "[[:digit:]]+"))
Round5 <- as.numeric(str_extract(raw_data[i+7], "[[:digit:]]+"))
Round6 <- as.numeric(str_extract(raw_data[i+8], "[[:digit:]]+"))
Round7 <- as.numeric(str_extract(raw_data[i+9], "[[:digit:]]+"))
Player_state <- raw_data[i+11]
ranking_info <- str_replace_all(raw_data[i+12], " ","")
player_rank <- as.numeric(str_replace(str_extract(ranking_info, "[:][[:digit:]]{3,4}"), ":", ""))
format_data<- rbind(format_data, data.frame(Number, Name, Total, Round1, Round2, Round3, Round4, Round5, Round6, Round7, Player_state, player_rank))
i <- i +23
}
head(format_data)
## Number Name Total Round1 Round2 Round3
## 1 1 GARY HUA 6.0 39 21 18
## 2 2 DAKSHESH DARURI 6.0 63 58 4
## 3 3 ADITYA BAJAJ 6.0 8 61 25
## 4 4 PATRICK H SCHILLING 5.5 23 28 2
## 5 5 HANSHI ZUO 5.5 45 37 12
## 6 6 HANSEN SONG 5.0 34 29 11
## Round4 Round5 Round6 Round7 Player_state player_rank
## 1 14 7 12 4 ON 1794
## 2 17 16 20 7 MI 1553
## 3 21 11 13 12 MI 1384
## 4 26 5 19 1 MI 1716
## 5 13 4 14 17 MI 1655
## 6 35 10 27 21 OH 1686
Rowlen <-nrow(format_data)
i <- 1
for (i in 1:Rowlen )
{
opponents <- as.numeric(format_data[i,c(4:10)]) #store the opponent playerIds
Avg_counter <- 0
Sum <- 0
for (j in 1:7)
{
#Ensure to perform average only for the correct number of games played
if(is.na(format_data[opponents[j],12]) == FALSE )
{
Sum <- Sum + format_data[opponents[j],12]
Avg_counter <- Avg_counter + 1
}
}
Avg_Pre_chess_rating <- Sum/Avg_counter
format_data[i,13] <-Avg_Pre_chess_rating
}
#Select only needed columns
col_index <- c(2,11,3,12,13)
tournament_data <- format_data[,col_index]
#Cleanup column names
colnames(tournament_data) <- c("Player's Name", "Player's State", "Total Number of Points", "Player's Pre Rating", "Average Pre Chess Rating of Opponents")
#Cleanup data
tournament_data[,1] <- str_trim(tournament_data[,1], side = "both")
tournament_data[,2] <- str_trim(tournament_data[,2], side = "both")
tournament_data[,5] <- round(tournament_data[,5])
#write to csv
write.table(tournament_data, file = "tournamentData.csv",row.names=FALSE, na="",col.names=TRUE, sep=",")
all <- c(seq(1,length(tlines), by=1)) #all row numbers
skip <- c(seq(1,length(tlines), by=3)) #row numbers to skip
sub_rows <- !(all %in% skip) #filter out rows to skip from all rows
tourn_all_chess <- tlines[c(sub_rows)]
tourn_all_chess_list <-str_split(string=tourn_all_chess,pattern="[|]", n = Inf, simplify = FALSE)
n<-length(tourn_all_chess)
tourn_all_chess_df <- data.frame(matrix(unlist(tourn_all_chess_list), nrow=n, byrow=T),stringsAsFactors=FALSE)
n <- nrow(tourn_all_chess_df ) #rows of unstructured df
r1 <- seq(2,n,2) #row sequence of attributes to be pivoted to columns
r2 <- seq(1,n-1,2) #row sequence of attributes to be kept as rows
tourn_all_sub <- tourn_all_chess_df[r2,] #subset of data frame per player name
for (i in 1:10)
{
tourn_all_sub<- cbind(tourn_all_sub,tourn_all_chess_df[r1,i]) #append
#pivotted rows to columns of the tourn_all df
}
#Select only needed columns
col_index <- c(1:10, 12:13)
tourn_all_data <- tourn_all_sub[,col_index]
#set row 1 as column names and delete the row
colnames(tourn_all_data) <- tourn_all_data[1,]
tourn_all_data <- tourn_all_data[-1,]
Row_no <- nrow(tourn_all_data)
stage_str <- vector(mode="character", length=Row_no)
for (i in 1:Row_no)
{
stage_str[i] <- paste (tourn_all_data[i, 4], tourn_all_data[i, 5], tourn_all_data[i, 6],tourn_all_data[i, 7],tourn_all_data[i, 8],tourn_all_data[i, 9],tourn_all_data[i, 10],sep = " ", collapse = NULL)
}
tourn_all_data[, 13] <-
str_count(stage_str, '[W]')
tourn_all_data[, 14] <-
str_count(stage_str, '[D]')
tourn_all_data[, 15] <-
str_count(stage_str, '[L]')
colnames(tourn_all_data) <- c("Pair Number", "Player Name", "Total", "Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7", "Player State", "Player Pre-Ranking","Number of Wins", "Number of Draws","Number of Loses")
#View(tourn_all_data)
extra_data <- integer(0)
j <- 1
k <- 1
while( j < length(raw_data)) {
postranking_info <- str_replace_all(raw_data[j+12], " ","")
post_rank <- as.numeric(str_replace(str_extract(postranking_info, "[>][[:digit:]]{3,4}"), ">", ""))
extra_data[k] <- post_rank
k <- k + 1
j <- j +23
}
newtournamentData <- tournament_data
newtournamentData[ , "Post Ratings"] <- extra_data
plot(newtournamentData$`Player's Pre Rating`, newtournamentData$`Post Ratings`)
The scatterplot shows that for the most part there is positive correlation between the pre and post tournament rankings - as expected.
newtournamentData[ , c('Number of Wins','Number of Draws', 'Number of Loses')] <- tourn_all_data[,c('Number of Wins','Number of Draws', 'Number of Loses')]
head(newtournamentData[order(-newtournamentData$`Number of Wins`),c(1,7)])
## Player's Name Number of Wins
## 2 DAKSHESH DARURI 6
## 3 ADITYA BAJAJ 6
## 1 GARY HUA 5
## 7 GARY DEE SWATHELL 5
## 8 EZEKIEL HOUGHTON 5
## 9 STEFANO LEE 5
head(newtournamentData[order(-newtournamentData$`Number of Draws`),c(1,8)])
## Player's Name Number of Draws
## 4 PATRICK H SCHILLING 3
## 5 HANSHI ZUO 3
## 28 SOFIA ADINA STANESCU-BELLU 3
## 52 ETHAN GUO 3
## 1 GARY HUA 2
## 6 HANSEN SONG 2
head(newtournamentData[order(-newtournamentData$`Number of Loses`),c(1,9)])
## Player's Name Number of Loses
## 54 LARRY HODGE 5
## 57 MICHAEL LU 5
## 58 VIRAJ MOHILE 5
## 59 SEAN M MC CORMICK 5
## 61 JEZZEL FARKAS 5
## 64 BEN LI 5
hist(newtournamentData$`Number of Wins`)
#Histogram of number Draws
hist(newtournamentData$`Number of Draws`)
#Histogram of number Loses
hist(newtournamentData$`Number of Loses`)