Chess Data.

project1_data <- read.table("https://raw.githubusercontent.com/mikegankhuyag/607-Projects/master/tournamentinfo.txt",header = FALSE, sep = "\t", stringsAsFactors = FALSE)
head(project1_data)
library(stringr)
project1_data[1:10,]
##  [1] "-----------------------------------------------------------------------------------------" 
##  [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
##  [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
##  [4] "-----------------------------------------------------------------------------------------" 
##  [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
##  [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |" 
##  [7] "-----------------------------------------------------------------------------------------" 
##  [8] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|" 
##  [9] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |" 
## [10] "-----------------------------------------------------------------------------------------"

The data for each player is in 2 lines, so I combined the first and second row of the player by finding the sequence for the lines.

Chess_Data <- paste (project1_data[seq(5,196,3),],project1_data[seq(6,196,3),1],sep = "")
seq
## function (...) 
## UseMethod("seq")
## <bytecode: 0x0000000014e6de20>
## <environment: namespace:base>
head(Chess_Data)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Where needed, I created seprators using the same β€˜|’ mark that the data uses. Also, I decided to input an β€œend” to mark where data becomes is irrelivant.

Chess_data1 <- str_replace_all(Chess_Data, pattern = "\\:\\S?","|")
str_sub(Chess_data1,130, 134) <- "|end"
str_sub(Chess_data1,119, 120) <- " |"
head(Chess_data1)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|   ON | 15445895 / R| 1794    |1817     |end|W    |B    |W    |B    |W    |B    |W    |"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|   MI | 14598900 / R| 1553    |1663     |end|B    |W    |B    |W    |B    |W    |B    |"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|   MI | 14959604 / R| 1384    |1640     |end|W    |B    |W    |B    |W    |B    |W    |"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|   MI | 12616049 / R| 1716    |1744     |end|W    |B    |W    |B    |W    |B    |B    |"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|   MI | 14601533 / R| 1655    |1690     |end|B    |W    |B    |W    |B    |W    |B    |"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|   OH | 15055204 / R| 1686    |1687     |end|W    |B    |W    |B    |B    |W    |B    |"

Using the | as a seperator, I created a table.

Chess_data2 <-read.table(text = Chess_data1, sep = "|")
head(Chess_data2)

Create a data frame out of the new table and name the columns.

chess_data3 <- data.frame(Chess_data2[,1:14])
colnames(chess_data3) <-c("ID","Player","Score","Game1","Game2","Game3","Game4","Game5","Game6","Game7","State","USCF_ID","Pre_Rating", "Post_Rating")

chess_data3[1:10,]

Clean out the data frame.

chess_data3$USCF_ID <- unlist(str_extract_all(chess_data3$USCF_ID,"\\(?\\d{8}?"))
chess_data3$Pre_Rating <- substr(chess_data3$Pre_Rating, 1, 5)
chess_data3$Post_Rating <- substr(chess_data3$Post_Rating, 1, 4)

chess_data3[1:10,]

Create a new data frame with just the opponent number and not the outcome of the match

Chess_data_4 <- chess_data3

Chess_data_4$Game1 <- unlist((str_extract(Chess_data_4$Game1, "\\d+")))
Chess_data_4$Game2 <- unlist((str_extract(Chess_data_4$Game2, "\\d+")))
Chess_data_4$Game3 <- unlist((str_extract(Chess_data_4$Game3, "\\d+")))
Chess_data_4$Game4 <- unlist((str_extract(Chess_data_4$Game4, "\\d+")))
Chess_data_4$Game5 <- unlist((str_extract(Chess_data_4$Game5, "\\d+")))
Chess_data_4$Game6 <- unlist((str_extract(Chess_data_4$Game6, "\\d+")))
Chess_data_4$Game7 <- unlist((str_extract(Chess_data_4$Game7, "\\d+")))
head(Chess_data_4)

Create a matrix from the data frame of just the ID and the pre rank of the opponent.

Chess_data_Ranks <- matrix(c(Chess_data_4$ID, Chess_data_4$Pre_Rating), ncol = 2,nrow = 64)
colnames(Chess_data_Ranks) <- c("ID", "Pre_Rank")
Chess_data_Ranks <- data.frame(Chess_data_Ranks)

head(Chess_data_Ranks)

Match opponent ID with their Pre rank scores and create a new tablae with just opponents pre rank averages.

Chess_data_Ranks2 <- Chess_data_4

Chess_data_Ranks2$Game1 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game1, Chess_data_Ranks$ID)]
Chess_data_Ranks2$Game2 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game2, Chess_data_Ranks$ID)]
Chess_data_Ranks2$Game3 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game3, Chess_data_Ranks$ID)]
Chess_data_Ranks2$Game4 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game4, Chess_data_Ranks$ID)]
Chess_data_Ranks2$Game5 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game5, Chess_data_Ranks$ID)]
Chess_data_Ranks2$Game6 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game6, Chess_data_Ranks$ID)]
Chess_data_Ranks2$Game7 <- Chess_data_Ranks$Pre_Rank[ match(Chess_data_Ranks2$Game7, Chess_data_Ranks$ID)]

head(Chess_data_Ranks2)

Turn all values in the game to a numeric value to cacluclate new rank

Chess_data_Ranks2$Game1 <- as.numeric(as.character(Chess_data_Ranks2$Game1))
Chess_data_Ranks2$Game2 <- as.numeric(as.character(Chess_data_Ranks2$Game2))
Chess_data_Ranks2$Game3 <- as.numeric(as.character(Chess_data_Ranks2$Game3))
Chess_data_Ranks2$Game4 <- as.numeric(as.character(Chess_data_Ranks2$Game4))
Chess_data_Ranks2$Game5 <- as.numeric(as.character(Chess_data_Ranks2$Game5))
Chess_data_Ranks2$Game6 <- as.numeric(as.character(Chess_data_Ranks2$Game6))
Chess_data_Ranks2$Game7 <- as.numeric(as.character(Chess_data_Ranks2$Game7))

Create a new column calculating the

New_Rating <- round(rowMeans(Chess_data_Ranks2[,4:10],na.rm = TRUE),digits = 0)

Chess_data_Ranks2$New_Rating <- New_Rating

Chess_data_Ranks2[,c(1,2,13,15)]
Chess_data_Ranks2$Pre_Rating <- as.numeric(as.character(Chess_data_Ranks2$Pre_Rating))
Ranking_differences <- Chess_data_Ranks2$New_Rating -  Chess_data_Ranks2$Pre_Rating

Chess_data_Ranks2$Ranking_differences <- Ranking_differences

head(Chess_data_Ranks2)
summary(Chess_data_Ranks2$Ranking_differences)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -382.000 -172.250  -68.000    0.125  139.750  981.000
Chess_data_Ranks2