In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:
Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents
For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.
If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…
The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments:
# Loading the tournamentinfo.txt:
theUrl <- "https://raw.githubusercontent.com/kamathvk1982/Data607-MajorAssignment-Project1/master/tournamentinfo.txt"
chess.tour.df <- readLines(theUrl)
## Warning in readLines(theUrl): incomplete final line found on 'https://
## raw.githubusercontent.com/kamathvk1982/Data607-MajorAssignment-Project1/master/
## tournamentinfo.txt'
# Sample rows from the dataset:
head(chess.tour.df)
## [1] "-----------------------------------------------------------------------------------------"
## [2] " Pair | Player Name |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num | USCF ID / Rtg (Pre->Post) | Pts | 1 | 2 | 3 | 4 | 5 | 6 | 7 | "
## [4] "-----------------------------------------------------------------------------------------"
## [5] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [6] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
#Cleanup the header by removing first 4 rows:
chess.tour.df <- chess.tour.df[c(-1:-4, 0)]
head(chess.tour.df)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [5] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [6] "-----------------------------------------------------------------------------------------"
#created empty objects:
data1.df <- c()
data2.df <- c()
#Run a loop to get the split:
k <- 1
for (i in 1:length(chess.tour.df) ) {
if ( i == 1 | i%%3 == 1 ) { data1.df[k] <- chess.tour.df[i] }
else if (i == 2 | i%%3 == 2) { data2.df[k] <- chess.tour.df[i] }
if (i%%3 == 0) {
k <- k + 1
}
}
# Sample rows from the dataset:
head(data1.df)
## [1] " 1 | GARY HUA |6.0 |W 39|W 21|W 18|W 14|W 7|D 12|D 4|"
## [2] " 2 | DAKSHESH DARURI |6.0 |W 63|W 58|L 4|W 17|W 16|W 20|W 7|"
## [3] " 3 | ADITYA BAJAJ |6.0 |L 8|W 61|W 25|W 21|W 11|W 13|W 12|"
## [4] " 4 | PATRICK H SCHILLING |5.5 |W 23|D 28|W 2|W 26|D 5|W 19|D 1|"
## [5] " 5 | HANSHI ZUO |5.5 |W 45|W 37|D 12|D 13|D 4|W 14|W 17|"
## [6] " 6 | HANSEN SONG |5.0 |W 34|D 29|L 11|W 35|D 10|W 27|W 21|"
head(data2.df)
## [1] " ON | 15445895 / R: 1794 ->1817 |N:2 |W |B |W |B |W |B |W |"
## [2] " MI | 14598900 / R: 1553 ->1663 |N:2 |B |W |B |W |B |W |B |"
## [3] " MI | 14959604 / R: 1384 ->1640 |N:2 |W |B |W |B |W |B |W |"
## [4] " MI | 12616049 / R: 1716 ->1744 |N:2 |W |B |W |B |W |B |B |"
## [5] " MI | 14601533 / R: 1655 ->1690 |N:2 |B |W |B |W |B |W |B |"
## [6] " OH | 15055204 / R: 1686 ->1687 |N:3 |W |B |W |B |B |W |B |"
# Splitting with pattern as the delimiter "|" :
data1.cols.df <- str_split(data1.df, pattern = fixed("|"), simplify = TRUE, n=11)
data2.cols.df <- str_split(data2.df, pattern = fixed("|"), simplify = TRUE, n=11)
#Trimming the values for values in all rows and columns:
data1.cols.df <- trimws(data1.cols.df )
data2.cols.df <- trimws(data2.cols.df )
# Sample rows from the dataset:
head(data1.cols.df)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "1" "GARY HUA" "6.0" "W 39" "W 21" "W 18" "W 14" "W 7"
## [2,] "2" "DAKSHESH DARURI" "6.0" "W 63" "W 58" "L 4" "W 17" "W 16"
## [3,] "3" "ADITYA BAJAJ" "6.0" "L 8" "W 61" "W 25" "W 21" "W 11"
## [4,] "4" "PATRICK H SCHILLING" "5.5" "W 23" "D 28" "W 2" "W 26" "D 5"
## [5,] "5" "HANSHI ZUO" "5.5" "W 45" "W 37" "D 12" "D 13" "D 4"
## [6,] "6" "HANSEN SONG" "5.0" "W 34" "D 29" "L 11" "W 35" "D 10"
## [,9] [,10] [,11]
## [1,] "D 12" "D 4" ""
## [2,] "W 20" "W 7" ""
## [3,] "W 13" "W 12" ""
## [4,] "W 19" "D 1" ""
## [5,] "W 14" "W 17" ""
## [6,] "W 27" "W 21" ""
head(data2.cols.df)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] "ON" "15445895 / R: 1794 ->1817" "N:2" "W" "B" "W" "B" "W" "B"
## [2,] "MI" "14598900 / R: 1553 ->1663" "N:2" "B" "W" "B" "W" "B" "W"
## [3,] "MI" "14959604 / R: 1384 ->1640" "N:2" "W" "B" "W" "B" "W" "B"
## [4,] "MI" "12616049 / R: 1716 ->1744" "N:2" "W" "B" "W" "B" "W" "B"
## [5,] "MI" "14601533 / R: 1655 ->1690" "N:2" "B" "W" "B" "W" "B" "W"
## [6,] "OH" "15055204 / R: 1686 ->1687" "N:3" "W" "B" "W" "B" "B" "W"
## [,10] [,11]
## [1,] "W" ""
## [2,] "B" ""
## [3,] "W" ""
## [4,] "B" ""
## [5,] "B" ""
## [6,] "B" ""
# Creating of an empty dataset with column names and number of rows as the length of final dataset:
final.dataset<-data.frame(id=character(nrow(data1.cols.df)), name=character(nrow(data1.cols.df)), state=character(nrow(data1.cols.df)), total.points=numeric(nrow(data1.cols.df)), pre.rating=numeric(nrow(data1.cols.df)), avg.pre.rating.oppo=numeric(nrow(data1.cols.df)), rd1=character(nrow(data1.cols.df)),rd2=character(nrow(data1.cols.df)), rd3=character(nrow(data1.cols.df)), rd4=character(nrow(data1.cols.df)), rd5=character(nrow(data1.cols.df)), rd6=character(nrow(data1.cols.df)), rd7=character(nrow(data1.cols.df)) )
# Assigning valve to each column in the final dataset [except for average opponent rating, which we will calculate in next step]:
final.dataset$id <- trimws(data1.cols.df[, 1])
final.dataset$name <- data1.cols.df[, 2]
final.dataset$state <- data2.cols.df[, 1]
final.dataset$total.points <- as.double(data1.cols.df[, 3] )
final.dataset$pre.rating <- as.integer( sub('P', '.', trimws( substr(data2.cols.df[,2], regexpr('R:', data2.cols.df[,2] ) +2 , regexpr("->", data2.cols.df[,2])-1) ) ))
final.dataset$rd1 <- sub('[[:alpha:]]', '', data1.cols.df[,4])
final.dataset$rd2 <- sub('[[:alpha:]]', '', data1.cols.df[,5])
final.dataset$rd3 <- sub('[[:alpha:]]', '', data1.cols.df[,6])
final.dataset$rd4 <- sub('[[:alpha:]]', '', data1.cols.df[,7])
final.dataset$rd5 <- sub('[[:alpha:]]', '', data1.cols.df[,8])
final.dataset$rd6 <- sub('[[:alpha:]]', '', data1.cols.df[,9])
final.dataset$rd7 <- sub('[[:alpha:]]', '', data1.cols.df[,10])
#Assigning valve to column avg.pre.rating.oppo:
for (i in 1:nrow(data1.cols.df) ) {
final.dataset$avg.pre.rating.oppo[i] <- as.integer( mean( c(final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd1[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd2[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd3[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd4[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd5[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd6[i])] , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd7[i])] ) , na.rm = TRUE ) )
}
# Sample rows from the dataset:
head(final.dataset)
#Creating the subset with required columns of Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents:
final.csv.dataset <- subset(final.dataset, select= c(name, state, total.points, pre.rating,avg.pre.rating.oppo ))
#Write into a csv file in current working directory
getwd()
## [1] "C:/Users/kamat/OneDrive/Documents/Vinayak/CUNY/Data607/MajorAssignment/Project1"
write.csv(final.csv.dataset,'tournament_player_info.csv')