Data607-MajorAssignment-Project1-Chess Tournament

In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players:

Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents

For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

If you have questions about the meaning of the data or the results, please post them on the discussion forum. Data science, like chess, is a game of back and forth…

The chess rating system (invented by a Minnesota statistician named Arpad Elo) has been used in many other contexts, including assessing relative strength of employment candidates by human resource departments:

Step: Loading the raw tournament information file from Git Repository:

# Loading the tournamentinfo.txt:
theUrl <- "https://raw.githubusercontent.com/kamathvk1982/Data607-MajorAssignment-Project1/master/tournamentinfo.txt"
chess.tour.df <- readLines(theUrl)

## Warning in readLines(theUrl): incomplete final line found on 'https://
## raw.githubusercontent.com/kamathvk1982/Data607-MajorAssignment-Project1/master/
## tournamentinfo.txt'

# Sample rows from the dataset:
head(chess.tour.df)

## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

#Cleanup the header by removing first 4 rows:
chess.tour.df <- chess.tour.df[c(-1:-4, 0)]
head(chess.tour.df)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

Step: Create two data frames to capture the two lines of data for each player from the original frame. Every player information is in lines of three with the third line being the row delimiter:

#created empty objects:  
data1.df <- c()
data2.df <- c()

#Run a loop to get the split:  
k <- 1
for (i in 1:length(chess.tour.df) ) {
  if ( i == 1 | i%%3 == 1 )  {    data1.df[k] <- chess.tour.df[i]  }
  else if (i == 2 | i%%3 == 2)  {      data2.df[k] <- chess.tour.df[i]    }
  if (i%%3 == 0) { 
    k <- k + 1
    }
}

# Sample rows from the dataset:  
head(data1.df)

## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"

head(data2.df)

## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

Step: We will now split each line of data into individual column based on the pipeline “|” delimiter:

# Splitting  with pattern as the delimiter "|" :
data1.cols.df <- str_split(data1.df, pattern = fixed("|"), simplify = TRUE, n=11)
data2.cols.df <- str_split(data2.df, pattern = fixed("|"), simplify = TRUE, n=11)

#Trimming the values for values in all rows and columns:
data1.cols.df  <- trimws(data1.cols.df )
data2.cols.df  <- trimws(data2.cols.df )

# Sample rows from the dataset:  
head(data1.cols.df)

##      [,1] [,2]                  [,3]  [,4]    [,5]    [,6]    [,7]    [,8]   
## [1,] "1"  "GARY HUA"            "6.0" "W  39" "W  21" "W  18" "W  14" "W   7"
## [2,] "2"  "DAKSHESH DARURI"     "6.0" "W  63" "W  58" "L   4" "W  17" "W  16"
## [3,] "3"  "ADITYA BAJAJ"        "6.0" "L   8" "W  61" "W  25" "W  21" "W  11"
## [4,] "4"  "PATRICK H SCHILLING" "5.5" "W  23" "D  28" "W   2" "W  26" "D   5"
## [5,] "5"  "HANSHI ZUO"          "5.5" "W  45" "W  37" "D  12" "D  13" "D   4"
## [6,] "6"  "HANSEN SONG"         "5.0" "W  34" "D  29" "L  11" "W  35" "D  10"
##      [,9]    [,10]   [,11]
## [1,] "D  12" "D   4" ""   
## [2,] "W  20" "W   7" ""   
## [3,] "W  13" "W  12" ""   
## [4,] "W  19" "D   1" ""   
## [5,] "W  14" "W  17" ""   
## [6,] "W  27" "W  21" ""

head(data2.cols.df)

##      [,1] [,2]                          [,3]  [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] "ON" "15445895 / R: 1794   ->1817" "N:2" "W"  "B"  "W"  "B"  "W"  "B" 
## [2,] "MI" "14598900 / R: 1553   ->1663" "N:2" "B"  "W"  "B"  "W"  "B"  "W" 
## [3,] "MI" "14959604 / R: 1384   ->1640" "N:2" "W"  "B"  "W"  "B"  "W"  "B" 
## [4,] "MI" "12616049 / R: 1716   ->1744" "N:2" "W"  "B"  "W"  "B"  "W"  "B" 
## [5,] "MI" "14601533 / R: 1655   ->1690" "N:2" "B"  "W"  "B"  "W"  "B"  "W" 
## [6,] "OH" "15055204 / R: 1686   ->1687" "N:3" "W"  "B"  "W"  "B"  "B"  "W" 
##      [,10] [,11]
## [1,] "W"   ""   
## [2,] "B"   ""   
## [3,] "W"   ""   
## [4,] "B"   ""   
## [5,] "B"   ""   
## [6,] "B"   ""

Step: We will create the final data set having the merged data from the above two data sets with each row in this final data set representing a single player information:

# Creating of an empty dataset with column names and number of rows as the length of final dataset:  
final.dataset<-data.frame(id=character(nrow(data1.cols.df)), name=character(nrow(data1.cols.df)), state=character(nrow(data1.cols.df)), total.points=numeric(nrow(data1.cols.df)), pre.rating=numeric(nrow(data1.cols.df)), avg.pre.rating.oppo=numeric(nrow(data1.cols.df)), rd1=character(nrow(data1.cols.df)),rd2=character(nrow(data1.cols.df)), rd3=character(nrow(data1.cols.df)), rd4=character(nrow(data1.cols.df)), rd5=character(nrow(data1.cols.df)), rd6=character(nrow(data1.cols.df)), rd7=character(nrow(data1.cols.df)) )

# Assigning valve to each column in the final dataset [except for average opponent rating, which we will calculate in next step]:    
final.dataset$id <- trimws(data1.cols.df[, 1])
final.dataset$name <- data1.cols.df[, 2]
final.dataset$state <- data2.cols.df[, 1]
final.dataset$total.points <-  as.double(data1.cols.df[, 3] )
final.dataset$pre.rating  <- as.integer(  sub('P', '.', trimws(  substr(data2.cols.df[,2], regexpr('R:', data2.cols.df[,2] ) +2 , regexpr("->", data2.cols.df[,2])-1)  ) ))
final.dataset$rd1 <- sub('[[:alpha:]]', '', data1.cols.df[,4])
final.dataset$rd2 <- sub('[[:alpha:]]', '', data1.cols.df[,5])
final.dataset$rd3 <- sub('[[:alpha:]]', '', data1.cols.df[,6])
final.dataset$rd4 <- sub('[[:alpha:]]', '', data1.cols.df[,7])
final.dataset$rd5 <- sub('[[:alpha:]]', '', data1.cols.df[,8])
final.dataset$rd6 <- sub('[[:alpha:]]', '', data1.cols.df[,9])
final.dataset$rd7 <- sub('[[:alpha:]]', '', data1.cols.df[,10])


#Assigning valve to column avg.pre.rating.oppo:
for (i in 1:nrow(data1.cols.df) ) {
  
  final.dataset$avg.pre.rating.oppo[i] <- as.integer( mean(  c(final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd1[i])]   , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd2[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd3[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd4[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd5[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd6[i])]  , final.dataset$pre.rating[final.dataset$id == trimws(final.dataset$rd7[i])] ) , na.rm = TRUE ) )
}


# Sample rows from the dataset:  
head(final.dataset)

Step: Creating of the csv file with the required columns:

#Creating the subset with required columns of Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents:
final.csv.dataset <- subset(final.dataset, select= c(name, state, total.points, pre.rating,avg.pre.rating.oppo ))

#Write into a csv file in current working directory
getwd()

## [1] "C:/Users/kamat/OneDrive/Documents/Vinayak/CUNY/Data607/MajorAssignment/Project1"

write.csv(final.csv.dataset,'tournament_player_info.csv')