In this project, you’re given a text file with chess tournament results where the information has some structure. Your job is to create an R Markdown file that generates a .CSV file (that could for example be imported into a SQL database) with the following information for all of the players: Player’s Name, Player’s State, Total Number of Points, Player’s Pre-Rating, and Average Pre Chess Rating of Opponents For the first player, the information would be: Gary Hua, ON, 6.0, 1794, 1605 1605 was calculated by using the pre-tournament opponents’ ratings of 1436, 1563, 1600, 1610, 1649, 1663, 1716, and dividing by the total number of games played.

Load tidyverse library

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load the chessGame.txt file

I loaded text file to my github account and R read line from github.

chessGame <- read_lines("https://raw.githubusercontent.com/deepasharma06/Data-607/main/ChessGame.txt", skip=4)
head(chessGame)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [3] "-----------------------------------------------------------------------------------------"
## [4] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [5] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "-----------------------------------------------------------------------------------------"

Extract players names

pattern_names <- "(?<=\\| )[A-Za-z -]{6,}(?=\\|)"
names <- str_match_all(chessGame, pattern_names)
##names

names2 <- str_trim(names[seq(1, length(names), 3)],
                   side = c("both","left","right"))
##names2
head(names2)
## [1] "GARY HUA"            "DAKSHESH DARURI"     "ADITYA BAJAJ"       
## [4] "PATRICK H SCHILLING" "HANSHI ZUO"          "HANSEN SONG"

Extract player’s States

pattern_states <- "[A-Z]{2}(?=\\s\\|)"

states <- str_match_all(chessGame,pattern_states)
##states
states2 <- str_trim(states[seq(2, length(states), 3)],
                   side = c("both","left","right"))
##states2

head(states2)
## [1] "ON" "MI" "MI" "MI" "MI" "OH"

Extract player’s Points

pattern_points <- "\\d\\.\\d"
points <- str_match_all(chessGame,pattern_points)
points2 <- str_trim(points[seq(1, length(points), 3)],
                    side = c("both","left","right"))
head(points2)
## [1] "6.0" "6.0" "6.0" "5.5" "5.5" "5.0"

Extract Players Rating

## Pattern Rating

pattern_rating <- "((?<=R: )|(?<=R:  ))\\d{3,4}"
rating <- str_match_all(chessGame,pattern_rating)
#Remove unneeded rows
rating2 <- rating[seq(2, length(rating), 3)]

#Remove unneeded columns
rating_temp <- lapply(rating2, function(x) x[,-2])

# This to handle the Unrated people
rating_temp[lengths(rating_temp) == 0] <- "0000"

# Trim whitespace
rating2 <- str_trim(rating_temp,side = c("both","left","right"))

# Convert from character to numeric
rating2 <- unlist(lapply(rating2,as.numeric), recursive = FALSE)
head(rating2)
## [1] 1794 1553 1384 1716 1655 1686

Extract Players Played

# This line extract every 3rd row
MychessGame <- chessGame[seq(1, length(chessGame), 3)]

# This pattern extracts the number that followed a D, W or L"
pattern_games <- "(((?<=W )|(?<=L ))|(?<=D ))\\s{0,3}\\d{0,2}(?=\\|)"

# Results are stored in this list of lists
players_played <- str_match_all(MychessGame,pattern_games)

# Lets remove 2 and 3 column, since I won't use them
players2 <- lapply(players_played, function(x) x[,-c(2:3)])

# Need to trim for whitespace
players2 <- lapply(players2,str_trim)
head(players2)
## [[1]]
## [1] "39" "21" "18" "14" "7"  "12" "4" 
## 
## [[2]]
## [1] "63" "58" "4"  "17" "16" "20" "7" 
## 
## [[3]]
## [1] "8"  "61" "25" "21" "11" "13" "12"
## 
## [[4]]
## [1] "23" "28" "2"  "26" "5"  "19" "1" 
## 
## [[5]]
## [1] "45" "37" "12" "13" "4"  "14" "17"
## 
## [[6]]
## [1] "34" "29" "11" "35" "10" "27" "21"

Average

index_players <- lapply(players2,as.numeric)
rows_players <- length(index_players)
player_opponents <- vector(mode = "list", length = rows_players)

for (row in 1:rows_players) {
  for (col in 1:length(index_players[[row]])) {
    player_opponents[[row]][col] <- rating2[[index_players[[row]][col]]]
  }
}

# Take mean first and then simplify list of vectors into a single vector
avg_players <- round(unlist(lapply(player_opponents,mean),recursive=FALSE))

head(avg_players)
## [1] 1605 1469 1564 1574 1501 1519

Create CSV file

Result<- cbind(names2, states2, points2, rating2, avg_players)
write.csv(Result,"607project1.txt", row.names=FALSE)
##output
head(Result)
##      names2                states2 points2 rating2 avg_players
## [1,] "GARY HUA"            "ON"    "6.0"   "1794"  "1605"     
## [2,] "DAKSHESH DARURI"     "MI"    "6.0"   "1553"  "1469"     
## [3,] "ADITYA BAJAJ"        "MI"    "6.0"   "1384"  "1564"     
## [4,] "PATRICK H SCHILLING" "MI"    "5.5"   "1716"  "1574"     
## [5,] "HANSHI ZUO"          "MI"    "5.5"   "1655"  "1501"     
## [6,] "HANSEN SONG"         "OH"    "5.0"   "1686"  "1519"

check the CSV file

check_csv_file <- read.csv("607project1.txt")
head(check_csv_file)
##                names2 states2 points2 rating2 avg_players
## 1            GARY HUA      ON     6.0    1794        1605
## 2     DAKSHESH DARURI      MI     6.0    1553        1469
## 3        ADITYA BAJAJ      MI     6.0    1384        1564
## 4 PATRICK H SCHILLING      MI     5.5    1716        1574
## 5          HANSHI ZUO      MI     5.5    1655        1501
## 6         HANSEN SONG      OH     5.0    1686        1519

Reference:

Youtube link:https://www.youtube.com/watch?v=uJKpz9T7mAg&t=469s

https://regexone.com/