library(readr, quietly = TRUE)
library(stringr, quietly = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)

I used readLines() function because it reads text lines from an input file. The readLines() function is perfect for text files since it reads the text line by line and creates character objects for each of the lines.

While uploading the dataset to R I got a warning, incomplete final line found in the dataset. The message indicates that the last line of the file doesn’t end with an End Of Line (EOL) character (linefeed () or carriage return+linefeed ()). The original intention of this message was to warn me that the file may be incomplete; most datafiles have an EOL character as the very last character in the file. I was able to correct this by going back to the origional file, navigate to the very last line of the file, place the cursor at the end of that line, press return and resaved the file.

I did a head() to see the first rows of the dataset.

chess_tournament <- readLines('https://raw.githubusercontent.com/enidroman/data_607_data_aquisition_and_management_project/main/tournamentinfo.txt')

head(chess_tournament)
## [1] "-----------------------------------------------------------------------------------------" 
## [2] " Pair | Player Name                     |Total|Round|Round|Round|Round|Round|Round|Round| "
## [3] " Num  | USCF ID / Rtg (Pre->Post)       | Pts |  1  |  2  |  3  |  4  |  5  |  6  |  7  | "
## [4] "-----------------------------------------------------------------------------------------" 
## [5] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|" 
## [6] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"

####I did a summary of the dataset.

summary(chess_tournament)
##    Length     Class      Mode 
##       196 character character

I noticed the first 4 lines was not needed. All I needed was the information from line 5 and on and the pattern was the player information and games played, repeating after every 3 lines. In order to extract the required fields I needed I separated the data into 2 matrices.

The warning below is just a warning not an error.

deconstruct_chess_tournament <- matrix(unlist(chess_tournament), byrow=TRUE)

d1_chess_tournament <- deconstruct_chess_tournament[seq(5,length(deconstruct_chess_tournament),3)]
head(d1_chess_tournament)
## [1] "    1 | GARY HUA                        |6.0  |W  39|W  21|W  18|W  14|W   7|D  12|D   4|"
## [2] "    2 | DAKSHESH DARURI                 |6.0  |W  63|W  58|L   4|W  17|W  16|W  20|W   7|"
## [3] "    3 | ADITYA BAJAJ                    |6.0  |L   8|W  61|W  25|W  21|W  11|W  13|W  12|"
## [4] "    4 | PATRICK H SCHILLING             |5.5  |W  23|D  28|W   2|W  26|D   5|W  19|D   1|"
## [5] "    5 | HANSHI ZUO                      |5.5  |W  45|W  37|D  12|D  13|D   4|W  14|W  17|"
## [6] "    6 | HANSEN SONG                     |5.0  |W  34|D  29|L  11|W  35|D  10|W  27|W  21|"
d2_chess_tournament <- deconstruct_chess_tournament[seq(6,length(deconstruct_chess_tournament),3)]
head(d2_chess_tournament)
## [1] "   ON | 15445895 / R: 1794   ->1817     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [2] "   MI | 14598900 / R: 1553   ->1663     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [3] "   MI | 14959604 / R: 1384   ->1640     |N:2  |W    |B    |W    |B    |W    |B    |W    |"
## [4] "   MI | 12616049 / R: 1716   ->1744     |N:2  |W    |B    |W    |B    |W    |B    |B    |"
## [5] "   MI | 14601533 / R: 1655   ->1690     |N:2  |B    |W    |B    |W    |B    |W    |B    |"
## [6] "   OH | 15055204 / R: 1686   ->1687     |N:3  |W    |B    |W    |B    |B    |W    |B    |"

I used string manipulation and regular expressions to extract ID, Player Name, Total Pts from matrix d1 and Player State, Pre Rating, Round from matrix d2.

# From matrix d1 I matched and extracted the ID column.
ID <- as.numeric(str_extract(d1_chess_tournament, '\\d+'))

# From matrix d1 I matched and extracted the Name from Name column.
Name <- str_extract(d1_chess_tournament, '[A-z].{1,32}') 

Player_Name <- str_trim(str_extract(Name, '.+\\s{2,}'))

# From matrix d2 I matched and extracted the States from the State column. 
Player_State <- str_extract(d2_chess_tournament, '[A-Z]{2}') 

# From matrix d1 I matched and extract the Total Pts column. 
Total_Number_of_Points <- as.numeric(str_extract(d1_chess_tournament, '\\d+\\.\\d'))

# From matrix d2 I matched and extract the Prerating column. 
PreRating <- str_extract(d2_chess_tournament, 'R:.{8,}-')

PreRating <- as.numeric(str_extract(PreRating, '\\d{1,4}'))

# From matrix d2 I matched and extracted Rounds.(Needed this for the average pre chess rating calculation)
Rounds <- str_extract_all(d1_chess_tournament, '[A-Z]\\s{2,}\\d+')

Rounds <- str_extract_all(Rounds, '\\d+')
## Warning in stri_extract_all_regex(string, pattern, simplify = simplify, :
## argument is not an atomic vector; coercing

I used Round and Pre Rating to calculate the Average Pre Chess Rating of Opponents using a loop.

Avg_Pre_Chess_Rating_Opp <- c()

for(i in c(1:length(Rounds))){
  Avg_Pre_Chess_Rating_Opp[i] <- round(mean(PreRating[as.numeric(Rounds[[i]])]),0)
}
Avg_Pre_Chess_Rating_Opp
##  [1] 1605 1469 1564 1574 1501 1519 1372 1468 1523 1554 1468 1506 1498 1515 1484
## [16] 1386 1499 1480 1426 1411 1470 1300 1214 1357 1363 1507 1222 1522 1314 1144
## [31] 1260 1379 1277 1375 1150 1388 1385 1539 1430 1391 1248 1150 1107 1327 1152
## [46] 1358 1392 1356 1286 1296 1356 1495 1345 1206 1406 1414 1363 1391 1319 1330
## [61] 1327 1186 1350 1263

I constructed the data frame.

new_chess_tournament <- data.frame(Player_Name, Player_State, Total_Number_of_Points, PreRating, Avg_Pre_Chess_Rating_Opp)

head(new_chess_tournament)
##           Player_Name Player_State Total_Number_of_Points PreRating
## 1            GARY HUA           ON                    6.0      1794
## 2     DAKSHESH DARURI           MI                    6.0      1553
## 3        ADITYA BAJAJ           MI                    6.0      1384
## 4 PATRICK H SCHILLING           MI                    5.5      1716
## 5          HANSHI ZUO           MI                    5.5      1655
## 6         HANSEN SONG           OH                    5.0      1686
##   Avg_Pre_Chess_Rating_Opp
## 1                     1605
## 2                     1469
## 3                     1564
## 4                     1574
## 5                     1501
## 6                     1519

I created an R Markdown file that generates a .CSV file. You can see the new csv file in github.

write.csv(new_chess_tournament, file = "new_chess_tournament.csv")